From 35e040c59c3e52c7583f97efd63a7dc795831255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:46:23 +0000 Subject: [PATCH 01/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20T2T-ViT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/T2T-ViT/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/.keep diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/.keep new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From dfce126ce662969faeb2f176c6006af0aec13016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:46:56 +0000 Subject: [PATCH 02/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/T2T-ViT/models/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 66f0272ce86deb381238bc7e3c10648aa0970cc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:47:06 +0000 Subject: [PATCH 03/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?= =?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/T2T-ViT/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/.keep diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/.keep deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From 276054e344e499285a1ffa1fdb344aa9c14a5dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:47:14 +0000 Subject: [PATCH 04/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/T2T-ViT/test/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep new file mode 100644 index 0000000000..e69de29bb2 -- Gitee From 3f8712405b7f0c5eb2238c95dd996f77a158633f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:47:34 +0000 Subject: [PATCH 05/15] update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../cv/classification/T2T-ViT/Dockerfile | 5 + .../contrib/cv/classification/T2T-ViT/LICENSE | 201 ++++ .../cv/classification/T2T-ViT/README.md | 165 ++++ .../cv/classification/T2T-ViT/README_raw.md | 202 ++++ .../cv/classification/T2T-ViT/docker_start.sh | 25 + .../contrib/cv/classification/T2T-ViT/main.py | 912 ++++++++++++++++++ .../cv/classification/T2T-ViT/metrics.py | 41 + .../classification/T2T-ViT/modelzoo_level.txt | 3 + .../classification/T2T-ViT/npu_fused_adamw.py | 255 +++++ .../classification/T2T-ViT/requirements.txt | 4 + 10 files changed, 1813 insertions(+) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/LICENSE create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/README.md create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/main.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/metrics.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile b/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile new file mode 100644 index 0000000000..7e712fe1a1 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile @@ -0,0 +1,5 @@ +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME + +COPY requirements.txt . +RUN pip3.7 install -r requirements.txt \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE b/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE new file mode 100644 index 0000000000..753842b672 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/README.md b/PyTorch/contrib/cv/classification/T2T-ViT/README.md new file mode 100644 index 0000000000..9fce4d9354 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/README.md @@ -0,0 +1,165 @@ +# T2T-ViT for PyTorch + +- [概述](概述.md) +- [准备训练环境](准备训练环境.md) +- [开始训练](开始训练.md) +- [训练结果展示](训练结果展示.md) +- [版本说明](版本说明.md) + + + +# 概述 + +## 简述 + +T2T-ViT模型还是解决cv领域图片分类的问题,通过捕获图片的局部特征来对图片进行分类。由于ViT将图像分为多个 token,然后用多个 Transformer 来对全局进行建模以进行分类,但是这样会失去局部性,实际上图像的局部信息如边缘、线条和纹理等信息对视觉理解是很重要的。其次,ViT的注意力骨干含有冗余,特征丰富度有限,模型训练困难。因此本论文提出了 Tokens-to-Token module 来建模一张图片的局部信息,和更高效的 Transformer Backbone 架构设计来提升中间特征的丰富程度减少冗余以提升性能,使得在纯 ImageNet 数据集预训练的视觉 Transformer 的性能超越了 CNN 的 ResNet 架构,其设计的思路和范式对视觉 Transformer 领域的工作带来的积极影响。 + +- 参考实现: + + ``` + url=https://github.com/yitu-opensource/T2T-ViT.git + commit_id=0f63dc9558f4d192de926504dbddfa1b3f5db6ca + ``` + +- 适配昇腾 AI 处理器的实现: + + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/contrib/cv/classification + ``` + +- 通过Git获取代码方法如下: + + ``` + git clone {url} # 克隆仓库的代码 + cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 + ``` + +- 通过单击“立即下载”,下载源码包。 + +# 准备训练环境 + +## 准备环境 + +- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。 + + **表 1** 版本配套表 + + | 配套 | 版本 | + | ---------- | ------------------------------------------------------------ | + | 固件与驱动 | [5.1.RC2](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | CANN | [5.1.RC2](https://www.hiascend.com/software/cann/commercial?version=5.1.RC2) | + | PyTorch | [1.5.0](https://gitee.com/ascend/pytorch/tree/master/) | + + +- 环境准备指导。 + + 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + +- 安装依赖。 + + ``` + pip install -r requirements.txt + ``` + + +## 准备数据集 + +1. 获取数据集。 + 用户自行获取原始数据集ImageNet2012,将数据集上传到服务器任意路径下并解压。数据集目录结构参考如下所示。 + ``` + ├── ImageNet2012 + ├──train + ├──类别1 + │──图片1 + │──图片2 + │ ... + ├──类别2 + │──图片1 + │──图片2 + │ ... + ├──... + ├──val + ├──类别1 + │──图片1 + │──图片2 + │ ... + ├──类别2 + │──图片1 + │──图片2 + │ ... + ``` + + + > **说明:** + > 该数据集的训练过程脚本只作为一种参考示例。 + +# 开始训练 + +## 训练模型 + +1. 进入解压后的源码包根目录。 + + ``` + cd /${模型文件夹名称} + ``` + +2. 运行训练脚本。 + + 该模型支持单机单卡训练和单机8卡训练。 + + - 单机单卡训练 + + 启动单卡训练。 + + ``` + bash ./test/train_full_1p.sh --data_path=/data/xxx/ + ``` + + - 单机8卡训练 + + 启动8卡训练。 + + ``` + bash ./test/train_full_8p.sh --data_path=/data/xxx/ + ``` + + --data\_path参数填写数据集路径。 + + 模型训练脚本参数说明如下。 + + ``` + 公共参数: + --data_path //数据集路径 + ``` + + + +日志输出路径: + + test/output/devie_id/train_${device_id}.log # training detail log + + test/output/devie_id/T2T-ViT_2_bs8192_8p_perf.log # 8p training performance result log + + test/output/devie_id/T2T-ViT_2_bs8192_8p_acc.log # 8p training accuracy result log + +# 训练结果展示 + +**表 2** 训练结果展示表 + +| Name | Acc@1 | FPS | Epochs | AMP_Type | +| ------ | ----- | ---- | ------ | -------- | +| GPU-1p | - | - | 1 | O1 | +| GPU-8p | - | - | 300 | O1 | +| NPU-1p | - | - | 1 | O1 | +| NPU-8p | - | - | 300 | O1 | + +# 版本说明 + +## 变更 + +2022.11.22:首次发布。 + +## 已知问题 + +无。 \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md b/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md new file mode 100644 index 0000000000..d4c537021e --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md @@ -0,0 +1,202 @@ +# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet, [ICCV 2021](https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_Tokens-to-Token_ViT_Training_Vision_Transformers_From_Scratch_on_ImageNet_ICCV_2021_paper.html) + +### Update: +2021/03/11: update our new results. Now our T2T-ViT-14 with 21.5M parameters can reach 81.5% top1-acc with 224x224 image resolution, and 83.3\% top1-acc with 384x384 resolution. + +2021/02/21: T2T-ViT can be trained on most of common GPUs: 1080Ti, 2080Ti, TiTAN V, V100 stably with '--amp' (Automatic Mixed Precision). In some specifical GPU like Tesla T4, 'amp' would cause NAN loss when training T2T-ViT. If you get NAN loss in training, you can disable amp by removing '--amp' in the [training scripts](https://github.com/yitu-opensource/T2T-ViT#train). + +2021/01/28: release codes and upload most of the pretrained models of T2T-ViT. + +

+ +

+ +## Reference +If you find this repo useful, please consider citing: +``` +@InProceedings{Yuan_2021_ICCV, + author = {Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Jiang, Zi-Hang and Tay, Francis E.H. and Feng, Jiashi and Yan, Shuicheng}, + title = {Tokens-to-Token ViT: Training Vision Transformers From Scratch on ImageNet}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2021}, + pages = {558-567} +} +``` + +Our codes are based on the [official imagenet example](https://github.com/pytorch/examples/tree/master/imagenet) by [PyTorch](https://pytorch.org/) and [pytorch-image-models](https://github.com/rwightman/pytorch-image-models) by [Ross Wightman](https://github.com/rwightman) + + +## 1. Requirements + +[timm](https://github.com/rwightman/pytorch-image-models), pip install timm==0.3.4 + +torch>=1.4.0 + +torchvision>=0.5.0 + +pyyaml + +data prepare: ImageNet with the following folder structure, you can extract imagenet by this [script](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4). + +``` +│imagenet/ +├──train/ +│ ├── n01440764 +│ │ ├── n01440764_10026.JPEG +│ │ ├── n01440764_10027.JPEG +│ │ ├── ...... +│ ├── ...... +├──val/ +│ ├── n01440764 +│ │ ├── ILSVRC2012_val_00000293.JPEG +│ │ ├── ILSVRC2012_val_00002138.JPEG +│ │ ├── ...... +│ ├── ...... +``` + +## 2. T2T-ViT Models + + +| Model | T2T Transformer | Top1 Acc | #params | MACs | Download| +| :--- | :---: | :---: | :---: | :---: | :---: | +| T2T-ViT-14 | Performer | 81.5 | 21.5M | 4.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.5_T2T_ViT_14.pth.tar)| +| T2T-ViT-19 | Performer | 81.9 | 39.2M | 8.5G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.9_T2T_ViT_19.pth.tar)| +| T2T-ViT-24 | Performer | 82.3 | 64.1M | 13.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.3_T2T_ViT_24.pth.tar)| +| T2T-ViT-14, 384 | Performer | 83.3 | 21.7M | | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/83.3_T2T_ViT_14.pth.tar)| +| T2T-ViT-24, Token Labeling | Performer | 84.2 | 65M | | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/84.2_T2T_ViT_24.pth.tar)| +| T2T-ViT_t-14 | Transformer | 81.7 | 21.5M | 6.1G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.7_T2T_ViTt_14.pth.tar) | +| T2T-ViT_t-19 | Transformer | 82.4 | 39.2M | 9.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.4_T2T_ViTt_19.pth.tar) | +| T2T-ViT_t-24 | Transformer | 82.6 | 64.1M | 15.0G| [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.6_T2T_ViTt_24.pth.tar) | + +The 'T2T-ViT-14, 384' means we train T2T-ViT-14 with image size of 384 x 384. + +The 'T2T-ViT-24, Token Labeling' means we train T2T-ViT-24 with [Token Labeling](https://github.com/zihangJiang/TokenLabeling). + +The three lite variants of T2T-ViT (Comparing with MobileNets): +| Model | T2T Transformer | Top1 Acc | #params | MACs | Download| +| :--- | :---: | :---: | :---: | :---: | :---: | +| T2T-ViT-7 | Performer | 71.7 | 4.3M | 1.1G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/71.7_T2T_ViT_7.pth.tar)| +| T2T-ViT-10 | Performer | 75.2 | 5.9M | 1.5G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/75.2_T2T_ViT_10.pth.tar)| +| T2T-ViT-12 | Performer | 76.5 | 6.9M | 1.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/76.5_T2T_ViT_12.pth.tar) | + + +### Usage +The way to use our pretrained T2T-ViT: +``` +from models.t2t_vit import * +from utils import load_for_transfer_learning + +# create model +model = t2t_vit_14() + +# load the pretrained weights +load_for_transfer_learning(model, /path/to/pretrained/weights, use_ema=True, strict=False, num_classes=1000) # change num_classes based on dataset, can work for different image size as we interpolate the position embeding for different image size. +``` + + +## 3. Validation + +Test the T2T-ViT-14 (take Performer in T2T module), + +Download the [T2T-ViT-14](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.5_T2T_ViT_14.pth.tar), then test it by running: + +``` +CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_14 -b 100 --eval_checkpoint path/to/checkpoint +``` +The results look like: + +``` +Test: [ 0/499] Time: 2.083 (2.083) Loss: 0.3578 (0.3578) Acc@1: 96.0000 (96.0000) Acc@5: 99.0000 (99.0000) +Test: [ 50/499] Time: 0.166 (0.202) Loss: 0.5823 (0.6404) Acc@1: 85.0000 (86.1569) Acc@5: 99.0000 (97.5098) +... +Test: [ 499/499] Time: 0.272 (0.172) Loss: 1.3983 (0.8261) Acc@1: 62.0000 (81.5000) Acc@5: 93.0000 (95.6660) +Top-1 accuracy of the model is: 81.5% + +``` + +Test the three lite variants: T2T-ViT-7, T2T-ViT-10, T2T-ViT-12 (take Performer in T2T module), + +Download the [T2T-ViT-7](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/71.7_T2T_ViT_7.pth.tar), [T2T-ViT-10](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/75.2_T2T_ViT_10.pth.tar) or [T2T-ViT-12](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/76.5_T2T_ViT_12.pth.tar), then test it by running: + +``` +CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_7 -b 100 --eval_checkpoint path/to/checkpoint +``` + +Test the model T2T-ViT-14, 384 with 83.3\% top-1 accuracy: +``` +CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_14 --img-size 384 -b 100 --eval_checkpoint path/to/T2T-ViT-14-384 +``` + + +## 4. Train + +Train the three lite variants: T2T-ViT-7, T2T-ViT-10 and T2T-ViT-12 (take Performer in T2T module): + +If only 4 GPUs are available, + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 ./distributed_train.sh 4 path/to/data --model t2t_vit_7 -b 128 --lr 1e-3 --weight-decay .03 --amp --img-size 224 +``` + +The top1-acc in 4 GPUs would be slightly lower than 8 GPUs (around 0.1%-0.3% lower). + +If 8 GPUs are available: +``` +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_7 -b 64 --lr 1e-3 --weight-decay .03 --amp --img-size 224 +``` + + +Train the T2T-ViT-14 and T2T-ViT_t-14 (run on 4 or 8 GPUs): + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 ./distributed_train.sh 4 path/to/data --model t2t_vit_14 -b 128 --lr 1e-3 --weight-decay .05 --amp --img-size 224 +``` + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_14 -b 64 --lr 5e-4 --weight-decay .05 --amp --img-size 224 +``` +If you want to train our T2T-ViT on images with 384x384 resolution, please use '--img-size 384'. + + +Train the T2T-ViT-19, T2T-ViT-24 or T2T-ViT_t-19, T2T-ViT_t-24: + +``` +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_19 -b 64 --lr 5e-4 --weight-decay .065 --amp --img-size 224 +``` + +## 5. Transfer T2T-ViT to CIFAR10/CIFAR100 + +| Model | ImageNet | CIFAR10 | CIFAR100| #params| +| :--- | :---: | :---: | :---: | :---: | +| T2T-ViT-14 | 81.5 |[98.3](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar10_t2t-vit_14_98.3.pth) | [88.4](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cirfar100_t2t-vit-14_88.4.pth) | 21.5M | +| T2T-ViT-19 | 81.9 |[98.4](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar10_t2t-vit_19_98.4.pth) | [89.0](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar100_t2t-vit-19_89.0.pth) |39.2M | + +We resize CIFAR10/100 to 224x224 and finetune our pretrained T2T-ViT-14/19 to CIFAR10/100 by running: + +``` +CUDA_VISIBLE_DEVICES=0,1 transfer_learning.py --lr 0.05 --b 64 --num-classes 10 --img-size 224 --transfer-learning True --transfer-model /path/to/pretrained/T2T-ViT-19 +``` + +## 6. Visualization + +Visualize the image features of ResNet50, you can open and run the [visualization_resnet.ipynb](https://github.com/yitu-opensource/T2T-ViT/blob/main/visualization_resnet.ipynb) file in jupyter notebook or jupyter lab; some results are given as following: + +

+ +

+ +Visualize the image features of ViT, you can open and run the [visualization_vit.ipynb](https://github.com/yitu-opensource/T2T-ViT/blob/main/visualization_vit.ipynb) file in jupyter notebook or jupyter lab; some results are given as following: + +

+ +

+ +Visualize attention map, you can refer to this [file](https://github.com/jeonsworld/ViT-pytorch/blob/main/visualize_attention_map.ipynb). A simple example by visualizing the attention map in attention block 4 and 5 is: + + +

+ +

+ + diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh b/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh new file mode 100644 index 0000000000..46ce9a02ec --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +docker_image=$1 +data_dir=$2 +model_dir=$3 + +docker run -it --ipc=host \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm --device=/dev/hisi_hdc \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \ + -v ${model_dir}:${model_dir} \ + -v ${data_dir}:${data_dir} \ + -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \ + -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \ + -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \ + /bin/bash \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/main.py new file mode 100644 index 0000000000..1b3ceea782 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/main.py @@ -0,0 +1,912 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +T2T-ViT training and evaluating script +This script is modified from pytorch-image-models by Ross Wightman (https://github.com/rwightman/pytorch-image-models/) +It was started from an early version of the PyTorch ImageNet example +(https://github.com/pytorch/examples/tree/master/imagenet) +""" +import argparse +import time +import yaml +import os +import logging +from collections import OrderedDict +from contextlib import suppress +from datetime import datetime +import models +from typing import Optional + +import torch +import torch.nn as nn +import torchvision.utils +from torch.optim.optimizer import Optimizer +from torch.nn.parallel import DistributedDataParallel as NativeDDP + +from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset +from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model +from timm.utils import * +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy +from timm.optim import create_optimizer +from timm.scheduler import create_scheduler +from timm.utils import ApexScaler, NativeScaler + +from data.myloader import create_loader +from npu_fused_adamw import NpuFusedAdamW + +torch.backends.cudnn.benchmark = True +_logger = logging.getLogger('train') + +# The first arg parser parses out only the --config argument, this argument is used to +# load a yaml file containing key-values that override the defaults for the main parser below +config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False) +parser.add_argument('-c', '--config', default='', type=str, metavar='FILE', + help='YAML config file specifying default arguments') + +parser = argparse.ArgumentParser(description='T2T-ViT Training and Evaluating') + +# Dataset / Model parameters +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('--model', default='T2t_vit_14', type=str, metavar='MODEL', + help='Name of model to train (default: "countception"') +parser.add_argument('--pretrained', action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') +parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH', + help='Initialize model from this checkpoint (default: none)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='Resume full model and optimizer state from checkpoint (default: none)') +parser.add_argument('--eval_checkpoint', default='', type=str, metavar='PATH', + help='path to eval checkpoint (default: none)') +parser.add_argument('--no-resume-opt', action='store_true', default=False, + help='prevent resume of optimizer state when resuming model') +parser.add_argument('--num-classes', type=int, default=1000, metavar='N', + help='number of label classes (default: 1000)') +parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') +parser.add_argument('--img-size', type=int, default=224, metavar='N', + help='Image patch size (default: None => model default)') +parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') +parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') +parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') +parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') +parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + +# Optimizer parameters +parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "adamw"') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') +parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') +parser.add_argument('--weight-decay', type=float, default=0.05, + help='weight decay (default: 0.005 for adamw)') +parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + +# Learning rate schedule parameters +parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER', + help='LR scheduler (default: "cosine"') +parser.add_argument('--lr', type=float, default=5e-4, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') +parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') +parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') +parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') +parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') +parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR', + help='warmup learning rate (default: 0.0001)') +parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') +parser.add_argument('--epochs', type=int, default=300, metavar='N', + help='number of epochs to train (default: 2)') +parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') +parser.add_argument('--warmup-epochs', type=int, default=10, metavar='N', + help='epochs to warmup LR, if scheduler supports') +parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') +parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') +parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + +# Augmentation & regularization parameters +parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') +parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') +parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') +parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') +parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') +parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') +parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), +parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') +parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') +parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', + help='Random erase prob (default: 0.25)') +parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "const")') +parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') +parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') +parser.add_argument('--mixup', type=float, default=0.8, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix', type=float, default=1.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') +parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') +parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') +parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') +parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') +parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') +parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') +parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.0)') +parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') +parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT', + help='Drop path rate (default: None)') +parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + +# Batch norm parameters (only works with gen_efficientnet based models currently) +parser.add_argument('--bn-tf', action='store_true', default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') +parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') +parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') +parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') +parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') +parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + +# Model Exponential Moving Average +parser.add_argument('--model-ema', action='store_true', default=True, + help='Enable tracking moving average of model weights') +parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') +parser.add_argument('--model-ema-decay', type=float, default=0.99996, + help='decay factor for model weights moving average (default: 0.9998)') + +# Misc +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=50, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--recovery-interval', type=int, default=0, metavar='N', + help='how many batches to wait before writing recovery checkpoint') +parser.add_argument('-j', '--workers', type=int, default=8, metavar='N', + help='how many training processes to use (default: 1)') +parser.add_argument('--num-gpu', type=int, default=1, + help='Number of GPUS to use') +parser.add_argument('--save-images', action='store_true', default=False, + help='save images of input bathes every log interval for debugging') +parser.add_argument('--amp', action='store_true', default=False, + help='use NVIDIA Apex AMP or Native AMP for mixed precision training') +parser.add_argument('--apex-amp', action='store_true', default=False, + help='Use NVIDIA Apex AMP mixed precision') +parser.add_argument('--native-amp', action='store_true', default=False, + help='Use Native Torch AMP mixed precision') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +parser.add_argument('--pin-mem', action='store_true', default=False, + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') +parser.add_argument('--no-prefetcher', action='store_true', default=False, + help='disable fast prefetcher') +parser.add_argument('--output', default='', type=str, metavar='PATH', + help='path to output folder (default: none, current dir)') +parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC', + help='Best metric (default: "top1"') +parser.add_argument('--tta', type=int, default=0, metavar='N', + help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)') +parser.add_argument("--local_rank", default=0, type=int) +parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False, + help='use the multi-epochs-loader to save time at the beginning of every epoch') + +parser.add_argument("--addr", default="127.0.0.1", type=str) +parser.add_argument("--performance", action='store_true', default=False, + help='whether get the model performance') + +has_apex = True + +import apex +from apex import amp +from apex.parallel import DistributedDataParallel as ApexDDP +from apex.parallel import convert_syncbn_model + +def optimizer_kwargs(cfg): + """ cfg/argparse to kwargs helper + Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn. + """ + kwargs = dict( + opt=cfg.opt, + lr=cfg.lr, + weight_decay=cfg.weight_decay, + momentum=cfg.momentum) + if getattr(cfg, 'opt_eps', None) is not None: + kwargs['eps'] = cfg.opt_eps + if getattr(cfg, 'opt_betas', None) is not None: + kwargs['betas'] = cfg.opt_betas + if getattr(cfg, 'opt_args', None) is not None: + kwargs.update(cfg.opt_args) + return kwargs + +def add_weight_decay(model, weight_decay=1e-5, skip_list=()): + """Add weight decay + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: + no_decay.append(param) + else: + decay.append(param) + return [ + {'params': no_decay, 'weight_decay': 0.}, + {'params': decay, 'weight_decay': weight_decay}] + +class Lookahead(Optimizer): + def __init__(self, base_optimizer, alpha=0.5, k=6): + # NOTE super().__init__() not called on purpose + if not 0.0 <= alpha <= 1.0: + raise ValueError(f'Invalid slow update rate: {alpha}') + if not 1 <= k: + raise ValueError(f'Invalid lookahead steps: {k}') + defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0) + self._base_optimizer = base_optimizer + self.param_groups = base_optimizer.param_groups + self.defaults = base_optimizer.defaults + self.defaults.update(defaults) + self.state = defaultdict(dict) + # manually add our defaults to the param groups + for name, default in defaults.items(): + for group in self._base_optimizer.param_groups: + group.setdefault(name, default) + + @torch.no_grad() + def update_slow(self, group): + for fast_p in group["params"]: + if fast_p.grad is None: + continue + param_state = self._base_optimizer.state[fast_p] + if 'lookahead_slow_buff' not in param_state: + param_state['lookahead_slow_buff'] = torch.empty_like(fast_p) + param_state['lookahead_slow_buff'].copy_(fast_p) + slow = param_state['lookahead_slow_buff'] + slow.add_(fast_p - slow, alpha=group['lookahead_alpha']) + fast_p.copy_(slow) + + def sync_lookahead(self): + for group in self._base_optimizer.param_groups: + self.update_slow(group) + + @torch.no_grad() + def step(self, closure=None): + loss = self._base_optimizer.step(closure) + for group in self._base_optimizer.param_groups: + group['lookahead_step'] += 1 + if group['lookahead_step'] % group['lookahead_k'] == 0: + self.update_slow(group) + return loss + + def state_dict(self): + return self._base_optimizer.state_dict() + + def load_state_dict(self, state_dict): + self._base_optimizer.load_state_dict(state_dict) + self.param_groups = self._base_optimizer.param_groups + +def create_optimizer_v2( + model_or_params, + opt: str = 'sgd', + lr: Optional[float] = None, + weight_decay: float = 0., + momentum: float = 0.9, + filter_bias_and_bn: bool = True, + **kwargs): + """ Create an optimizer. + Only support npu fused AdamW and npu fused SGD + """ + if isinstance(model_or_params, nn.Module): + # a model was passed in, extract parameters and add weight decays to appropriate layers + if weight_decay and filter_bias_and_bn: + skip = {} + if hasattr(model_or_params, 'no_weight_decay'): + skip = model_or_params.no_weight_decay() + parameters = add_weight_decay(model_or_params, weight_decay, skip) + weight_decay = 0. + else: + parameters = model_or_params.parameters() + else: + # iterable of parameters or param groups passed in + parameters = model_or_params + + opt_lower = opt.lower() + opt_split = opt_lower.split('_') + opt_lower = opt_split[-1] + # if 'fused' in opt_lower: + # assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' + + opt_args = dict(weight_decay=weight_decay, **kwargs) + if lr is not None: + opt_args.setdefault('lr', lr) + + # basic SGD & related + if opt_lower == 'sgd' or opt_lower == 'nesterov': + # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons + opt_args.pop('eps', None) + # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args) + optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'momentum': + opt_args.pop('eps', None) + # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args) + optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args) + elif opt_lower == 'adamw': + # optimizer = optim.AdamW(parameters, **opt_args) + optimizer = NpuFusedAdamW(parameters, **opt_args) + else: + print(opt_lower, flush=True) + assert False and "Invalid optimizer" + raise ValueError + + if len(opt_split) > 1: + if opt_split[0] == 'lookahead': + optimizer = Lookahead(optimizer) + + return optimizer + + + +def _parse_args(): + # Do we have a config file to parse? + args_config, remaining = config_parser.parse_known_args() + if args_config.config: + with open(args_config.config, 'r') as f: + cfg = yaml.safe_load(f) + parser.set_defaults(**cfg) + + # The main arg parser parses the rest of the args, the usual + # defaults will have been overridden if config file specified. + args = parser.parse_args(remaining) + + # Cache the args as a text string to save them in the output dir later + args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) + return args, args_text + + +def main(): + setup_default_logging() + args, args_text = _parse_args() + + os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1' + os.environ['MASTER_PORT'] = '99999' # Any available port + + args.prefetcher = not args.no_prefetcher + args.distributed = (args.workers > 1) + + torch.npu.set_device(args.local_rank) + args.world_size = 1 + args.rank = args.local_rank # global rank + if args.distributed: + torch.npu.set_device(args.local_rank) + args.world_size = args.workers + torch.distributed.init_process_group(backend='hccl', rank=args.rank, world_size=args.world_size) + args.world_size = torch.distributed.get_world_size() + assert args.rank >= 0 + + if args.distributed: + _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' + % (args.rank, args.world_size)) + else: + _logger.info('Training with a single process on %d GPUs.' % args.num_gpu) + + torch.manual_seed(args.seed + args.rank) + + model = create_model( + args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + drop_rate=args.drop, + drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps, + checkpoint_path=args.initial_checkpoint, + img_size=args.img_size) + + if args.local_rank == 0 or args.workers == 1: + _logger.info('Model %s created, param count: %d' % + (args.model, sum([m.numel() for m in model.parameters()]))) + + data_config = resolve_data_config(vars(args), model=model, verbose=(args.local_rank == 0 or args.workers==1)) + + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits > 1, 'A split of 1 makes no sense' + num_aug_splits = args.aug_splits + + if args.split_bn: + assert num_aug_splits > 1 or args.resplit + model = convert_splitbn_model(model, max(num_aug_splits, 2)) + + use_amp = None + args.apex_amp = True + use_amp = 'apex' + + model.npu() + if args.channels_last: + model = model.to(memory_format=torch.channels_last) + + optimizer = create_optimizer_v2( + model, + **optimizer_kwargs(cfg=args), + filter_bias_and_bn=True, + ) + # optimizer = create_optimizer(args, model) + + amp_autocast = suppress # do nothing + loss_scaler = None + model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0, combine_grad=True) + loss_scaler = ApexScaler() + if args.local_rank == 0: + _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.') + + # optionally resume from a checkpoint + resume_epoch = None + if args.resume: + resume_epoch = resume_checkpoint( + model, args.resume, + optimizer=None if args.no_resume_opt else optimizer, + loss_scaler=None if args.no_resume_opt else loss_scaler, + log_info=args.local_rank == 0) + + model_ema = None + if args.model_ema: + # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper + model_ema = ModelEma( + model, + decay=args.model_ema_decay, + device='cpu' if args.model_ema_force_cpu else '', + resume=args.resume) + + if args.distributed: + if args.sync_bn: + assert not args.split_bn + try: + if has_apex and use_amp != 'native': + # Apex SyncBN preferred unless native amp is activated + model = convert_syncbn_model(model) + else: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + if args.local_rank == 0: + _logger.info( + 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' + 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') + except Exception as e: + _logger.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1') + + model = NativeDDP(model, device_ids=[args.local_rank], broadcast_buffers=False) # can use device str in Torch >= 1.1 + # NOTE: EMA model does not need to be wrapped by DDP + + lr_scheduler, num_epochs = create_scheduler(args, optimizer) + if args.performance: + num_epochs = 1 + start_epoch = 0 + if args.start_epoch is not None: + # a specified start_epoch will always override the resume epoch + start_epoch = args.start_epoch + elif resume_epoch is not None: + start_epoch = resume_epoch + if lr_scheduler is not None and start_epoch > 0: + lr_scheduler.step(start_epoch) + + if args.local_rank == 0: + _logger.info('Scheduled epochs: {}'.format(num_epochs)) + + train_dir = os.path.join(args.data, 'train') + if not os.path.exists(train_dir): + _logger.error('Training folder does not exist at: {}'.format(train_dir)) + exit(1) + dataset_train = Dataset(train_dir) + + collate_fn = None + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + mixup_args = dict( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.num_classes) + if args.prefetcher: + assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) + collate_fn = FastCollateMixup(**mixup_args) + else: + mixup_fn = Mixup(**mixup_args) + + if num_aug_splits > 1: + dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) + + train_interpolation = args.train_interpolation + if args.no_aug or not train_interpolation: + train_interpolation = data_config['interpolation'] + loader_train = create_loader( + dataset_train, + input_size=data_config['input_size'], + batch_size=args.batch_size, + is_training=True, + use_prefetcher=args.prefetcher, + no_aug=args.no_aug, + re_prob=args.reprob, + # re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_splits=num_aug_splits, + interpolation=train_interpolation, + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + collate_fn=collate_fn, + pin_memory=args.pin_mem, + use_multi_epochs_loader=args.use_multi_epochs_loader + ) + + eval_dir = os.path.join(args.data, 'val') + if not os.path.isdir(eval_dir): + eval_dir = os.path.join(args.data, 'validation') + if not os.path.isdir(eval_dir): + _logger.error('Validation folder does not exist at: {}'.format(eval_dir)) + exit(1) + dataset_eval = Dataset(eval_dir) + + loader_eval = create_loader( + dataset_eval, + input_size=data_config['input_size'], + batch_size=args.validation_batch_size_multiplier * args.batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + + if args.jsd: + assert num_aug_splits > 1 # JSD only valid with aug splits set + train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).npu() + elif mixup_active: + # smoothing is handled with mixup target transform + train_loss_fn = SoftTargetCrossEntropy().npu() + elif args.smoothing: + train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).npu() + else: + train_loss_fn = nn.CrossEntropyLoss().npu() + validate_loss_fn = nn.CrossEntropyLoss().npu() + + eval_metric = args.eval_metric + best_metric = None + best_epoch = None + + if args.eval_checkpoint: # evaluate the model + load_checkpoint(model, args.eval_checkpoint, args.model_ema) + val_metrics = validate(model, loader_eval, validate_loss_fn, args) + print(f"Top-1 accuracy of the model is: {val_metrics['top1']:.1f}%") + return + + saver = None + output_dir = '' + if args.local_rank == 0: + output_base = args.output if args.output else './output' + exp_name = '-'.join([ + datetime.now().strftime("%Y%m%d-%H%M%S"), + args.model, + str(data_config['input_size'][-1]) + ]) + output_dir = get_outdir(output_base, 'train', exp_name) + decreasing = True if eval_metric == 'loss' else False + saver = CheckpointSaver( + model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, + checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing) + with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: + f.write(args_text) + + try: # train the model + for epoch in range(start_epoch, num_epochs): + if args.distributed: + loader_train.sampler.set_epoch(epoch) + + + train_metrics = train_epoch( + epoch, model, loader_train, optimizer, train_loss_fn, args, + lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, + amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn) + + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + if args.local_rank == 0 or args.workers == 1: + _logger.info("Distributing BatchNorm running means and vars") + distribute_bn(model, args.world_size, args.dist_bn == 'reduce') + + eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) + + if model_ema is not None and not args.model_ema_force_cpu: + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') + ema_eval_metrics = validate( + model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)') + eval_metrics = ema_eval_metrics + + if lr_scheduler is not None: + # step LR for next epoch + lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) + + update_summary( + epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), + write_header=best_metric is None) + + if saver is not None: + # save proper checkpoint with eval metric + save_metric = eval_metrics[eval_metric] + best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric) + + except KeyboardInterrupt: + pass + if best_metric is not None: + _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch)) + + +def train_epoch( + epoch, model, loader, optimizer, loss_fn, args, + lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress, + loss_scaler=None, model_ema=None, mixup_fn=None): + if args.mixup_off_epoch and epoch >= args.mixup_off_epoch: + if args.prefetcher and loader.mixup_enabled: + loader.mixup_enabled = False + elif mixup_fn is not None: + mixup_fn.mixup_enabled = False + + second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order + batch_time_m = AverageMeter() + data_time_m = AverageMeter() + losses_m = AverageMeter() + top1_m = AverageMeter() + top5_m = AverageMeter() + + model.train() + + end = time.time() + last_idx = len(loader) - 1 + num_updates = epoch * len(loader) + epoch_fps = [] + prof_list = [] + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + data_time_m.update(time.time() - end) + if not args.prefetcher: + input, target = input.npu(), target.npu() + if mixup_fn is not None: + input, target = mixup_fn(input, target) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + if batch_idx in prof_list: + with torch.autograd.profiler.profile(use_npu=True) as prof: + output = model(input) + loss = loss_fn(output, target) + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + if loss_scaler is not None: + loss_scaler( + loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order) + else: + loss.backward(create_graph=second_order) + if args.clip_grad is not None: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + optimizer.step() + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + prof.export_chrome_trace("output_{}.prof".format(str(batch_idx).zfill(4))) + sys.exit() + + else: + with amp_autocast(): + output = model(input) + loss = loss_fn(output, target) + + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + if loss_scaler is not None: + loss_scaler( + loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order) + else: + loss.backward(create_graph=second_order) + if args.clip_grad is not None: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + optimizer.step() + + torch.npu.synchronize() + if model_ema is not None: + model_ema.update(model) + num_updates += 1 + + batch_time_m.update(time.time() - end) + + if last_batch or batch_idx % args.log_interval == 0: + lrl = [param_group['lr'] for param_group in optimizer.param_groups] + lr = sum(lrl) / len(lrl) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + losses_m.update(reduced_loss.item(), input.size(0)) + + if args.local_rank == 0 or args.workers == 1: + _logger.info( + 'Train: {} [{:>4d}/{} ({:>3.0f}%)] ' + 'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) ' + 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s ' + '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'LR: {lr:.3e} ' + 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( + epoch, + batch_idx, len(loader), + 100. * batch_idx / last_idx, + loss=losses_m, + batch_time=batch_time_m, + rate=input.size(0) * args.world_size / batch_time_m.val, + rate_avg=input.size(0) * args.world_size / batch_time_m.avg, + lr=lr, + data_time=data_time_m)) + + if args.save_images and output_dir: + torchvision.utils.save_image( + input, + os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx), + padding=0, + normalize=True) + + if saver is not None and args.recovery_interval and ( + last_batch or (batch_idx + 1) % args.recovery_interval == 0): + saver.save_recovery(epoch, batch_idx=batch_idx) + + if lr_scheduler is not None: + lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg) + + epoch_fps.append(input.shape[0] * args.workers / (time.time() - end)) + end = time.time() + # end for + + if hasattr(optimizer, 'sync_lookahead'): + optimizer.sync_lookahead() + + print('Epoch {}: {} fps'.format(epoch, sum(epoch_fps[5:]) / len(epoch_fps[5:]))) + return OrderedDict([('loss', losses_m.avg)]) + + +def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''): + batch_time_m = AverageMeter() + losses_m = AverageMeter() + top1_m = AverageMeter() + top5_m = AverageMeter() + + model.eval() + + end = time.time() + last_idx = len(loader) - 1 + with torch.no_grad(): + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + if not args.prefetcher: + input = input.npu() + target = target.npu() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + with amp_autocast(): + output = model(input) + if isinstance(output, (tuple, list)): + output = output[0] + + # augmentation reduction + reduce_factor = args.tta + if reduce_factor > 1: + output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2) + target = target[0:target.size(0):reduce_factor] + + loss = loss_fn(output, target) + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + acc1 = reduce_tensor(acc1, args.world_size) + acc5 = reduce_tensor(acc5, args.world_size) + else: + reduced_loss = loss.data + + torch.npu.synchronize() + + losses_m.update(reduced_loss.item(), input.size(0)) + top1_m.update(acc1.item(), output.size(0)) + top5_m.update(acc5.item(), output.size(0)) + + batch_time_m.update(time.time() - end) + end = time.time() + if (args.local_rank == 0 or args.workers == 1) and (last_batch or batch_idx % args.log_interval == 0): + log_name = 'Test' + log_suffix + _logger.info( + '{0}: [{1:>4d}/{2}] ' + 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) ' + 'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format( + log_name, batch_idx, last_idx, batch_time=batch_time_m, + loss=losses_m, top1=top1_m, top5=top5_m)) + + metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)]) + + return metrics + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py b/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py new file mode 100644 index 0000000000..401aa8e586 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py @@ -0,0 +1,41 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class AverageMeter: + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def t2taccuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.reshape(1, -1).expand_as(pred)) + return [correct[:k].reshape(-1).float().sum(0) * 100. for k in topk] diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt b/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt new file mode 100644 index 0000000000..0b49b4fb26 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt @@ -0,0 +1,3 @@ +FuncStatus:OK +PerfStatus:OK +PrecisionStatus:OK \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py b/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py new file mode 100644 index 0000000000..a2f9cf0db1 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py @@ -0,0 +1,255 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import defaultdict + +import torch +from torch.optim.optimizer import Optimizer + +from apex.contrib.combine_tensors import combine_npu + + +class NpuFusedAdamW(Optimizer): + """Implements AdamW algorithm. + + Currently NPU-only. Requires Apex to be installed via + ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--npu_float_status" ./``. + + This version of NPU fused AdamW implements 1 fusions. + + * A combine-tensor apply launch that batches the elementwise updates applied to all the model's parameters + into one or a few kernel launches. + + :class:`apex.optimizers.NpuFusedAdamW` may be used as a drop-in replacement for ``torch.optim.AdamW``:: + + opt = apex.optimizers.NpuFusedAdamW(model.parameters(), lr = ....) + ... + opt.step() + + :class:`apex.optimizers.FusedAdamW` should be used with Amp. Currently, if you wish to use :class:`NpuFusedAdamW` + with Amp, only ``opt_level O1 and O2`` can be choosed:: + + opt = apex.optimizers.NpuFusedAdamW(model.parameters(), lr = ....) + model, opt = amp.initialize(model, opt, opt_level="O2") + ... + opt.step() + + + The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. + The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional, default: 1e-3): learning rate + betas (Tuple[float, float], optional, default: (0.9, 0.999)): coefficients used + for computing running averages of gradient and its square + eps (float, optional, default: 1e-8): term added to the denominator to improve + numerical stability + weight_decay (float, optional, default: 1e-2): weight decay coefficient + amsgrad (boolean, optional, default: False): whether to use the AMSGrad variant of + this algorithm from the paper `On the Convergence of Adam and Beyond`_ + + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _Decoupled Weight Decay Regularization: + https://arxiv.org/abs/1711.05101 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=1e-2, amsgrad=False): + if lr < 0.0: + raise ValueError("Invalid learning rate: {}".format(lr)) + if eps < 0.0: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if betas[0] < 0.0 or betas[0] >= 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if betas[1] < 0.0 or betas[1] >= 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if weight_decay < 0.0: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + self.is_npu_fused_optimizer = True + super(NpuFusedAdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(NpuFusedAdamW, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + def _init_param_state(self, p, amsgrad): + state = self.state[p] + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + else: + exp_avg_tmp = torch.zeros_like(p, memory_format=torch.preserve_format) + exp_avg_tmp.copy_(state['exp_avg']) + state['exp_avg'] = exp_avg_tmp + + exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format) + exp_avg_sq_tmp.copy_(state['exp_avg_sq']) + state['exp_avg_sq'] = exp_avg_sq_tmp + + if amsgrad: + max_exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format) + max_exp_avg_sq_tmp.copy_(state['max_exp_avg_sq']) + state['max_exp_avg_sq'] = max_exp_avg_sq_tmp + + def _combine_group_param_states(self, group_index): + group = self.param_groups[group_index] + stash = self._amp_stash + group_params_list = stash.params_lists_indexed_by_group[group_index] + + amsgrad = group['amsgrad'] + + combined_param_states = [] + for params in group_params_list: + step_list = [] + exp_avg_list = [] + exp_avg_sq_list = [] + max_exp_avg_sq_list = [] + + for p in params: + if p.grad is None: + continue + grad = p.grad + if grad.is_sparse: + raise RuntimeError('NpuFusedAdamW does not support sparse gradients, ' + 'please consider SparseAdam instead') + + self._init_param_state(p, amsgrad) + state = self.state[p] + step_list.append(state['step']) + exp_avg_list.append(state['exp_avg']) + exp_avg_sq_list.append(state['exp_avg_sq']) + if amsgrad: + max_exp_avg_sq_list.append(state['max_exp_avg_sq']) + + combined_step = 0 + combined_exp_avg = None + combined_exp_avg_sq = None + combined_max_exp_avg_sq = None + + if len(exp_avg_list) > 0: + combined_step = step_list[0] + combined_exp_avg = combine_npu(exp_avg_list) + combined_exp_avg_sq = combine_npu(exp_avg_sq_list) + combined_max_exp_avg_sq = combine_npu(max_exp_avg_sq_list) + + combined_state = defaultdict(dict) + combined_state['step'] = combined_step + combined_state['exp_avg'] = combined_exp_avg + combined_state['exp_avg_sq'] = combined_exp_avg_sq + combined_state['max_exp_avg_sq'] = combined_max_exp_avg_sq + combined_param_states.append(combined_state) + stash.combined_param_states_indexed_by_group[group_index] = combined_param_states + + def _combine_param_states_by_group(self): + stash = self._amp_stash + if stash.param_states_are_combined_by_group: + return + + stash.combined_param_states_indexed_by_group = [] + for _ in self.param_groups: + stash.combined_param_states_indexed_by_group.append([]) + + for i, _ in enumerate(self.param_groups): + self._combine_group_param_states(i) + stash.param_states_are_combined_by_group = True + + def _group_step(self, group_index): + group = self.param_groups[group_index] + for p in group['params']: + if p.grad is None: + continue + + grad = p.grad + if grad.is_sparse: + raise RuntimeError('NpuFusedAdamW does not support sparse gradients, ' + 'please consider SparseAdam instead') + state_p = self.state[p] + state_p['step'] += 1 + + amsgrad = group['amsgrad'] + beta1, beta2 = group['betas'] + + stash = self._amp_stash + combined_group_params = stash.combined_params_indexed_by_group[group_index] + combined_group_grads = stash.combined_grads_indexed_by_group[group_index] + combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index] + + for combined_param, combined_grad, combined_param_state in zip(combined_group_params, + combined_group_grads, + combined_group_param_states): + if combined_param is None or combined_grad is None: + continue + + # Perform stepweight decay. The fused method is used here to speed up the calculation + combined_param.mul_(1 - group['lr'] * group['weight_decay']) + + exp_avg, exp_avg_sq = combined_param_state['exp_avg'], combined_param_state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = combined_param_state['max_exp_avg_sq'] + + combined_param_state['step'] += 1 + bias_correction1 = 1 - beta1 ** combined_param_state['step'] + bias_correction2 = 1 - beta2 ** combined_param_state['step'] + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(combined_grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(combined_grad, combined_grad, value=1 - beta2) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + else: + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + + step_size = group['lr'] / bias_correction1 + + combined_param.addcdiv_(exp_avg, denom, value=-step_size) + + @torch.no_grad() + def step(self, closure=None): + if not hasattr(self, "_amp_stash"): + raise RuntimeError('apex.optimizers.NpuFusedAdamW should be used with AMP.') + + self._check_already_combined_params_and_grads() + # combine params and grads first + self._combine_params_and_grads_by_group() + # then combine param states + self._combine_param_states_by_group() + + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for i, _ in enumerate(self.param_groups): + self._group_step(i) + + return loss diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt b/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt new file mode 100644 index 0000000000..f6f0f6fd77 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt @@ -0,0 +1,4 @@ +torch==1.5.0 +apex==0.1 +torchvision==0.6.0 +timm==0.3.4 \ No newline at end of file -- Gitee From db61d27560e84f08317d292f129ad058b8e42848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:47:54 +0000 Subject: [PATCH 06/15] update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../cv/classification/T2T-ViT/test/env_npu.sh | 68 +++++++++ .../T2T-ViT/test/train_full_1p.sh | 129 +++++++++++++++++ .../T2T-ViT/test/train_full_8p.sh | 130 +++++++++++++++++ .../T2T-ViT/test/train_performance_1p.sh | 131 ++++++++++++++++++ .../T2T-ViT/test/train_performance_8p.sh | 130 +++++++++++++++++ 5 files changed, 588 insertions(+) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh new file mode 100644 index 0000000000..bd4205d15d --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh @@ -0,0 +1,68 @@ +#!/bin/bash +CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' + +if [ -f $CANN_INSTALL_PATH_CONF ]; then + CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2) +else + CANN_INSTALL_PATH="/usr/local/Ascend" +fi + +if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then + source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh +else + source ${CANN_INSTALL_PATH}/nnae/set_env.sh +fi + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export TRI_COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 + +#设置device侧日志登记为error +msnpureport -g error -d 0 +msnpureport -g error -d 1 +msnpureport -g error -d 2 +msnpureport -g error -d 3 +msnpureport -g error -d 4 +msnpureport -g error -d 5 +msnpureport -g error -d 6 +msnpureport -g error -d 7 +#关闭Device侧Event日志 +msnpureport -e disable + +ulimit -SHn 512000 + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh new file mode 100644 index 0000000000..09dd270d72 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="T2T-ViT" +# 训练batch_size +batch_size=64 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=310 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +echo ${pwd} + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID} +else + mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID} +fi + +# 变量 +export SPACH_DATASETS=${data_path} +export PYTHONPATH=./:$PYTHONPATH + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh + #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' +fi + +nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \ + ${data_path} \ + --model t2t_vit_14 \ + --batch-size 64 \ + --lr 5e-4 \ + --weight-decay .05 \ + -j 1 \ + --amp \ + --img-size 224 \ + --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \ + > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +AvgFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh new file mode 100644 index 0000000000..d7ef995ea9 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="T2T-ViT" +# 训练batch_size +batch_size=64 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=310 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +echo ${pwd} + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +# 变量 +export DATASETS=${data_path} +export PYTHONPATH=./:$PYTHONPATH + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh + #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' +fi + + +nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \ + ${data_path} \ + --model t2t_vit_14 \ + --batch-size 64 \ + --lr 5e-4 \ + --weight-decay .05 \ + --amp \ + --img-size 224 \ + --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \ + > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh new file mode 100644 index 0000000000..6c1828793e --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="T2T-ViT" +# 训练batch_size +batch_size=64 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=310 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +echo ${pwd} + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID} +else + mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID} +fi + +# 变量 +export SPACH_DATASETS=${data_path} +export PYTHONPATH=./:$PYTHONPATH + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh + #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' +fi + +nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \ + ${data_path} \ + --model t2t_vit_14 \ + --batch-size 64 \ + --lr 5e-4 \ + --weight-decay .05 \ + -j 1 \ + --amp \ + --img-size 224 \ + --epochs 1 \ + --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \ + > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh new file mode 100644 index 0000000000..f024d6cc71 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="T2T-ViT" +# 训练batch_size +batch_size=64 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=310 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +echo ${pwd} + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +# 变量 +export SPACH_DATASETS=${data_path} +export PYTHONPATH=./:$PYTHONPATH + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh + #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' +fi + +nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \ + ${data_path} \ + --model t2t_vit_14 \ + --batch-size 64 \ + --lr 5e-4 \ + --weight-decay .05 \ + --amp \ + --img-size 224 \ + --epochs 5 \ + --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \ + > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From c28a1f6efe660b5d0a09382261a5a16fa6c357b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:48:00 +0000 Subject: [PATCH 07/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?= =?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/test/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/T2T-ViT/test/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From 6113ed35721c8283025840060d8f2ed8ee5f6255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:48:15 +0000 Subject: [PATCH 08/15] update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../classification/T2T-ViT/models/__init__.py | 17 + .../classification/T2T-ViT/models/t2t_vit.py | 304 ++++++++ .../T2T-ViT/models/t2t_vit_dense.py | 177 +++++ .../T2T-ViT/models/t2t_vit_ghost.py | 204 ++++++ .../T2T-ViT/models/t2t_vit_se.py | 176 +++++ .../T2T-ViT/models/token_performer.py | 73 ++ .../T2T-ViT/models/token_transformer.py | 68 ++ .../T2T-ViT/models/transformer_block.py | 96 +++ .../cv/classification/T2T-ViT/models/vit.py | 674 ++++++++++++++++++ 9 files changed, 1789 insertions(+) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py new file mode 100644 index 0000000000..007f383c19 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .t2t_vit import * +from .t2t_vit_se import * +from .t2t_vit_dense import * +from .t2t_vit_ghost import * diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py new file mode 100644 index 0000000000..db04ed4ee0 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py @@ -0,0 +1,304 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +T2T-ViT +""" +import torch +import torch.nn as nn + +from timm.models.helpers import load_pretrained +from timm.models.registry import register_model +from timm.models.layers import trunc_normal_ +import numpy as np +from .token_transformer import Token_transformer +from .token_performer import Token_performer +from .transformer_block import Block, get_sinusoid_encoding + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), + 'classifier': 'head', + **kwargs + } + +default_cfgs = { + 'T2t_vit_7': _cfg(), + 'T2t_vit_10': _cfg(), + 'T2t_vit_12': _cfg(), + 'T2t_vit_14': _cfg(), + 'T2t_vit_19': _cfg(), + 'T2t_vit_24': _cfg(), + 'T2t_vit_t_14': _cfg(), + 'T2t_vit_t_19': _cfg(), + 'T2t_vit_t_24': _cfg(), + 'T2t_vit_14_resnext': _cfg(), + 'T2t_vit_14_wide': _cfg(), +} + +class T2T_module(nn.Module): + """ + Tokens-to-Token encoding module + """ + def __init__(self, img_size=224, tokens_type='performer', in_chans=3, embed_dim=768, token_dim=64): + super().__init__() + + if tokens_type == 'transformer': + print('adopt transformer encoder for tokens-to-token') + self.soft_split0 = nn.Unfold(kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) + self.soft_split1 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + self.soft_split2 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + + self.attention1 = Token_transformer(dim=in_chans * 7 * 7, in_dim=token_dim, num_heads=1, mlp_ratio=1.0) + self.attention2 = Token_transformer(dim=token_dim * 3 * 3, in_dim=token_dim, num_heads=1, mlp_ratio=1.0) + self.project = nn.Linear(token_dim * 3 * 3, embed_dim) + + elif tokens_type == 'performer': + print('adopt performer encoder for tokens-to-token') + self.soft_split0 = nn.Unfold(kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) + self.soft_split1 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + self.soft_split2 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + + #self.attention1 = Token_performer(dim=token_dim, in_dim=in_chans*7*7, kernel_ratio=0.5) + #self.attention2 = Token_performer(dim=token_dim, in_dim=token_dim*3*3, kernel_ratio=0.5) + self.attention1 = Token_performer(dim=in_chans*7*7, in_dim=token_dim, kernel_ratio=0.5) + self.attention2 = Token_performer(dim=token_dim*3*3, in_dim=token_dim, kernel_ratio=0.5) + self.project = nn.Linear(token_dim * 3 * 3, embed_dim) + + elif tokens_type == 'convolution': # just for comparison with conolution, not our model + # for this tokens type, you need change forward as three convolution operation + print('adopt convolution layers for tokens-to-token') + self.soft_split0 = nn.Conv2d(3, token_dim, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) # the 1st convolution + self.soft_split1 = nn.Conv2d(token_dim, token_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) # the 2nd convolution + self.project = nn.Conv2d(token_dim, embed_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) # the 3rd convolution + + self.num_patches = (img_size // (4 * 2 * 2)) * (img_size // (4 * 2 * 2)) # there are 3 sfot split, stride are 4,2,2 seperately + + def forward(self, x): + # step0: soft split + x = self.soft_split0(x).transpose(1, 2) + + # iteration1: re-structurization/reconstruction + x = self.attention1(x) + B, new_HW, C = x.shape + x = x.transpose(1,2).reshape(B, C, int(np.sqrt(new_HW)), int(np.sqrt(new_HW))) + # iteration1: soft split + x = self.soft_split1(x).transpose(1, 2) + + # iteration2: re-structurization/reconstruction + x = self.attention2(x) + B, new_HW, C = x.shape + x = x.transpose(1, 2).reshape(B, C, int(np.sqrt(new_HW)), int(np.sqrt(new_HW))) + # iteration2: soft split + x = self.soft_split2(x).transpose(1, 2) + + # final tokens + x = self.project(x) + + return x + +class T2T_ViT(nn.Module): + def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm, token_dim=64): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.tokens_to_token = T2T_module( + img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim, token_dim=token_dim) + num_patches = self.tokens_to_token.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + x = self.tokens_to_token(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + +@register_model +def t2t_vit_7(pretrained=False, **kwargs): # adopt performer for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 256 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=7, num_heads=4, mlp_ratio=2., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_7'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_10(pretrained=False, **kwargs): # adopt performer for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 256 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=10, num_heads=4, mlp_ratio=2., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_10'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_12(pretrained=False, **kwargs): # adopt performer for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 256 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=12, num_heads=4, mlp_ratio=2., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_12'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + + +@register_model +def t2t_vit_14(pretrained=False, **kwargs): # adopt performer for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 384 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_14'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_19(pretrained=False, **kwargs): # adopt performer for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 448 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_19'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_24(pretrained=False, **kwargs): # adopt performer for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 512 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_24'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_t_14(pretrained=False, **kwargs): # adopt transformers for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 384 ** -0.5) + model = T2T_ViT(tokens_type='transformer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_t_14'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_t_19(pretrained=False, **kwargs): # adopt transformers for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 448 ** -0.5) + model = T2T_ViT(tokens_type='transformer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_t_19'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_t_24(pretrained=False, **kwargs): # adopt transformers for tokens to token + if pretrained: + kwargs.setdefault('qk_scale', 512 ** -0.5) + model = T2T_ViT(tokens_type='transformer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_t_24'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +# rexnext and wide structure +@register_model +def t2t_vit_14_resnext(pretrained=False, **kwargs): + if pretrained: + kwargs.setdefault('qk_scale', 384 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=384, depth=14, num_heads=32, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_14_resnext'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model + +@register_model +def t2t_vit_14_wide(pretrained=False, **kwargs): + if pretrained: + kwargs.setdefault('qk_scale', 512 ** -0.5) + model = T2T_ViT(tokens_type='performer', embed_dim=768, depth=4, num_heads=12, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_14_wide'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py new file mode 100644 index 0000000000..5724570c39 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py @@ -0,0 +1,177 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +T2T-ViT-Dense +""" +import torch +import torch.nn as nn + +from timm.models.helpers import load_pretrained +from timm.models.layers import DropPath, trunc_normal_ +from timm.models.registry import register_model + +from .transformer_block import Mlp, Block, get_sinusoid_encoding +from .t2t_vit import T2T_module, _cfg + +default_cfgs = { + 't2t_vit_dense': _cfg(), +} + +class Transition(nn.Module): + def __init__(self, in_features, out_features, act_layer=nn.GELU): + super(Transition, self).__init__() + self.act = act_layer() + self.linear = nn.Linear(in_features, out_features) + def forward(self, x): + x = self.linear(x) + x = self.act(x) + + return x + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + + def __init__(self, growth_rate, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) #, out_features=growth_rate + self.dense_linear = nn.Linear(dim, growth_rate) + + def forward(self, x): + new_x = x + self.drop_path(self.attn(self.norm1(x))) + new_x = new_x + self.drop_path(self.mlp(self.norm2(new_x))) + new_x = self.dense_linear(new_x) + x = torch.cat([x, new_x], 2) # dense connnection: concate all the old features with new features in channel dimension + return x + +class T2T_ViT_Dense(nn.Module): + def __init__(self, growth_rate=32, tokens_type='performer', block_config=(3, 4, 6, 3), img_size=224, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.tokens_to_token = T2T_module( + img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.tokens_to_token.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList() + + start_dim = embed_dim + for i, num_layers in enumerate(block_config): + for j in range(num_layers): + new_dim = start_dim + j * growth_rate + block = Block(growth_rate=growth_rate, dim=new_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + self.blocks.append(block) + if i != len(block_config)-1: + transition = Transition(new_dim+growth_rate, (new_dim+growth_rate)//2) + self.blocks.append(transition) + start_dim = int((new_dim+growth_rate)//2) + out_dim = new_dim + growth_rate + print(f'end dim:{out_dim}') + self.norm = norm_layer(out_dim) + + # Classifier head + self.head = nn.Linear(out_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + x = self.tokens_to_token(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed # self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + +@register_model +def t2t_vit_dense(pretrained=False, **kwargs): + model = T2T_ViT_Dense(growth_rate=64, block_config=(3, 6, 6, 4), embed_dim=128, num_heads=8, mlp_ratio=2., **kwargs) + model.default_cfg = default_cfgs['t2t_vit_dense'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py new file mode 100644 index 0000000000..217dad772b --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py @@ -0,0 +1,204 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +T2T-ViT-Ghost +""" +import torch +import torch.nn as nn + +from timm.models.helpers import load_pretrained +from timm.models.layers import DropPath, trunc_normal_ +from timm.models.registry import register_model + +from .transformer_block import Block, get_sinusoid_encoding +from .t2t_vit import T2T_module, _cfg + + +default_cfgs = { + 'T2t_vit_16_ghost': _cfg(), +} + +class Mlp_ghost(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, in_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + self.ratio = hidden_features//in_features + self.cheap_operation2 = nn.Conv1d(in_features, in_features, kernel_size=1, groups=in_features, bias=False) + self.cheap_operation3 = nn.Conv1d(in_features, in_features, kernel_size=1, groups=in_features, bias=False) + + def forward(self, x): # x: [B, N, C] + x1 = self.fc1(x) # x1: [B, N, C] + x1 = self.act(x1) + + x2 = self.cheap_operation2(x1.transpose(1,2)) # x2: [B, N, C] + x2 = x2.transpose(1,2) + x2 = self.act(x2) + + x3 = self.cheap_operation3(x1.transpose(1, 2)) # x3: [B, N, C] + x3 = x3.transpose(1, 2) + x3 = self.act(x3) + + x = torch.cat((x1, x2, x3), dim=2) # x: [B, N, 3C] + x = self.drop(x) + + x = self.fc2(x) + x = self.drop(x) + return x + +class Attention_ghost(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + half_dim = int(0.5*dim) + self.q = nn.Linear(dim, half_dim, bias=qkv_bias) + self.k = nn.Linear(dim, half_dim, bias=qkv_bias) + self.v = nn.Linear(dim, half_dim, bias=qkv_bias) + + self.cheap_operation_q = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False) + self.cheap_operation_k = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False) + self.cheap_operation_v = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + q = self.q(x) + k = self.k(x) + v = self.v(x) + + q1 = self.cheap_operation_q(q.transpose(1,2)).transpose(1,2) + k1 = self.cheap_operation_k(k.transpose(1,2)).transpose(1,2) + v1 = self.cheap_operation_v(v.transpose(1,2)).transpose(1,2) + + q = torch.cat((q, q1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + k = torch.cat((k, k1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + v = torch.cat((v, v1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention_ghost( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp_ghost(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + +class T2T_ViT_Ghost(nn.Module): + def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.tokens_to_token = T2T_module( + img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.tokens_to_token.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + x = self.tokens_to_token(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + +@register_model +def t2t_vit_16_ghost(pretrained=False, **kwargs): + if pretrained: + kwargs.setdefault('qk_scale', 384 ** -0.5) + model = T2T_ViT_Ghost(tokens_type='performer', embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_16_ghost'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py new file mode 100644 index 0000000000..b43a86e39e --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py @@ -0,0 +1,176 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +T2T-ViT-SE +""" +import torch +import torch.nn as nn + +from timm.models.helpers import load_pretrained +from timm.models.layers import DropPath, trunc_normal_ +from timm.models.registry import register_model +from .transformer_block import Block, Mlp, get_sinusoid_encoding +from .t2t_vit import T2T_module, _cfg + +default_cfgs = { + 'T2t_vit_14_se': _cfg(), +} + +class SELayer(nn.Module): + def __init__(self, channel, reduction=16): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool1d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction, bias=False), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel, bias=False), + nn.Sigmoid() + ) + + def forward(self, x): # x: [B, N, C] + x = torch.transpose(x, 1, 2) # [B, C, N] + b, c, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1) + x = x * y.expand_as(x) + x = torch.transpose(x, 1, 2) # [B, N, C] + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.se_layer = SELayer(dim) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.se_layer(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + +class T2T_ViT_SE(nn.Module): + def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=nn.LayerNorm): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.tokens_to_token = T2T_module( + img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.tokens_to_token.num_patches + print(num_patches) + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + x = self.tokens_to_token(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + +@register_model +def t2t_vit_14_se(pretrained=False, **kwargs): + if pretrained: + kwargs.setdefault('qk_scale', 384 ** -0.5) + model = T2T_ViT_SE(tokens_type='performer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs) + model.default_cfg = default_cfgs['T2t_vit_14_se'] + if pretrained: + load_pretrained( + model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) + return model diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py new file mode 100644 index 0000000000..16134300a3 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py @@ -0,0 +1,73 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Take Performer as T2T Transformer +""" +import math +import torch +import torch.nn as nn + +class Token_performer(nn.Module): + def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2 = 0.1): + super().__init__() + self.emb = in_dim * head_cnt # we use 1, so it is no need here + self.kqv = nn.Linear(dim, 3 * self.emb) + self.dp = nn.Dropout(dp1) + self.proj = nn.Linear(self.emb, self.emb) + self.head_cnt = head_cnt + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(self.emb) + self.epsilon = 1e-8 # for stable in division + + self.mlp = nn.Sequential( + nn.Linear(self.emb, 1 * self.emb), + nn.GELU(), + nn.Linear(1 * self.emb, self.emb), + nn.Dropout(dp2), + ) + + self.m = int(self.emb * kernel_ratio) + self.w = torch.randn(self.m, self.emb) + self.w = nn.Parameter(nn.init.orthogonal_(self.w) * math.sqrt(self.m), requires_grad=False) + + def prm_exp(self, x): + # part of the function is borrow from https://github.com/lucidrains/performer-pytorch + # and Simo Ryu (https://github.com/cloneofsimo) + # ==== positive random features for gaussian kernels ==== + # x = (B, T, hs) + # w = (m, hs) + # return : x : B, T, m + # SM(x, y) = E_w[exp(w^T x - |x|/2) exp(w^T y - |y|/2)] + # therefore return exp(w^Tx - |x|/2)/sqrt(m) + xd = ((x * x).sum(dim=-1, keepdim=True)).repeat(1, 1, self.m) / 2 + wtx = torch.einsum('bti,mi->btm', x.float(), self.w) + + return torch.exp(wtx - xd) / math.sqrt(self.m) + + def single_attn(self, x): + k, q, v = torch.split(self.kqv(x), self.emb, dim=-1) + kp, qp = self.prm_exp(k), self.prm_exp(q) # (B, T, m), (B, T, m) + D = torch.einsum('bti,bi->bt', qp, kp.sum(dim=1)).unsqueeze(dim=2) # (B, T, m) * (B, m) -> (B, T, 1) + kptv = torch.einsum('bin,bim->bnm', v.float(), kp) # (B, emb, m) + y = torch.einsum('bti,bni->btn', qp, kptv) / (D.repeat(1, 1, self.emb) + self.epsilon) # (B, T, emb)/Diag + # skip connection + y = v + self.dp(self.proj(y)) # same as token_transformer in T2T layer, use v as skip connection + + return y + + def forward(self, x): + x = self.single_attn(self.norm1(x)) + x = x + self.mlp(self.norm2(x)) + return x + diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py new file mode 100644 index 0000000000..f9133a1d9f --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py @@ -0,0 +1,68 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Take the standard Transformer as T2T Transformer +""" +import torch.nn as nn +from timm.models.layers import DropPath +from .transformer_block import Mlp + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, in_dim = None, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + self.in_dim = in_dim + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, in_dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(in_dim, in_dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.in_dim).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q * self.scale) @ k.transpose(-2, -1) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, self.in_dim) + x = self.proj(x) + x = self.proj_drop(x) + + # skip connection + x = v.squeeze(1) + x # because the original x has different size with current x, use v to do skip connection + + return x + +class Token_transformer(nn.Module): + + def __init__(self, dim, in_dim, num_heads, mlp_ratio=1., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, in_dim=in_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(in_dim) + self.mlp = Mlp(in_features=in_dim, hidden_features=int(in_dim*mlp_ratio), out_features=in_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = self.attn(self.norm1(x)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py new file mode 100644 index 0000000000..0ba43c1421 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py @@ -0,0 +1,96 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Borrow from timm(https://github.com/rwightman/pytorch-image-models) +""" +import torch +import torch.nn as nn +import numpy as np +from timm.models.layers import DropPath + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +def get_sinusoid_encoding(n_position, d_hid): + ''' Sinusoid position encoding table ''' + + def get_position_angle_vec(position): + return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] + + sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + + return torch.FloatTensor(sinusoid_table).unsqueeze(0) diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py new file mode 100644 index 0000000000..b263a6f9e5 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py @@ -0,0 +1,674 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The original Vision Transformer (ViT) from timm, copyright belongs to / Copyright 2020 Ross Wightman +""" +import math +import logging +from functools import partial +from collections import OrderedDict +from copy import deepcopy + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from timm.models.helpers import build_model_with_cfg, overlay_external_default_cfg +from timm.models.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_ +from timm.models.registry import register_model + +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # patch models (my experiments) + 'vit_small_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/vit_small_p16_224-15ec54c9.pth', + ), + + # patch models (weights ported from official Google JAX impl) + 'vit_base_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + ), + 'vit_base_patch32_224': _cfg( + url='', # no official model weights for this combo, only for in21k + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_base_patch16_384': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth', + input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0), + 'vit_base_patch32_384': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p32_384-830016f5.pth', + input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0), + 'vit_large_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_large_patch32_224': _cfg( + url='', # no official model weights for this combo, only for in21k + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_large_patch16_384': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth', + input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0), + 'vit_large_patch32_384': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth', + input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0), + + # patch models, imagenet21k (weights ported from official Google JAX impl) + 'vit_base_patch16_224_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth', + num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_base_patch32_224_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth', + num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_large_patch16_224_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth', + num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_large_patch32_224_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth', + num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + 'vit_huge_patch14_224_in21k': _cfg( + hf_hub='timm/vit_huge_patch14_224_in21k', + num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + + # deit models (FB weights) + 'vit_deit_tiny_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'), + 'vit_deit_small_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'), + 'vit_deit_base_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',), + 'vit_deit_base_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_deit_tiny_distilled_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth', + classifier=('head', 'head_dist')), + 'vit_deit_small_distilled_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth', + classifier=('head', 'head_dist')), + 'vit_deit_base_distilled_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth', + classifier=('head', 'head_dist')), + 'vit_deit_base_distilled_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth', + input_size=(3, 384, 384), crop_pct=1.0, classifier=('head', 'head_dist')), + + # ViT ImageNet-21K-P pretraining + 'vit_base_patch16_224_miil_in21k': _cfg( + url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/vit_base_patch16_224_in21k_miil.pth', + mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear', num_classes=11221, + ), + 'vit_base_patch16_224_miil': _cfg( + url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm' + '/vit_base_patch16_224_1k_miil_84_4.pth', + mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear', + ), +} + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class VisionTransformer(nn.Module): + """ Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + Includes distillation token & head support for `DeiT: Data-efficient Image Transformers` + - https://arxiv.org/abs/2012.12877 + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, distilled=False, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=PatchEmbed, norm_layer=None, + act_layer=None, weight_init=''): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + qk_scale (float): override default qk scale of head_dim ** -0.5 if set + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + distilled (bool): model includes a distillation token and head as in DeiT models + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + weight_init: (str): weight init scheme + """ + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 2 if distilled else 1 + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.patch_embed = embed_layer( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # Representation layer + if representation_size and not distilled: + self.num_features = representation_size + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(embed_dim, representation_size)), + ('act', nn.Tanh()) + ])) + else: + self.pre_logits = nn.Identity() + + # Classifier head(s) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + self.head_dist = None + if distilled: + self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity() + + # Weight init + assert weight_init in ('jax', 'jax_nlhb', 'nlhb', '') + head_bias = -math.log(self.num_classes) if 'nlhb' in weight_init else 0. + trunc_normal_(self.pos_embed, std=.02) + if self.dist_token is not None: + trunc_normal_(self.dist_token, std=.02) + if weight_init.startswith('jax'): + # leave cls token as zeros to match jax impl + for n, m in self.named_modules(): + _init_vit_weights(m, n, head_bias=head_bias, jax_impl=True) + else: + trunc_normal_(self.cls_token, std=.02) + self.apply(_init_vit_weights) + + def _init_weights(self, m): + # this fn left here for compat with downstream users + _init_vit_weights(m) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token', 'dist_token'} + + def get_classifier(self): + if self.dist_token is None: + return self.head + else: + return self.head, self.head_dist + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + if self.num_tokens == 2: + self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + cls_token = self.cls_token.expand(x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks + if self.dist_token is None: + x = torch.cat((cls_token, x), dim=1) + else: + x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1) + x = self.pos_drop(x + self.pos_embed) + x = self.blocks(x) + x = self.norm(x) + if self.dist_token is None: + return self.pre_logits(x[:, 0]) + else: + return x[:, 0], x[:, 1] + + def forward(self, x): + x = self.forward_features(x) + if self.head_dist is not None: + x, x_dist = self.head(x[0]), self.head_dist(x[1]) # x must be a tuple + if self.training and not torch.jit.is_scripting(): + # during inference, return the average of both classifier predictions + return x, x_dist + else: + return (x + x_dist) / 2 + else: + x = self.head(x) + return x + + +def _init_vit_weights(m, n: str = '', head_bias: float = 0., jax_impl: bool = False): + """ ViT weight initialization + * When called without n, head_bias, jax_impl args it will behave exactly the same + as my original init for compatibility with prev hparam / downstream use cases (ie DeiT). + * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl + """ + if isinstance(m, nn.Linear): + if n.startswith('head'): + nn.init.zeros_(m.weight) + nn.init.constant_(m.bias, head_bias) + elif n.startswith('pre_logits'): + lecun_normal_(m.weight) + nn.init.zeros_(m.bias) + else: + if jax_impl: + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + if 'mlp' in n: + nn.init.normal_(m.bias, std=1e-6) + else: + nn.init.zeros_(m.bias) + else: + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif jax_impl and isinstance(m, nn.Conv2d): + # NOTE conv was left to pytorch default in my original init + lecun_normal_(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.zeros_(m.bias) + nn.init.ones_(m.weight) + + +def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()): + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape) + ntok_new = posemb_new.shape[1] + if num_tokens: + posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0, num_tokens:] + ntok_new -= num_tokens + else: + posemb_tok, posemb_grid = posemb[:, :0], posemb[0] + gs_old = int(math.sqrt(len(posemb_grid))) + if not len(gs_new): # backwards compatibility + gs_new = [int(math.sqrt(ntok_new))] * 2 + assert len(gs_new) >= 2 + _logger.info('Position embedding grid-size from %s to %s', [gs_old, gs_old], gs_new) + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate(posemb_grid, size=gs_new, mode='bilinear') + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1) + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + return posemb + + +def checkpoint_filter_fn(state_dict, model): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + if 'model' in state_dict: + # For deit models + state_dict = state_dict['model'] + for k, v in state_dict.items(): + if 'patch_embed.proj.weight' in k and len(v.shape) < 4: + # For old models that I trained prior to conv based patchification + O, I, H, W = model.patch_embed.proj.weight.shape + v = v.reshape(O, -1, H, W) + elif k == 'pos_embed' and v.shape != model.pos_embed.shape: + # To resize pos embedding when using model at different size from pretrained weights + v = resize_pos_embed(v, model.pos_embed, getattr(model, 'num_tokens', 1), + model.patch_embed.grid_size) + out_dict[k] = v + return out_dict + + +def _create_vision_transformer(variant, pretrained=False, default_cfg=None, **kwargs): + if default_cfg is None: + default_cfg = deepcopy(default_cfgs[variant]) + overlay_external_default_cfg(default_cfg, kwargs) + default_num_classes = default_cfg['num_classes'] + default_img_size = default_cfg['input_size'][-2:] + + num_classes = kwargs.pop('num_classes', default_num_classes) + img_size = kwargs.pop('img_size', default_img_size) + repr_size = kwargs.pop('representation_size', None) + if repr_size is not None and num_classes != default_num_classes: + # Remove representation layer if fine-tuning. This may not always be the desired action, + # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface? + _logger.warning("Removing representation layer for fine-tuning.") + repr_size = None + + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg( + VisionTransformer, variant, pretrained, + default_cfg=default_cfg, + img_size=img_size, + num_classes=num_classes, + representation_size=repr_size, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + + return model + + +@register_model +def vit_small_patch16_224(pretrained=False, **kwargs): + """ My custom 'small' ViT model. embed_dim=768, depth=8, num_heads=8, mlp_ratio=3. + NOTE: + * this differs from the DeiT based 'small' definitions with embed_dim=384, depth=12, num_heads=6 + * this model does not have a bias for QKV (unlike the official ViT and DeiT models) + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=8, num_heads=8, mlp_ratio=3., + qkv_bias=False, norm_layer=nn.LayerNorm, **kwargs) + if pretrained: + # NOTE my scale was wrong for original weights, leaving this here until I have better ones for this model + model_kwargs.setdefault('qk_scale', 768 ** -0.5) + model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights. + """ + model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_384(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_384(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch16_224(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch32_224(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights. + """ + model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch16_384(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch32_384(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_in21k(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_224_in21k(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict( + patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs) + model = _create_vision_transformer('vit_base_patch32_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch16_224_in21k(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs) + model = _create_vision_transformer('vit_large_patch16_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch32_224_in21k(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict( + patch_size=32, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs) + model = _create_vision_transformer('vit_large_patch32_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_huge_patch14_224_in21k(pretrained=False, **kwargs): + """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: converted weights not currently available, too large for github release hosting. + """ + model_kwargs = dict( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, representation_size=1280, **kwargs) + model = _create_vision_transformer('vit_huge_patch14_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_deit_tiny_patch16_224(pretrained=False, **kwargs): + """ DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer('vit_deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_deit_small_patch16_224(pretrained=False, **kwargs): + """ DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_deit_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_deit_base_patch16_224(pretrained=False, **kwargs): + """ DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_deit_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_deit_base_patch16_384(pretrained=False, **kwargs): + """ DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_deit_base_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_deit_tiny_distilled_patch16_224(pretrained=False, **kwargs): + """ DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer( + 'vit_deit_tiny_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def vit_deit_small_distilled_patch16_224(pretrained=False, **kwargs): + """ DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer( + 'vit_deit_small_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def vit_deit_base_distilled_patch16_224(pretrained=False, **kwargs): + """ DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer( + 'vit_deit_base_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs): + """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer( + 'vit_deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_miil_in21k(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_miil_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_miil(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_miil', pretrained=pretrained, **model_kwargs) + return model -- Gitee From 883e1375ea828ef07d7e1e488ebff9d19566ba56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Tue, 20 Dec 2022 13:48:23 +0000 Subject: [PATCH 09/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?= =?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/models/.keep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/T2T-ViT/models/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep deleted file mode 100644 index e69de29bb2..0000000000 -- Gitee From 0cc3dca85cfc02f45ecf01a4dec74604928e66ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Wed, 21 Dec 2022 01:37:55 +0000 Subject: [PATCH 10/15] update PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../cv/classification/T2T-ViT/test/train_performance_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh index f024d6cc71..85e322f06f 100644 --- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh @@ -75,7 +75,7 @@ nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \ --weight-decay .05 \ --amp \ --img-size 224 \ - --epochs 5 \ + --epochs 1 \ --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & -- Gitee From fc1f100426ae4570c05e96851dd003bf5cb7f85f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Wed, 21 Dec 2022 01:41:22 +0000 Subject: [PATCH 11/15] update t2t-main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../classification/T2T-ViT/{main.py => t2t-main.py} | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) rename PyTorch/contrib/cv/classification/T2T-ViT/{main.py => t2t-main.py} (99%) diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py similarity index 99% rename from PyTorch/contrib/cv/classification/T2T-ViT/main.py rename to PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py index 1b3ceea782..d230656a48 100644 --- a/PyTorch/contrib/cv/classification/T2T-ViT/main.py +++ b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py @@ -34,7 +34,7 @@ import torchvision.utils from torch.optim.optimizer import Optimizer from torch.nn.parallel import DistributedDataParallel as NativeDDP -from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset +from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset, create_loader from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model from timm.utils import * from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy @@ -42,8 +42,10 @@ from timm.optim import create_optimizer from timm.scheduler import create_scheduler from timm.utils import ApexScaler, NativeScaler -from data.myloader import create_loader + +#from data.myloader import create_loader from npu_fused_adamw import NpuFusedAdamW +from metrics import t2taccuracy torch.backends.cudnn.benchmark = True _logger = logging.getLogger('train') @@ -427,7 +429,7 @@ def main(): args, args_text = _parse_args() os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1' - os.environ['MASTER_PORT'] = '99999' # Any available port + os.environ['MASTER_PORT'] = '9999' # Any available port args.prefetcher = not args.no_prefetcher args.distributed = (args.workers > 1) @@ -681,6 +683,7 @@ def main(): lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn) + #exit() if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0 or args.workers == 1: _logger.info("Distributing BatchNorm running means and vars") @@ -875,7 +878,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='') target = target[0:target.size(0):reduce_factor] loss = loss_fn(output, target) - acc1, acc5 = accuracy(output, target, topk=(1, 5)) + acc1, acc5 = t2taccuracy(output, target, topk=(1, 5)) if args.distributed: reduced_loss = reduce_tensor(loss.data, args.world_size) -- Gitee From 2fba4c3aa6a75ff9bb1e712d6f35d80e770e05af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Wed, 21 Dec 2022 02:36:36 +0000 Subject: [PATCH 12/15] update PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py index d230656a48..37dc538abe 100644 --- a/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py +++ b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py @@ -45,7 +45,6 @@ from timm.utils import ApexScaler, NativeScaler #from data.myloader import create_loader from npu_fused_adamw import NpuFusedAdamW -from metrics import t2taccuracy torch.backends.cudnn.benchmark = True _logger = logging.getLogger('train') @@ -878,7 +877,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='') target = target[0:target.size(0):reduce_factor] loss = loss_fn(output, target) - acc1, acc5 = t2taccuracy(output, target, topk=(1, 5)) + acc1, acc5 = accuracy(output, target, topk=(1, 5)) if args.distributed: reduced_loss = reduce_tensor(loss.data, args.world_size) -- Gitee From 1cd1d1eb6a291d8a01f61c2474219eeb2d47cac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Wed, 21 Dec 2022 02:38:13 +0000 Subject: [PATCH 13/15] update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../contrib/cv/classification/T2T-ViT/main.py | 913 ++++++++++++++++++ 1 file changed, 913 insertions(+) create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/main.py diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/main.py new file mode 100644 index 0000000000..9884699a79 --- /dev/null +++ b/PyTorch/contrib/cv/classification/T2T-ViT/main.py @@ -0,0 +1,913 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +T2T-ViT training and evaluating script +This script is modified from pytorch-image-models by Ross Wightman (https://github.com/rwightman/pytorch-image-models/) +It was started from an early version of the PyTorch ImageNet example +(https://github.com/pytorch/examples/tree/master/imagenet) +""" +import argparse +import time +import yaml +import os +import logging +from collections import OrderedDict +from contextlib import suppress +from datetime import datetime +import models +from typing import Optional + +import torch +import torch.nn as nn +import torchvision.utils +from torch.optim.optimizer import Optimizer +from torch.nn.parallel import DistributedDataParallel as NativeDDP + +from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset +from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model +from timm.utils import * +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy +from timm.optim import create_optimizer +from timm.scheduler import create_scheduler +from timm.utils import ApexScaler, NativeScaler + +from data.myloader import create_loader +from npu_fused_adamw import NpuFusedAdamW +from metrics import t2taccuracy + +torch.backends.cudnn.benchmark = True +_logger = logging.getLogger('train') + +# The first arg parser parses out only the --config argument, this argument is used to +# load a yaml file containing key-values that override the defaults for the main parser below +config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False) +parser.add_argument('-c', '--config', default='', type=str, metavar='FILE', + help='YAML config file specifying default arguments') + +parser = argparse.ArgumentParser(description='T2T-ViT Training and Evaluating') + +# Dataset / Model parameters +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('--model', default='T2t_vit_14', type=str, metavar='MODEL', + help='Name of model to train (default: "countception"') +parser.add_argument('--pretrained', action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') +parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH', + help='Initialize model from this checkpoint (default: none)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='Resume full model and optimizer state from checkpoint (default: none)') +parser.add_argument('--eval_checkpoint', default='', type=str, metavar='PATH', + help='path to eval checkpoint (default: none)') +parser.add_argument('--no-resume-opt', action='store_true', default=False, + help='prevent resume of optimizer state when resuming model') +parser.add_argument('--num-classes', type=int, default=1000, metavar='N', + help='number of label classes (default: 1000)') +parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') +parser.add_argument('--img-size', type=int, default=224, metavar='N', + help='Image patch size (default: None => model default)') +parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') +parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') +parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') +parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') +parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + +# Optimizer parameters +parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "adamw"') +parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') +parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') +parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') +parser.add_argument('--weight-decay', type=float, default=0.05, + help='weight decay (default: 0.005 for adamw)') +parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + +# Learning rate schedule parameters +parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER', + help='LR scheduler (default: "cosine"') +parser.add_argument('--lr', type=float, default=5e-4, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') +parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') +parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') +parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') +parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') +parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR', + help='warmup learning rate (default: 0.0001)') +parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') +parser.add_argument('--epochs', type=int, default=300, metavar='N', + help='number of epochs to train (default: 2)') +parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') +parser.add_argument('--warmup-epochs', type=int, default=10, metavar='N', + help='epochs to warmup LR, if scheduler supports') +parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') +parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') +parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + +# Augmentation & regularization parameters +parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') +parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') +parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') +parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') +parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') +parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') +parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), +parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') +parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') +parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', + help='Random erase prob (default: 0.25)') +parser.add_argument('--remode', type=str, default='pixel', + help='Random erase mode (default: "const")') +parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') +parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') +parser.add_argument('--mixup', type=float, default=0.8, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix', type=float, default=1.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') +parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') +parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') +parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') +parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') +parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') +parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') +parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') +parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.0)') +parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') +parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT', + help='Drop path rate (default: None)') +parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + +# Batch norm parameters (only works with gen_efficientnet based models currently) +parser.add_argument('--bn-tf', action='store_true', default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') +parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') +parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') +parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') +parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') +parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + +# Model Exponential Moving Average +parser.add_argument('--model-ema', action='store_true', default=True, + help='Enable tracking moving average of model weights') +parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') +parser.add_argument('--model-ema-decay', type=float, default=0.99996, + help='decay factor for model weights moving average (default: 0.9998)') + +# Misc +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=50, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--recovery-interval', type=int, default=0, metavar='N', + help='how many batches to wait before writing recovery checkpoint') +parser.add_argument('-j', '--workers', type=int, default=8, metavar='N', + help='how many training processes to use (default: 1)') +parser.add_argument('--num-gpu', type=int, default=1, + help='Number of GPUS to use') +parser.add_argument('--save-images', action='store_true', default=False, + help='save images of input bathes every log interval for debugging') +parser.add_argument('--amp', action='store_true', default=False, + help='use NVIDIA Apex AMP or Native AMP for mixed precision training') +parser.add_argument('--apex-amp', action='store_true', default=False, + help='Use NVIDIA Apex AMP mixed precision') +parser.add_argument('--native-amp', action='store_true', default=False, + help='Use Native Torch AMP mixed precision') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +parser.add_argument('--pin-mem', action='store_true', default=False, + help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') +parser.add_argument('--no-prefetcher', action='store_true', default=False, + help='disable fast prefetcher') +parser.add_argument('--output', default='', type=str, metavar='PATH', + help='path to output folder (default: none, current dir)') +parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC', + help='Best metric (default: "top1"') +parser.add_argument('--tta', type=int, default=0, metavar='N', + help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)') +parser.add_argument("--local_rank", default=0, type=int) +parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False, + help='use the multi-epochs-loader to save time at the beginning of every epoch') + +parser.add_argument("--addr", default="127.0.0.1", type=str) +parser.add_argument("--performance", action='store_true', default=False, + help='whether get the model performance') + +has_apex = True + +import apex +from apex import amp +from apex.parallel import DistributedDataParallel as ApexDDP +from apex.parallel import convert_syncbn_model + +def optimizer_kwargs(cfg): + """ cfg/argparse to kwargs helper + Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn. + """ + kwargs = dict( + opt=cfg.opt, + lr=cfg.lr, + weight_decay=cfg.weight_decay, + momentum=cfg.momentum) + if getattr(cfg, 'opt_eps', None) is not None: + kwargs['eps'] = cfg.opt_eps + if getattr(cfg, 'opt_betas', None) is not None: + kwargs['betas'] = cfg.opt_betas + if getattr(cfg, 'opt_args', None) is not None: + kwargs.update(cfg.opt_args) + return kwargs + +def add_weight_decay(model, weight_decay=1e-5, skip_list=()): + """Add weight decay + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: + no_decay.append(param) + else: + decay.append(param) + return [ + {'params': no_decay, 'weight_decay': 0.}, + {'params': decay, 'weight_decay': weight_decay}] + +class Lookahead(Optimizer): + def __init__(self, base_optimizer, alpha=0.5, k=6): + # NOTE super().__init__() not called on purpose + if not 0.0 <= alpha <= 1.0: + raise ValueError(f'Invalid slow update rate: {alpha}') + if not 1 <= k: + raise ValueError(f'Invalid lookahead steps: {k}') + defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0) + self._base_optimizer = base_optimizer + self.param_groups = base_optimizer.param_groups + self.defaults = base_optimizer.defaults + self.defaults.update(defaults) + self.state = defaultdict(dict) + # manually add our defaults to the param groups + for name, default in defaults.items(): + for group in self._base_optimizer.param_groups: + group.setdefault(name, default) + + @torch.no_grad() + def update_slow(self, group): + for fast_p in group["params"]: + if fast_p.grad is None: + continue + param_state = self._base_optimizer.state[fast_p] + if 'lookahead_slow_buff' not in param_state: + param_state['lookahead_slow_buff'] = torch.empty_like(fast_p) + param_state['lookahead_slow_buff'].copy_(fast_p) + slow = param_state['lookahead_slow_buff'] + slow.add_(fast_p - slow, alpha=group['lookahead_alpha']) + fast_p.copy_(slow) + + def sync_lookahead(self): + for group in self._base_optimizer.param_groups: + self.update_slow(group) + + @torch.no_grad() + def step(self, closure=None): + loss = self._base_optimizer.step(closure) + for group in self._base_optimizer.param_groups: + group['lookahead_step'] += 1 + if group['lookahead_step'] % group['lookahead_k'] == 0: + self.update_slow(group) + return loss + + def state_dict(self): + return self._base_optimizer.state_dict() + + def load_state_dict(self, state_dict): + self._base_optimizer.load_state_dict(state_dict) + self.param_groups = self._base_optimizer.param_groups + +def create_optimizer_v2( + model_or_params, + opt: str = 'sgd', + lr: Optional[float] = None, + weight_decay: float = 0., + momentum: float = 0.9, + filter_bias_and_bn: bool = True, + **kwargs): + """ Create an optimizer. + Only support npu fused AdamW and npu fused SGD + """ + if isinstance(model_or_params, nn.Module): + # a model was passed in, extract parameters and add weight decays to appropriate layers + if weight_decay and filter_bias_and_bn: + skip = {} + if hasattr(model_or_params, 'no_weight_decay'): + skip = model_or_params.no_weight_decay() + parameters = add_weight_decay(model_or_params, weight_decay, skip) + weight_decay = 0. + else: + parameters = model_or_params.parameters() + else: + # iterable of parameters or param groups passed in + parameters = model_or_params + + opt_lower = opt.lower() + opt_split = opt_lower.split('_') + opt_lower = opt_split[-1] + # if 'fused' in opt_lower: + # assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' + + opt_args = dict(weight_decay=weight_decay, **kwargs) + if lr is not None: + opt_args.setdefault('lr', lr) + + # basic SGD & related + if opt_lower == 'sgd' or opt_lower == 'nesterov': + # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons + opt_args.pop('eps', None) + # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args) + optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'momentum': + opt_args.pop('eps', None) + # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args) + optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args) + elif opt_lower == 'adamw': + # optimizer = optim.AdamW(parameters, **opt_args) + optimizer = NpuFusedAdamW(parameters, **opt_args) + else: + print(opt_lower, flush=True) + assert False and "Invalid optimizer" + raise ValueError + + if len(opt_split) > 1: + if opt_split[0] == 'lookahead': + optimizer = Lookahead(optimizer) + + return optimizer + + + +def _parse_args(): + # Do we have a config file to parse? + args_config, remaining = config_parser.parse_known_args() + if args_config.config: + with open(args_config.config, 'r') as f: + cfg = yaml.safe_load(f) + parser.set_defaults(**cfg) + + # The main arg parser parses the rest of the args, the usual + # defaults will have been overridden if config file specified. + args = parser.parse_args(remaining) + + # Cache the args as a text string to save them in the output dir later + args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) + return args, args_text + + +def main(): + setup_default_logging() + args, args_text = _parse_args() + + os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1' + os.environ['MASTER_PORT'] = '99999' # Any available port + + args.prefetcher = not args.no_prefetcher + args.distributed = (args.workers > 1) + + torch.npu.set_device(args.local_rank) + args.world_size = 1 + args.rank = args.local_rank # global rank + if args.distributed: + torch.npu.set_device(args.local_rank) + args.world_size = args.workers + torch.distributed.init_process_group(backend='hccl', rank=args.rank, world_size=args.world_size) + args.world_size = torch.distributed.get_world_size() + assert args.rank >= 0 + + if args.distributed: + _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' + % (args.rank, args.world_size)) + else: + _logger.info('Training with a single process on %d GPUs.' % args.num_gpu) + + torch.manual_seed(args.seed + args.rank) + + model = create_model( + args.model, + pretrained=args.pretrained, + num_classes=args.num_classes, + drop_rate=args.drop, + drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps, + checkpoint_path=args.initial_checkpoint, + img_size=args.img_size) + + if args.local_rank == 0 or args.workers == 1: + _logger.info('Model %s created, param count: %d' % + (args.model, sum([m.numel() for m in model.parameters()]))) + + data_config = resolve_data_config(vars(args), model=model, verbose=(args.local_rank == 0 or args.workers==1)) + + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits > 1, 'A split of 1 makes no sense' + num_aug_splits = args.aug_splits + + if args.split_bn: + assert num_aug_splits > 1 or args.resplit + model = convert_splitbn_model(model, max(num_aug_splits, 2)) + + use_amp = None + args.apex_amp = True + use_amp = 'apex' + + model.npu() + if args.channels_last: + model = model.to(memory_format=torch.channels_last) + + optimizer = create_optimizer_v2( + model, + **optimizer_kwargs(cfg=args), + filter_bias_and_bn=True, + ) + # optimizer = create_optimizer(args, model) + + amp_autocast = suppress # do nothing + loss_scaler = None + model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0, combine_grad=True) + loss_scaler = ApexScaler() + if args.local_rank == 0: + _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.') + + # optionally resume from a checkpoint + resume_epoch = None + if args.resume: + resume_epoch = resume_checkpoint( + model, args.resume, + optimizer=None if args.no_resume_opt else optimizer, + loss_scaler=None if args.no_resume_opt else loss_scaler, + log_info=args.local_rank == 0) + + model_ema = None + if args.model_ema: + # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper + model_ema = ModelEma( + model, + decay=args.model_ema_decay, + device='cpu' if args.model_ema_force_cpu else '', + resume=args.resume) + + if args.distributed: + if args.sync_bn: + assert not args.split_bn + try: + if has_apex and use_amp != 'native': + # Apex SyncBN preferred unless native amp is activated + model = convert_syncbn_model(model) + else: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) + if args.local_rank == 0: + _logger.info( + 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' + 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') + except Exception as e: + _logger.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1') + + model = NativeDDP(model, device_ids=[args.local_rank], broadcast_buffers=False) # can use device str in Torch >= 1.1 + # NOTE: EMA model does not need to be wrapped by DDP + + lr_scheduler, num_epochs = create_scheduler(args, optimizer) + if args.performance: + num_epochs = 1 + start_epoch = 0 + if args.start_epoch is not None: + # a specified start_epoch will always override the resume epoch + start_epoch = args.start_epoch + elif resume_epoch is not None: + start_epoch = resume_epoch + if lr_scheduler is not None and start_epoch > 0: + lr_scheduler.step(start_epoch) + + if args.local_rank == 0: + _logger.info('Scheduled epochs: {}'.format(num_epochs)) + + train_dir = os.path.join(args.data, 'train') + if not os.path.exists(train_dir): + _logger.error('Training folder does not exist at: {}'.format(train_dir)) + exit(1) + dataset_train = Dataset(train_dir) + + collate_fn = None + mixup_fn = None + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + if mixup_active: + mixup_args = dict( + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + label_smoothing=args.smoothing, num_classes=args.num_classes) + if args.prefetcher: + assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) + collate_fn = FastCollateMixup(**mixup_args) + else: + mixup_fn = Mixup(**mixup_args) + + if num_aug_splits > 1: + dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) + + train_interpolation = args.train_interpolation + if args.no_aug or not train_interpolation: + train_interpolation = data_config['interpolation'] + loader_train = create_loader( + dataset_train, + input_size=data_config['input_size'], + batch_size=args.batch_size, + is_training=True, + use_prefetcher=args.prefetcher, + no_aug=args.no_aug, + re_prob=args.reprob, + # re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_splits=num_aug_splits, + interpolation=train_interpolation, + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + collate_fn=collate_fn, + pin_memory=args.pin_mem, + use_multi_epochs_loader=args.use_multi_epochs_loader + ) + + eval_dir = os.path.join(args.data, 'val') + if not os.path.isdir(eval_dir): + eval_dir = os.path.join(args.data, 'validation') + if not os.path.isdir(eval_dir): + _logger.error('Validation folder does not exist at: {}'.format(eval_dir)) + exit(1) + dataset_eval = Dataset(eval_dir) + + loader_eval = create_loader( + dataset_eval, + input_size=data_config['input_size'], + batch_size=args.validation_batch_size_multiplier * args.batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.workers, + distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + + if args.jsd: + assert num_aug_splits > 1 # JSD only valid with aug splits set + train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).npu() + elif mixup_active: + # smoothing is handled with mixup target transform + train_loss_fn = SoftTargetCrossEntropy().npu() + elif args.smoothing: + train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).npu() + else: + train_loss_fn = nn.CrossEntropyLoss().npu() + validate_loss_fn = nn.CrossEntropyLoss().npu() + + eval_metric = args.eval_metric + best_metric = None + best_epoch = None + + if args.eval_checkpoint: # evaluate the model + load_checkpoint(model, args.eval_checkpoint, args.model_ema) + val_metrics = validate(model, loader_eval, validate_loss_fn, args) + print(f"Top-1 accuracy of the model is: {val_metrics['top1']:.1f}%") + return + + saver = None + output_dir = '' + if args.local_rank == 0: + output_base = args.output if args.output else './output' + exp_name = '-'.join([ + datetime.now().strftime("%Y%m%d-%H%M%S"), + args.model, + str(data_config['input_size'][-1]) + ]) + output_dir = get_outdir(output_base, 'train', exp_name) + decreasing = True if eval_metric == 'loss' else False + saver = CheckpointSaver( + model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, + checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing) + with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: + f.write(args_text) + + try: # train the model + for epoch in range(start_epoch, num_epochs): + if args.distributed: + loader_train.sampler.set_epoch(epoch) + + + train_metrics = train_epoch( + epoch, model, loader_train, optimizer, train_loss_fn, args, + lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, + amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn) + + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + if args.local_rank == 0 or args.workers == 1: + _logger.info("Distributing BatchNorm running means and vars") + distribute_bn(model, args.world_size, args.dist_bn == 'reduce') + + eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) + + if model_ema is not None and not args.model_ema_force_cpu: + if args.distributed and args.dist_bn in ('broadcast', 'reduce'): + distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') + ema_eval_metrics = validate( + model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)') + eval_metrics = ema_eval_metrics + + if lr_scheduler is not None: + # step LR for next epoch + lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) + + update_summary( + epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), + write_header=best_metric is None) + + if saver is not None: + # save proper checkpoint with eval metric + save_metric = eval_metrics[eval_metric] + best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric) + + except KeyboardInterrupt: + pass + if best_metric is not None: + _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch)) + + +def train_epoch( + epoch, model, loader, optimizer, loss_fn, args, + lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress, + loss_scaler=None, model_ema=None, mixup_fn=None): + if args.mixup_off_epoch and epoch >= args.mixup_off_epoch: + if args.prefetcher and loader.mixup_enabled: + loader.mixup_enabled = False + elif mixup_fn is not None: + mixup_fn.mixup_enabled = False + + second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order + batch_time_m = AverageMeter() + data_time_m = AverageMeter() + losses_m = AverageMeter() + top1_m = AverageMeter() + top5_m = AverageMeter() + + model.train() + + end = time.time() + last_idx = len(loader) - 1 + num_updates = epoch * len(loader) + epoch_fps = [] + prof_list = [] + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + data_time_m.update(time.time() - end) + if not args.prefetcher: + input, target = input.npu(), target.npu() + if mixup_fn is not None: + input, target = mixup_fn(input, target) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + if batch_idx in prof_list: + with torch.autograd.profiler.profile(use_npu=True) as prof: + output = model(input) + loss = loss_fn(output, target) + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + if loss_scaler is not None: + loss_scaler( + loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order) + else: + loss.backward(create_graph=second_order) + if args.clip_grad is not None: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + optimizer.step() + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + prof.export_chrome_trace("output_{}.prof".format(str(batch_idx).zfill(4))) + sys.exit() + + else: + with amp_autocast(): + output = model(input) + loss = loss_fn(output, target) + + if not args.distributed: + losses_m.update(loss.item(), input.size(0)) + + optimizer.zero_grad() + if loss_scaler is not None: + loss_scaler( + loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order) + else: + loss.backward(create_graph=second_order) + if args.clip_grad is not None: + torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) + optimizer.step() + + torch.npu.synchronize() + if model_ema is not None: + model_ema.update(model) + num_updates += 1 + + batch_time_m.update(time.time() - end) + + if last_batch or batch_idx % args.log_interval == 0: + lrl = [param_group['lr'] for param_group in optimizer.param_groups] + lr = sum(lrl) / len(lrl) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + losses_m.update(reduced_loss.item(), input.size(0)) + + if args.local_rank == 0 or args.workers == 1: + _logger.info( + 'Train: {} [{:>4d}/{} ({:>3.0f}%)] ' + 'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) ' + 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s ' + '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'LR: {lr:.3e} ' + 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format( + epoch, + batch_idx, len(loader), + 100. * batch_idx / last_idx, + loss=losses_m, + batch_time=batch_time_m, + rate=input.size(0) * args.world_size / batch_time_m.val, + rate_avg=input.size(0) * args.world_size / batch_time_m.avg, + lr=lr, + data_time=data_time_m)) + + if args.save_images and output_dir: + torchvision.utils.save_image( + input, + os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx), + padding=0, + normalize=True) + + if saver is not None and args.recovery_interval and ( + last_batch or (batch_idx + 1) % args.recovery_interval == 0): + saver.save_recovery(epoch, batch_idx=batch_idx) + + if lr_scheduler is not None: + lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg) + + epoch_fps.append(input.shape[0] * args.workers / (time.time() - end)) + end = time.time() + # end for + + if hasattr(optimizer, 'sync_lookahead'): + optimizer.sync_lookahead() + + print('Epoch {}: {} fps'.format(epoch, sum(epoch_fps[5:]) / len(epoch_fps[5:]))) + return OrderedDict([('loss', losses_m.avg)]) + + +def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''): + batch_time_m = AverageMeter() + losses_m = AverageMeter() + top1_m = AverageMeter() + top5_m = AverageMeter() + + model.eval() + + end = time.time() + last_idx = len(loader) - 1 + with torch.no_grad(): + for batch_idx, (input, target) in enumerate(loader): + last_batch = batch_idx == last_idx + if not args.prefetcher: + input = input.npu() + target = target.npu() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + with amp_autocast(): + output = model(input) + if isinstance(output, (tuple, list)): + output = output[0] + + # augmentation reduction + reduce_factor = args.tta + if reduce_factor > 1: + output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2) + target = target[0:target.size(0):reduce_factor] + + loss = loss_fn(output, target) + acc1, acc5 = t2taccuracy(output, target, topk=(1, 5)) + + if args.distributed: + reduced_loss = reduce_tensor(loss.data, args.world_size) + acc1 = reduce_tensor(acc1, args.world_size) + acc5 = reduce_tensor(acc5, args.world_size) + else: + reduced_loss = loss.data + + torch.npu.synchronize() + + losses_m.update(reduced_loss.item(), input.size(0)) + top1_m.update(acc1.item(), output.size(0)) + top5_m.update(acc5.item(), output.size(0)) + + batch_time_m.update(time.time() - end) + end = time.time() + if (args.local_rank == 0 or args.workers == 1) and (last_batch or batch_idx % args.log_interval == 0): + log_name = 'Test' + log_suffix + _logger.info( + '{0}: [{1:>4d}/{2}] ' + 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) ' + 'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format( + log_name, batch_idx, last_idx, batch_time=batch_time_m, + loss=losses_m, top1=top1_m, top5=top5_m)) + + metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)]) + + return metrics + + +if __name__ == '__main__': + main() \ No newline at end of file -- Gitee From b7c1dce395f052f3fd10d787079fab6993c9c97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Wed, 21 Dec 2022 02:38:52 +0000 Subject: [PATCH 14/15] update PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh index 09dd270d72..88152a40e9 100644 --- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh @@ -66,7 +66,7 @@ if [ x"${etp_flag}" != x"true" ];then #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' fi -nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \ +nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 main.py \ ${data_path} \ --model t2t_vit_14 \ --batch-size 64 \ -- Gitee From 2f18386bf49f5ecb5fdab8f76ec24a0c5d38c14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com> Date: Wed, 21 Dec 2022 02:39:07 +0000 Subject: [PATCH 15/15] update PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王旭 <1149693659@qq.com> --- .../cv/classification/T2T-ViT/test/train_performance_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh index 6c1828793e..f6018663ff 100644 --- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh @@ -67,7 +67,7 @@ if [ x"${etp_flag}" != x"true" ];then #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' fi -nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \ +nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 main.py \ ${data_path} \ --model t2t_vit_14 \ --batch-size 64 \ -- Gitee