From 35e040c59c3e52c7583f97efd63a7dc795831255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:46:23 +0000
Subject: [PATCH 01/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20T2T-ViT?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
PyTorch/contrib/cv/classification/T2T-ViT/.keep | 0
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/.keep
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/.keep
new file mode 100644
index 0000000000..e69de29bb2
--
Gitee
From dfce126ce662969faeb2f176c6006af0aec13016 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:46:56 +0000
Subject: [PATCH 02/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20models?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
PyTorch/contrib/cv/classification/T2T-ViT/models/.keep | 0
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep
new file mode 100644
index 0000000000..e69de29bb2
--
Gitee
From 66f0272ce86deb381238bc7e3c10648aa0970cc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:06 +0000
Subject: [PATCH 03/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?=
=?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
PyTorch/contrib/cv/classification/T2T-ViT/.keep | 0
1 file changed, 0 insertions(+), 0 deletions(-)
delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/.keep
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/.keep
deleted file mode 100644
index e69de29bb2..0000000000
--
Gitee
From 276054e344e499285a1ffa1fdb344aa9c14a5dc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:14 +0000
Subject: [PATCH 04/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
PyTorch/contrib/cv/classification/T2T-ViT/test/.keep | 0
1 file changed, 0 insertions(+), 0 deletions(-)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep
new file mode 100644
index 0000000000..e69de29bb2
--
Gitee
From 3f8712405b7f0c5eb2238c95dd996f77a158633f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:34 +0000
Subject: [PATCH 05/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../cv/classification/T2T-ViT/Dockerfile | 5 +
.../contrib/cv/classification/T2T-ViT/LICENSE | 201 ++++
.../cv/classification/T2T-ViT/README.md | 165 ++++
.../cv/classification/T2T-ViT/README_raw.md | 202 ++++
.../cv/classification/T2T-ViT/docker_start.sh | 25 +
.../contrib/cv/classification/T2T-ViT/main.py | 912 ++++++++++++++++++
.../cv/classification/T2T-ViT/metrics.py | 41 +
.../classification/T2T-ViT/modelzoo_level.txt | 3 +
.../classification/T2T-ViT/npu_fused_adamw.py | 255 +++++
.../classification/T2T-ViT/requirements.txt | 4 +
10 files changed, 1813 insertions(+)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/LICENSE
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/README.md
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/main.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/metrics.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile b/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile
new file mode 100644
index 0000000000..7e712fe1a1
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile
@@ -0,0 +1,5 @@
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME
+
+COPY requirements.txt .
+RUN pip3.7 install -r requirements.txt
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE b/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE
new file mode 100644
index 0000000000..753842b672
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/README.md b/PyTorch/contrib/cv/classification/T2T-ViT/README.md
new file mode 100644
index 0000000000..9fce4d9354
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/README.md
@@ -0,0 +1,165 @@
+# T2T-ViT for PyTorch
+
+- [概述](概述.md)
+- [准备训练环境](准备训练环境.md)
+- [开始训练](开始训练.md)
+- [训练结果展示](训练结果展示.md)
+- [版本说明](版本说明.md)
+
+
+
+# 概述
+
+## 简述
+
+T2T-ViT模型还是解决cv领域图片分类的问题,通过捕获图片的局部特征来对图片进行分类。由于ViT将图像分为多个 token,然后用多个 Transformer 来对全局进行建模以进行分类,但是这样会失去局部性,实际上图像的局部信息如边缘、线条和纹理等信息对视觉理解是很重要的。其次,ViT的注意力骨干含有冗余,特征丰富度有限,模型训练困难。因此本论文提出了 Tokens-to-Token module 来建模一张图片的局部信息,和更高效的 Transformer Backbone 架构设计来提升中间特征的丰富程度减少冗余以提升性能,使得在纯 ImageNet 数据集预训练的视觉 Transformer 的性能超越了 CNN 的 ResNet 架构,其设计的思路和范式对视觉 Transformer 领域的工作带来的积极影响。
+
+- 参考实现:
+
+ ```
+ url=https://github.com/yitu-opensource/T2T-ViT.git
+ commit_id=0f63dc9558f4d192de926504dbddfa1b3f5db6ca
+ ```
+
+- 适配昇腾 AI 处理器的实现:
+
+ ```
+ url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+ code_path=PyTorch/contrib/cv/classification
+ ```
+
+- 通过Git获取代码方法如下:
+
+ ```
+ git clone {url} # 克隆仓库的代码
+ cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换
+ ```
+
+- 通过单击“立即下载”,下载源码包。
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。
+
+ **表 1** 版本配套表
+
+ | 配套 | 版本 |
+ | ---------- | ------------------------------------------------------------ |
+ | 固件与驱动 | [5.1.RC2](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+ | CANN | [5.1.RC2](https://www.hiascend.com/software/cann/commercial?version=5.1.RC2) |
+ | PyTorch | [1.5.0](https://gitee.com/ascend/pytorch/tree/master/) |
+
+
+- 环境准备指导。
+
+ 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+
+- 安装依赖。
+
+ ```
+ pip install -r requirements.txt
+ ```
+
+
+## 准备数据集
+
+1. 获取数据集。
+ 用户自行获取原始数据集ImageNet2012,将数据集上传到服务器任意路径下并解压。数据集目录结构参考如下所示。
+ ```
+ ├── ImageNet2012
+ ├──train
+ ├──类别1
+ │──图片1
+ │──图片2
+ │ ...
+ ├──类别2
+ │──图片1
+ │──图片2
+ │ ...
+ ├──...
+ ├──val
+ ├──类别1
+ │──图片1
+ │──图片2
+ │ ...
+ ├──类别2
+ │──图片1
+ │──图片2
+ │ ...
+ ```
+
+
+ > **说明:**
+ > 该数据集的训练过程脚本只作为一种参考示例。
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录。
+
+ ```
+ cd /${模型文件夹名称}
+ ```
+
+2. 运行训练脚本。
+
+ 该模型支持单机单卡训练和单机8卡训练。
+
+ - 单机单卡训练
+
+ 启动单卡训练。
+
+ ```
+ bash ./test/train_full_1p.sh --data_path=/data/xxx/
+ ```
+
+ - 单机8卡训练
+
+ 启动8卡训练。
+
+ ```
+ bash ./test/train_full_8p.sh --data_path=/data/xxx/
+ ```
+
+ --data\_path参数填写数据集路径。
+
+ 模型训练脚本参数说明如下。
+
+ ```
+ 公共参数:
+ --data_path //数据集路径
+ ```
+
+
+
+日志输出路径:
+
+ test/output/devie_id/train_${device_id}.log # training detail log
+
+ test/output/devie_id/T2T-ViT_2_bs8192_8p_perf.log # 8p training performance result log
+
+ test/output/devie_id/T2T-ViT_2_bs8192_8p_acc.log # 8p training accuracy result log
+
+# 训练结果展示
+
+**表 2** 训练结果展示表
+
+| Name | Acc@1 | FPS | Epochs | AMP_Type |
+| ------ | ----- | ---- | ------ | -------- |
+| GPU-1p | - | - | 1 | O1 |
+| GPU-8p | - | - | 300 | O1 |
+| NPU-1p | - | - | 1 | O1 |
+| NPU-8p | - | - | 300 | O1 |
+
+# 版本说明
+
+## 变更
+
+2022.11.22:首次发布。
+
+## 已知问题
+
+无。
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md b/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md
new file mode 100644
index 0000000000..d4c537021e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md
@@ -0,0 +1,202 @@
+# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet, [ICCV 2021](https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_Tokens-to-Token_ViT_Training_Vision_Transformers_From_Scratch_on_ImageNet_ICCV_2021_paper.html)
+
+### Update:
+2021/03/11: update our new results. Now our T2T-ViT-14 with 21.5M parameters can reach 81.5% top1-acc with 224x224 image resolution, and 83.3\% top1-acc with 384x384 resolution.
+
+2021/02/21: T2T-ViT can be trained on most of common GPUs: 1080Ti, 2080Ti, TiTAN V, V100 stably with '--amp' (Automatic Mixed Precision). In some specifical GPU like Tesla T4, 'amp' would cause NAN loss when training T2T-ViT. If you get NAN loss in training, you can disable amp by removing '--amp' in the [training scripts](https://github.com/yitu-opensource/T2T-ViT#train).
+
+2021/01/28: release codes and upload most of the pretrained models of T2T-ViT.
+
+
+
+
+
+## Reference
+If you find this repo useful, please consider citing:
+```
+@InProceedings{Yuan_2021_ICCV,
+ author = {Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Jiang, Zi-Hang and Tay, Francis E.H. and Feng, Jiashi and Yan, Shuicheng},
+ title = {Tokens-to-Token ViT: Training Vision Transformers From Scratch on ImageNet},
+ booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+ month = {October},
+ year = {2021},
+ pages = {558-567}
+}
+```
+
+Our codes are based on the [official imagenet example](https://github.com/pytorch/examples/tree/master/imagenet) by [PyTorch](https://pytorch.org/) and [pytorch-image-models](https://github.com/rwightman/pytorch-image-models) by [Ross Wightman](https://github.com/rwightman)
+
+
+## 1. Requirements
+
+[timm](https://github.com/rwightman/pytorch-image-models), pip install timm==0.3.4
+
+torch>=1.4.0
+
+torchvision>=0.5.0
+
+pyyaml
+
+data prepare: ImageNet with the following folder structure, you can extract imagenet by this [script](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4).
+
+```
+│imagenet/
+├──train/
+│ ├── n01440764
+│ │ ├── n01440764_10026.JPEG
+│ │ ├── n01440764_10027.JPEG
+│ │ ├── ......
+│ ├── ......
+├──val/
+│ ├── n01440764
+│ │ ├── ILSVRC2012_val_00000293.JPEG
+│ │ ├── ILSVRC2012_val_00002138.JPEG
+│ │ ├── ......
+│ ├── ......
+```
+
+## 2. T2T-ViT Models
+
+
+| Model | T2T Transformer | Top1 Acc | #params | MACs | Download|
+| :--- | :---: | :---: | :---: | :---: | :---: |
+| T2T-ViT-14 | Performer | 81.5 | 21.5M | 4.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.5_T2T_ViT_14.pth.tar)|
+| T2T-ViT-19 | Performer | 81.9 | 39.2M | 8.5G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.9_T2T_ViT_19.pth.tar)|
+| T2T-ViT-24 | Performer | 82.3 | 64.1M | 13.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.3_T2T_ViT_24.pth.tar)|
+| T2T-ViT-14, 384 | Performer | 83.3 | 21.7M | | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/83.3_T2T_ViT_14.pth.tar)|
+| T2T-ViT-24, Token Labeling | Performer | 84.2 | 65M | | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/84.2_T2T_ViT_24.pth.tar)|
+| T2T-ViT_t-14 | Transformer | 81.7 | 21.5M | 6.1G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.7_T2T_ViTt_14.pth.tar) |
+| T2T-ViT_t-19 | Transformer | 82.4 | 39.2M | 9.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.4_T2T_ViTt_19.pth.tar) |
+| T2T-ViT_t-24 | Transformer | 82.6 | 64.1M | 15.0G| [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.6_T2T_ViTt_24.pth.tar) |
+
+The 'T2T-ViT-14, 384' means we train T2T-ViT-14 with image size of 384 x 384.
+
+The 'T2T-ViT-24, Token Labeling' means we train T2T-ViT-24 with [Token Labeling](https://github.com/zihangJiang/TokenLabeling).
+
+The three lite variants of T2T-ViT (Comparing with MobileNets):
+| Model | T2T Transformer | Top1 Acc | #params | MACs | Download|
+| :--- | :---: | :---: | :---: | :---: | :---: |
+| T2T-ViT-7 | Performer | 71.7 | 4.3M | 1.1G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/71.7_T2T_ViT_7.pth.tar)|
+| T2T-ViT-10 | Performer | 75.2 | 5.9M | 1.5G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/75.2_T2T_ViT_10.pth.tar)|
+| T2T-ViT-12 | Performer | 76.5 | 6.9M | 1.8G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/76.5_T2T_ViT_12.pth.tar) |
+
+
+### Usage
+The way to use our pretrained T2T-ViT:
+```
+from models.t2t_vit import *
+from utils import load_for_transfer_learning
+
+# create model
+model = t2t_vit_14()
+
+# load the pretrained weights
+load_for_transfer_learning(model, /path/to/pretrained/weights, use_ema=True, strict=False, num_classes=1000) # change num_classes based on dataset, can work for different image size as we interpolate the position embeding for different image size.
+```
+
+
+## 3. Validation
+
+Test the T2T-ViT-14 (take Performer in T2T module),
+
+Download the [T2T-ViT-14](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.5_T2T_ViT_14.pth.tar), then test it by running:
+
+```
+CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_14 -b 100 --eval_checkpoint path/to/checkpoint
+```
+The results look like:
+
+```
+Test: [ 0/499] Time: 2.083 (2.083) Loss: 0.3578 (0.3578) Acc@1: 96.0000 (96.0000) Acc@5: 99.0000 (99.0000)
+Test: [ 50/499] Time: 0.166 (0.202) Loss: 0.5823 (0.6404) Acc@1: 85.0000 (86.1569) Acc@5: 99.0000 (97.5098)
+...
+Test: [ 499/499] Time: 0.272 (0.172) Loss: 1.3983 (0.8261) Acc@1: 62.0000 (81.5000) Acc@5: 93.0000 (95.6660)
+Top-1 accuracy of the model is: 81.5%
+
+```
+
+Test the three lite variants: T2T-ViT-7, T2T-ViT-10, T2T-ViT-12 (take Performer in T2T module),
+
+Download the [T2T-ViT-7](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/71.7_T2T_ViT_7.pth.tar), [T2T-ViT-10](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/75.2_T2T_ViT_10.pth.tar) or [T2T-ViT-12](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/76.5_T2T_ViT_12.pth.tar), then test it by running:
+
+```
+CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_7 -b 100 --eval_checkpoint path/to/checkpoint
+```
+
+Test the model T2T-ViT-14, 384 with 83.3\% top-1 accuracy:
+```
+CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_14 --img-size 384 -b 100 --eval_checkpoint path/to/T2T-ViT-14-384
+```
+
+
+## 4. Train
+
+Train the three lite variants: T2T-ViT-7, T2T-ViT-10 and T2T-ViT-12 (take Performer in T2T module):
+
+If only 4 GPUs are available,
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 ./distributed_train.sh 4 path/to/data --model t2t_vit_7 -b 128 --lr 1e-3 --weight-decay .03 --amp --img-size 224
+```
+
+The top1-acc in 4 GPUs would be slightly lower than 8 GPUs (around 0.1%-0.3% lower).
+
+If 8 GPUs are available:
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_7 -b 64 --lr 1e-3 --weight-decay .03 --amp --img-size 224
+```
+
+
+Train the T2T-ViT-14 and T2T-ViT_t-14 (run on 4 or 8 GPUs):
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 ./distributed_train.sh 4 path/to/data --model t2t_vit_14 -b 128 --lr 1e-3 --weight-decay .05 --amp --img-size 224
+```
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_14 -b 64 --lr 5e-4 --weight-decay .05 --amp --img-size 224
+```
+If you want to train our T2T-ViT on images with 384x384 resolution, please use '--img-size 384'.
+
+
+Train the T2T-ViT-19, T2T-ViT-24 or T2T-ViT_t-19, T2T-ViT_t-24:
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_19 -b 64 --lr 5e-4 --weight-decay .065 --amp --img-size 224
+```
+
+## 5. Transfer T2T-ViT to CIFAR10/CIFAR100
+
+| Model | ImageNet | CIFAR10 | CIFAR100| #params|
+| :--- | :---: | :---: | :---: | :---: |
+| T2T-ViT-14 | 81.5 |[98.3](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar10_t2t-vit_14_98.3.pth) | [88.4](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cirfar100_t2t-vit-14_88.4.pth) | 21.5M |
+| T2T-ViT-19 | 81.9 |[98.4](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar10_t2t-vit_19_98.4.pth) | [89.0](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar100_t2t-vit-19_89.0.pth) |39.2M |
+
+We resize CIFAR10/100 to 224x224 and finetune our pretrained T2T-ViT-14/19 to CIFAR10/100 by running:
+
+```
+CUDA_VISIBLE_DEVICES=0,1 transfer_learning.py --lr 0.05 --b 64 --num-classes 10 --img-size 224 --transfer-learning True --transfer-model /path/to/pretrained/T2T-ViT-19
+```
+
+## 6. Visualization
+
+Visualize the image features of ResNet50, you can open and run the [visualization_resnet.ipynb](https://github.com/yitu-opensource/T2T-ViT/blob/main/visualization_resnet.ipynb) file in jupyter notebook or jupyter lab; some results are given as following:
+
+
+
+
+
+Visualize the image features of ViT, you can open and run the [visualization_vit.ipynb](https://github.com/yitu-opensource/T2T-ViT/blob/main/visualization_vit.ipynb) file in jupyter notebook or jupyter lab; some results are given as following:
+
+
+
+
+
+Visualize attention map, you can refer to this [file](https://github.com/jeonsworld/ViT-pytorch/blob/main/visualize_attention_map.ipynb). A simple example by visualizing the attention map in attention block 4 and 5 is:
+
+
+
+
+
+
+
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh b/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh
new file mode 100644
index 0000000000..46ce9a02ec
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+docker_image=$1
+data_dir=$2
+model_dir=$3
+
+docker run -it --ipc=host \
+ --device=/dev/davinci0 \
+ --device=/dev/davinci1 \
+ --device=/dev/davinci2 \
+ --device=/dev/davinci3 \
+ --device=/dev/davinci4 \
+ --device=/dev/davinci5 \
+ --device=/dev/davinci6 \
+ --device=/dev/davinci7 \
+ --device=/dev/davinci_manager \
+ --device=/dev/devmm_svm --device=/dev/hisi_hdc \
+ -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+ -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
+ -v ${model_dir}:${model_dir} \
+ -v ${data_dir}:${data_dir} \
+ -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
+ -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
+ -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
+ /bin/bash
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
new file mode 100644
index 0000000000..1b3ceea782
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
@@ -0,0 +1,912 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT training and evaluating script
+This script is modified from pytorch-image-models by Ross Wightman (https://github.com/rwightman/pytorch-image-models/)
+It was started from an early version of the PyTorch ImageNet example
+(https://github.com/pytorch/examples/tree/master/imagenet)
+"""
+import argparse
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+import models
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.optim.optimizer import Optimizer
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model
+from timm.utils import *
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from timm.utils import ApexScaler, NativeScaler
+
+from data.myloader import create_loader
+from npu_fused_adamw import NpuFusedAdamW
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+ help='YAML config file specifying default arguments')
+
+parser = argparse.ArgumentParser(description='T2T-ViT Training and Evaluating')
+
+# Dataset / Model parameters
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--model', default='T2t_vit_14', type=str, metavar='MODEL',
+ help='Name of model to train (default: "countception"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+ help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+ help='Initialize model from this checkpoint (default: none)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+ help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--eval_checkpoint', default='', type=str, metavar='PATH',
+ help='path to eval checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+ help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=1000, metavar='N',
+ help='number of label classes (default: 1000)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+ help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=224, metavar='N',
+ help='Image patch size (default: None => model default)')
+parser.add_argument('--crop-pct', default=None, type=float,
+ metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
+ help='input batch size for training (default: 64)')
+parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+ help='ratio of validation batch size to training batch size (default: 1)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+ help='Optimizer (default: "adamw"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+ help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+ help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+ help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=0.05,
+ help='weight decay (default: 0.005 for adamw)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+ help='Clip gradient norm (default: None, no clipping)')
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+ help='LR scheduler (default: "cosine"')
+parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+ help='learning rate (default: 0.01)')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+ help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+ help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+ help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+ help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+ help='learning rate cycle limit')
+parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+ help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+ help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=300, metavar='N',
+ help='number of epochs to train (default: 2)')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+ help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+ help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=10, metavar='N',
+ help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+ help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+ help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+ help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+ help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+ help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+ help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+ help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+ help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+ help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+ help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+parser.add_argument('--aug-splits', type=int, default=0,
+ help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd', action='store_true', default=False,
+ help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+ help='Random erase prob (default: 0.25)')
+parser.add_argument('--remode', type=str, default='pixel',
+ help='Random erase mode (default: "const")')
+parser.add_argument('--recount', type=int, default=1,
+ help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+ help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.8,
+ help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=1.0,
+ help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+ help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+ help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+ help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+ help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+ help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+ help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+ help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+ help='Dropout rate (default: 0.0)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+ help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
+ help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+ help='Drop block rate (default: None)')
+
+# Batch norm parameters (only works with gen_efficientnet based models currently)
+parser.add_argument('--bn-tf', action='store_true', default=False,
+ help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+parser.add_argument('--bn-momentum', type=float, default=None,
+ help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+ help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+ help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+ help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+ help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=True,
+ help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+ help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.99996,
+ help='decay factor for model weights moving average (default: 0.9998)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+ help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+ help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+ help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('-j', '--workers', type=int, default=8, metavar='N',
+ help='how many training processes to use (default: 1)')
+parser.add_argument('--num-gpu', type=int, default=1,
+ help='Number of GPUS to use')
+parser.add_argument('--save-images', action='store_true', default=False,
+ help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+ help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+ help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+ help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+ help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+ help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+ help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+ help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+ help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+ help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+ help='use the multi-epochs-loader to save time at the beginning of every epoch')
+
+parser.add_argument("--addr", default="127.0.0.1", type=str)
+parser.add_argument("--performance", action='store_true', default=False,
+ help='whether get the model performance')
+
+has_apex = True
+
+import apex
+from apex import amp
+from apex.parallel import DistributedDataParallel as ApexDDP
+from apex.parallel import convert_syncbn_model
+
+def optimizer_kwargs(cfg):
+ """ cfg/argparse to kwargs helper
+ Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
+ """
+ kwargs = dict(
+ opt=cfg.opt,
+ lr=cfg.lr,
+ weight_decay=cfg.weight_decay,
+ momentum=cfg.momentum)
+ if getattr(cfg, 'opt_eps', None) is not None:
+ kwargs['eps'] = cfg.opt_eps
+ if getattr(cfg, 'opt_betas', None) is not None:
+ kwargs['betas'] = cfg.opt_betas
+ if getattr(cfg, 'opt_args', None) is not None:
+ kwargs.update(cfg.opt_args)
+ return kwargs
+
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+ """Add weight decay
+ """
+ decay = []
+ no_decay = []
+ for name, param in model.named_parameters():
+ if not param.requires_grad:
+ continue # frozen weights
+ if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+ no_decay.append(param)
+ else:
+ decay.append(param)
+ return [
+ {'params': no_decay, 'weight_decay': 0.},
+ {'params': decay, 'weight_decay': weight_decay}]
+
+class Lookahead(Optimizer):
+ def __init__(self, base_optimizer, alpha=0.5, k=6):
+ # NOTE super().__init__() not called on purpose
+ if not 0.0 <= alpha <= 1.0:
+ raise ValueError(f'Invalid slow update rate: {alpha}')
+ if not 1 <= k:
+ raise ValueError(f'Invalid lookahead steps: {k}')
+ defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
+ self._base_optimizer = base_optimizer
+ self.param_groups = base_optimizer.param_groups
+ self.defaults = base_optimizer.defaults
+ self.defaults.update(defaults)
+ self.state = defaultdict(dict)
+ # manually add our defaults to the param groups
+ for name, default in defaults.items():
+ for group in self._base_optimizer.param_groups:
+ group.setdefault(name, default)
+
+ @torch.no_grad()
+ def update_slow(self, group):
+ for fast_p in group["params"]:
+ if fast_p.grad is None:
+ continue
+ param_state = self._base_optimizer.state[fast_p]
+ if 'lookahead_slow_buff' not in param_state:
+ param_state['lookahead_slow_buff'] = torch.empty_like(fast_p)
+ param_state['lookahead_slow_buff'].copy_(fast_p)
+ slow = param_state['lookahead_slow_buff']
+ slow.add_(fast_p - slow, alpha=group['lookahead_alpha'])
+ fast_p.copy_(slow)
+
+ def sync_lookahead(self):
+ for group in self._base_optimizer.param_groups:
+ self.update_slow(group)
+
+ @torch.no_grad()
+ def step(self, closure=None):
+ loss = self._base_optimizer.step(closure)
+ for group in self._base_optimizer.param_groups:
+ group['lookahead_step'] += 1
+ if group['lookahead_step'] % group['lookahead_k'] == 0:
+ self.update_slow(group)
+ return loss
+
+ def state_dict(self):
+ return self._base_optimizer.state_dict()
+
+ def load_state_dict(self, state_dict):
+ self._base_optimizer.load_state_dict(state_dict)
+ self.param_groups = self._base_optimizer.param_groups
+
+def create_optimizer_v2(
+ model_or_params,
+ opt: str = 'sgd',
+ lr: Optional[float] = None,
+ weight_decay: float = 0.,
+ momentum: float = 0.9,
+ filter_bias_and_bn: bool = True,
+ **kwargs):
+ """ Create an optimizer.
+ Only support npu fused AdamW and npu fused SGD
+ """
+ if isinstance(model_or_params, nn.Module):
+ # a model was passed in, extract parameters and add weight decays to appropriate layers
+ if weight_decay and filter_bias_and_bn:
+ skip = {}
+ if hasattr(model_or_params, 'no_weight_decay'):
+ skip = model_or_params.no_weight_decay()
+ parameters = add_weight_decay(model_or_params, weight_decay, skip)
+ weight_decay = 0.
+ else:
+ parameters = model_or_params.parameters()
+ else:
+ # iterable of parameters or param groups passed in
+ parameters = model_or_params
+
+ opt_lower = opt.lower()
+ opt_split = opt_lower.split('_')
+ opt_lower = opt_split[-1]
+ # if 'fused' in opt_lower:
+ # assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+
+ opt_args = dict(weight_decay=weight_decay, **kwargs)
+ if lr is not None:
+ opt_args.setdefault('lr', lr)
+
+ # basic SGD & related
+ if opt_lower == 'sgd' or opt_lower == 'nesterov':
+ # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons
+ opt_args.pop('eps', None)
+ # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+ optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+ elif opt_lower == 'momentum':
+ opt_args.pop('eps', None)
+ # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+ optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+ elif opt_lower == 'adamw':
+ # optimizer = optim.AdamW(parameters, **opt_args)
+ optimizer = NpuFusedAdamW(parameters, **opt_args)
+ else:
+ print(opt_lower, flush=True)
+ assert False and "Invalid optimizer"
+ raise ValueError
+
+ if len(opt_split) > 1:
+ if opt_split[0] == 'lookahead':
+ optimizer = Lookahead(optimizer)
+
+ return optimizer
+
+
+
+def _parse_args():
+ # Do we have a config file to parse?
+ args_config, remaining = config_parser.parse_known_args()
+ if args_config.config:
+ with open(args_config.config, 'r') as f:
+ cfg = yaml.safe_load(f)
+ parser.set_defaults(**cfg)
+
+ # The main arg parser parses the rest of the args, the usual
+ # defaults will have been overridden if config file specified.
+ args = parser.parse_args(remaining)
+
+ # Cache the args as a text string to save them in the output dir later
+ args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+ return args, args_text
+
+
+def main():
+ setup_default_logging()
+ args, args_text = _parse_args()
+
+ os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1'
+ os.environ['MASTER_PORT'] = '99999' # Any available port
+
+ args.prefetcher = not args.no_prefetcher
+ args.distributed = (args.workers > 1)
+
+ torch.npu.set_device(args.local_rank)
+ args.world_size = 1
+ args.rank = args.local_rank # global rank
+ if args.distributed:
+ torch.npu.set_device(args.local_rank)
+ args.world_size = args.workers
+ torch.distributed.init_process_group(backend='hccl', rank=args.rank, world_size=args.world_size)
+ args.world_size = torch.distributed.get_world_size()
+ assert args.rank >= 0
+
+ if args.distributed:
+ _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
+ % (args.rank, args.world_size))
+ else:
+ _logger.info('Training with a single process on %d GPUs.' % args.num_gpu)
+
+ torch.manual_seed(args.seed + args.rank)
+
+ model = create_model(
+ args.model,
+ pretrained=args.pretrained,
+ num_classes=args.num_classes,
+ drop_rate=args.drop,
+ drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path
+ drop_path_rate=args.drop_path,
+ drop_block_rate=args.drop_block,
+ global_pool=args.gp,
+ bn_tf=args.bn_tf,
+ bn_momentum=args.bn_momentum,
+ bn_eps=args.bn_eps,
+ checkpoint_path=args.initial_checkpoint,
+ img_size=args.img_size)
+
+ if args.local_rank == 0 or args.workers == 1:
+ _logger.info('Model %s created, param count: %d' %
+ (args.model, sum([m.numel() for m in model.parameters()])))
+
+ data_config = resolve_data_config(vars(args), model=model, verbose=(args.local_rank == 0 or args.workers==1))
+
+ num_aug_splits = 0
+ if args.aug_splits > 0:
+ assert args.aug_splits > 1, 'A split of 1 makes no sense'
+ num_aug_splits = args.aug_splits
+
+ if args.split_bn:
+ assert num_aug_splits > 1 or args.resplit
+ model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+ use_amp = None
+ args.apex_amp = True
+ use_amp = 'apex'
+
+ model.npu()
+ if args.channels_last:
+ model = model.to(memory_format=torch.channels_last)
+
+ optimizer = create_optimizer_v2(
+ model,
+ **optimizer_kwargs(cfg=args),
+ filter_bias_and_bn=True,
+ )
+ # optimizer = create_optimizer(args, model)
+
+ amp_autocast = suppress # do nothing
+ loss_scaler = None
+ model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0, combine_grad=True)
+ loss_scaler = ApexScaler()
+ if args.local_rank == 0:
+ _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+
+ # optionally resume from a checkpoint
+ resume_epoch = None
+ if args.resume:
+ resume_epoch = resume_checkpoint(
+ model, args.resume,
+ optimizer=None if args.no_resume_opt else optimizer,
+ loss_scaler=None if args.no_resume_opt else loss_scaler,
+ log_info=args.local_rank == 0)
+
+ model_ema = None
+ if args.model_ema:
+ # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+ model_ema = ModelEma(
+ model,
+ decay=args.model_ema_decay,
+ device='cpu' if args.model_ema_force_cpu else '',
+ resume=args.resume)
+
+ if args.distributed:
+ if args.sync_bn:
+ assert not args.split_bn
+ try:
+ if has_apex and use_amp != 'native':
+ # Apex SyncBN preferred unless native amp is activated
+ model = convert_syncbn_model(model)
+ else:
+ model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+ if args.local_rank == 0:
+ _logger.info(
+ 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+ 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+ except Exception as e:
+ _logger.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1')
+
+ model = NativeDDP(model, device_ids=[args.local_rank], broadcast_buffers=False) # can use device str in Torch >= 1.1
+ # NOTE: EMA model does not need to be wrapped by DDP
+
+ lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+ if args.performance:
+ num_epochs = 1
+ start_epoch = 0
+ if args.start_epoch is not None:
+ # a specified start_epoch will always override the resume epoch
+ start_epoch = args.start_epoch
+ elif resume_epoch is not None:
+ start_epoch = resume_epoch
+ if lr_scheduler is not None and start_epoch > 0:
+ lr_scheduler.step(start_epoch)
+
+ if args.local_rank == 0:
+ _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+ train_dir = os.path.join(args.data, 'train')
+ if not os.path.exists(train_dir):
+ _logger.error('Training folder does not exist at: {}'.format(train_dir))
+ exit(1)
+ dataset_train = Dataset(train_dir)
+
+ collate_fn = None
+ mixup_fn = None
+ mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+ if mixup_active:
+ mixup_args = dict(
+ mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+ prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+ label_smoothing=args.smoothing, num_classes=args.num_classes)
+ if args.prefetcher:
+ assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup)
+ collate_fn = FastCollateMixup(**mixup_args)
+ else:
+ mixup_fn = Mixup(**mixup_args)
+
+ if num_aug_splits > 1:
+ dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+ train_interpolation = args.train_interpolation
+ if args.no_aug or not train_interpolation:
+ train_interpolation = data_config['interpolation']
+ loader_train = create_loader(
+ dataset_train,
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ is_training=True,
+ use_prefetcher=args.prefetcher,
+ no_aug=args.no_aug,
+ re_prob=args.reprob,
+ # re_mode=args.remode,
+ re_count=args.recount,
+ re_split=args.resplit,
+ scale=args.scale,
+ ratio=args.ratio,
+ hflip=args.hflip,
+ vflip=args.vflip,
+ color_jitter=args.color_jitter,
+ auto_augment=args.aa,
+ num_aug_splits=num_aug_splits,
+ interpolation=train_interpolation,
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ distributed=args.distributed,
+ collate_fn=collate_fn,
+ pin_memory=args.pin_mem,
+ use_multi_epochs_loader=args.use_multi_epochs_loader
+ )
+
+ eval_dir = os.path.join(args.data, 'val')
+ if not os.path.isdir(eval_dir):
+ eval_dir = os.path.join(args.data, 'validation')
+ if not os.path.isdir(eval_dir):
+ _logger.error('Validation folder does not exist at: {}'.format(eval_dir))
+ exit(1)
+ dataset_eval = Dataset(eval_dir)
+
+ loader_eval = create_loader(
+ dataset_eval,
+ input_size=data_config['input_size'],
+ batch_size=args.validation_batch_size_multiplier * args.batch_size,
+ is_training=False,
+ use_prefetcher=args.prefetcher,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ distributed=args.distributed,
+ crop_pct=data_config['crop_pct'],
+ pin_memory=args.pin_mem,
+ )
+
+ if args.jsd:
+ assert num_aug_splits > 1 # JSD only valid with aug splits set
+ train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).npu()
+ elif mixup_active:
+ # smoothing is handled with mixup target transform
+ train_loss_fn = SoftTargetCrossEntropy().npu()
+ elif args.smoothing:
+ train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).npu()
+ else:
+ train_loss_fn = nn.CrossEntropyLoss().npu()
+ validate_loss_fn = nn.CrossEntropyLoss().npu()
+
+ eval_metric = args.eval_metric
+ best_metric = None
+ best_epoch = None
+
+ if args.eval_checkpoint: # evaluate the model
+ load_checkpoint(model, args.eval_checkpoint, args.model_ema)
+ val_metrics = validate(model, loader_eval, validate_loss_fn, args)
+ print(f"Top-1 accuracy of the model is: {val_metrics['top1']:.1f}%")
+ return
+
+ saver = None
+ output_dir = ''
+ if args.local_rank == 0:
+ output_base = args.output if args.output else './output'
+ exp_name = '-'.join([
+ datetime.now().strftime("%Y%m%d-%H%M%S"),
+ args.model,
+ str(data_config['input_size'][-1])
+ ])
+ output_dir = get_outdir(output_base, 'train', exp_name)
+ decreasing = True if eval_metric == 'loss' else False
+ saver = CheckpointSaver(
+ model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+ checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing)
+ with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+ f.write(args_text)
+
+ try: # train the model
+ for epoch in range(start_epoch, num_epochs):
+ if args.distributed:
+ loader_train.sampler.set_epoch(epoch)
+
+
+ train_metrics = train_epoch(
+ epoch, model, loader_train, optimizer, train_loss_fn, args,
+ lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+ amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+
+ if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+ if args.local_rank == 0 or args.workers == 1:
+ _logger.info("Distributing BatchNorm running means and vars")
+ distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+ eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+ if model_ema is not None and not args.model_ema_force_cpu:
+ if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+ distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+ ema_eval_metrics = validate(
+ model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)')
+ eval_metrics = ema_eval_metrics
+
+ if lr_scheduler is not None:
+ # step LR for next epoch
+ lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+ update_summary(
+ epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+ write_header=best_metric is None)
+
+ if saver is not None:
+ # save proper checkpoint with eval metric
+ save_metric = eval_metrics[eval_metric]
+ best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+ except KeyboardInterrupt:
+ pass
+ if best_metric is not None:
+ _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_epoch(
+ epoch, model, loader, optimizer, loss_fn, args,
+ lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress,
+ loss_scaler=None, model_ema=None, mixup_fn=None):
+ if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+ if args.prefetcher and loader.mixup_enabled:
+ loader.mixup_enabled = False
+ elif mixup_fn is not None:
+ mixup_fn.mixup_enabled = False
+
+ second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+ batch_time_m = AverageMeter()
+ data_time_m = AverageMeter()
+ losses_m = AverageMeter()
+ top1_m = AverageMeter()
+ top5_m = AverageMeter()
+
+ model.train()
+
+ end = time.time()
+ last_idx = len(loader) - 1
+ num_updates = epoch * len(loader)
+ epoch_fps = []
+ prof_list = []
+ for batch_idx, (input, target) in enumerate(loader):
+ last_batch = batch_idx == last_idx
+ data_time_m.update(time.time() - end)
+ if not args.prefetcher:
+ input, target = input.npu(), target.npu()
+ if mixup_fn is not None:
+ input, target = mixup_fn(input, target)
+ if args.channels_last:
+ input = input.contiguous(memory_format=torch.channels_last)
+
+ if batch_idx in prof_list:
+ with torch.autograd.profiler.profile(use_npu=True) as prof:
+ output = model(input)
+ loss = loss_fn(output, target)
+ if not args.distributed:
+ losses_m.update(loss.item(), input.size(0))
+
+ optimizer.zero_grad()
+ if loss_scaler is not None:
+ loss_scaler(
+ loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+ else:
+ loss.backward(create_graph=second_order)
+ if args.clip_grad is not None:
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+ optimizer.step()
+ print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+ prof.export_chrome_trace("output_{}.prof".format(str(batch_idx).zfill(4)))
+ sys.exit()
+
+ else:
+ with amp_autocast():
+ output = model(input)
+ loss = loss_fn(output, target)
+
+ if not args.distributed:
+ losses_m.update(loss.item(), input.size(0))
+
+ optimizer.zero_grad()
+ if loss_scaler is not None:
+ loss_scaler(
+ loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+ else:
+ loss.backward(create_graph=second_order)
+ if args.clip_grad is not None:
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+ optimizer.step()
+
+ torch.npu.synchronize()
+ if model_ema is not None:
+ model_ema.update(model)
+ num_updates += 1
+
+ batch_time_m.update(time.time() - end)
+
+ if last_batch or batch_idx % args.log_interval == 0:
+ lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+ lr = sum(lrl) / len(lrl)
+
+ if args.distributed:
+ reduced_loss = reduce_tensor(loss.data, args.world_size)
+ losses_m.update(reduced_loss.item(), input.size(0))
+
+ if args.local_rank == 0 or args.workers == 1:
+ _logger.info(
+ 'Train: {} [{:>4d}/{} ({:>3.0f}%)] '
+ 'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) '
+ 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s '
+ '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) '
+ 'LR: {lr:.3e} '
+ 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+ epoch,
+ batch_idx, len(loader),
+ 100. * batch_idx / last_idx,
+ loss=losses_m,
+ batch_time=batch_time_m,
+ rate=input.size(0) * args.world_size / batch_time_m.val,
+ rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+ lr=lr,
+ data_time=data_time_m))
+
+ if args.save_images and output_dir:
+ torchvision.utils.save_image(
+ input,
+ os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+ padding=0,
+ normalize=True)
+
+ if saver is not None and args.recovery_interval and (
+ last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+ saver.save_recovery(epoch, batch_idx=batch_idx)
+
+ if lr_scheduler is not None:
+ lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+
+ epoch_fps.append(input.shape[0] * args.workers / (time.time() - end))
+ end = time.time()
+ # end for
+
+ if hasattr(optimizer, 'sync_lookahead'):
+ optimizer.sync_lookahead()
+
+ print('Epoch {}: {} fps'.format(epoch, sum(epoch_fps[5:]) / len(epoch_fps[5:])))
+ return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+ batch_time_m = AverageMeter()
+ losses_m = AverageMeter()
+ top1_m = AverageMeter()
+ top5_m = AverageMeter()
+
+ model.eval()
+
+ end = time.time()
+ last_idx = len(loader) - 1
+ with torch.no_grad():
+ for batch_idx, (input, target) in enumerate(loader):
+ last_batch = batch_idx == last_idx
+ if not args.prefetcher:
+ input = input.npu()
+ target = target.npu()
+ if args.channels_last:
+ input = input.contiguous(memory_format=torch.channels_last)
+
+ with amp_autocast():
+ output = model(input)
+ if isinstance(output, (tuple, list)):
+ output = output[0]
+
+ # augmentation reduction
+ reduce_factor = args.tta
+ if reduce_factor > 1:
+ output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+ target = target[0:target.size(0):reduce_factor]
+
+ loss = loss_fn(output, target)
+ acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+ if args.distributed:
+ reduced_loss = reduce_tensor(loss.data, args.world_size)
+ acc1 = reduce_tensor(acc1, args.world_size)
+ acc5 = reduce_tensor(acc5, args.world_size)
+ else:
+ reduced_loss = loss.data
+
+ torch.npu.synchronize()
+
+ losses_m.update(reduced_loss.item(), input.size(0))
+ top1_m.update(acc1.item(), output.size(0))
+ top5_m.update(acc5.item(), output.size(0))
+
+ batch_time_m.update(time.time() - end)
+ end = time.time()
+ if (args.local_rank == 0 or args.workers == 1) and (last_batch or batch_idx % args.log_interval == 0):
+ log_name = 'Test' + log_suffix
+ _logger.info(
+ '{0}: [{1:>4d}/{2}] '
+ 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) '
+ 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) '
+ 'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) '
+ 'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+ log_name, batch_idx, last_idx, batch_time=batch_time_m,
+ loss=losses_m, top1=top1_m, top5=top5_m))
+
+ metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+ return metrics
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py b/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py
new file mode 100644
index 0000000000..401aa8e586
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py
@@ -0,0 +1,41 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class AverageMeter:
+ """Computes and stores the average and current value"""
+ def __init__(self):
+ self.reset()
+
+ def reset(self):
+ self.val = 0
+ self.avg = 0
+ self.sum = 0
+ self.count = 0
+
+ def update(self, val, n=1):
+ self.val = val
+ self.sum += val * n
+ self.count += n
+ self.avg = self.sum / self.count
+
+
+def t2taccuracy(output, target, topk=(1,)):
+ """Computes the accuracy over the k top predictions for the specified values of k"""
+ maxk = max(topk)
+ batch_size = target.size(0)
+ _, pred = output.topk(maxk, 1, True, True)
+ pred = pred.t()
+ correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+ return [correct[:k].reshape(-1).float().sum(0) * 100. for k in topk]
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt b/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt
new file mode 100644
index 0000000000..0b49b4fb26
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:OK
+PrecisionStatus:OK
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py b/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py
new file mode 100644
index 0000000000..a2f9cf0db1
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py
@@ -0,0 +1,255 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import defaultdict
+
+import torch
+from torch.optim.optimizer import Optimizer
+
+from apex.contrib.combine_tensors import combine_npu
+
+
+class NpuFusedAdamW(Optimizer):
+ """Implements AdamW algorithm.
+
+ Currently NPU-only. Requires Apex to be installed via
+ ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--npu_float_status" ./``.
+
+ This version of NPU fused AdamW implements 1 fusions.
+
+ * A combine-tensor apply launch that batches the elementwise updates applied to all the model's parameters
+ into one or a few kernel launches.
+
+ :class:`apex.optimizers.NpuFusedAdamW` may be used as a drop-in replacement for ``torch.optim.AdamW``::
+
+ opt = apex.optimizers.NpuFusedAdamW(model.parameters(), lr = ....)
+ ...
+ opt.step()
+
+ :class:`apex.optimizers.FusedAdamW` should be used with Amp. Currently, if you wish to use :class:`NpuFusedAdamW`
+ with Amp, only ``opt_level O1 and O2`` can be choosed::
+
+ opt = apex.optimizers.NpuFusedAdamW(model.parameters(), lr = ....)
+ model, opt = amp.initialize(model, opt, opt_level="O2")
+ ...
+ opt.step()
+
+
+ The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+ The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+ Arguments:
+ params (iterable): iterable of parameters to optimize or dicts defining
+ parameter groups
+ lr (float, optional, default: 1e-3): learning rate
+ betas (Tuple[float, float], optional, default: (0.9, 0.999)): coefficients used
+ for computing running averages of gradient and its square
+ eps (float, optional, default: 1e-8): term added to the denominator to improve
+ numerical stability
+ weight_decay (float, optional, default: 1e-2): weight decay coefficient
+ amsgrad (boolean, optional, default: False): whether to use the AMSGrad variant of
+ this algorithm from the paper `On the Convergence of Adam and Beyond`_
+
+ .. _Adam\: A Method for Stochastic Optimization:
+ https://arxiv.org/abs/1412.6980
+ .. _Decoupled Weight Decay Regularization:
+ https://arxiv.org/abs/1711.05101
+ .. _On the Convergence of Adam and Beyond:
+ https://openreview.net/forum?id=ryQu7f-RZ
+ """
+
+ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+ weight_decay=1e-2, amsgrad=False):
+ if lr < 0.0:
+ raise ValueError("Invalid learning rate: {}".format(lr))
+ if eps < 0.0:
+ raise ValueError("Invalid epsilon value: {}".format(eps))
+ if betas[0] < 0.0 or betas[0] >= 1.0:
+ raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+ if betas[1] < 0.0 or betas[1] >= 1.0:
+ raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+ if weight_decay < 0.0:
+ raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+ defaults = dict(lr=lr, betas=betas, eps=eps,
+ weight_decay=weight_decay, amsgrad=amsgrad)
+ self.is_npu_fused_optimizer = True
+ super(NpuFusedAdamW, self).__init__(params, defaults)
+
+ def __setstate__(self, state):
+ super(NpuFusedAdamW, self).__setstate__(state)
+ for group in self.param_groups:
+ group.setdefault('amsgrad', False)
+
+ def _init_param_state(self, p, amsgrad):
+ state = self.state[p]
+ # State initialization
+ if len(state) == 0:
+ state['step'] = 0
+ # Exponential moving average of gradient values
+ state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+ # Exponential moving average of squared gradient values
+ state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+ if amsgrad:
+ # Maintains max of all exp. moving avg. of sq. grad. values
+ state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+ else:
+ exp_avg_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+ exp_avg_tmp.copy_(state['exp_avg'])
+ state['exp_avg'] = exp_avg_tmp
+
+ exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+ exp_avg_sq_tmp.copy_(state['exp_avg_sq'])
+ state['exp_avg_sq'] = exp_avg_sq_tmp
+
+ if amsgrad:
+ max_exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+ max_exp_avg_sq_tmp.copy_(state['max_exp_avg_sq'])
+ state['max_exp_avg_sq'] = max_exp_avg_sq_tmp
+
+ def _combine_group_param_states(self, group_index):
+ group = self.param_groups[group_index]
+ stash = self._amp_stash
+ group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+ amsgrad = group['amsgrad']
+
+ combined_param_states = []
+ for params in group_params_list:
+ step_list = []
+ exp_avg_list = []
+ exp_avg_sq_list = []
+ max_exp_avg_sq_list = []
+
+ for p in params:
+ if p.grad is None:
+ continue
+ grad = p.grad
+ if grad.is_sparse:
+ raise RuntimeError('NpuFusedAdamW does not support sparse gradients, '
+ 'please consider SparseAdam instead')
+
+ self._init_param_state(p, amsgrad)
+ state = self.state[p]
+ step_list.append(state['step'])
+ exp_avg_list.append(state['exp_avg'])
+ exp_avg_sq_list.append(state['exp_avg_sq'])
+ if amsgrad:
+ max_exp_avg_sq_list.append(state['max_exp_avg_sq'])
+
+ combined_step = 0
+ combined_exp_avg = None
+ combined_exp_avg_sq = None
+ combined_max_exp_avg_sq = None
+
+ if len(exp_avg_list) > 0:
+ combined_step = step_list[0]
+ combined_exp_avg = combine_npu(exp_avg_list)
+ combined_exp_avg_sq = combine_npu(exp_avg_sq_list)
+ combined_max_exp_avg_sq = combine_npu(max_exp_avg_sq_list)
+
+ combined_state = defaultdict(dict)
+ combined_state['step'] = combined_step
+ combined_state['exp_avg'] = combined_exp_avg
+ combined_state['exp_avg_sq'] = combined_exp_avg_sq
+ combined_state['max_exp_avg_sq'] = combined_max_exp_avg_sq
+ combined_param_states.append(combined_state)
+ stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+ def _combine_param_states_by_group(self):
+ stash = self._amp_stash
+ if stash.param_states_are_combined_by_group:
+ return
+
+ stash.combined_param_states_indexed_by_group = []
+ for _ in self.param_groups:
+ stash.combined_param_states_indexed_by_group.append([])
+
+ for i, _ in enumerate(self.param_groups):
+ self._combine_group_param_states(i)
+ stash.param_states_are_combined_by_group = True
+
+ def _group_step(self, group_index):
+ group = self.param_groups[group_index]
+ for p in group['params']:
+ if p.grad is None:
+ continue
+
+ grad = p.grad
+ if grad.is_sparse:
+ raise RuntimeError('NpuFusedAdamW does not support sparse gradients, '
+ 'please consider SparseAdam instead')
+ state_p = self.state[p]
+ state_p['step'] += 1
+
+ amsgrad = group['amsgrad']
+ beta1, beta2 = group['betas']
+
+ stash = self._amp_stash
+ combined_group_params = stash.combined_params_indexed_by_group[group_index]
+ combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+ combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+
+ for combined_param, combined_grad, combined_param_state in zip(combined_group_params,
+ combined_group_grads,
+ combined_group_param_states):
+ if combined_param is None or combined_grad is None:
+ continue
+
+ # Perform stepweight decay. The fused method is used here to speed up the calculation
+ combined_param.mul_(1 - group['lr'] * group['weight_decay'])
+
+ exp_avg, exp_avg_sq = combined_param_state['exp_avg'], combined_param_state['exp_avg_sq']
+ if amsgrad:
+ max_exp_avg_sq = combined_param_state['max_exp_avg_sq']
+
+ combined_param_state['step'] += 1
+ bias_correction1 = 1 - beta1 ** combined_param_state['step']
+ bias_correction2 = 1 - beta2 ** combined_param_state['step']
+
+ # Decay the first and second moment running average coefficient
+ exp_avg.mul_(beta1).add_(combined_grad, alpha=1 - beta1)
+ exp_avg_sq.mul_(beta2).addcmul_(combined_grad, combined_grad, value=1 - beta2)
+ if amsgrad:
+ # Maintains the maximum of all 2nd moment running avg. till now
+ torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+ # Use the max. for normalizing running avg. of gradient
+ denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+ else:
+ denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+
+ step_size = group['lr'] / bias_correction1
+
+ combined_param.addcdiv_(exp_avg, denom, value=-step_size)
+
+ @torch.no_grad()
+ def step(self, closure=None):
+ if not hasattr(self, "_amp_stash"):
+ raise RuntimeError('apex.optimizers.NpuFusedAdamW should be used with AMP.')
+
+ self._check_already_combined_params_and_grads()
+ # combine params and grads first
+ self._combine_params_and_grads_by_group()
+ # then combine param states
+ self._combine_param_states_by_group()
+
+ loss = None
+ if closure is not None:
+ with torch.enable_grad():
+ loss = closure()
+
+ for i, _ in enumerate(self.param_groups):
+ self._group_step(i)
+
+ return loss
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt b/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt
new file mode 100644
index 0000000000..f6f0f6fd77
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.5.0
+apex==0.1
+torchvision==0.6.0
+timm==0.3.4
\ No newline at end of file
--
Gitee
From db61d27560e84f08317d292f129ad058b8e42848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:54 +0000
Subject: [PATCH 06/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../cv/classification/T2T-ViT/test/env_npu.sh | 68 +++++++++
.../T2T-ViT/test/train_full_1p.sh | 129 +++++++++++++++++
.../T2T-ViT/test/train_full_8p.sh | 130 +++++++++++++++++
.../T2T-ViT/test/train_performance_1p.sh | 131 ++++++++++++++++++
.../T2T-ViT/test/train_performance_8p.sh | 130 +++++++++++++++++
5 files changed, 588 insertions(+)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh
new file mode 100644
index 0000000000..bd4205d15d
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+ CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+ CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+ source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+ source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export TRI_COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+
+#设置device侧日志登记为error
+msnpureport -g error -d 0
+msnpureport -g error -d 1
+msnpureport -g error -d 2
+msnpureport -g error -d 3
+msnpureport -g error -d 4
+msnpureport -g error -d 5
+msnpureport -g error -d 6
+msnpureport -g error -d 7
+#关闭Device侧Event日志
+msnpureport -e disable
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+ match_sit = re.search('-packages', sys.path[index])
+ if match_sit is not None:
+ match_lib = re.search('lib', sys.path[index])
+
+ if match_lib is not None:
+ end=match_lib.span()[1]
+ result += sys.path[index][0:end] + ':'
+
+ result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
new file mode 100644
index 0000000000..09dd270d72
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+################基础配置参数,需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称,同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值
+for para in $*
+do
+ if [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+ test_path_dir=${cur_path}
+ cd ..
+ cur_path=`pwd`
+else
+ test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录,不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+else
+ mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+ source ${test_path_dir}/env_npu.sh
+ #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+ ${data_path} \
+ --model t2t_vit_14 \
+ --batch-size 64 \
+ --lr 5e-4 \
+ --weight-decay .05 \
+ -j 1 \
+ --amp \
+ --img-size 224 \
+ --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印,不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印,不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息,不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据,不需要修改
+#吞吐量
+AvgFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值,不需要修改
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh
new file mode 100644
index 0000000000..d7ef995ea9
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+################基础配置参数,需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称,同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+
+# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值
+for para in $*
+do
+ if [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+ test_path_dir=${cur_path}
+ cd ..
+ cur_path=`pwd`
+else
+ test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录,不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+# 变量
+export DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+ source ${test_path_dir}/env_npu.sh
+ #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \
+ ${data_path} \
+ --model t2t_vit_14 \
+ --batch-size 64 \
+ --lr 5e-4 \
+ --weight-decay .05 \
+ --amp \
+ --img-size 224 \
+ --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印,不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印,不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息,不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据,不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值,不需要修改
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
new file mode 100644
index 0000000000..6c1828793e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+################基础配置参数,需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称,同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+
+# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值
+for para in $*
+do
+ if [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+ test_path_dir=${cur_path}
+ cd ..
+ cur_path=`pwd`
+else
+ test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录,不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+else
+ mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+ source ${test_path_dir}/env_npu.sh
+ #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+ ${data_path} \
+ --model t2t_vit_14 \
+ --batch-size 64 \
+ --lr 5e-4 \
+ --weight-decay .05 \
+ -j 1 \
+ --amp \
+ --img-size 224 \
+ --epochs 1 \
+ --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印,不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印,不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息,不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据,不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值,不需要修改
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
new file mode 100644
index 0000000000..f024d6cc71
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+################基础配置参数,需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称,同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+
+# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值
+for para in $*
+do
+ if [[ $para == --data_path* ]];then
+ data_path=`echo ${para#*=}`
+ fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+ echo "[Error] para \"data_path\" must be confing"
+ exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+ test_path_dir=${cur_path}
+ cd ..
+ cur_path=`pwd`
+else
+ test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录,不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+ rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+ mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间,不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+ source ${test_path_dir}/env_npu.sh
+ #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \
+ ${data_path} \
+ --model t2t_vit_14 \
+ --batch-size 64 \
+ --lr 5e-4 \
+ --weight-decay .05 \
+ --amp \
+ --img-size 224 \
+ --epochs 5 \
+ --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+ > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间,不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印,不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS,需要模型审视修改
+FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印,不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印,不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息,不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据,不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值,不需要修改
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中,不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
--
Gitee
From c28a1f6efe660b5d0a09382261a5a16fa6c357b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:48:00 +0000
Subject: [PATCH 07/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?=
=?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/test/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
PyTorch/contrib/cv/classification/T2T-ViT/test/.keep | 0
1 file changed, 0 insertions(+), 0 deletions(-)
delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep
deleted file mode 100644
index e69de29bb2..0000000000
--
Gitee
From 6113ed35721c8283025840060d8f2ed8ee5f6255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:48:15 +0000
Subject: [PATCH 08/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../classification/T2T-ViT/models/__init__.py | 17 +
.../classification/T2T-ViT/models/t2t_vit.py | 304 ++++++++
.../T2T-ViT/models/t2t_vit_dense.py | 177 +++++
.../T2T-ViT/models/t2t_vit_ghost.py | 204 ++++++
.../T2T-ViT/models/t2t_vit_se.py | 176 +++++
.../T2T-ViT/models/token_performer.py | 73 ++
.../T2T-ViT/models/token_transformer.py | 68 ++
.../T2T-ViT/models/transformer_block.py | 96 +++
.../cv/classification/T2T-ViT/models/vit.py | 674 ++++++++++++++++++
9 files changed, 1789 insertions(+)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py
new file mode 100644
index 0000000000..007f383c19
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .t2t_vit import *
+from .t2t_vit_se import *
+from .t2t_vit_dense import *
+from .t2t_vit_ghost import *
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py
new file mode 100644
index 0000000000..db04ed4ee0
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py
@@ -0,0 +1,304 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_
+import numpy as np
+from .token_transformer import Token_transformer
+from .token_performer import Token_performer
+from .transformer_block import Block, get_sinusoid_encoding
+
+def _cfg(url='', **kwargs):
+ return {
+ 'url': url,
+ 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+ 'crop_pct': .9, 'interpolation': 'bicubic',
+ 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225),
+ 'classifier': 'head',
+ **kwargs
+ }
+
+default_cfgs = {
+ 'T2t_vit_7': _cfg(),
+ 'T2t_vit_10': _cfg(),
+ 'T2t_vit_12': _cfg(),
+ 'T2t_vit_14': _cfg(),
+ 'T2t_vit_19': _cfg(),
+ 'T2t_vit_24': _cfg(),
+ 'T2t_vit_t_14': _cfg(),
+ 'T2t_vit_t_19': _cfg(),
+ 'T2t_vit_t_24': _cfg(),
+ 'T2t_vit_14_resnext': _cfg(),
+ 'T2t_vit_14_wide': _cfg(),
+}
+
+class T2T_module(nn.Module):
+ """
+ Tokens-to-Token encoding module
+ """
+ def __init__(self, img_size=224, tokens_type='performer', in_chans=3, embed_dim=768, token_dim=64):
+ super().__init__()
+
+ if tokens_type == 'transformer':
+ print('adopt transformer encoder for tokens-to-token')
+ self.soft_split0 = nn.Unfold(kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+ self.soft_split1 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+ self.soft_split2 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+ self.attention1 = Token_transformer(dim=in_chans * 7 * 7, in_dim=token_dim, num_heads=1, mlp_ratio=1.0)
+ self.attention2 = Token_transformer(dim=token_dim * 3 * 3, in_dim=token_dim, num_heads=1, mlp_ratio=1.0)
+ self.project = nn.Linear(token_dim * 3 * 3, embed_dim)
+
+ elif tokens_type == 'performer':
+ print('adopt performer encoder for tokens-to-token')
+ self.soft_split0 = nn.Unfold(kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+ self.soft_split1 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+ self.soft_split2 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+ #self.attention1 = Token_performer(dim=token_dim, in_dim=in_chans*7*7, kernel_ratio=0.5)
+ #self.attention2 = Token_performer(dim=token_dim, in_dim=token_dim*3*3, kernel_ratio=0.5)
+ self.attention1 = Token_performer(dim=in_chans*7*7, in_dim=token_dim, kernel_ratio=0.5)
+ self.attention2 = Token_performer(dim=token_dim*3*3, in_dim=token_dim, kernel_ratio=0.5)
+ self.project = nn.Linear(token_dim * 3 * 3, embed_dim)
+
+ elif tokens_type == 'convolution': # just for comparison with conolution, not our model
+ # for this tokens type, you need change forward as three convolution operation
+ print('adopt convolution layers for tokens-to-token')
+ self.soft_split0 = nn.Conv2d(3, token_dim, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2)) # the 1st convolution
+ self.soft_split1 = nn.Conv2d(token_dim, token_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) # the 2nd convolution
+ self.project = nn.Conv2d(token_dim, embed_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) # the 3rd convolution
+
+ self.num_patches = (img_size // (4 * 2 * 2)) * (img_size // (4 * 2 * 2)) # there are 3 sfot split, stride are 4,2,2 seperately
+
+ def forward(self, x):
+ # step0: soft split
+ x = self.soft_split0(x).transpose(1, 2)
+
+ # iteration1: re-structurization/reconstruction
+ x = self.attention1(x)
+ B, new_HW, C = x.shape
+ x = x.transpose(1,2).reshape(B, C, int(np.sqrt(new_HW)), int(np.sqrt(new_HW)))
+ # iteration1: soft split
+ x = self.soft_split1(x).transpose(1, 2)
+
+ # iteration2: re-structurization/reconstruction
+ x = self.attention2(x)
+ B, new_HW, C = x.shape
+ x = x.transpose(1, 2).reshape(B, C, int(np.sqrt(new_HW)), int(np.sqrt(new_HW)))
+ # iteration2: soft split
+ x = self.soft_split2(x).transpose(1, 2)
+
+ # final tokens
+ x = self.project(x)
+
+ return x
+
+class T2T_ViT(nn.Module):
+ def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm, token_dim=64):
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+
+ self.tokens_to_token = T2T_module(
+ img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim, token_dim=token_dim)
+ num_patches = self.tokens_to_token.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.blocks = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+ for i in range(depth)])
+ self.norm = norm_layer(embed_dim)
+
+ # Classifier head
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ trunc_normal_(self.cls_token, std=.02)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ B = x.shape[0]
+ x = self.tokens_to_token(x)
+
+ cls_tokens = self.cls_token.expand(B, -1, -1)
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.norm(x)
+ return x[:, 0]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ x = self.head(x)
+ return x
+
+@register_model
+def t2t_vit_7(pretrained=False, **kwargs): # adopt performer for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 256 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=7, num_heads=4, mlp_ratio=2., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_7']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_10(pretrained=False, **kwargs): # adopt performer for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 256 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=10, num_heads=4, mlp_ratio=2., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_10']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_12(pretrained=False, **kwargs): # adopt performer for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 256 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=12, num_heads=4, mlp_ratio=2., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_12']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+
+@register_model
+def t2t_vit_14(pretrained=False, **kwargs): # adopt performer for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 384 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_14']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_19(pretrained=False, **kwargs): # adopt performer for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 448 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_19']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_24(pretrained=False, **kwargs): # adopt performer for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 512 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_24']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_t_14(pretrained=False, **kwargs): # adopt transformers for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 384 ** -0.5)
+ model = T2T_ViT(tokens_type='transformer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_t_14']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_t_19(pretrained=False, **kwargs): # adopt transformers for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 448 ** -0.5)
+ model = T2T_ViT(tokens_type='transformer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_t_19']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_t_24(pretrained=False, **kwargs): # adopt transformers for tokens to token
+ if pretrained:
+ kwargs.setdefault('qk_scale', 512 ** -0.5)
+ model = T2T_ViT(tokens_type='transformer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_t_24']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+# rexnext and wide structure
+@register_model
+def t2t_vit_14_resnext(pretrained=False, **kwargs):
+ if pretrained:
+ kwargs.setdefault('qk_scale', 384 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=384, depth=14, num_heads=32, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_14_resnext']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
+
+@register_model
+def t2t_vit_14_wide(pretrained=False, **kwargs):
+ if pretrained:
+ kwargs.setdefault('qk_scale', 512 ** -0.5)
+ model = T2T_ViT(tokens_type='performer', embed_dim=768, depth=4, num_heads=12, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_14_wide']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py
new file mode 100644
index 0000000000..5724570c39
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py
@@ -0,0 +1,177 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT-Dense
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+
+from .transformer_block import Mlp, Block, get_sinusoid_encoding
+from .t2t_vit import T2T_module, _cfg
+
+default_cfgs = {
+ 't2t_vit_dense': _cfg(),
+}
+
+class Transition(nn.Module):
+ def __init__(self, in_features, out_features, act_layer=nn.GELU):
+ super(Transition, self).__init__()
+ self.act = act_layer()
+ self.linear = nn.Linear(in_features, out_features)
+ def forward(self, x):
+ x = self.linear(x)
+ x = self.act(x)
+
+ return x
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, growth_rate, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) #, out_features=growth_rate
+ self.dense_linear = nn.Linear(dim, growth_rate)
+
+ def forward(self, x):
+ new_x = x + self.drop_path(self.attn(self.norm1(x)))
+ new_x = new_x + self.drop_path(self.mlp(self.norm2(new_x)))
+ new_x = self.dense_linear(new_x)
+ x = torch.cat([x, new_x], 2) # dense connnection: concate all the old features with new features in channel dimension
+ return x
+
+class T2T_ViT_Dense(nn.Module):
+ def __init__(self, growth_rate=32, tokens_type='performer', block_config=(3, 4, 6, 3), img_size=224, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+
+ self.tokens_to_token = T2T_module(
+ img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.tokens_to_token.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.blocks = nn.ModuleList()
+
+ start_dim = embed_dim
+ for i, num_layers in enumerate(block_config):
+ for j in range(num_layers):
+ new_dim = start_dim + j * growth_rate
+ block = Block(growth_rate=growth_rate, dim=new_dim, num_heads=num_heads, mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+ self.blocks.append(block)
+ if i != len(block_config)-1:
+ transition = Transition(new_dim+growth_rate, (new_dim+growth_rate)//2)
+ self.blocks.append(transition)
+ start_dim = int((new_dim+growth_rate)//2)
+ out_dim = new_dim + growth_rate
+ print(f'end dim:{out_dim}')
+ self.norm = norm_layer(out_dim)
+
+ # Classifier head
+ self.head = nn.Linear(out_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ trunc_normal_(self.cls_token, std=.02)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ B = x.shape[0]
+ x = self.tokens_to_token(x)
+
+ cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed # self.pos_embed
+ x = self.pos_drop(x)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.norm(x)
+ return x[:, 0]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ x = self.head(x)
+ return x
+
+@register_model
+def t2t_vit_dense(pretrained=False, **kwargs):
+ model = T2T_ViT_Dense(growth_rate=64, block_config=(3, 6, 6, 4), embed_dim=128, num_heads=8, mlp_ratio=2., **kwargs)
+ model.default_cfg = default_cfgs['t2t_vit_dense']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py
new file mode 100644
index 0000000000..217dad772b
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py
@@ -0,0 +1,204 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT-Ghost
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+
+from .transformer_block import Block, get_sinusoid_encoding
+from .t2t_vit import T2T_module, _cfg
+
+
+default_cfgs = {
+ 'T2t_vit_16_ghost': _cfg(),
+}
+
+class Mlp_ghost(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, in_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+ self.ratio = hidden_features//in_features
+ self.cheap_operation2 = nn.Conv1d(in_features, in_features, kernel_size=1, groups=in_features, bias=False)
+ self.cheap_operation3 = nn.Conv1d(in_features, in_features, kernel_size=1, groups=in_features, bias=False)
+
+ def forward(self, x): # x: [B, N, C]
+ x1 = self.fc1(x) # x1: [B, N, C]
+ x1 = self.act(x1)
+
+ x2 = self.cheap_operation2(x1.transpose(1,2)) # x2: [B, N, C]
+ x2 = x2.transpose(1,2)
+ x2 = self.act(x2)
+
+ x3 = self.cheap_operation3(x1.transpose(1, 2)) # x3: [B, N, C]
+ x3 = x3.transpose(1, 2)
+ x3 = self.act(x3)
+
+ x = torch.cat((x1, x2, x3), dim=2) # x: [B, N, 3C]
+ x = self.drop(x)
+
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+class Attention_ghost(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+ half_dim = int(0.5*dim)
+ self.q = nn.Linear(dim, half_dim, bias=qkv_bias)
+ self.k = nn.Linear(dim, half_dim, bias=qkv_bias)
+ self.v = nn.Linear(dim, half_dim, bias=qkv_bias)
+
+ self.cheap_operation_q = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False)
+ self.cheap_operation_k = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False)
+ self.cheap_operation_v = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False)
+
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ q = self.q(x)
+ k = self.k(x)
+ v = self.v(x)
+
+ q1 = self.cheap_operation_q(q.transpose(1,2)).transpose(1,2)
+ k1 = self.cheap_operation_k(k.transpose(1,2)).transpose(1,2)
+ v1 = self.cheap_operation_v(v.transpose(1,2)).transpose(1,2)
+
+ q = torch.cat((q, q1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+ k = torch.cat((k, k1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+ v = torch.cat((v, v1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention_ghost(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp_ghost(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+class T2T_ViT_Ghost(nn.Module):
+ def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+
+ self.tokens_to_token = T2T_module(
+ img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.tokens_to_token.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.blocks = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+ for i in range(depth)])
+ self.norm = norm_layer(embed_dim)
+
+ # Classifier head
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ trunc_normal_(self.cls_token, std=.02)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ B = x.shape[0]
+ x = self.tokens_to_token(x)
+
+ cls_tokens = self.cls_token.expand(B, -1, -1)
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.norm(x)
+ return x[:, 0]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ x = self.head(x)
+ return x
+
+
+@register_model
+def t2t_vit_16_ghost(pretrained=False, **kwargs):
+ if pretrained:
+ kwargs.setdefault('qk_scale', 384 ** -0.5)
+ model = T2T_ViT_Ghost(tokens_type='performer', embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_16_ghost']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py
new file mode 100644
index 0000000000..b43a86e39e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py
@@ -0,0 +1,176 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT-SE
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+from .transformer_block import Block, Mlp, get_sinusoid_encoding
+from .t2t_vit import T2T_module, _cfg
+
+default_cfgs = {
+ 'T2t_vit_14_se': _cfg(),
+}
+
+class SELayer(nn.Module):
+ def __init__(self, channel, reduction=16):
+ super(SELayer, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool1d(1)
+ self.fc = nn.Sequential(
+ nn.Linear(channel, channel // reduction, bias=False),
+ nn.ReLU(inplace=True),
+ nn.Linear(channel // reduction, channel, bias=False),
+ nn.Sigmoid()
+ )
+
+ def forward(self, x): # x: [B, N, C]
+ x = torch.transpose(x, 1, 2) # [B, C, N]
+ b, c, _ = x.size()
+ y = self.avg_pool(x).view(b, c)
+ y = self.fc(y).view(b, c, 1)
+ x = x * y.expand_as(x)
+ x = torch.transpose(x, 1, 2) # [B, N, C]
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+ self.se_layer = SELayer(dim)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.se_layer(x)
+ x = self.proj_drop(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+class T2T_ViT_SE(nn.Module):
+ def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+ drop_path_rate=0., norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+
+ self.tokens_to_token = T2T_module(
+ img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.tokens_to_token.num_patches
+ print(num_patches)
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.blocks = nn.ModuleList([
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+ for i in range(depth)])
+ self.norm = norm_layer(embed_dim)
+
+ # Classifier head
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ trunc_normal_(self.cls_token, std=.02)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ B = x.shape[0]
+ x = self.tokens_to_token(x)
+
+ cls_tokens = self.cls_token.expand(B, -1, -1)
+ x = torch.cat((cls_tokens, x), dim=1)
+ x = x + self.pos_embed
+ x = self.pos_drop(x)
+
+ for blk in self.blocks:
+ x = blk(x)
+
+ x = self.norm(x)
+ return x[:, 0]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ x = self.head(x)
+ return x
+
+@register_model
+def t2t_vit_14_se(pretrained=False, **kwargs):
+ if pretrained:
+ kwargs.setdefault('qk_scale', 384 ** -0.5)
+ model = T2T_ViT_SE(tokens_type='performer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
+ model.default_cfg = default_cfgs['T2t_vit_14_se']
+ if pretrained:
+ load_pretrained(
+ model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+ return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py
new file mode 100644
index 0000000000..16134300a3
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py
@@ -0,0 +1,73 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Take Performer as T2T Transformer
+"""
+import math
+import torch
+import torch.nn as nn
+
+class Token_performer(nn.Module):
+ def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2 = 0.1):
+ super().__init__()
+ self.emb = in_dim * head_cnt # we use 1, so it is no need here
+ self.kqv = nn.Linear(dim, 3 * self.emb)
+ self.dp = nn.Dropout(dp1)
+ self.proj = nn.Linear(self.emb, self.emb)
+ self.head_cnt = head_cnt
+ self.norm1 = nn.LayerNorm(dim)
+ self.norm2 = nn.LayerNorm(self.emb)
+ self.epsilon = 1e-8 # for stable in division
+
+ self.mlp = nn.Sequential(
+ nn.Linear(self.emb, 1 * self.emb),
+ nn.GELU(),
+ nn.Linear(1 * self.emb, self.emb),
+ nn.Dropout(dp2),
+ )
+
+ self.m = int(self.emb * kernel_ratio)
+ self.w = torch.randn(self.m, self.emb)
+ self.w = nn.Parameter(nn.init.orthogonal_(self.w) * math.sqrt(self.m), requires_grad=False)
+
+ def prm_exp(self, x):
+ # part of the function is borrow from https://github.com/lucidrains/performer-pytorch
+ # and Simo Ryu (https://github.com/cloneofsimo)
+ # ==== positive random features for gaussian kernels ====
+ # x = (B, T, hs)
+ # w = (m, hs)
+ # return : x : B, T, m
+ # SM(x, y) = E_w[exp(w^T x - |x|/2) exp(w^T y - |y|/2)]
+ # therefore return exp(w^Tx - |x|/2)/sqrt(m)
+ xd = ((x * x).sum(dim=-1, keepdim=True)).repeat(1, 1, self.m) / 2
+ wtx = torch.einsum('bti,mi->btm', x.float(), self.w)
+
+ return torch.exp(wtx - xd) / math.sqrt(self.m)
+
+ def single_attn(self, x):
+ k, q, v = torch.split(self.kqv(x), self.emb, dim=-1)
+ kp, qp = self.prm_exp(k), self.prm_exp(q) # (B, T, m), (B, T, m)
+ D = torch.einsum('bti,bi->bt', qp, kp.sum(dim=1)).unsqueeze(dim=2) # (B, T, m) * (B, m) -> (B, T, 1)
+ kptv = torch.einsum('bin,bim->bnm', v.float(), kp) # (B, emb, m)
+ y = torch.einsum('bti,bni->btn', qp, kptv) / (D.repeat(1, 1, self.emb) + self.epsilon) # (B, T, emb)/Diag
+ # skip connection
+ y = v + self.dp(self.proj(y)) # same as token_transformer in T2T layer, use v as skip connection
+
+ return y
+
+ def forward(self, x):
+ x = self.single_attn(self.norm1(x))
+ x = x + self.mlp(self.norm2(x))
+ return x
+
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py
new file mode 100644
index 0000000000..f9133a1d9f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py
@@ -0,0 +1,68 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Take the standard Transformer as T2T Transformer
+"""
+import torch.nn as nn
+from timm.models.layers import DropPath
+from .transformer_block import Mlp
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, in_dim = None, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ self.in_dim = in_dim
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, in_dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(in_dim, in_dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.in_dim).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q * self.scale) @ k.transpose(-2, -1)
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, self.in_dim)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+
+ # skip connection
+ x = v.squeeze(1) + x # because the original x has different size with current x, use v to do skip connection
+
+ return x
+
+class Token_transformer(nn.Module):
+
+ def __init__(self, dim, in_dim, num_heads, mlp_ratio=1., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, in_dim=in_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(in_dim)
+ self.mlp = Mlp(in_features=in_dim, hidden_features=int(in_dim*mlp_ratio), out_features=in_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = self.attn(self.norm1(x))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py
new file mode 100644
index 0000000000..0ba43c1421
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py
@@ -0,0 +1,96 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Borrow from timm(https://github.com/rwightman/pytorch-image-models)
+"""
+import torch
+import torch.nn as nn
+import numpy as np
+from timm.models.layers import DropPath
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+
+def get_sinusoid_encoding(n_position, d_hid):
+ ''' Sinusoid position encoding table '''
+
+ def get_position_angle_vec(position):
+ return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+ sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+ sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
+ sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
+
+ return torch.FloatTensor(sinusoid_table).unsqueeze(0)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py
new file mode 100644
index 0000000000..b263a6f9e5
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py
@@ -0,0 +1,674 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The original Vision Transformer (ViT) from timm, copyright belongs to / Copyright 2020 Ross Wightman
+"""
+import math
+import logging
+from functools import partial
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.helpers import build_model_with_cfg, overlay_external_default_cfg
+from timm.models.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_
+from timm.models.registry import register_model
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+ return {
+ 'url': url,
+ 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+ 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+ 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+ 'first_conv': 'patch_embed.proj', 'classifier': 'head',
+ **kwargs
+ }
+
+
+default_cfgs = {
+ # patch models (my experiments)
+ 'vit_small_patch16_224': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/vit_small_p16_224-15ec54c9.pth',
+ ),
+
+ # patch models (weights ported from official Google JAX impl)
+ 'vit_base_patch16_224': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
+ mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+ ),
+ 'vit_base_patch32_224': _cfg(
+ url='', # no official model weights for this combo, only for in21k
+ mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_base_patch16_384': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth',
+ input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+ 'vit_base_patch32_384': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p32_384-830016f5.pth',
+ input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+ 'vit_large_patch16_224': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth',
+ mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_large_patch32_224': _cfg(
+ url='', # no official model weights for this combo, only for in21k
+ mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_large_patch16_384': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',
+ input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+ 'vit_large_patch32_384': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
+ input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+
+ # patch models, imagenet21k (weights ported from official Google JAX impl)
+ 'vit_base_patch16_224_in21k': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth',
+ num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_base_patch32_224_in21k': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth',
+ num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_large_patch16_224_in21k': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth',
+ num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_large_patch32_224_in21k': _cfg(
+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
+ num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+ 'vit_huge_patch14_224_in21k': _cfg(
+ hf_hub='timm/vit_huge_patch14_224_in21k',
+ num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+
+ # deit models (FB weights)
+ 'vit_deit_tiny_patch16_224': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'),
+ 'vit_deit_small_patch16_224': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'),
+ 'vit_deit_base_patch16_224': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',),
+ 'vit_deit_base_patch16_384': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth',
+ input_size=(3, 384, 384), crop_pct=1.0),
+ 'vit_deit_tiny_distilled_patch16_224': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth',
+ classifier=('head', 'head_dist')),
+ 'vit_deit_small_distilled_patch16_224': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
+ classifier=('head', 'head_dist')),
+ 'vit_deit_base_distilled_patch16_224': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth',
+ classifier=('head', 'head_dist')),
+ 'vit_deit_base_distilled_patch16_384': _cfg(
+ url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth',
+ input_size=(3, 384, 384), crop_pct=1.0, classifier=('head', 'head_dist')),
+
+ # ViT ImageNet-21K-P pretraining
+ 'vit_base_patch16_224_miil_in21k': _cfg(
+ url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/vit_base_patch16_224_in21k_miil.pth',
+ mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear', num_classes=11221,
+ ),
+ 'vit_base_patch16_224_miil': _cfg(
+ url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm'
+ '/vit_base_patch16_224_1k_miil_84_4.pth',
+ mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear',
+ ),
+}
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class Block(nn.Module):
+
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+
+class VisionTransformer(nn.Module):
+ """ Vision Transformer
+ A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+ - https://arxiv.org/abs/2010.11929
+ Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+ - https://arxiv.org/abs/2012.12877
+ """
+
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+ num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, distilled=False,
+ drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=PatchEmbed, norm_layer=None,
+ act_layer=None, weight_init=''):
+ """
+ Args:
+ img_size (int, tuple): input image size
+ patch_size (int, tuple): patch size
+ in_chans (int): number of input channels
+ num_classes (int): number of classes for classification head
+ embed_dim (int): embedding dimension
+ depth (int): depth of transformer
+ num_heads (int): number of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+ representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+ distilled (bool): model includes a distillation token and head as in DeiT models
+ drop_rate (float): dropout rate
+ attn_drop_rate (float): attention dropout rate
+ drop_path_rate (float): stochastic depth rate
+ embed_layer (nn.Module): patch embedding layer
+ norm_layer: (nn.Module): normalization layer
+ weight_init: (str): weight init scheme
+ """
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ self.num_tokens = 2 if distilled else 1
+ norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+ act_layer = act_layer or nn.GELU
+
+ self.patch_embed = embed_layer(
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+ num_patches = self.patch_embed.num_patches
+
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+ self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
+ self.blocks = nn.Sequential(*[
+ Block(
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
+ for i in range(depth)])
+ self.norm = norm_layer(embed_dim)
+
+ # Representation layer
+ if representation_size and not distilled:
+ self.num_features = representation_size
+ self.pre_logits = nn.Sequential(OrderedDict([
+ ('fc', nn.Linear(embed_dim, representation_size)),
+ ('act', nn.Tanh())
+ ]))
+ else:
+ self.pre_logits = nn.Identity()
+
+ # Classifier head(s)
+ self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+ self.head_dist = None
+ if distilled:
+ self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+ # Weight init
+ assert weight_init in ('jax', 'jax_nlhb', 'nlhb', '')
+ head_bias = -math.log(self.num_classes) if 'nlhb' in weight_init else 0.
+ trunc_normal_(self.pos_embed, std=.02)
+ if self.dist_token is not None:
+ trunc_normal_(self.dist_token, std=.02)
+ if weight_init.startswith('jax'):
+ # leave cls token as zeros to match jax impl
+ for n, m in self.named_modules():
+ _init_vit_weights(m, n, head_bias=head_bias, jax_impl=True)
+ else:
+ trunc_normal_(self.cls_token, std=.02)
+ self.apply(_init_vit_weights)
+
+ def _init_weights(self, m):
+ # this fn left here for compat with downstream users
+ _init_vit_weights(m)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'pos_embed', 'cls_token', 'dist_token'}
+
+ def get_classifier(self):
+ if self.dist_token is None:
+ return self.head
+ else:
+ return self.head, self.head_dist
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+ if self.num_tokens == 2:
+ self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ x = self.patch_embed(x)
+ cls_token = self.cls_token.expand(x.shape[0], -1, -1) # stole cls_tokens impl from Phil Wang, thanks
+ if self.dist_token is None:
+ x = torch.cat((cls_token, x), dim=1)
+ else:
+ x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+ x = self.pos_drop(x + self.pos_embed)
+ x = self.blocks(x)
+ x = self.norm(x)
+ if self.dist_token is None:
+ return self.pre_logits(x[:, 0])
+ else:
+ return x[:, 0], x[:, 1]
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ if self.head_dist is not None:
+ x, x_dist = self.head(x[0]), self.head_dist(x[1]) # x must be a tuple
+ if self.training and not torch.jit.is_scripting():
+ # during inference, return the average of both classifier predictions
+ return x, x_dist
+ else:
+ return (x + x_dist) / 2
+ else:
+ x = self.head(x)
+ return x
+
+
+def _init_vit_weights(m, n: str = '', head_bias: float = 0., jax_impl: bool = False):
+ """ ViT weight initialization
+ * When called without n, head_bias, jax_impl args it will behave exactly the same
+ as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).
+ * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl
+ """
+ if isinstance(m, nn.Linear):
+ if n.startswith('head'):
+ nn.init.zeros_(m.weight)
+ nn.init.constant_(m.bias, head_bias)
+ elif n.startswith('pre_logits'):
+ lecun_normal_(m.weight)
+ nn.init.zeros_(m.bias)
+ else:
+ if jax_impl:
+ nn.init.xavier_uniform_(m.weight)
+ if m.bias is not None:
+ if 'mlp' in n:
+ nn.init.normal_(m.bias, std=1e-6)
+ else:
+ nn.init.zeros_(m.bias)
+ else:
+ trunc_normal_(m.weight, std=.02)
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif jax_impl and isinstance(m, nn.Conv2d):
+ # NOTE conv was left to pytorch default in my original init
+ lecun_normal_(m.weight)
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.zeros_(m.bias)
+ nn.init.ones_(m.weight)
+
+
+def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()):
+ # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+ # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+ _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
+ ntok_new = posemb_new.shape[1]
+ if num_tokens:
+ posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0, num_tokens:]
+ ntok_new -= num_tokens
+ else:
+ posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
+ gs_old = int(math.sqrt(len(posemb_grid)))
+ if not len(gs_new): # backwards compatibility
+ gs_new = [int(math.sqrt(ntok_new))] * 2
+ assert len(gs_new) >= 2
+ _logger.info('Position embedding grid-size from %s to %s', [gs_old, gs_old], gs_new)
+ posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+ posemb_grid = F.interpolate(posemb_grid, size=gs_new, mode='bilinear')
+ posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1)
+ posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+ return posemb
+
+
+def checkpoint_filter_fn(state_dict, model):
+ """ convert patch embedding weight from manual patchify + linear proj to conv"""
+ out_dict = {}
+ if 'model' in state_dict:
+ # For deit models
+ state_dict = state_dict['model']
+ for k, v in state_dict.items():
+ if 'patch_embed.proj.weight' in k and len(v.shape) < 4:
+ # For old models that I trained prior to conv based patchification
+ O, I, H, W = model.patch_embed.proj.weight.shape
+ v = v.reshape(O, -1, H, W)
+ elif k == 'pos_embed' and v.shape != model.pos_embed.shape:
+ # To resize pos embedding when using model at different size from pretrained weights
+ v = resize_pos_embed(v, model.pos_embed, getattr(model, 'num_tokens', 1),
+ model.patch_embed.grid_size)
+ out_dict[k] = v
+ return out_dict
+
+
+def _create_vision_transformer(variant, pretrained=False, default_cfg=None, **kwargs):
+ if default_cfg is None:
+ default_cfg = deepcopy(default_cfgs[variant])
+ overlay_external_default_cfg(default_cfg, kwargs)
+ default_num_classes = default_cfg['num_classes']
+ default_img_size = default_cfg['input_size'][-2:]
+
+ num_classes = kwargs.pop('num_classes', default_num_classes)
+ img_size = kwargs.pop('img_size', default_img_size)
+ repr_size = kwargs.pop('representation_size', None)
+ if repr_size is not None and num_classes != default_num_classes:
+ # Remove representation layer if fine-tuning. This may not always be the desired action,
+ # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface?
+ _logger.warning("Removing representation layer for fine-tuning.")
+ repr_size = None
+
+ if kwargs.get('features_only', None):
+ raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+ model = build_model_with_cfg(
+ VisionTransformer, variant, pretrained,
+ default_cfg=default_cfg,
+ img_size=img_size,
+ num_classes=num_classes,
+ representation_size=repr_size,
+ pretrained_filter_fn=checkpoint_filter_fn,
+ **kwargs)
+
+ return model
+
+
+@register_model
+def vit_small_patch16_224(pretrained=False, **kwargs):
+ """ My custom 'small' ViT model. embed_dim=768, depth=8, num_heads=8, mlp_ratio=3.
+ NOTE:
+ * this differs from the DeiT based 'small' definitions with embed_dim=384, depth=12, num_heads=6
+ * this model does not have a bias for QKV (unlike the official ViT and DeiT models)
+ """
+ model_kwargs = dict(
+ patch_size=16, embed_dim=768, depth=8, num_heads=8, mlp_ratio=3.,
+ qkv_bias=False, norm_layer=nn.LayerNorm, **kwargs)
+ if pretrained:
+ # NOTE my scale was wrong for original weights, leaving this here until I have better ones for this model
+ model_kwargs.setdefault('qk_scale', 768 ** -0.5)
+ model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch16_224(pretrained=False, **kwargs):
+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch32_224(pretrained=False, **kwargs):
+ """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
+ """
+ model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch16_384(pretrained=False, **kwargs):
+ """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch32_384(pretrained=False, **kwargs):
+ """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_large_patch16_224(pretrained=False, **kwargs):
+ """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+ model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_large_patch32_224(pretrained=False, **kwargs):
+ """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
+ """
+ model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+ model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_large_patch16_384(pretrained=False, **kwargs):
+ """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+ model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_large_patch32_384(pretrained=False, **kwargs):
+ """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+ model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch16_224_in21k(pretrained=False, **kwargs):
+ """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(
+ patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
+ model = _create_vision_transformer('vit_base_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch32_224_in21k(pretrained=False, **kwargs):
+ """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(
+ patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
+ model = _create_vision_transformer('vit_base_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_large_patch16_224_in21k(pretrained=False, **kwargs):
+ """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(
+ patch_size=16, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs)
+ model = _create_vision_transformer('vit_large_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_large_patch32_224_in21k(pretrained=False, **kwargs):
+ """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+ """
+ model_kwargs = dict(
+ patch_size=32, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs)
+ model = _create_vision_transformer('vit_large_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_huge_patch14_224_in21k(pretrained=False, **kwargs):
+ """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+ ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+ NOTE: converted weights not currently available, too large for github release hosting.
+ """
+ model_kwargs = dict(
+ patch_size=14, embed_dim=1280, depth=32, num_heads=16, representation_size=1280, **kwargs)
+ model = _create_vision_transformer('vit_huge_patch14_224_in21k', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_tiny_patch16_224(pretrained=False, **kwargs):
+ """ DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+ model = _create_vision_transformer('vit_deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_small_patch16_224(pretrained=False, **kwargs):
+ """ DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+ model = _create_vision_transformer('vit_deit_small_patch16_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_base_patch16_224(pretrained=False, **kwargs):
+ """ DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer('vit_deit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_base_patch16_384(pretrained=False, **kwargs):
+ """ DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer('vit_deit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_tiny_distilled_patch16_224(pretrained=False, **kwargs):
+ """ DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+ model = _create_vision_transformer(
+ 'vit_deit_tiny_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_small_distilled_patch16_224(pretrained=False, **kwargs):
+ """ DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+ model = _create_vision_transformer(
+ 'vit_deit_small_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_base_distilled_patch16_224(pretrained=False, **kwargs):
+ """ DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer(
+ 'vit_deit_base_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs):
+ """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
+ ImageNet-1k weights from https://github.com/facebookresearch/deit.
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+ model = _create_vision_transformer(
+ 'vit_deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch16_224_miil_in21k(pretrained=False, **kwargs):
+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+ Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs)
+ model = _create_vision_transformer('vit_base_patch16_224_miil_in21k', pretrained=pretrained, **model_kwargs)
+ return model
+
+
+@register_model
+def vit_base_patch16_224_miil(pretrained=False, **kwargs):
+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+ Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+ """
+ model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs)
+ model = _create_vision_transformer('vit_base_patch16_224_miil', pretrained=pretrained, **model_kwargs)
+ return model
--
Gitee
From 883e1375ea828ef07d7e1e488ebff9d19566ba56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:48:23 +0000
Subject: [PATCH 09/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?=
=?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/models/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
PyTorch/contrib/cv/classification/T2T-ViT/models/.keep | 0
1 file changed, 0 insertions(+), 0 deletions(-)
delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep
deleted file mode 100644
index e69de29bb2..0000000000
--
Gitee
From 0cc3dca85cfc02f45ecf01a4dec74604928e66ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 01:37:55 +0000
Subject: [PATCH 10/15] update
PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../cv/classification/T2T-ViT/test/train_performance_8p.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
index f024d6cc71..85e322f06f 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
@@ -75,7 +75,7 @@ nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \
--weight-decay .05 \
--amp \
--img-size 224 \
- --epochs 5 \
+ --epochs 1 \
--output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
--
Gitee
From fc1f100426ae4570c05e96851dd003bf5cb7f85f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 01:41:22 +0000
Subject: [PATCH 11/15] update t2t-main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../classification/T2T-ViT/{main.py => t2t-main.py} | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
rename PyTorch/contrib/cv/classification/T2T-ViT/{main.py => t2t-main.py} (99%)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
similarity index 99%
rename from PyTorch/contrib/cv/classification/T2T-ViT/main.py
rename to PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
index 1b3ceea782..d230656a48 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/main.py
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
@@ -34,7 +34,7 @@ import torchvision.utils
from torch.optim.optimizer import Optimizer
from torch.nn.parallel import DistributedDataParallel as NativeDDP
-from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset, create_loader
from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model
from timm.utils import *
from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
@@ -42,8 +42,10 @@ from timm.optim import create_optimizer
from timm.scheduler import create_scheduler
from timm.utils import ApexScaler, NativeScaler
-from data.myloader import create_loader
+
+#from data.myloader import create_loader
from npu_fused_adamw import NpuFusedAdamW
+from metrics import t2taccuracy
torch.backends.cudnn.benchmark = True
_logger = logging.getLogger('train')
@@ -427,7 +429,7 @@ def main():
args, args_text = _parse_args()
os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1'
- os.environ['MASTER_PORT'] = '99999' # Any available port
+ os.environ['MASTER_PORT'] = '9999' # Any available port
args.prefetcher = not args.no_prefetcher
args.distributed = (args.workers > 1)
@@ -681,6 +683,7 @@ def main():
lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+ #exit()
if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
if args.local_rank == 0 or args.workers == 1:
_logger.info("Distributing BatchNorm running means and vars")
@@ -875,7 +878,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='')
target = target[0:target.size(0):reduce_factor]
loss = loss_fn(output, target)
- acc1, acc5 = accuracy(output, target, topk=(1, 5))
+ acc1, acc5 = t2taccuracy(output, target, topk=(1, 5))
if args.distributed:
reduced_loss = reduce_tensor(loss.data, args.world_size)
--
Gitee
From 2fba4c3aa6a75ff9bb1e712d6f35d80e770e05af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:36:36 +0000
Subject: [PATCH 12/15] update
PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
index d230656a48..37dc538abe 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
@@ -45,7 +45,6 @@ from timm.utils import ApexScaler, NativeScaler
#from data.myloader import create_loader
from npu_fused_adamw import NpuFusedAdamW
-from metrics import t2taccuracy
torch.backends.cudnn.benchmark = True
_logger = logging.getLogger('train')
@@ -878,7 +877,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='')
target = target[0:target.size(0):reduce_factor]
loss = loss_fn(output, target)
- acc1, acc5 = t2taccuracy(output, target, topk=(1, 5))
+ acc1, acc5 = accuracy(output, target, topk=(1, 5))
if args.distributed:
reduced_loss = reduce_tensor(loss.data, args.world_size)
--
Gitee
From 1cd1d1eb6a291d8a01f61c2474219eeb2d47cac8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:38:13 +0000
Subject: [PATCH 13/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../contrib/cv/classification/T2T-ViT/main.py | 913 ++++++++++++++++++
1 file changed, 913 insertions(+)
create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/main.py
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
new file mode 100644
index 0000000000..9884699a79
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
@@ -0,0 +1,913 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT training and evaluating script
+This script is modified from pytorch-image-models by Ross Wightman (https://github.com/rwightman/pytorch-image-models/)
+It was started from an early version of the PyTorch ImageNet example
+(https://github.com/pytorch/examples/tree/master/imagenet)
+"""
+import argparse
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+import models
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.optim.optimizer import Optimizer
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model
+from timm.utils import *
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from timm.utils import ApexScaler, NativeScaler
+
+from data.myloader import create_loader
+from npu_fused_adamw import NpuFusedAdamW
+from metrics import t2taccuracy
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+ help='YAML config file specifying default arguments')
+
+parser = argparse.ArgumentParser(description='T2T-ViT Training and Evaluating')
+
+# Dataset / Model parameters
+parser.add_argument('data', metavar='DIR',
+ help='path to dataset')
+parser.add_argument('--model', default='T2t_vit_14', type=str, metavar='MODEL',
+ help='Name of model to train (default: "countception"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+ help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+ help='Initialize model from this checkpoint (default: none)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+ help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--eval_checkpoint', default='', type=str, metavar='PATH',
+ help='path to eval checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+ help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=1000, metavar='N',
+ help='number of label classes (default: 1000)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+ help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=224, metavar='N',
+ help='Image patch size (default: None => model default)')
+parser.add_argument('--crop-pct', default=None, type=float,
+ metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+ help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+ help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+ help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
+ help='input batch size for training (default: 64)')
+parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+ help='ratio of validation batch size to training batch size (default: 1)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+ help='Optimizer (default: "adamw"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+ help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+ help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+ help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=0.05,
+ help='weight decay (default: 0.005 for adamw)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+ help='Clip gradient norm (default: None, no clipping)')
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+ help='LR scheduler (default: "cosine"')
+parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+ help='learning rate (default: 0.01)')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+ help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+ help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+ help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+ help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+ help='learning rate cycle limit')
+parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+ help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+ help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=300, metavar='N',
+ help='number of epochs to train (default: 2)')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+ help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+ help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=10, metavar='N',
+ help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+ help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+ help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+ help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+ help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+ help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+ help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+ help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+ help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+ help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+ help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+parser.add_argument('--aug-splits', type=int, default=0,
+ help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd', action='store_true', default=False,
+ help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+ help='Random erase prob (default: 0.25)')
+parser.add_argument('--remode', type=str, default='pixel',
+ help='Random erase mode (default: "const")')
+parser.add_argument('--recount', type=int, default=1,
+ help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+ help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.8,
+ help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=1.0,
+ help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+ help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+ help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+ help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+ help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+ help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+ help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+ help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+ help='Dropout rate (default: 0.0)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+ help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
+ help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+ help='Drop block rate (default: None)')
+
+# Batch norm parameters (only works with gen_efficientnet based models currently)
+parser.add_argument('--bn-tf', action='store_true', default=False,
+ help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+parser.add_argument('--bn-momentum', type=float, default=None,
+ help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+ help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+ help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+ help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+ help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=True,
+ help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+ help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.99996,
+ help='decay factor for model weights moving average (default: 0.9998)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+ help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+ help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+ help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('-j', '--workers', type=int, default=8, metavar='N',
+ help='how many training processes to use (default: 1)')
+parser.add_argument('--num-gpu', type=int, default=1,
+ help='Number of GPUS to use')
+parser.add_argument('--save-images', action='store_true', default=False,
+ help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+ help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+ help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+ help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+ help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+ help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+ help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+ help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+ help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+ help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+ help='use the multi-epochs-loader to save time at the beginning of every epoch')
+
+parser.add_argument("--addr", default="127.0.0.1", type=str)
+parser.add_argument("--performance", action='store_true', default=False,
+ help='whether get the model performance')
+
+has_apex = True
+
+import apex
+from apex import amp
+from apex.parallel import DistributedDataParallel as ApexDDP
+from apex.parallel import convert_syncbn_model
+
+def optimizer_kwargs(cfg):
+ """ cfg/argparse to kwargs helper
+ Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
+ """
+ kwargs = dict(
+ opt=cfg.opt,
+ lr=cfg.lr,
+ weight_decay=cfg.weight_decay,
+ momentum=cfg.momentum)
+ if getattr(cfg, 'opt_eps', None) is not None:
+ kwargs['eps'] = cfg.opt_eps
+ if getattr(cfg, 'opt_betas', None) is not None:
+ kwargs['betas'] = cfg.opt_betas
+ if getattr(cfg, 'opt_args', None) is not None:
+ kwargs.update(cfg.opt_args)
+ return kwargs
+
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+ """Add weight decay
+ """
+ decay = []
+ no_decay = []
+ for name, param in model.named_parameters():
+ if not param.requires_grad:
+ continue # frozen weights
+ if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+ no_decay.append(param)
+ else:
+ decay.append(param)
+ return [
+ {'params': no_decay, 'weight_decay': 0.},
+ {'params': decay, 'weight_decay': weight_decay}]
+
+class Lookahead(Optimizer):
+ def __init__(self, base_optimizer, alpha=0.5, k=6):
+ # NOTE super().__init__() not called on purpose
+ if not 0.0 <= alpha <= 1.0:
+ raise ValueError(f'Invalid slow update rate: {alpha}')
+ if not 1 <= k:
+ raise ValueError(f'Invalid lookahead steps: {k}')
+ defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
+ self._base_optimizer = base_optimizer
+ self.param_groups = base_optimizer.param_groups
+ self.defaults = base_optimizer.defaults
+ self.defaults.update(defaults)
+ self.state = defaultdict(dict)
+ # manually add our defaults to the param groups
+ for name, default in defaults.items():
+ for group in self._base_optimizer.param_groups:
+ group.setdefault(name, default)
+
+ @torch.no_grad()
+ def update_slow(self, group):
+ for fast_p in group["params"]:
+ if fast_p.grad is None:
+ continue
+ param_state = self._base_optimizer.state[fast_p]
+ if 'lookahead_slow_buff' not in param_state:
+ param_state['lookahead_slow_buff'] = torch.empty_like(fast_p)
+ param_state['lookahead_slow_buff'].copy_(fast_p)
+ slow = param_state['lookahead_slow_buff']
+ slow.add_(fast_p - slow, alpha=group['lookahead_alpha'])
+ fast_p.copy_(slow)
+
+ def sync_lookahead(self):
+ for group in self._base_optimizer.param_groups:
+ self.update_slow(group)
+
+ @torch.no_grad()
+ def step(self, closure=None):
+ loss = self._base_optimizer.step(closure)
+ for group in self._base_optimizer.param_groups:
+ group['lookahead_step'] += 1
+ if group['lookahead_step'] % group['lookahead_k'] == 0:
+ self.update_slow(group)
+ return loss
+
+ def state_dict(self):
+ return self._base_optimizer.state_dict()
+
+ def load_state_dict(self, state_dict):
+ self._base_optimizer.load_state_dict(state_dict)
+ self.param_groups = self._base_optimizer.param_groups
+
+def create_optimizer_v2(
+ model_or_params,
+ opt: str = 'sgd',
+ lr: Optional[float] = None,
+ weight_decay: float = 0.,
+ momentum: float = 0.9,
+ filter_bias_and_bn: bool = True,
+ **kwargs):
+ """ Create an optimizer.
+ Only support npu fused AdamW and npu fused SGD
+ """
+ if isinstance(model_or_params, nn.Module):
+ # a model was passed in, extract parameters and add weight decays to appropriate layers
+ if weight_decay and filter_bias_and_bn:
+ skip = {}
+ if hasattr(model_or_params, 'no_weight_decay'):
+ skip = model_or_params.no_weight_decay()
+ parameters = add_weight_decay(model_or_params, weight_decay, skip)
+ weight_decay = 0.
+ else:
+ parameters = model_or_params.parameters()
+ else:
+ # iterable of parameters or param groups passed in
+ parameters = model_or_params
+
+ opt_lower = opt.lower()
+ opt_split = opt_lower.split('_')
+ opt_lower = opt_split[-1]
+ # if 'fused' in opt_lower:
+ # assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+
+ opt_args = dict(weight_decay=weight_decay, **kwargs)
+ if lr is not None:
+ opt_args.setdefault('lr', lr)
+
+ # basic SGD & related
+ if opt_lower == 'sgd' or opt_lower == 'nesterov':
+ # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons
+ opt_args.pop('eps', None)
+ # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+ optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+ elif opt_lower == 'momentum':
+ opt_args.pop('eps', None)
+ # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+ optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+ elif opt_lower == 'adamw':
+ # optimizer = optim.AdamW(parameters, **opt_args)
+ optimizer = NpuFusedAdamW(parameters, **opt_args)
+ else:
+ print(opt_lower, flush=True)
+ assert False and "Invalid optimizer"
+ raise ValueError
+
+ if len(opt_split) > 1:
+ if opt_split[0] == 'lookahead':
+ optimizer = Lookahead(optimizer)
+
+ return optimizer
+
+
+
+def _parse_args():
+ # Do we have a config file to parse?
+ args_config, remaining = config_parser.parse_known_args()
+ if args_config.config:
+ with open(args_config.config, 'r') as f:
+ cfg = yaml.safe_load(f)
+ parser.set_defaults(**cfg)
+
+ # The main arg parser parses the rest of the args, the usual
+ # defaults will have been overridden if config file specified.
+ args = parser.parse_args(remaining)
+
+ # Cache the args as a text string to save them in the output dir later
+ args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+ return args, args_text
+
+
+def main():
+ setup_default_logging()
+ args, args_text = _parse_args()
+
+ os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1'
+ os.environ['MASTER_PORT'] = '99999' # Any available port
+
+ args.prefetcher = not args.no_prefetcher
+ args.distributed = (args.workers > 1)
+
+ torch.npu.set_device(args.local_rank)
+ args.world_size = 1
+ args.rank = args.local_rank # global rank
+ if args.distributed:
+ torch.npu.set_device(args.local_rank)
+ args.world_size = args.workers
+ torch.distributed.init_process_group(backend='hccl', rank=args.rank, world_size=args.world_size)
+ args.world_size = torch.distributed.get_world_size()
+ assert args.rank >= 0
+
+ if args.distributed:
+ _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
+ % (args.rank, args.world_size))
+ else:
+ _logger.info('Training with a single process on %d GPUs.' % args.num_gpu)
+
+ torch.manual_seed(args.seed + args.rank)
+
+ model = create_model(
+ args.model,
+ pretrained=args.pretrained,
+ num_classes=args.num_classes,
+ drop_rate=args.drop,
+ drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path
+ drop_path_rate=args.drop_path,
+ drop_block_rate=args.drop_block,
+ global_pool=args.gp,
+ bn_tf=args.bn_tf,
+ bn_momentum=args.bn_momentum,
+ bn_eps=args.bn_eps,
+ checkpoint_path=args.initial_checkpoint,
+ img_size=args.img_size)
+
+ if args.local_rank == 0 or args.workers == 1:
+ _logger.info('Model %s created, param count: %d' %
+ (args.model, sum([m.numel() for m in model.parameters()])))
+
+ data_config = resolve_data_config(vars(args), model=model, verbose=(args.local_rank == 0 or args.workers==1))
+
+ num_aug_splits = 0
+ if args.aug_splits > 0:
+ assert args.aug_splits > 1, 'A split of 1 makes no sense'
+ num_aug_splits = args.aug_splits
+
+ if args.split_bn:
+ assert num_aug_splits > 1 or args.resplit
+ model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+ use_amp = None
+ args.apex_amp = True
+ use_amp = 'apex'
+
+ model.npu()
+ if args.channels_last:
+ model = model.to(memory_format=torch.channels_last)
+
+ optimizer = create_optimizer_v2(
+ model,
+ **optimizer_kwargs(cfg=args),
+ filter_bias_and_bn=True,
+ )
+ # optimizer = create_optimizer(args, model)
+
+ amp_autocast = suppress # do nothing
+ loss_scaler = None
+ model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0, combine_grad=True)
+ loss_scaler = ApexScaler()
+ if args.local_rank == 0:
+ _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+
+ # optionally resume from a checkpoint
+ resume_epoch = None
+ if args.resume:
+ resume_epoch = resume_checkpoint(
+ model, args.resume,
+ optimizer=None if args.no_resume_opt else optimizer,
+ loss_scaler=None if args.no_resume_opt else loss_scaler,
+ log_info=args.local_rank == 0)
+
+ model_ema = None
+ if args.model_ema:
+ # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+ model_ema = ModelEma(
+ model,
+ decay=args.model_ema_decay,
+ device='cpu' if args.model_ema_force_cpu else '',
+ resume=args.resume)
+
+ if args.distributed:
+ if args.sync_bn:
+ assert not args.split_bn
+ try:
+ if has_apex and use_amp != 'native':
+ # Apex SyncBN preferred unless native amp is activated
+ model = convert_syncbn_model(model)
+ else:
+ model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+ if args.local_rank == 0:
+ _logger.info(
+ 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+ 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+ except Exception as e:
+ _logger.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1')
+
+ model = NativeDDP(model, device_ids=[args.local_rank], broadcast_buffers=False) # can use device str in Torch >= 1.1
+ # NOTE: EMA model does not need to be wrapped by DDP
+
+ lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+ if args.performance:
+ num_epochs = 1
+ start_epoch = 0
+ if args.start_epoch is not None:
+ # a specified start_epoch will always override the resume epoch
+ start_epoch = args.start_epoch
+ elif resume_epoch is not None:
+ start_epoch = resume_epoch
+ if lr_scheduler is not None and start_epoch > 0:
+ lr_scheduler.step(start_epoch)
+
+ if args.local_rank == 0:
+ _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+ train_dir = os.path.join(args.data, 'train')
+ if not os.path.exists(train_dir):
+ _logger.error('Training folder does not exist at: {}'.format(train_dir))
+ exit(1)
+ dataset_train = Dataset(train_dir)
+
+ collate_fn = None
+ mixup_fn = None
+ mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+ if mixup_active:
+ mixup_args = dict(
+ mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+ prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+ label_smoothing=args.smoothing, num_classes=args.num_classes)
+ if args.prefetcher:
+ assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup)
+ collate_fn = FastCollateMixup(**mixup_args)
+ else:
+ mixup_fn = Mixup(**mixup_args)
+
+ if num_aug_splits > 1:
+ dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+ train_interpolation = args.train_interpolation
+ if args.no_aug or not train_interpolation:
+ train_interpolation = data_config['interpolation']
+ loader_train = create_loader(
+ dataset_train,
+ input_size=data_config['input_size'],
+ batch_size=args.batch_size,
+ is_training=True,
+ use_prefetcher=args.prefetcher,
+ no_aug=args.no_aug,
+ re_prob=args.reprob,
+ # re_mode=args.remode,
+ re_count=args.recount,
+ re_split=args.resplit,
+ scale=args.scale,
+ ratio=args.ratio,
+ hflip=args.hflip,
+ vflip=args.vflip,
+ color_jitter=args.color_jitter,
+ auto_augment=args.aa,
+ num_aug_splits=num_aug_splits,
+ interpolation=train_interpolation,
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ distributed=args.distributed,
+ collate_fn=collate_fn,
+ pin_memory=args.pin_mem,
+ use_multi_epochs_loader=args.use_multi_epochs_loader
+ )
+
+ eval_dir = os.path.join(args.data, 'val')
+ if not os.path.isdir(eval_dir):
+ eval_dir = os.path.join(args.data, 'validation')
+ if not os.path.isdir(eval_dir):
+ _logger.error('Validation folder does not exist at: {}'.format(eval_dir))
+ exit(1)
+ dataset_eval = Dataset(eval_dir)
+
+ loader_eval = create_loader(
+ dataset_eval,
+ input_size=data_config['input_size'],
+ batch_size=args.validation_batch_size_multiplier * args.batch_size,
+ is_training=False,
+ use_prefetcher=args.prefetcher,
+ interpolation=data_config['interpolation'],
+ mean=data_config['mean'],
+ std=data_config['std'],
+ num_workers=args.workers,
+ distributed=args.distributed,
+ crop_pct=data_config['crop_pct'],
+ pin_memory=args.pin_mem,
+ )
+
+ if args.jsd:
+ assert num_aug_splits > 1 # JSD only valid with aug splits set
+ train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).npu()
+ elif mixup_active:
+ # smoothing is handled with mixup target transform
+ train_loss_fn = SoftTargetCrossEntropy().npu()
+ elif args.smoothing:
+ train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).npu()
+ else:
+ train_loss_fn = nn.CrossEntropyLoss().npu()
+ validate_loss_fn = nn.CrossEntropyLoss().npu()
+
+ eval_metric = args.eval_metric
+ best_metric = None
+ best_epoch = None
+
+ if args.eval_checkpoint: # evaluate the model
+ load_checkpoint(model, args.eval_checkpoint, args.model_ema)
+ val_metrics = validate(model, loader_eval, validate_loss_fn, args)
+ print(f"Top-1 accuracy of the model is: {val_metrics['top1']:.1f}%")
+ return
+
+ saver = None
+ output_dir = ''
+ if args.local_rank == 0:
+ output_base = args.output if args.output else './output'
+ exp_name = '-'.join([
+ datetime.now().strftime("%Y%m%d-%H%M%S"),
+ args.model,
+ str(data_config['input_size'][-1])
+ ])
+ output_dir = get_outdir(output_base, 'train', exp_name)
+ decreasing = True if eval_metric == 'loss' else False
+ saver = CheckpointSaver(
+ model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+ checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing)
+ with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+ f.write(args_text)
+
+ try: # train the model
+ for epoch in range(start_epoch, num_epochs):
+ if args.distributed:
+ loader_train.sampler.set_epoch(epoch)
+
+
+ train_metrics = train_epoch(
+ epoch, model, loader_train, optimizer, train_loss_fn, args,
+ lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+ amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+
+ if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+ if args.local_rank == 0 or args.workers == 1:
+ _logger.info("Distributing BatchNorm running means and vars")
+ distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+ eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+ if model_ema is not None and not args.model_ema_force_cpu:
+ if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+ distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+ ema_eval_metrics = validate(
+ model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)')
+ eval_metrics = ema_eval_metrics
+
+ if lr_scheduler is not None:
+ # step LR for next epoch
+ lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+ update_summary(
+ epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+ write_header=best_metric is None)
+
+ if saver is not None:
+ # save proper checkpoint with eval metric
+ save_metric = eval_metrics[eval_metric]
+ best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+ except KeyboardInterrupt:
+ pass
+ if best_metric is not None:
+ _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_epoch(
+ epoch, model, loader, optimizer, loss_fn, args,
+ lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress,
+ loss_scaler=None, model_ema=None, mixup_fn=None):
+ if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+ if args.prefetcher and loader.mixup_enabled:
+ loader.mixup_enabled = False
+ elif mixup_fn is not None:
+ mixup_fn.mixup_enabled = False
+
+ second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+ batch_time_m = AverageMeter()
+ data_time_m = AverageMeter()
+ losses_m = AverageMeter()
+ top1_m = AverageMeter()
+ top5_m = AverageMeter()
+
+ model.train()
+
+ end = time.time()
+ last_idx = len(loader) - 1
+ num_updates = epoch * len(loader)
+ epoch_fps = []
+ prof_list = []
+ for batch_idx, (input, target) in enumerate(loader):
+ last_batch = batch_idx == last_idx
+ data_time_m.update(time.time() - end)
+ if not args.prefetcher:
+ input, target = input.npu(), target.npu()
+ if mixup_fn is not None:
+ input, target = mixup_fn(input, target)
+ if args.channels_last:
+ input = input.contiguous(memory_format=torch.channels_last)
+
+ if batch_idx in prof_list:
+ with torch.autograd.profiler.profile(use_npu=True) as prof:
+ output = model(input)
+ loss = loss_fn(output, target)
+ if not args.distributed:
+ losses_m.update(loss.item(), input.size(0))
+
+ optimizer.zero_grad()
+ if loss_scaler is not None:
+ loss_scaler(
+ loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+ else:
+ loss.backward(create_graph=second_order)
+ if args.clip_grad is not None:
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+ optimizer.step()
+ print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+ prof.export_chrome_trace("output_{}.prof".format(str(batch_idx).zfill(4)))
+ sys.exit()
+
+ else:
+ with amp_autocast():
+ output = model(input)
+ loss = loss_fn(output, target)
+
+ if not args.distributed:
+ losses_m.update(loss.item(), input.size(0))
+
+ optimizer.zero_grad()
+ if loss_scaler is not None:
+ loss_scaler(
+ loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+ else:
+ loss.backward(create_graph=second_order)
+ if args.clip_grad is not None:
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+ optimizer.step()
+
+ torch.npu.synchronize()
+ if model_ema is not None:
+ model_ema.update(model)
+ num_updates += 1
+
+ batch_time_m.update(time.time() - end)
+
+ if last_batch or batch_idx % args.log_interval == 0:
+ lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+ lr = sum(lrl) / len(lrl)
+
+ if args.distributed:
+ reduced_loss = reduce_tensor(loss.data, args.world_size)
+ losses_m.update(reduced_loss.item(), input.size(0))
+
+ if args.local_rank == 0 or args.workers == 1:
+ _logger.info(
+ 'Train: {} [{:>4d}/{} ({:>3.0f}%)] '
+ 'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) '
+ 'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s '
+ '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) '
+ 'LR: {lr:.3e} '
+ 'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+ epoch,
+ batch_idx, len(loader),
+ 100. * batch_idx / last_idx,
+ loss=losses_m,
+ batch_time=batch_time_m,
+ rate=input.size(0) * args.world_size / batch_time_m.val,
+ rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+ lr=lr,
+ data_time=data_time_m))
+
+ if args.save_images and output_dir:
+ torchvision.utils.save_image(
+ input,
+ os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+ padding=0,
+ normalize=True)
+
+ if saver is not None and args.recovery_interval and (
+ last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+ saver.save_recovery(epoch, batch_idx=batch_idx)
+
+ if lr_scheduler is not None:
+ lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+
+ epoch_fps.append(input.shape[0] * args.workers / (time.time() - end))
+ end = time.time()
+ # end for
+
+ if hasattr(optimizer, 'sync_lookahead'):
+ optimizer.sync_lookahead()
+
+ print('Epoch {}: {} fps'.format(epoch, sum(epoch_fps[5:]) / len(epoch_fps[5:])))
+ return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+ batch_time_m = AverageMeter()
+ losses_m = AverageMeter()
+ top1_m = AverageMeter()
+ top5_m = AverageMeter()
+
+ model.eval()
+
+ end = time.time()
+ last_idx = len(loader) - 1
+ with torch.no_grad():
+ for batch_idx, (input, target) in enumerate(loader):
+ last_batch = batch_idx == last_idx
+ if not args.prefetcher:
+ input = input.npu()
+ target = target.npu()
+ if args.channels_last:
+ input = input.contiguous(memory_format=torch.channels_last)
+
+ with amp_autocast():
+ output = model(input)
+ if isinstance(output, (tuple, list)):
+ output = output[0]
+
+ # augmentation reduction
+ reduce_factor = args.tta
+ if reduce_factor > 1:
+ output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+ target = target[0:target.size(0):reduce_factor]
+
+ loss = loss_fn(output, target)
+ acc1, acc5 = t2taccuracy(output, target, topk=(1, 5))
+
+ if args.distributed:
+ reduced_loss = reduce_tensor(loss.data, args.world_size)
+ acc1 = reduce_tensor(acc1, args.world_size)
+ acc5 = reduce_tensor(acc5, args.world_size)
+ else:
+ reduced_loss = loss.data
+
+ torch.npu.synchronize()
+
+ losses_m.update(reduced_loss.item(), input.size(0))
+ top1_m.update(acc1.item(), output.size(0))
+ top5_m.update(acc5.item(), output.size(0))
+
+ batch_time_m.update(time.time() - end)
+ end = time.time()
+ if (args.local_rank == 0 or args.workers == 1) and (last_batch or batch_idx % args.log_interval == 0):
+ log_name = 'Test' + log_suffix
+ _logger.info(
+ '{0}: [{1:>4d}/{2}] '
+ 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) '
+ 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) '
+ 'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) '
+ 'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+ log_name, batch_idx, last_idx, batch_time=batch_time_m,
+ loss=losses_m, top1=top1_m, top5=top5_m))
+
+ metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+ return metrics
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
--
Gitee
From b7c1dce395f052f3fd10d787079fab6993c9c97f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:38:52 +0000
Subject: [PATCH 14/15] update
PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
index 09dd270d72..88152a40e9 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
@@ -66,7 +66,7 @@ if [ x"${etp_flag}" != x"true" ];then
#export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
fi
-nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 main.py \
${data_path} \
--model t2t_vit_14 \
--batch-size 64 \
--
Gitee
From 2f18386bf49f5ecb5fdab8f76ec24a0c5d38c14d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:39:07 +0000
Subject: [PATCH 15/15] update
PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: 王旭 <1149693659@qq.com>
---
.../cv/classification/T2T-ViT/test/train_performance_1p.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
index 6c1828793e..f6018663ff 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
@@ -67,7 +67,7 @@ if [ x"${etp_flag}" != x"true" ];then
#export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
fi
-nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 main.py \
${data_path} \
--model t2t_vit_14 \
--batch-size 64 \
--
Gitee