From 35e040c59c3e52c7583f97efd63a7dc795831255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:46:23 +0000
Subject: [PATCH 01/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20T2T-ViT?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/classification/T2T-ViT/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/.keep

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From dfce126ce662969faeb2f176c6006af0aec13016 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:46:56 +0000
Subject: [PATCH 02/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20models?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 66f0272ce86deb381238bc7e3c10648aa0970cc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:06 +0000
Subject: [PATCH 03/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?=
 =?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/classification/T2T-ViT/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/.keep

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/.keep
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From 276054e344e499285a1ffa1fdb344aa9c14a5dc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:14 +0000
Subject: [PATCH 04/15] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 3f8712405b7f0c5eb2238c95dd996f77a158633f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:34 +0000
Subject: [PATCH 05/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../cv/classification/T2T-ViT/Dockerfile      |   5 +
 .../contrib/cv/classification/T2T-ViT/LICENSE | 201 ++++
 .../cv/classification/T2T-ViT/README.md       | 165 ++++
 .../cv/classification/T2T-ViT/README_raw.md   | 202 ++++
 .../cv/classification/T2T-ViT/docker_start.sh |  25 +
 .../contrib/cv/classification/T2T-ViT/main.py | 912 ++++++++++++++++++
 .../cv/classification/T2T-ViT/metrics.py      |  41 +
 .../classification/T2T-ViT/modelzoo_level.txt |   3 +
 .../classification/T2T-ViT/npu_fused_adamw.py | 255 +++++
 .../classification/T2T-ViT/requirements.txt   |   4 +
 10 files changed, 1813 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/LICENSE
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/README.md
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/main.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/metrics.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile b/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile
new file mode 100644
index 0000000000..7e712fe1a1
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/Dockerfile
@@ -0,0 +1,5 @@
+ARG FROM_IMAGE_NAME
+FROM $FROM_IMAGE_NAME
+
+COPY requirements.txt .
+RUN pip3.7 install -r requirements.txt
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE b/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE
new file mode 100644
index 0000000000..753842b672
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/README.md b/PyTorch/contrib/cv/classification/T2T-ViT/README.md
new file mode 100644
index 0000000000..9fce4d9354
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/README.md
@@ -0,0 +1,165 @@
+# T2T-ViT for PyTorch
+
+-   [概述](概述.md)
+-   [准备训练环境](准备训练环境.md)
+-   [开始训练](开始训练.md)
+-   [训练结果展示](训练结果展示.md)
+-   [版本说明](版本说明.md)
+
+
+
+# 概述
+
+## 简述
+
+T2T-ViT模型还是解决cv领域图片分类的问题，通过捕获图片的局部特征来对图片进行分类。由于ViT将图像分为多个 token，然后用多个 Transformer 来对全局进行建模以进行分类，但是这样会失去局部性，实际上图像的局部信息如边缘、线条和纹理等信息对视觉理解是很重要的。其次，ViT的注意力骨干含有冗余，特征丰富度有限，模型训练困难。因此本论文提出了 Tokens-to-Token module 来建模一张图片的局部信息，和更高效的 Transformer Backbone 架构设计来提升中间特征的丰富程度减少冗余以提升性能，使得在纯 ImageNet 数据集预训练的视觉 Transformer 的性能超越了 CNN 的 ResNet 架构，其设计的思路和范式对视觉 Transformer 领域的工作带来的积极影响。
+
+- 参考实现：
+
+  ```
+  url=https://github.com/yitu-opensource/T2T-ViT.git
+  commit_id=0f63dc9558f4d192de926504dbddfa1b3f5db6ca
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/classification
+  ```
+  
+- 通过Git获取代码方法如下：
+
+  ```
+  git clone {url}       # 克隆仓库的代码
+  cd {code_path}        # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
+  ```
+  
+- 通过单击“立即下载”，下载源码包。
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。
+
+  **表 1**  版本配套表
+
+  | 配套       | 版本                                                         |
+  | ---------- | ------------------------------------------------------------ |
+  | 固件与驱动 | [5.1.RC2](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+  | CANN       | [5.1.RC2](https://www.hiascend.com/software/cann/commercial?version=5.1.RC2) |
+  | PyTorch    | [1.5.0](https://gitee.com/ascend/pytorch/tree/master/)       |
+
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖。
+
+  ```
+  pip install -r requirements.txt
+  ```
+
+
+## 准备数据集
+
+1. 获取数据集。
+    用户自行获取原始数据集ImageNet2012，将数据集上传到服务器任意路径下并解压。数据集目录结构参考如下所示。
+   ```
+   ├── ImageNet2012
+         ├──train
+              ├──类别1
+                    │──图片1
+                    │──图片2
+                    │   ...       
+              ├──类别2
+                    │──图片1
+                    │──图片2
+                    │   ...   
+              ├──...                     
+         ├──val  
+              ├──类别1
+                    │──图片1
+                    │──图片2
+                    │   ...       
+              ├──类别2
+                    │──图片1
+                    │──图片2
+                    │   ...              
+   ```
+
+
+   > **说明：** 
+   > 该数据集的训练过程脚本只作为一种参考示例。
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录。
+
+   ```
+   cd /${模型文件夹名称} 
+   ```
+
+2. 运行训练脚本。
+
+   该模型支持单机单卡训练和单机8卡训练。
+
+   - 单机单卡训练
+
+     启动单卡训练。
+
+     ```
+     bash ./test/train_full_1p.sh --data_path=/data/xxx/    
+     ```
+
+   - 单机8卡训练
+
+     启动8卡训练。
+
+     ```
+     bash ./test/train_full_8p.sh --data_path=/data/xxx/   
+     ```
+
+   --data\_path参数填写数据集路径。
+
+   模型训练脚本参数说明如下。
+
+   ```
+   公共参数：
+   --data_path                              //数据集路径
+   ```
+   
+   
+
+日志输出路径：
+
+    test/output/devie_id/train_${device_id}.log           # training detail log
+    
+    test/output/devie_id/T2T-ViT_2_bs8192_8p_perf.log  # 8p training performance result log
+    
+    test/output/devie_id/T2T-ViT_2_bs8192_8p_acc.log   # 8p training accuracy result log
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+| Name   | Acc@1 | FPS  | Epochs | AMP_Type |
+| ------ | ----- | ---- | ------ | -------- |
+| GPU-1p | -     | -    | 1      | O1       |
+| GPU-8p | -     | -    | 300    | O1       |
+| NPU-1p | -     | -    | 1      | O1       |
+| NPU-8p | -     | -    | 300    | O1       |
+
+# 版本说明
+
+## 变更
+
+2022.11.22：首次发布。
+
+## 已知问题
+
+无。
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md b/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md
new file mode 100644
index 0000000000..d4c537021e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/README_raw.md
@@ -0,0 +1,202 @@
+# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet, [ICCV 2021](https://openaccess.thecvf.com/content/ICCV2021/html/Yuan_Tokens-to-Token_ViT_Training_Vision_Transformers_From_Scratch_on_ImageNet_ICCV_2021_paper.html)
+
+### Update:
+2021/03/11: update our new results. Now our T2T-ViT-14 with 21.5M parameters can reach 81.5% top1-acc with 224x224 image resolution, and 83.3\% top1-acc with 384x384 resolution. 
+
+2021/02/21: T2T-ViT can be trained on most of common GPUs: 1080Ti, 2080Ti, TiTAN V, V100 stably with '--amp' (Automatic Mixed Precision). In some specifical GPU like Tesla T4, 'amp' would cause NAN loss when training T2T-ViT. If you get NAN loss in training, you can disable amp by removing '--amp' in the [training scripts](https://github.com/yitu-opensource/T2T-ViT#train).
+
+2021/01/28: release codes and upload most of the pretrained models of T2T-ViT.
+
+<p align="center">
+<img src="https://github.com/yitu-opensource/T2T-ViT/blob/main/images/f1.png">
+</p>
+
+## Reference
+If you find this repo useful, please consider citing:
+```
+@InProceedings{Yuan_2021_ICCV,
+    author    = {Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Jiang, Zi-Hang and Tay, Francis E.H. and Feng, Jiashi and Yan, Shuicheng},
+    title     = {Tokens-to-Token ViT: Training Vision Transformers From Scratch on ImageNet},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    month     = {October},
+    year      = {2021},
+    pages     = {558-567}
+}
+```
+
+Our codes are based on the [official imagenet example](https://github.com/pytorch/examples/tree/master/imagenet) by [PyTorch](https://pytorch.org/) and [pytorch-image-models](https://github.com/rwightman/pytorch-image-models) by [Ross Wightman](https://github.com/rwightman)
+
+
+## 1. Requirements
+
+[timm](https://github.com/rwightman/pytorch-image-models), pip install timm==0.3.4
+
+torch>=1.4.0
+
+torchvision>=0.5.0
+
+pyyaml
+
+data prepare: ImageNet with the following folder structure, you can extract imagenet by this [script](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4).
+
+```
+│imagenet/
+├──train/
+│  ├── n01440764
+│  │   ├── n01440764_10026.JPEG
+│  │   ├── n01440764_10027.JPEG
+│  │   ├── ......
+│  ├── ......
+├──val/
+│  ├── n01440764
+│  │   ├── ILSVRC2012_val_00000293.JPEG
+│  │   ├── ILSVRC2012_val_00002138.JPEG
+│  │   ├── ......
+│  ├── ......
+```
+
+## 2. T2T-ViT Models
+
+
+| Model    | T2T Transformer | Top1 Acc | #params | MACs |  Download|
+| :---     |   :---:         |  :---:   |  :---:  | :---: |  :---:   |
+| T2T-ViT-14   |  Performer  |   81.5   |  21.5M  | 4.8G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.5_T2T_ViT_14.pth.tar)|
+| T2T-ViT-19   |  Performer  |   81.9   |  39.2M  | 8.5G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.9_T2T_ViT_19.pth.tar)|
+| T2T-ViT-24   |  Performer  |   82.3   |  64.1M  | 13.8G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.3_T2T_ViT_24.pth.tar)|
+| T2T-ViT-14, 384   |  Performer  |   83.3   |  21.7M  |   | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/83.3_T2T_ViT_14.pth.tar)|
+| T2T-ViT-24, Token Labeling   |  Performer  |   84.2   |  65M  |   | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/84.2_T2T_ViT_24.pth.tar)|
+| T2T-ViT_t-14 | Transformer |   81.7   |  21.5M  | 6.1G | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.7_T2T_ViTt_14.pth.tar)  |
+| T2T-ViT_t-19 | Transformer |   82.4   |  39.2M  | 9.8G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.4_T2T_ViTt_19.pth.tar) |
+| T2T-ViT_t-24 | Transformer |   82.6   |  64.1M  | 15.0G| [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/82.6_T2T_ViTt_24.pth.tar) |
+
+The 'T2T-ViT-14, 384' means we train T2T-ViT-14 with image size of 384 x 384.
+
+The 'T2T-ViT-24, Token Labeling' means we train T2T-ViT-24 with [Token Labeling](https://github.com/zihangJiang/TokenLabeling).
+
+The three lite variants of T2T-ViT (Comparing with MobileNets):
+| Model    | T2T Transformer | Top1 Acc | #params | MACs |  Download|
+| :---     |   :---:         |  :---:   |  :---:  | :---: |  :---:   |
+| T2T-ViT-7   |  Performer  |   71.7   |  4.3M   | 1.1G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/71.7_T2T_ViT_7.pth.tar)|
+| T2T-ViT-10   |  Performer  |   75.2   |  5.9M   | 1.5G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/75.2_T2T_ViT_10.pth.tar)|
+| T2T-ViT-12   |  Performer  |   76.5   |  6.9M   | 1.8G  | [here](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/76.5_T2T_ViT_12.pth.tar)  |
+
+
+### Usage
+The way to use our pretrained T2T-ViT:
+```
+from models.t2t_vit import *
+from utils import load_for_transfer_learning 
+
+# create model
+model = t2t_vit_14()
+
+# load the pretrained weights
+load_for_transfer_learning(model, /path/to/pretrained/weights, use_ema=True, strict=False, num_classes=1000)  # change num_classes based on dataset, can work for different image size as we interpolate the position embeding for different image size.
+```
+
+
+## 3. Validation
+
+Test the T2T-ViT-14 (take Performer in T2T module),
+
+Download the [T2T-ViT-14](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/81.5_T2T_ViT_14.pth.tar), then test it by running:
+
+```
+CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_14 -b 100 --eval_checkpoint path/to/checkpoint
+```
+The results look like:
+
+```
+Test: [   0/499]  Time: 2.083 (2.083)  Loss:  0.3578 (0.3578)  Acc@1: 96.0000 (96.0000)  Acc@5: 99.0000 (99.0000)
+Test: [  50/499]  Time: 0.166 (0.202)  Loss:  0.5823 (0.6404)  Acc@1: 85.0000 (86.1569)  Acc@5: 99.0000 (97.5098)
+...
+Test: [ 499/499]  Time: 0.272 (0.172)  Loss:  1.3983 (0.8261)  Acc@1: 62.0000 (81.5000)  Acc@5: 93.0000 (95.6660)
+Top-1 accuracy of the model is: 81.5%
+
+```
+
+Test the three lite variants: T2T-ViT-7, T2T-ViT-10, T2T-ViT-12 (take Performer in T2T module),
+
+Download the [T2T-ViT-7](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/71.7_T2T_ViT_7.pth.tar), [T2T-ViT-10](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/75.2_T2T_ViT_10.pth.tar) or [T2T-ViT-12](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/76.5_T2T_ViT_12.pth.tar), then test it by running:
+
+```
+CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_7 -b 100 --eval_checkpoint path/to/checkpoint
+```
+
+Test the model T2T-ViT-14, 384 with 83.3\% top-1 accuracy: 
+```
+CUDA_VISIBLE_DEVICES=0 python main.py path/to/data --model t2t_vit_14 --img-size 384 -b 100 --eval_checkpoint path/to/T2T-ViT-14-384 
+```
+
+
+## 4. Train
+
+Train the three lite variants: T2T-ViT-7, T2T-ViT-10 and T2T-ViT-12 (take Performer in T2T module):
+
+If only 4 GPUs are available,
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 ./distributed_train.sh 4 path/to/data --model t2t_vit_7 -b 128 --lr 1e-3 --weight-decay .03 --amp --img-size 224
+```
+
+The top1-acc in 4 GPUs would be slightly lower than 8 GPUs (around 0.1%-0.3% lower).
+
+If 8 GPUs are available: 
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_7 -b 64 --lr 1e-3 --weight-decay .03 --amp --img-size 224
+```
+
+
+Train the T2T-ViT-14 and T2T-ViT_t-14 (run on 4 or 8 GPUs):
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 ./distributed_train.sh 4 path/to/data --model t2t_vit_14 -b 128 --lr 1e-3 --weight-decay .05 --amp --img-size 224
+```
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_14 -b 64 --lr 5e-4 --weight-decay .05 --amp --img-size 224
+```
+If you want to train our T2T-ViT on images with 384x384 resolution, please use '--img-size 384'.
+
+
+Train the T2T-ViT-19, T2T-ViT-24 or T2T-ViT_t-19, T2T-ViT_t-24:
+
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./distributed_train.sh 8 path/to/data --model t2t_vit_19 -b 64 --lr 5e-4 --weight-decay .065 --amp --img-size 224
+```
+
+## 5. Transfer T2T-ViT to CIFAR10/CIFAR100
+
+| Model        |  ImageNet | CIFAR10 |  CIFAR100| #params|
+| :---         |    :---:  | :---:   |  :---:   |   :---:  |
+| T2T-ViT-14   |   81.5    |[98.3](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar10_t2t-vit_14_98.3.pth)  | [88.4](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cirfar100_t2t-vit-14_88.4.pth) | 21.5M    |
+| T2T-ViT-19   |   81.9    |[98.4](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar10_t2t-vit_19_98.4.pth)  | [89.0](https://github.com/yitu-opensource/T2T-ViT/releases/download/main/cifar100_t2t-vit-19_89.0.pth) |39.2M     |
+
+We resize CIFAR10/100 to 224x224 and finetune our pretrained T2T-ViT-14/19 to CIFAR10/100 by running:
+
+```
+CUDA_VISIBLE_DEVICES=0,1 transfer_learning.py --lr 0.05 --b 64 --num-classes 10 --img-size 224 --transfer-learning True --transfer-model /path/to/pretrained/T2T-ViT-19
+```
+
+## 6. Visualization
+
+Visualize the image features of ResNet50, you can open and run the [visualization_resnet.ipynb](https://github.com/yitu-opensource/T2T-ViT/blob/main/visualization_resnet.ipynb) file in jupyter notebook or jupyter lab; some results are given as following:
+
+<p align="center">
+<img src="https://github.com/yitu-opensource/T2T-ViT/blob/main/images/resnet_conv1.png" width="600" height="300"/>
+</p>
+
+Visualize the image features of ViT, you can open and run the [visualization_vit.ipynb](https://github.com/yitu-opensource/T2T-ViT/blob/main/visualization_vit.ipynb) file in jupyter notebook or jupyter lab; some results are given as following:
+
+<p align="center">
+<img src="https://github.com/yitu-opensource/T2T-ViT/blob/main/images/vit_block1.png" width="600" height="300"/>
+</p>
+
+Visualize attention map, you can refer to this [file](https://github.com/jeonsworld/ViT-pytorch/blob/main/visualize_attention_map.ipynb). A simple example by visualizing the attention map in attention block 4 and 5 is:
+
+
+<p align="center">
+<img src="https://github.com/yitu-opensource/T2T-ViT/blob/main/images/attention_visualization.png" width="600" height="400"/>
+</p>
+
+
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh b/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh
new file mode 100644
index 0000000000..46ce9a02ec
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/docker_start.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+docker_image=$1
+data_dir=$2
+model_dir=$3
+
+docker run -it --ipc=host \
+              --device=/dev/davinci0 \
+              --device=/dev/davinci1 \
+              --device=/dev/davinci2 \
+              --device=/dev/davinci3 \
+              --device=/dev/davinci4 \
+              --device=/dev/davinci5 \
+              --device=/dev/davinci6 \
+              --device=/dev/davinci7 \
+              --device=/dev/davinci_manager \
+              --device=/dev/devmm_svm --device=/dev/hisi_hdc \
+              -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+              -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
+              -v ${model_dir}:${model_dir} \
+              -v ${data_dir}:${data_dir}  \
+              -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
+              -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
+              -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
+              /bin/bash
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
new file mode 100644
index 0000000000..1b3ceea782
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
@@ -0,0 +1,912 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT training and evaluating script
+This script is modified from pytorch-image-models by Ross Wightman (https://github.com/rwightman/pytorch-image-models/)
+It was started from an early version of the PyTorch ImageNet example
+(https://github.com/pytorch/examples/tree/master/imagenet)
+"""
+import argparse
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+import models
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.optim.optimizer import Optimizer
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model
+from timm.utils import *
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from timm.utils import ApexScaler, NativeScaler
+
+from data.myloader import create_loader
+from npu_fused_adamw import NpuFusedAdamW
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+                    help='YAML config file specifying default arguments')
+
+parser = argparse.ArgumentParser(description='T2T-ViT Training and Evaluating')
+
+# Dataset / Model parameters
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--model', default='T2t_vit_14', type=str, metavar='MODEL',
+                    help='Name of model to train (default: "countception"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+                    help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+                    help='Initialize model from this checkpoint (default: none)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--eval_checkpoint', default='', type=str, metavar='PATH',
+                    help='path to eval checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+                    help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=1000, metavar='N',
+                    help='number of label classes (default: 1000)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=224, metavar='N',
+                    help='Image patch size (default: None => model default)')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
+                    help='input batch size for training (default: 64)')
+parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                    help='ratio of validation batch size to training batch size (default: 1)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                    help='Optimizer (default: "adamw"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                    help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=0.05,
+                    help='weight decay (default: 0.005 for adamw)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                    help='Clip gradient norm (default: None, no clipping)')
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                    help='LR scheduler (default: "cosine"')
+parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                    help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                    help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                    help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                    help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                    help='learning rate cycle limit')
+parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+                    help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=300, metavar='N',
+                    help='number of epochs to train (default: 2)')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                    help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=10, metavar='N',
+                    help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                    help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                    help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                    help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+                    help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                    help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                    help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+                    help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+                    help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                    help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                    help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+parser.add_argument('--aug-splits', type=int, default=0,
+                    help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd', action='store_true', default=False,
+                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                    help='Random erase prob (default: 0.25)')
+parser.add_argument('--remode', type=str, default='pixel',
+                    help='Random erase mode (default: "const")')
+parser.add_argument('--recount', type=int, default=1,
+                    help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+                    help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.8,
+                    help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=1.0,
+                    help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+                    help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                    help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+                    help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+                    help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                    help='Dropout rate (default: 0.0)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
+
+# Batch norm parameters (only works with gen_efficientnet based models currently)
+parser.add_argument('--bn-tf', action='store_true', default=False,
+                    help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+parser.add_argument('--bn-momentum', type=float, default=None,
+                    help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+                    help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+                    help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+                    help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=True,
+                    help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                    help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.99996,
+                    help='decay factor for model weights moving average (default: 0.9998)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+                    help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('-j', '--workers', type=int, default=8, metavar='N',
+                    help='how many training processes to use (default: 1)')
+parser.add_argument('--num-gpu', type=int, default=1,
+                    help='Number of GPUS to use')
+parser.add_argument('--save-images', action='store_true', default=False,
+                    help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+                    help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+                    help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+                    help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+                    help='use the multi-epochs-loader to save time at the beginning of every epoch')
+
+parser.add_argument("--addr", default="127.0.0.1", type=str)
+parser.add_argument("--performance", action='store_true', default=False,
+                    help='whether get the model performance')
+
+has_apex = True
+
+import apex
+from apex import amp
+from apex.parallel import DistributedDataParallel as ApexDDP
+from apex.parallel import convert_syncbn_model
+
+def optimizer_kwargs(cfg):
+    """ cfg/argparse to kwargs helper
+    Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
+    """
+    kwargs = dict(
+        opt=cfg.opt,
+        lr=cfg.lr,
+        weight_decay=cfg.weight_decay,
+        momentum=cfg.momentum)
+    if getattr(cfg, 'opt_eps', None) is not None:
+        kwargs['eps'] = cfg.opt_eps
+    if getattr(cfg, 'opt_betas', None) is not None:
+        kwargs['betas'] = cfg.opt_betas
+    if getattr(cfg, 'opt_args', None) is not None:
+        kwargs.update(cfg.opt_args)
+    return kwargs
+
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    """Add weight decay
+    """
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            no_decay.append(param)
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+
+class Lookahead(Optimizer):
+    def __init__(self, base_optimizer, alpha=0.5, k=6):
+        # NOTE super().__init__() not called on purpose
+        if not 0.0 <= alpha <= 1.0:
+            raise ValueError(f'Invalid slow update rate: {alpha}')
+        if not 1 <= k:
+            raise ValueError(f'Invalid lookahead steps: {k}')
+        defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
+        self._base_optimizer = base_optimizer
+        self.param_groups = base_optimizer.param_groups
+        self.defaults = base_optimizer.defaults
+        self.defaults.update(defaults)
+        self.state = defaultdict(dict)
+        # manually add our defaults to the param groups
+        for name, default in defaults.items():
+            for group in self._base_optimizer.param_groups:
+                group.setdefault(name, default)
+
+    @torch.no_grad()
+    def update_slow(self, group):
+        for fast_p in group["params"]:
+            if fast_p.grad is None:
+                continue
+            param_state = self._base_optimizer.state[fast_p]
+            if 'lookahead_slow_buff' not in param_state:
+                param_state['lookahead_slow_buff'] = torch.empty_like(fast_p)
+                param_state['lookahead_slow_buff'].copy_(fast_p)
+            slow = param_state['lookahead_slow_buff']
+            slow.add_(fast_p - slow, alpha=group['lookahead_alpha'])
+            fast_p.copy_(slow)
+
+    def sync_lookahead(self):
+        for group in self._base_optimizer.param_groups:
+            self.update_slow(group)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        loss = self._base_optimizer.step(closure)
+        for group in self._base_optimizer.param_groups:
+            group['lookahead_step'] += 1
+            if group['lookahead_step'] % group['lookahead_k'] == 0:
+                self.update_slow(group)
+        return loss
+
+    def state_dict(self):
+        return self._base_optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._base_optimizer.load_state_dict(state_dict)
+        self.param_groups = self._base_optimizer.param_groups
+
+def create_optimizer_v2(
+        model_or_params,
+        opt: str = 'sgd',
+        lr: Optional[float] = None,
+        weight_decay: float = 0.,
+        momentum: float = 0.9,
+        filter_bias_and_bn: bool = True,
+        **kwargs):
+    """ Create an optimizer.
+    Only support npu fused AdamW and npu fused SGD
+    """
+    if isinstance(model_or_params, nn.Module):
+        # a model was passed in, extract parameters and add weight decays to appropriate layers
+        if weight_decay and filter_bias_and_bn:
+            skip = {}
+            if hasattr(model_or_params, 'no_weight_decay'):
+                skip = model_or_params.no_weight_decay()
+            parameters = add_weight_decay(model_or_params, weight_decay, skip)
+            weight_decay = 0.
+        else:
+            parameters = model_or_params.parameters()
+    else:
+        # iterable of parameters or param groups passed in
+        parameters = model_or_params
+
+    opt_lower = opt.lower()
+    opt_split = opt_lower.split('_')
+    opt_lower = opt_split[-1]
+    # if 'fused' in opt_lower:
+    #     assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+
+    opt_args = dict(weight_decay=weight_decay, **kwargs)
+    if lr is not None:
+        opt_args.setdefault('lr', lr)
+
+    # basic SGD & related
+    if opt_lower == 'sgd' or opt_lower == 'nesterov':
+        # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons
+        opt_args.pop('eps', None)
+        # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+        optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'momentum':
+        opt_args.pop('eps', None)
+        # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+        optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'adamw':
+    #     optimizer = optim.AdamW(parameters, **opt_args)
+        optimizer = NpuFusedAdamW(parameters, **opt_args)
+    else:
+        print(opt_lower, flush=True)
+        assert False and "Invalid optimizer"
+        raise ValueError
+
+    if len(opt_split) > 1:
+        if opt_split[0] == 'lookahead':
+            optimizer = Lookahead(optimizer)
+
+    return optimizer
+
+
+
+def _parse_args():
+    # Do we have a config file to parse?
+    args_config, remaining = config_parser.parse_known_args()
+    if args_config.config:
+        with open(args_config.config, 'r') as f:
+            cfg = yaml.safe_load(f)
+            parser.set_defaults(**cfg)
+
+    # The main arg parser parses the rest of the args, the usual
+    # defaults will have been overridden if config file specified.
+    args = parser.parse_args(remaining)
+
+    # Cache the args as a text string to save them in the output dir later
+    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+    return args, args_text
+
+
+def main():
+    setup_default_logging()
+    args, args_text = _parse_args()
+    
+    os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1'
+    os.environ['MASTER_PORT'] = '99999' # Any available port
+
+    args.prefetcher = not args.no_prefetcher
+    args.distributed = (args.workers > 1)
+
+    torch.npu.set_device(args.local_rank)
+    args.world_size = 1
+    args.rank = args.local_rank  # global rank
+    if args.distributed:
+        torch.npu.set_device(args.local_rank)
+        args.world_size = args.workers
+        torch.distributed.init_process_group(backend='hccl', rank=args.rank, world_size=args.world_size)
+        args.world_size = torch.distributed.get_world_size()
+    assert args.rank >= 0
+
+    if args.distributed:
+        _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
+                     % (args.rank, args.world_size))
+    else:
+        _logger.info('Training with a single process on %d GPUs.' % args.num_gpu)
+
+    torch.manual_seed(args.seed + args.rank)
+
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        drop_rate=args.drop,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps,
+        checkpoint_path=args.initial_checkpoint,
+        img_size=args.img_size)
+
+    if args.local_rank == 0 or args.workers == 1:
+        _logger.info('Model %s created, param count: %d' %
+                     (args.model, sum([m.numel() for m in model.parameters()])))
+
+    data_config = resolve_data_config(vars(args), model=model, verbose=(args.local_rank == 0 or args.workers==1))
+
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    if args.split_bn:
+        assert num_aug_splits > 1 or args.resplit
+        model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+    use_amp = None
+    args.apex_amp = True
+    use_amp = 'apex'
+    
+    model.npu()
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    optimizer = create_optimizer_v2(
+        model,
+        **optimizer_kwargs(cfg=args),
+        filter_bias_and_bn=True,
+    )
+    # optimizer = create_optimizer(args, model)
+
+    amp_autocast = suppress  # do nothing
+    loss_scaler = None
+    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0, combine_grad=True)
+    loss_scaler = ApexScaler()
+    if args.local_rank == 0:
+        _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+
+    # optionally resume from a checkpoint
+    resume_epoch = None
+    if args.resume:
+        resume_epoch = resume_checkpoint(
+            model, args.resume,
+            optimizer=None if args.no_resume_opt else optimizer,
+            loss_scaler=None if args.no_resume_opt else loss_scaler,
+            log_info=args.local_rank == 0)
+
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume=args.resume)
+
+    if args.distributed:
+        if args.sync_bn:
+            assert not args.split_bn
+            try:
+                if has_apex and use_amp != 'native':
+                    # Apex SyncBN preferred unless native amp is activated
+                    model = convert_syncbn_model(model)
+                else:
+                    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+                if args.local_rank == 0:
+                    _logger.info(
+                        'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+                        'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+            except Exception as e:
+                _logger.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1')
+
+        model = NativeDDP(model, device_ids=[args.local_rank], broadcast_buffers=False)  # can use device str in Torch >= 1.1
+        # NOTE: EMA model does not need to be wrapped by DDP
+
+    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+    if args.performance:
+        num_epochs = 1
+    start_epoch = 0
+    if args.start_epoch is not None:
+        # a specified start_epoch will always override the resume epoch
+        start_epoch = args.start_epoch
+    elif resume_epoch is not None:
+        start_epoch = resume_epoch
+    if lr_scheduler is not None and start_epoch > 0:
+        lr_scheduler.step(start_epoch)
+
+    if args.local_rank == 0:
+        _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+    train_dir = os.path.join(args.data, 'train')
+    if not os.path.exists(train_dir):
+        _logger.error('Training folder does not exist at: {}'.format(train_dir))
+        exit(1)
+    dataset_train = Dataset(train_dir)
+
+    collate_fn = None
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_args = dict(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.num_classes)
+        if args.prefetcher:
+            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
+            collate_fn = FastCollateMixup(**mixup_args)
+        else:
+            mixup_fn = Mixup(**mixup_args)
+
+    if num_aug_splits > 1:
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+    loader_train = create_loader(
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        # re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader
+    )
+
+    eval_dir = os.path.join(args.data, 'val')
+    if not os.path.isdir(eval_dir):
+        eval_dir = os.path.join(args.data, 'validation')
+        if not os.path.isdir(eval_dir):
+            _logger.error('Validation folder does not exist at: {}'.format(eval_dir))
+            exit(1)
+    dataset_eval = Dataset(eval_dir)
+
+    loader_eval = create_loader(
+        dataset_eval,
+        input_size=data_config['input_size'],
+        batch_size=args.validation_batch_size_multiplier * args.batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+
+    if args.jsd:
+        assert num_aug_splits > 1  # JSD only valid with aug splits set
+        train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).npu()
+    elif mixup_active:
+        # smoothing is handled with mixup target transform
+        train_loss_fn = SoftTargetCrossEntropy().npu()
+    elif args.smoothing:
+        train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).npu()
+    else:
+        train_loss_fn = nn.CrossEntropyLoss().npu()
+    validate_loss_fn = nn.CrossEntropyLoss().npu()
+
+    eval_metric = args.eval_metric
+    best_metric = None
+    best_epoch = None
+    
+    if args.eval_checkpoint:  # evaluate the model
+        load_checkpoint(model, args.eval_checkpoint, args.model_ema)
+        val_metrics = validate(model, loader_eval, validate_loss_fn, args)
+        print(f"Top-1 accuracy of the model is: {val_metrics['top1']:.1f}%")
+        return
+
+    saver = None
+    output_dir = ''
+    if args.local_rank == 0:
+        output_base = args.output if args.output else './output'
+        exp_name = '-'.join([
+            datetime.now().strftime("%Y%m%d-%H%M%S"),
+            args.model,
+            str(data_config['input_size'][-1])
+        ])
+        output_dir = get_outdir(output_base, 'train', exp_name)
+        decreasing = True if eval_metric == 'loss' else False
+        saver = CheckpointSaver(
+            model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+            checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing)
+        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+            f.write(args_text)
+
+    try:  # train the model
+        for epoch in range(start_epoch, num_epochs):
+            if args.distributed:
+                loader_train.sampler.set_epoch(epoch)
+                
+
+            train_metrics = train_epoch(
+                epoch, model, loader_train, optimizer, train_loss_fn, args,
+                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+                amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+
+            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                if args.local_rank == 0 or args.workers == 1:
+                    _logger.info("Distributing BatchNorm running means and vars")
+                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+            eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+            if model_ema is not None and not args.model_ema_force_cpu:
+                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+                ema_eval_metrics = validate(
+                    model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)')
+                eval_metrics = ema_eval_metrics
+
+            if lr_scheduler is not None:
+                # step LR for next epoch
+                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+            update_summary(
+                epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+                write_header=best_metric is None)
+
+            if saver is not None:
+                # save proper checkpoint with eval metric
+                save_metric = eval_metrics[eval_metric]
+                best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+    except KeyboardInterrupt:
+        pass
+    if best_metric is not None:
+        _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_epoch(
+        epoch, model, loader, optimizer, loss_fn, args,
+        lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress,
+        loss_scaler=None, model_ema=None, mixup_fn=None):
+    if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+        if args.prefetcher and loader.mixup_enabled:
+            loader.mixup_enabled = False
+        elif mixup_fn is not None:
+            mixup_fn.mixup_enabled = False
+
+    second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.train()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    num_updates = epoch * len(loader)
+    epoch_fps = []
+    prof_list = []
+    for batch_idx, (input, target) in enumerate(loader):
+        last_batch = batch_idx == last_idx
+        data_time_m.update(time.time() - end)
+        if not args.prefetcher:
+            input, target = input.npu(), target.npu()
+            if mixup_fn is not None:
+                input, target = mixup_fn(input, target)
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+
+        if batch_idx in prof_list:
+            with torch.autograd.profiler.profile(use_npu=True) as prof:
+                output = model(input)
+                loss = loss_fn(output, target)
+                if not args.distributed:
+                    losses_m.update(loss.item(), input.size(0))
+
+                optimizer.zero_grad()
+                if loss_scaler is not None:
+                    loss_scaler(
+                        loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+                else:
+                    loss.backward(create_graph=second_order)
+                    if args.clip_grad is not None:
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                    optimizer.step()
+            print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+            prof.export_chrome_trace("output_{}.prof".format(str(batch_idx).zfill(4)))
+            sys.exit()
+
+        else:
+            with amp_autocast():
+                output = model(input)
+                loss = loss_fn(output, target)
+
+            if not args.distributed:
+                losses_m.update(loss.item(), input.size(0))
+
+            optimizer.zero_grad()
+            if loss_scaler is not None:
+                loss_scaler(
+                    loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+            else:
+                loss.backward(create_graph=second_order)
+                if args.clip_grad is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+
+        torch.npu.synchronize()
+        if model_ema is not None:
+            model_ema.update(model)
+        num_updates += 1
+
+        batch_time_m.update(time.time() - end)
+        
+        if last_batch or batch_idx % args.log_interval == 0:
+            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+            lr = sum(lrl) / len(lrl)
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                losses_m.update(reduced_loss.item(), input.size(0))
+
+            if args.local_rank == 0 or args.workers == 1:
+                _logger.info(
+                    'Train: {} [{:>4d}/{} ({:>3.0f}%)]  '
+                    'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f})  '
+                    'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s  '
+                    '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'LR: {lr:.3e}  '
+                    'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+                        epoch,
+                        batch_idx, len(loader),
+                        100. * batch_idx / last_idx,
+                        loss=losses_m,
+                        batch_time=batch_time_m,
+                        rate=input.size(0) * args.world_size / batch_time_m.val,
+                        rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+                        lr=lr,
+                        data_time=data_time_m))
+
+                if args.save_images and output_dir:
+                    torchvision.utils.save_image(
+                        input,
+                        os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+                        padding=0,
+                        normalize=True)
+
+        if saver is not None and args.recovery_interval and (
+                last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+            saver.save_recovery(epoch, batch_idx=batch_idx)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+        
+        epoch_fps.append(input.shape[0] * args.workers / (time.time() - end))
+        end = time.time()
+        # end for
+    
+    if hasattr(optimizer, 'sync_lookahead'):
+        optimizer.sync_lookahead()
+
+    print('Epoch {}: {} fps'.format(epoch, sum(epoch_fps[5:]) / len(epoch_fps[5:])))
+    return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+    batch_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.eval()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    with torch.no_grad():
+        for batch_idx, (input, target) in enumerate(loader):
+            last_batch = batch_idx == last_idx
+            if not args.prefetcher:
+                input = input.npu()
+                target = target.npu()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+
+            with amp_autocast():
+                output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+
+            # augmentation reduction
+            reduce_factor = args.tta
+            if reduce_factor > 1:
+                output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+                target = target[0:target.size(0):reduce_factor]
+
+            loss = loss_fn(output, target)
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                acc1 = reduce_tensor(acc1, args.world_size)
+                acc5 = reduce_tensor(acc5, args.world_size)
+            else:
+                reduced_loss = loss.data
+
+            torch.npu.synchronize()
+
+            losses_m.update(reduced_loss.item(), input.size(0))
+            top1_m.update(acc1.item(), output.size(0))
+            top5_m.update(acc5.item(), output.size(0))
+
+            batch_time_m.update(time.time() - end)
+            end = time.time()
+            if (args.local_rank == 0 or args.workers == 1) and (last_batch or batch_idx % args.log_interval == 0):
+                log_name = 'Test' + log_suffix
+                _logger.info(
+                    '{0}: [{1:>4d}/{2}]  '
+                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  '
+                    'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+                        log_name, batch_idx, last_idx, batch_time=batch_time_m,
+                        loss=losses_m, top1=top1_m, top5=top5_m))
+
+    metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+    return metrics
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py b/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py
new file mode 100644
index 0000000000..401aa8e586
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/metrics.py
@@ -0,0 +1,41 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class AverageMeter:
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def t2taccuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+    return [correct[:k].reshape(-1).float().sum(0) * 100. for k in topk]
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt b/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt
new file mode 100644
index 0000000000..0b49b4fb26
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:OK
+PrecisionStatus:OK
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py b/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py
new file mode 100644
index 0000000000..a2f9cf0db1
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/npu_fused_adamw.py
@@ -0,0 +1,255 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import defaultdict
+
+import torch
+from torch.optim.optimizer import Optimizer
+
+from apex.contrib.combine_tensors import combine_npu
+
+
+class NpuFusedAdamW(Optimizer):
+    """Implements AdamW algorithm.
+
+    Currently NPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--npu_float_status" ./``.
+
+    This version of NPU fused AdamW implements 1 fusions.
+
+      * A combine-tensor apply launch that batches the elementwise updates applied to all the model's parameters
+        into one or a few kernel launches.
+
+    :class:`apex.optimizers.NpuFusedAdamW` may be used as a drop-in replacement for ``torch.optim.AdamW``::
+
+        opt = apex.optimizers.NpuFusedAdamW(model.parameters(), lr = ....)
+        ...
+        opt.step()
+
+    :class:`apex.optimizers.FusedAdamW` should be used with Amp.  Currently, if you wish to use :class:`NpuFusedAdamW`
+    with Amp, only ``opt_level O1 and O2`` can be choosed::
+
+        opt = apex.optimizers.NpuFusedAdamW(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O2")
+        ...
+        opt.step()
+
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional, default: 1e-3): learning rate
+        betas (Tuple[float, float], optional, default: (0.9, 0.999)): coefficients used
+            for computing running averages of gradient and its square
+        eps (float, optional, default: 1e-8): term added to the denominator to improve
+            numerical stability
+        weight_decay (float, optional, default: 1e-2): weight decay coefficient
+        amsgrad (boolean, optional, default: False): whether to use the AMSGrad variant of
+            this algorithm from the paper `On the Convergence of Adam and Beyond`_
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if eps < 0.0:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if betas[0] < 0.0 or betas[0] >= 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if betas[1] < 0.0 or betas[1] >= 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        self.is_npu_fused_optimizer = True
+        super(NpuFusedAdamW, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(NpuFusedAdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def _init_param_state(self, p, amsgrad):
+        state = self.state[p]
+        # State initialization
+        if len(state) == 0:
+            state['step'] = 0
+            # Exponential moving average of gradient values
+            state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+            # Exponential moving average of squared gradient values
+            state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+            if amsgrad:
+                # Maintains max of all exp. moving avg. of sq. grad. values
+                state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+        else:
+            exp_avg_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            exp_avg_tmp.copy_(state['exp_avg'])
+            state['exp_avg'] = exp_avg_tmp
+
+            exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+            exp_avg_sq_tmp.copy_(state['exp_avg_sq'])
+            state['exp_avg_sq'] = exp_avg_sq_tmp
+
+            if amsgrad:
+                max_exp_avg_sq_tmp = torch.zeros_like(p, memory_format=torch.preserve_format)
+                max_exp_avg_sq_tmp.copy_(state['max_exp_avg_sq'])
+                state['max_exp_avg_sq'] = max_exp_avg_sq_tmp
+
+    def _combine_group_param_states(self, group_index):
+        group = self.param_groups[group_index]
+        stash = self._amp_stash
+        group_params_list = stash.params_lists_indexed_by_group[group_index]
+
+        amsgrad = group['amsgrad']
+
+        combined_param_states = []
+        for params in group_params_list:
+            step_list = []
+            exp_avg_list = []
+            exp_avg_sq_list = []
+            max_exp_avg_sq_list = []
+
+            for p in params:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError('NpuFusedAdamW does not support sparse gradients, '
+                                       'please consider SparseAdam instead')
+
+                self._init_param_state(p, amsgrad)
+                state = self.state[p]
+                step_list.append(state['step'])
+                exp_avg_list.append(state['exp_avg'])
+                exp_avg_sq_list.append(state['exp_avg_sq'])
+                if amsgrad:
+                    max_exp_avg_sq_list.append(state['max_exp_avg_sq'])
+
+            combined_step = 0
+            combined_exp_avg = None
+            combined_exp_avg_sq = None
+            combined_max_exp_avg_sq = None
+
+            if len(exp_avg_list) > 0:
+                combined_step = step_list[0]
+                combined_exp_avg = combine_npu(exp_avg_list)
+                combined_exp_avg_sq = combine_npu(exp_avg_sq_list)
+                combined_max_exp_avg_sq = combine_npu(max_exp_avg_sq_list)
+
+            combined_state = defaultdict(dict)
+            combined_state['step'] = combined_step
+            combined_state['exp_avg'] = combined_exp_avg
+            combined_state['exp_avg_sq'] = combined_exp_avg_sq
+            combined_state['max_exp_avg_sq'] = combined_max_exp_avg_sq
+            combined_param_states.append(combined_state)
+        stash.combined_param_states_indexed_by_group[group_index] = combined_param_states
+
+    def _combine_param_states_by_group(self):
+        stash = self._amp_stash
+        if stash.param_states_are_combined_by_group:
+            return
+
+        stash.combined_param_states_indexed_by_group = []
+        for _ in self.param_groups:
+            stash.combined_param_states_indexed_by_group.append([])
+
+        for i, _ in enumerate(self.param_groups):
+            self._combine_group_param_states(i)
+        stash.param_states_are_combined_by_group = True
+
+    def _group_step(self, group_index):
+        group = self.param_groups[group_index]
+        for p in group['params']:
+            if p.grad is None:
+                continue
+
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError('NpuFusedAdamW does not support sparse gradients, '
+                                   'please consider SparseAdam instead')
+            state_p = self.state[p]
+            state_p['step'] += 1
+
+        amsgrad = group['amsgrad']
+        beta1, beta2 = group['betas']
+
+        stash = self._amp_stash
+        combined_group_params = stash.combined_params_indexed_by_group[group_index]
+        combined_group_grads = stash.combined_grads_indexed_by_group[group_index]
+        combined_group_param_states = stash.combined_param_states_indexed_by_group[group_index]
+
+        for combined_param, combined_grad, combined_param_state in zip(combined_group_params,
+                                                                       combined_group_grads,
+                                                                       combined_group_param_states):
+            if combined_param is None or combined_grad is None:
+                continue
+
+            # Perform stepweight decay. The fused method is used here to speed up the calculation
+            combined_param.mul_(1 - group['lr'] * group['weight_decay'])
+
+            exp_avg, exp_avg_sq = combined_param_state['exp_avg'], combined_param_state['exp_avg_sq']
+            if amsgrad:
+                max_exp_avg_sq = combined_param_state['max_exp_avg_sq']
+
+            combined_param_state['step'] += 1
+            bias_correction1 = 1 - beta1 ** combined_param_state['step']
+            bias_correction2 = 1 - beta2 ** combined_param_state['step']
+
+            # Decay the first and second moment running average coefficient
+            exp_avg.mul_(beta1).add_(combined_grad, alpha=1 - beta1)
+            exp_avg_sq.mul_(beta2).addcmul_(combined_grad, combined_grad, value=1 - beta2)
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                # Use the max. for normalizing running avg. of gradient
+                denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+            else:
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+
+            step_size = group['lr'] / bias_correction1
+
+            combined_param.addcdiv_(exp_avg, denom, value=-step_size)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        if not hasattr(self, "_amp_stash"):
+            raise RuntimeError('apex.optimizers.NpuFusedAdamW should be used with AMP.')
+
+        self._check_already_combined_params_and_grads()
+        # combine params and grads first
+        self._combine_params_and_grads_by_group()
+        # then combine param states
+        self._combine_param_states_by_group()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for i, _ in enumerate(self.param_groups):
+            self._group_step(i)
+
+        return loss
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt b/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt
new file mode 100644
index 0000000000..f6f0f6fd77
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.5.0
+apex==0.1
+torchvision==0.6.0
+timm==0.3.4
\ No newline at end of file
-- 
Gitee


From db61d27560e84f08317d292f129ad058b8e42848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:47:54 +0000
Subject: [PATCH 06/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../cv/classification/T2T-ViT/test/env_npu.sh |  68 +++++++++
 .../T2T-ViT/test/train_full_1p.sh             | 129 +++++++++++++++++
 .../T2T-ViT/test/train_full_8p.sh             | 130 +++++++++++++++++
 .../T2T-ViT/test/train_performance_1p.sh      | 131 ++++++++++++++++++
 .../T2T-ViT/test/train_performance_8p.sh      | 130 +++++++++++++++++
 5 files changed, 588 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh
new file mode 100644
index 0000000000..bd4205d15d
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/env_npu.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+    CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export TRI_COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+
+#设置device侧日志登记为error
+msnpureport -g error -d 0
+msnpureport -g error -d 1
+msnpureport -g error -d 2
+msnpureport -g error -d 3
+msnpureport -g error -d 4
+msnpureport -g error -d 5
+msnpureport -g error -d 6
+msnpureport -g error -d 7
+#关闭Device侧Event日志
+msnpureport -e disable
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
new file mode 100644
index 0000000000..09dd270d72
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+else
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+  ${data_path} \
+  --model t2t_vit_14 \
+  --batch-size 64  \
+  --lr 5e-4 \
+  --weight-decay .05 \
+  -j 1 \
+  --amp \
+  --img-size 224 \
+  --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+  > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'fps'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+AvgFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh
new file mode 100644
index 0000000000..d7ef995ea9
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_8p.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+# 变量
+export DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \
+  ${data_path} \
+  --model t2t_vit_14 \
+  --batch-size 64  \
+  --lr 5e-4 \
+  --weight-decay .05 \
+  --amp \
+  --img-size 224 \
+  --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+  > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'fps'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
new file mode 100644
index 0000000000..6c1828793e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+else
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+  ${data_path} \
+  --model t2t_vit_14 \
+  --batch-size 64  \
+  --lr 5e-4 \
+  --weight-decay .05 \
+  -j 1 \
+  --amp \
+  --img-size 224 \
+  --epochs 1 \
+  --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+  > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'fps'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
new file mode 100644
index 0000000000..f024d6cc71
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="T2T-ViT"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch
+train_epochs=310
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \
+  ${data_path} \
+  --model t2t_vit_14 \
+  --batch-size 64  \
+  --lr 5e-4 \
+  --weight-decay .05 \
+  --amp \
+  --img-size 224 \
+  --epochs 5 \
+  --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+  > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -a 'fps'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'|awk 'END {print}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a ".pth.tar'," ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $2}'|awk -F ")" '{print $1}'|awk 'BEGIN {max = 0} {if ($1+0 > max+0) max=$1} END {print max}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Train: ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v Test|awk -F "Loss:" '{print $NF}'|awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
-- 
Gitee


From c28a1f6efe660b5d0a09382261a5a16fa6c357b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:48:00 +0000
Subject: [PATCH 07/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?=
 =?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/test/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/test/.keep

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/test/.keep
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From 6113ed35721c8283025840060d8f2ed8ee5f6255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:48:15 +0000
Subject: [PATCH 08/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../classification/T2T-ViT/models/__init__.py |  17 +
 .../classification/T2T-ViT/models/t2t_vit.py  | 304 ++++++++
 .../T2T-ViT/models/t2t_vit_dense.py           | 177 +++++
 .../T2T-ViT/models/t2t_vit_ghost.py           | 204 ++++++
 .../T2T-ViT/models/t2t_vit_se.py              | 176 +++++
 .../T2T-ViT/models/token_performer.py         |  73 ++
 .../T2T-ViT/models/token_transformer.py       |  68 ++
 .../T2T-ViT/models/transformer_block.py       |  96 +++
 .../cv/classification/T2T-ViT/models/vit.py   | 674 ++++++++++++++++++
 9 files changed, 1789 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py
new file mode 100644
index 0000000000..007f383c19
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .t2t_vit import *
+from .t2t_vit_se import *
+from .t2t_vit_dense import *
+from .t2t_vit_ghost import *
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py
new file mode 100644
index 0000000000..db04ed4ee0
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit.py
@@ -0,0 +1,304 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_
+import numpy as np
+from .token_transformer import Token_transformer
+from .token_performer import Token_performer
+from .transformer_block import Block, get_sinusoid_encoding
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225),
+        'classifier': 'head',
+        **kwargs
+    }
+
+default_cfgs = {
+    'T2t_vit_7': _cfg(),
+    'T2t_vit_10': _cfg(),
+    'T2t_vit_12': _cfg(),
+    'T2t_vit_14': _cfg(),
+    'T2t_vit_19': _cfg(),
+    'T2t_vit_24': _cfg(),
+    'T2t_vit_t_14': _cfg(),
+    'T2t_vit_t_19': _cfg(),
+    'T2t_vit_t_24': _cfg(),
+    'T2t_vit_14_resnext': _cfg(),
+    'T2t_vit_14_wide': _cfg(),
+}
+
+class T2T_module(nn.Module):
+    """
+    Tokens-to-Token encoding module
+    """
+    def __init__(self, img_size=224, tokens_type='performer', in_chans=3, embed_dim=768, token_dim=64):
+        super().__init__()
+
+        if tokens_type == 'transformer':
+            print('adopt transformer encoder for tokens-to-token')
+            self.soft_split0 = nn.Unfold(kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+            self.soft_split1 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+            self.soft_split2 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+            self.attention1 = Token_transformer(dim=in_chans * 7 * 7, in_dim=token_dim, num_heads=1, mlp_ratio=1.0)
+            self.attention2 = Token_transformer(dim=token_dim * 3 * 3, in_dim=token_dim, num_heads=1, mlp_ratio=1.0)
+            self.project = nn.Linear(token_dim * 3 * 3, embed_dim)
+
+        elif tokens_type == 'performer':
+            print('adopt performer encoder for tokens-to-token')
+            self.soft_split0 = nn.Unfold(kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+            self.soft_split1 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+            self.soft_split2 = nn.Unfold(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+
+            #self.attention1 = Token_performer(dim=token_dim, in_dim=in_chans*7*7, kernel_ratio=0.5)
+            #self.attention2 = Token_performer(dim=token_dim, in_dim=token_dim*3*3, kernel_ratio=0.5)
+            self.attention1 = Token_performer(dim=in_chans*7*7, in_dim=token_dim, kernel_ratio=0.5)
+            self.attention2 = Token_performer(dim=token_dim*3*3, in_dim=token_dim, kernel_ratio=0.5)
+            self.project = nn.Linear(token_dim * 3 * 3, embed_dim)
+
+        elif tokens_type == 'convolution':  # just for comparison with conolution, not our model
+            # for this tokens type, you need change forward as three convolution operation
+            print('adopt convolution layers for tokens-to-token')
+            self.soft_split0 = nn.Conv2d(3, token_dim, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))  # the 1st convolution
+            self.soft_split1 = nn.Conv2d(token_dim, token_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) # the 2nd convolution
+            self.project = nn.Conv2d(token_dim, embed_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) # the 3rd convolution
+
+        self.num_patches = (img_size // (4 * 2 * 2)) * (img_size // (4 * 2 * 2))  # there are 3 sfot split, stride are 4,2,2 seperately
+
+    def forward(self, x):
+        # step0: soft split
+        x = self.soft_split0(x).transpose(1, 2)
+
+        # iteration1: re-structurization/reconstruction
+        x = self.attention1(x)
+        B, new_HW, C = x.shape
+        x = x.transpose(1,2).reshape(B, C, int(np.sqrt(new_HW)), int(np.sqrt(new_HW)))
+        # iteration1: soft split
+        x = self.soft_split1(x).transpose(1, 2)
+
+        # iteration2: re-structurization/reconstruction
+        x = self.attention2(x)
+        B, new_HW, C = x.shape
+        x = x.transpose(1, 2).reshape(B, C, int(np.sqrt(new_HW)), int(np.sqrt(new_HW)))
+        # iteration2: soft split
+        x = self.soft_split2(x).transpose(1, 2)
+
+        # final tokens
+        x = self.project(x)
+
+        return x
+
+class T2T_ViT(nn.Module):
+    def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, token_dim=64):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.tokens_to_token = T2T_module(
+                img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim, token_dim=token_dim)
+        num_patches = self.tokens_to_token.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.tokens_to_token(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+@register_model
+def t2t_vit_7(pretrained=False, **kwargs): # adopt performer for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 256 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=7, num_heads=4, mlp_ratio=2., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_7']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_10(pretrained=False, **kwargs): # adopt performer for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 256 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=10, num_heads=4, mlp_ratio=2., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_10']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_12(pretrained=False, **kwargs): # adopt performer for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 256 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=256, depth=12, num_heads=4, mlp_ratio=2., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_12']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+
+@register_model
+def t2t_vit_14(pretrained=False, **kwargs):  # adopt performer for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 384 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_14']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_19(pretrained=False, **kwargs): # adopt performer for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 448 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_19']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_24(pretrained=False, **kwargs): # adopt performer for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 512 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_24']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_t_14(pretrained=False, **kwargs):  # adopt transformers for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 384 ** -0.5)
+    model = T2T_ViT(tokens_type='transformer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_t_14']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_t_19(pretrained=False, **kwargs):  # adopt transformers for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 448 ** -0.5)
+    model = T2T_ViT(tokens_type='transformer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_t_19']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_t_24(pretrained=False, **kwargs):  # adopt transformers for tokens to token
+    if pretrained:
+        kwargs.setdefault('qk_scale', 512 ** -0.5)
+    model = T2T_ViT(tokens_type='transformer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_t_24']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+# rexnext and wide structure
+@register_model
+def t2t_vit_14_resnext(pretrained=False, **kwargs):
+    if pretrained:
+        kwargs.setdefault('qk_scale', 384 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=384, depth=14, num_heads=32, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_14_resnext']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
+
+@register_model
+def t2t_vit_14_wide(pretrained=False, **kwargs):
+    if pretrained:
+        kwargs.setdefault('qk_scale', 512 ** -0.5)
+    model = T2T_ViT(tokens_type='performer', embed_dim=768, depth=4, num_heads=12, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_14_wide']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py
new file mode 100644
index 0000000000..5724570c39
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_dense.py
@@ -0,0 +1,177 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT-Dense
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+
+from .transformer_block import Mlp, Block, get_sinusoid_encoding
+from .t2t_vit import T2T_module, _cfg
+
+default_cfgs = {
+    't2t_vit_dense': _cfg(),
+}
+
+class Transition(nn.Module):
+    def __init__(self, in_features, out_features,  act_layer=nn.GELU):
+        super(Transition, self).__init__()
+        self.act =  act_layer()
+        self.linear = nn.Linear(in_features, out_features)
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.act(x)
+
+        return x
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, growth_rate, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)  #, out_features=growth_rate
+        self.dense_linear = nn.Linear(dim, growth_rate)
+
+    def forward(self, x):
+        new_x = x + self.drop_path(self.attn(self.norm1(x)))
+        new_x = new_x + self.drop_path(self.mlp(self.norm2(new_x)))
+        new_x = self.dense_linear(new_x)
+        x = torch.cat([x, new_x], 2)  # dense connnection: concate all the old features with new features in channel dimension
+        return x
+
+class T2T_ViT_Dense(nn.Module):
+    def __init__(self, growth_rate=32, tokens_type='performer', block_config=(3, 4, 6, 3), img_size=224, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.tokens_to_token = T2T_module(
+            img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.tokens_to_token.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList()
+
+        start_dim = embed_dim
+        for i, num_layers in enumerate(block_config):
+            for j in range(num_layers):
+                new_dim = start_dim + j * growth_rate
+                block = Block(growth_rate=growth_rate, dim=new_dim, num_heads=num_heads, mlp_ratio=mlp_ratio,
+                              qkv_bias=qkv_bias, qk_scale=qk_scale,
+                              drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+                self.blocks.append(block)
+            if i != len(block_config)-1:
+                transition = Transition(new_dim+growth_rate, (new_dim+growth_rate)//2)
+                self.blocks.append(transition)
+                start_dim = int((new_dim+growth_rate)//2)
+        out_dim = new_dim + growth_rate
+        print(f'end dim:{out_dim}')
+        self.norm = norm_layer(out_dim)
+
+        # Classifier head
+        self.head = nn.Linear(out_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.tokens_to_token(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed # self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+@register_model
+def t2t_vit_dense(pretrained=False, **kwargs):
+    model = T2T_ViT_Dense(growth_rate=64, block_config=(3, 6, 6, 4), embed_dim=128, num_heads=8, mlp_ratio=2., **kwargs)
+    model.default_cfg = default_cfgs['t2t_vit_dense']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py
new file mode 100644
index 0000000000..217dad772b
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_ghost.py
@@ -0,0 +1,204 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT-Ghost
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+
+from .transformer_block import Block, get_sinusoid_encoding
+from .t2t_vit import T2T_module, _cfg
+
+
+default_cfgs = {
+    'T2t_vit_16_ghost': _cfg(),
+}
+
+class Mlp_ghost(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, in_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.ratio = hidden_features//in_features
+        self.cheap_operation2 = nn.Conv1d(in_features, in_features, kernel_size=1, groups=in_features, bias=False)
+        self.cheap_operation3 = nn.Conv1d(in_features, in_features, kernel_size=1, groups=in_features, bias=False)
+
+    def forward(self, x):  # x: [B, N, C]
+        x1 = self.fc1(x)   # x1: [B, N, C]
+        x1 = self.act(x1)
+
+        x2 = self.cheap_operation2(x1.transpose(1,2))  # x2: [B, N, C]
+        x2 = x2.transpose(1,2)
+        x2 = self.act(x2)
+
+        x3 = self.cheap_operation3(x1.transpose(1, 2))  # x3: [B, N, C]
+        x3 = x3.transpose(1, 2)
+        x3 = self.act(x3)
+
+        x = torch.cat((x1, x2, x3), dim=2)  # x: [B, N, 3C]
+        x = self.drop(x)
+
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class Attention_ghost(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        half_dim = int(0.5*dim)
+        self.q = nn.Linear(dim, half_dim, bias=qkv_bias)
+        self.k = nn.Linear(dim, half_dim, bias=qkv_bias)
+        self.v = nn.Linear(dim, half_dim, bias=qkv_bias)
+
+        self.cheap_operation_q = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False)
+        self.cheap_operation_k = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False)
+        self.cheap_operation_v = nn.Conv1d(half_dim, half_dim, kernel_size=1, groups=half_dim, bias=False)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+
+        q1 = self.cheap_operation_q(q.transpose(1,2)).transpose(1,2)
+        k1 = self.cheap_operation_k(k.transpose(1,2)).transpose(1,2)
+        v1 = self.cheap_operation_v(v.transpose(1,2)).transpose(1,2)
+
+        q = torch.cat((q, q1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = torch.cat((k, k1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = torch.cat((v, v1), dim=2).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention_ghost(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp_ghost(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+class T2T_ViT_Ghost(nn.Module):
+    def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.tokens_to_token = T2T_module(
+                img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.tokens_to_token.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.tokens_to_token(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+@register_model
+def t2t_vit_16_ghost(pretrained=False, **kwargs):
+    if pretrained:
+        kwargs.setdefault('qk_scale', 384 ** -0.5)
+    model = T2T_ViT_Ghost(tokens_type='performer', embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_16_ghost']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py
new file mode 100644
index 0000000000..b43a86e39e
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/t2t_vit_se.py
@@ -0,0 +1,176 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT-SE
+"""
+import torch
+import torch.nn as nn
+
+from timm.models.helpers import load_pretrained
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+from .transformer_block import Block, Mlp, get_sinusoid_encoding
+from .t2t_vit import T2T_module, _cfg
+
+default_cfgs = {
+    'T2t_vit_14_se': _cfg(),
+}
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool1d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()
+        )
+
+    def forward(self, x):  # x: [B, N, C]
+        x = torch.transpose(x, 1, 2)  # [B, C, N]
+        b, c, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1)
+        x = x * y.expand_as(x)
+        x = torch.transpose(x, 1, 2)  # [B, N, C]
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.se_layer = SELayer(dim)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.se_layer(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+class T2T_ViT_SE(nn.Module):
+    def __init__(self, img_size=224, tokens_type='performer', in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.tokens_to_token = T2T_module(
+                img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.tokens_to_token.num_patches
+        print(num_patches)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim), requires_grad=False)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.tokens_to_token(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+@register_model
+def t2t_vit_14_se(pretrained=False, **kwargs):
+    if pretrained:
+        kwargs.setdefault('qk_scale', 384 ** -0.5)
+    model = T2T_ViT_SE(tokens_type='performer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
+    model.default_cfg = default_cfgs['T2t_vit_14_se']
+    if pretrained:
+        load_pretrained(
+            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
+    return model
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py
new file mode 100644
index 0000000000..16134300a3
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_performer.py
@@ -0,0 +1,73 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Take Performer as T2T Transformer
+"""
+import math
+import torch
+import torch.nn as nn
+
+class Token_performer(nn.Module):
+    def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2 = 0.1):
+        super().__init__()
+        self.emb = in_dim * head_cnt # we use 1, so it is no need here
+        self.kqv = nn.Linear(dim, 3 * self.emb)
+        self.dp = nn.Dropout(dp1)
+        self.proj = nn.Linear(self.emb, self.emb)
+        self.head_cnt = head_cnt
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(self.emb)
+        self.epsilon = 1e-8  # for stable in division
+
+        self.mlp = nn.Sequential(
+            nn.Linear(self.emb, 1 * self.emb),
+            nn.GELU(),
+            nn.Linear(1 * self.emb, self.emb),
+            nn.Dropout(dp2),
+        )
+
+        self.m = int(self.emb * kernel_ratio)
+        self.w = torch.randn(self.m, self.emb)
+        self.w = nn.Parameter(nn.init.orthogonal_(self.w) * math.sqrt(self.m), requires_grad=False)
+
+    def prm_exp(self, x):
+        # part of the function is borrow from https://github.com/lucidrains/performer-pytorch 
+        # and Simo Ryu (https://github.com/cloneofsimo)
+        # ==== positive random features for gaussian kernels ====
+        # x = (B, T, hs)
+        # w = (m, hs)
+        # return : x : B, T, m
+        # SM(x, y) = E_w[exp(w^T x - |x|/2) exp(w^T y - |y|/2)]
+        # therefore return exp(w^Tx - |x|/2)/sqrt(m)
+        xd = ((x * x).sum(dim=-1, keepdim=True)).repeat(1, 1, self.m) / 2
+        wtx = torch.einsum('bti,mi->btm', x.float(), self.w)
+
+        return torch.exp(wtx - xd) / math.sqrt(self.m)
+
+    def single_attn(self, x):
+        k, q, v = torch.split(self.kqv(x), self.emb, dim=-1)
+        kp, qp = self.prm_exp(k), self.prm_exp(q)  # (B, T, m), (B, T, m)
+        D = torch.einsum('bti,bi->bt', qp, kp.sum(dim=1)).unsqueeze(dim=2)  # (B, T, m) * (B, m) -> (B, T, 1)
+        kptv = torch.einsum('bin,bim->bnm', v.float(), kp)  # (B, emb, m)
+        y = torch.einsum('bti,bni->btn', qp, kptv) / (D.repeat(1, 1, self.emb) + self.epsilon)  # (B, T, emb)/Diag
+        # skip connection
+        y = v + self.dp(self.proj(y))  # same as token_transformer in T2T layer, use v as skip connection
+
+        return y
+
+    def forward(self, x):
+        x = self.single_attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py
new file mode 100644
index 0000000000..f9133a1d9f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/token_transformer.py
@@ -0,0 +1,68 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Take the standard Transformer as T2T Transformer
+"""
+import torch.nn as nn
+from timm.models.layers import DropPath
+from .transformer_block import Mlp
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, in_dim = None, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.in_dim = in_dim
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, in_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(in_dim, in_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.in_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.in_dim)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        # skip connection
+        x = v.squeeze(1) + x   # because the original x has different size with current x, use v to do skip connection
+
+        return x
+
+class Token_transformer(nn.Module):
+
+    def __init__(self, dim, in_dim, num_heads, mlp_ratio=1., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, in_dim=in_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(in_dim)
+        self.mlp = Mlp(in_features=in_dim, hidden_features=int(in_dim*mlp_ratio), out_features=in_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = self.attn(self.norm1(x))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py
new file mode 100644
index 0000000000..0ba43c1421
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/transformer_block.py
@@ -0,0 +1,96 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Borrow from timm(https://github.com/rwightman/pytorch-image-models)
+"""
+import torch
+import torch.nn as nn
+import numpy as np
+from timm.models.layers import DropPath
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+def get_sinusoid_encoding(n_position, d_hid):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py b/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py
new file mode 100644
index 0000000000..b263a6f9e5
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/models/vit.py
@@ -0,0 +1,674 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" 
+The original Vision Transformer (ViT) from timm, copyright belongs to / Copyright 2020 Ross Wightman
+"""
+import math
+import logging
+from functools import partial
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.helpers import build_model_with_cfg, overlay_external_default_cfg
+from timm.models.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_
+from timm.models.registry import register_model
+
+_logger = logging.getLogger(__name__)
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # patch models (my experiments)
+    'vit_small_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/vit_small_p16_224-15ec54c9.pth',
+    ),
+
+    # patch models (weights ported from official Google JAX impl)
+    'vit_base_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+    ),
+    'vit_base_patch32_224': _cfg(
+        url='',  # no official model weights for this combo, only for in21k
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_base_patch16_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    'vit_base_patch32_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p32_384-830016f5.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    'vit_large_patch16_224': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch32_224': _cfg(
+        url='',  # no official model weights for this combo, only for in21k
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch16_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+    'vit_large_patch32_384': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
+        input_size=(3, 384, 384), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), crop_pct=1.0),
+
+    # patch models, imagenet21k (weights ported from official Google JAX impl)
+    'vit_base_patch16_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch16_224_in21k-e5005f0a.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_base_patch32_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_patch32_224_in21k-8db57226.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch16_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch16_224_in21k-606da67d.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_large_patch32_224_in21k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+    'vit_huge_patch14_224_in21k': _cfg(
+        hf_hub='timm/vit_huge_patch14_224_in21k',
+        num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+
+    # deit models (FB weights)
+    'vit_deit_tiny_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'),
+    'vit_deit_small_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'),
+    'vit_deit_base_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',),
+    'vit_deit_base_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth',
+        input_size=(3, 384, 384), crop_pct=1.0),
+    'vit_deit_tiny_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth',
+        classifier=('head', 'head_dist')),
+    'vit_deit_small_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth',
+        classifier=('head', 'head_dist')),
+    'vit_deit_base_distilled_patch16_224': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth',
+        classifier=('head', 'head_dist')),
+    'vit_deit_base_distilled_patch16_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth',
+        input_size=(3, 384, 384), crop_pct=1.0, classifier=('head', 'head_dist')),
+
+    # ViT ImageNet-21K-P pretraining
+    'vit_base_patch16_224_miil_in21k': _cfg(
+        url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/vit_base_patch16_224_in21k_miil.pth',
+        mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear', num_classes=11221,
+    ),
+    'vit_base_patch16_224_miil': _cfg(
+        url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm'
+            '/vit_base_patch16_224_1k_miil_84_4.pth',
+        mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear',
+    ),
+}
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, distilled=False,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., embed_layer=PatchEmbed, norm_layer=None,
+                 act_layer=None, weight_init=''):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            weight_init: (str): weight init scheme
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if distilled else None
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+
+        # Representation layer
+        if representation_size and not distilled:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', nn.Linear(embed_dim, representation_size)),
+                ('act', nn.Tanh())
+            ]))
+        else:
+            self.pre_logits = nn.Identity()
+
+        # Classifier head(s)
+        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head_dist = None
+        if distilled:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+        # Weight init
+        assert weight_init in ('jax', 'jax_nlhb', 'nlhb', '')
+        head_bias = -math.log(self.num_classes) if 'nlhb' in weight_init else 0.
+        trunc_normal_(self.pos_embed, std=.02)
+        if self.dist_token is not None:
+            trunc_normal_(self.dist_token, std=.02)
+        if weight_init.startswith('jax'):
+            # leave cls token as zeros to match jax impl
+            for n, m in self.named_modules():
+                _init_vit_weights(m, n, head_bias=head_bias, jax_impl=True)
+        else:
+            trunc_normal_(self.cls_token, std=.02)
+            self.apply(_init_vit_weights)
+
+    def _init_weights(self, m):
+        # this fn left here for compat with downstream users
+        _init_vit_weights(m)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'dist_token'}
+
+    def get_classifier(self):
+        if self.dist_token is None:
+            return self.head
+        else:
+            return self.head, self.head_dist
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if self.num_tokens == 2:
+            self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.dist_token is None:
+            x = torch.cat((cls_token, x), dim=1)
+        else:
+            x = torch.cat((cls_token, self.dist_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = self.pos_drop(x + self.pos_embed)
+        x = self.blocks(x)
+        x = self.norm(x)
+        if self.dist_token is None:
+            return self.pre_logits(x[:, 0])
+        else:
+            return x[:, 0], x[:, 1]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.head_dist is not None:
+            x, x_dist = self.head(x[0]), self.head_dist(x[1])  # x must be a tuple
+            if self.training and not torch.jit.is_scripting():
+                # during inference, return the average of both classifier predictions
+                return x, x_dist
+            else:
+                return (x + x_dist) / 2
+        else:
+            x = self.head(x)
+        return x
+
+
+def _init_vit_weights(m, n: str = '', head_bias: float = 0., jax_impl: bool = False):
+    """ ViT weight initialization
+    * When called without n, head_bias, jax_impl args it will behave exactly the same
+      as my original init for compatibility with prev hparam / downstream use cases (ie DeiT).
+    * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl
+    """
+    if isinstance(m, nn.Linear):
+        if n.startswith('head'):
+            nn.init.zeros_(m.weight)
+            nn.init.constant_(m.bias, head_bias)
+        elif n.startswith('pre_logits'):
+            lecun_normal_(m.weight)
+            nn.init.zeros_(m.bias)
+        else:
+            if jax_impl:
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    if 'mlp' in n:
+                        nn.init.normal_(m.bias, std=1e-6)
+                    else:
+                        nn.init.zeros_(m.bias)
+            else:
+                trunc_normal_(m.weight, std=.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    elif jax_impl and isinstance(m, nn.Conv2d):
+        # NOTE conv was left to pytorch default in my original init
+        lecun_normal_(m.weight)
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.zeros_(m.bias)
+        nn.init.ones_(m.weight)
+
+
+def resize_pos_embed(posemb, posemb_new, num_tokens=1, gs_new=()):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape)
+    ntok_new = posemb_new.shape[1]
+    if num_tokens:
+        posemb_tok, posemb_grid = posemb[:, :num_tokens], posemb[0, num_tokens:]
+        ntok_new -= num_tokens
+    else:
+        posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    _logger.info('Position embedding grid-size from %s to %s', [gs_old, gs_old], gs_new)
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=gs_new, mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+
+
+def checkpoint_filter_fn(state_dict, model):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    if 'model' in state_dict:
+        # For deit models
+        state_dict = state_dict['model']
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k and len(v.shape) < 4:
+            # For old models that I trained prior to conv based patchification
+            O, I, H, W = model.patch_embed.proj.weight.shape
+            v = v.reshape(O, -1, H, W)
+        elif k == 'pos_embed' and v.shape != model.pos_embed.shape:
+            # To resize pos embedding when using model at different size from pretrained weights
+            v = resize_pos_embed(v, model.pos_embed, getattr(model, 'num_tokens', 1),
+                                                model.patch_embed.grid_size)
+        out_dict[k] = v
+    return out_dict
+
+
+def _create_vision_transformer(variant, pretrained=False, default_cfg=None, **kwargs):
+    if default_cfg is None:
+        default_cfg = deepcopy(default_cfgs[variant])
+    overlay_external_default_cfg(default_cfg, kwargs)
+    default_num_classes = default_cfg['num_classes']
+    default_img_size = default_cfg['input_size'][-2:]
+
+    num_classes = kwargs.pop('num_classes', default_num_classes)
+    img_size = kwargs.pop('img_size', default_img_size)
+    repr_size = kwargs.pop('representation_size', None)
+    if repr_size is not None and num_classes != default_num_classes:
+        # Remove representation layer if fine-tuning. This may not always be the desired action,
+        # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface?
+        _logger.warning("Removing representation layer for fine-tuning.")
+        repr_size = None
+
+    if kwargs.get('features_only', None):
+        raise RuntimeError('features_only not implemented for Vision Transformer models.')
+
+    model = build_model_with_cfg(
+        VisionTransformer, variant, pretrained,
+        default_cfg=default_cfg,
+        img_size=img_size,
+        num_classes=num_classes,
+        representation_size=repr_size,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        **kwargs)
+
+    return model
+
+
+@register_model
+def vit_small_patch16_224(pretrained=False, **kwargs):
+    """ My custom 'small' ViT model. embed_dim=768, depth=8, num_heads=8, mlp_ratio=3.
+    NOTE:
+        * this differs from the DeiT based 'small' definitions with embed_dim=384, depth=12, num_heads=6
+        * this model does not have a bias for QKV (unlike the official ViT and DeiT models)
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=8, num_heads=8, mlp_ratio=3.,
+        qkv_bias=False, norm_layer=nn.LayerNorm, **kwargs)
+    if pretrained:
+        # NOTE my scale was wrong for original weights, leaving this here until I have better ones for this model
+        model_kwargs.setdefault('qk_scale', 768 ** -0.5)
+    model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_224(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_384(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_384(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch16_224(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch32_224(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch16_384(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch32_384(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs)
+    model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch32_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(
+        patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
+    model = _create_vision_transformer('vit_base_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch16_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs)
+    model = _create_vision_transformer('vit_large_patch16_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_large_patch32_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(
+        patch_size=32, embed_dim=1024, depth=24, num_heads=16, representation_size=1024, **kwargs)
+    model = _create_vision_transformer('vit_large_patch32_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_huge_patch14_224_in21k(pretrained=False, **kwargs):
+    """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer.
+    NOTE: converted weights not currently available, too large for github release hosting.
+    """
+    model_kwargs = dict(
+        patch_size=14, embed_dim=1280, depth=32, num_heads=16, representation_size=1280, **kwargs)
+    model = _create_vision_transformer('vit_huge_patch14_224_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_tiny_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer('vit_deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_small_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer('vit_deit_small_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_base_patch16_224(pretrained=False, **kwargs):
+    """ DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_deit_base_patch16_224', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_base_patch16_384(pretrained=False, **kwargs):
+    """ DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer('vit_deit_base_patch16_384', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_tiny_distilled_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs)
+    model = _create_vision_transformer(
+        'vit_deit_tiny_distilled_patch16_224', pretrained=pretrained,  distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_small_distilled_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
+    model = _create_vision_transformer(
+        'vit_deit_small_distilled_patch16_224', pretrained=pretrained,  distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_base_distilled_patch16_224(pretrained=False, **kwargs):
+    """ DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer(
+        'vit_deit_base_distilled_patch16_224', pretrained=pretrained,  distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs):
+    """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877).
+    ImageNet-1k weights from https://github.com/facebookresearch/deit.
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer(
+        'vit_deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_miil_in21k(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_miil_in21k', pretrained=pretrained, **model_kwargs)
+    return model
+
+
+@register_model
+def vit_base_patch16_224_miil(pretrained=False, **kwargs):
+    """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
+    """
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs)
+    model = _create_vision_transformer('vit_base_patch16_224_miil', pretrained=pretrained, **model_kwargs)
+    return model
-- 
Gitee


From 883e1375ea828ef07d7e1e488ebff9d19566ba56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Tue, 20 Dec 2022 13:48:23 +0000
Subject: [PATCH 09/15] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Py?=
 =?UTF-8?q?Torch/contrib/cv/classification/T2T-ViT/models/.keep?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/models/.keep

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep b/PyTorch/contrib/cv/classification/T2T-ViT/models/.keep
deleted file mode 100644
index e69de29bb2..0000000000
-- 
Gitee


From 0cc3dca85cfc02f45ecf01a4dec74604928e66ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 01:37:55 +0000
Subject: [PATCH 10/15] update
 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../cv/classification/T2T-ViT/test/train_performance_8p.sh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
index f024d6cc71..85e322f06f 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_8p.sh
@@ -75,7 +75,7 @@ nohup python3.7 -m torch.distributed.launch --nproc_per_node=8 t2t-main.py \
   --weight-decay .05 \
   --amp \
   --img-size 224 \
-  --epochs 5 \
+  --epochs 1 \
   --output ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
   > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
 
-- 
Gitee


From fc1f100426ae4570c05e96851dd003bf5cb7f85f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 01:41:22 +0000
Subject: [PATCH 11/15] update t2t-main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../classification/T2T-ViT/{main.py => t2t-main.py}   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
 rename PyTorch/contrib/cv/classification/T2T-ViT/{main.py => t2t-main.py} (99%)

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
similarity index 99%
rename from PyTorch/contrib/cv/classification/T2T-ViT/main.py
rename to PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
index 1b3ceea782..d230656a48 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/main.py
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
@@ -34,7 +34,7 @@ import torchvision.utils
 from torch.optim.optimizer import Optimizer
 from torch.nn.parallel import DistributedDataParallel as NativeDDP
 
-from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset, create_loader
 from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model
 from timm.utils import *
 from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
@@ -42,8 +42,10 @@ from timm.optim import create_optimizer
 from timm.scheduler import create_scheduler
 from timm.utils import ApexScaler, NativeScaler
 
-from data.myloader import create_loader
+
+#from data.myloader import create_loader
 from npu_fused_adamw import NpuFusedAdamW
+from metrics import t2taccuracy
 
 torch.backends.cudnn.benchmark = True
 _logger = logging.getLogger('train')
@@ -427,7 +429,7 @@ def main():
     args, args_text = _parse_args()
     
     os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1'
-    os.environ['MASTER_PORT'] = '99999' # Any available port
+    os.environ['MASTER_PORT'] = '9999' # Any available port
 
     args.prefetcher = not args.no_prefetcher
     args.distributed = (args.workers > 1)
@@ -681,6 +683,7 @@ def main():
                 lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
                 amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
 
+            #exit()
             if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                 if args.local_rank == 0 or args.workers == 1:
                     _logger.info("Distributing BatchNorm running means and vars")
@@ -875,7 +878,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='')
                 target = target[0:target.size(0):reduce_factor]
 
             loss = loss_fn(output, target)
-            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            acc1, acc5 = t2taccuracy(output, target, topk=(1, 5))
 
             if args.distributed:
                 reduced_loss = reduce_tensor(loss.data, args.world_size)
-- 
Gitee


From 2fba4c3aa6a75ff9bb1e712d6f35d80e770e05af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:36:36 +0000
Subject: [PATCH 12/15] update
 PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
index d230656a48..37dc538abe 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/t2t-main.py
@@ -45,7 +45,6 @@ from timm.utils import ApexScaler, NativeScaler
 
 #from data.myloader import create_loader
 from npu_fused_adamw import NpuFusedAdamW
-from metrics import t2taccuracy
 
 torch.backends.cudnn.benchmark = True
 _logger = logging.getLogger('train')
@@ -878,7 +877,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='')
                 target = target[0:target.size(0):reduce_factor]
 
             loss = loss_fn(output, target)
-            acc1, acc5 = t2taccuracy(output, target, topk=(1, 5))
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
 
             if args.distributed:
                 reduced_loss = reduce_tensor(loss.data, args.world_size)
-- 
Gitee


From 1cd1d1eb6a291d8a01f61c2474219eeb2d47cac8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:38:13 +0000
Subject: [PATCH 13/15] update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../contrib/cv/classification/T2T-ViT/main.py | 913 ++++++++++++++++++
 1 file changed, 913 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/T2T-ViT/main.py

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/main.py b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
new file mode 100644
index 0000000000..9884699a79
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/main.py
@@ -0,0 +1,913 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+T2T-ViT training and evaluating script
+This script is modified from pytorch-image-models by Ross Wightman (https://github.com/rwightman/pytorch-image-models/)
+It was started from an early version of the PyTorch ImageNet example
+(https://github.com/pytorch/examples/tree/master/imagenet)
+"""
+import argparse
+import time
+import yaml
+import os
+import logging
+from collections import OrderedDict
+from contextlib import suppress
+from datetime import datetime
+import models
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torchvision.utils
+from torch.optim.optimizer import Optimizer
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
+from timm.models import load_checkpoint, create_model, resume_checkpoint, convert_splitbn_model
+from timm.utils import *
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+from timm.utils import ApexScaler, NativeScaler
+
+from data.myloader import create_loader
+from npu_fused_adamw import NpuFusedAdamW
+from metrics import t2taccuracy
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('train')
+
+# The first arg parser parses out only the --config argument, this argument is used to
+# load a yaml file containing key-values that override the defaults for the main parser below
+config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+                    help='YAML config file specifying default arguments')
+
+parser = argparse.ArgumentParser(description='T2T-ViT Training and Evaluating')
+
+# Dataset / Model parameters
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--model', default='T2t_vit_14', type=str, metavar='MODEL',
+                    help='Name of model to train (default: "countception"')
+parser.add_argument('--pretrained', action='store_true', default=False,
+                    help='Start with pretrained version of specified network (if avail)')
+parser.add_argument('--initial-checkpoint', default='', type=str, metavar='PATH',
+                    help='Initialize model from this checkpoint (default: none)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='Resume full model and optimizer state from checkpoint (default: none)')
+parser.add_argument('--eval_checkpoint', default='', type=str, metavar='PATH',
+                    help='path to eval checkpoint (default: none)')
+parser.add_argument('--no-resume-opt', action='store_true', default=False,
+                    help='prevent resume of optimizer state when resuming model')
+parser.add_argument('--num-classes', type=int, default=1000, metavar='N',
+                    help='number of label classes (default: 1000)')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--img-size', type=int, default=224, metavar='N',
+                    help='Image patch size (default: None => model default)')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop percent (for validation only)')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
+                    help='input batch size for training (default: 64)')
+parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                    help='ratio of validation batch size to training batch size (default: 1)')
+
+# Optimizer parameters
+parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                    help='Optimizer (default: "adamw"')
+parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                    help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight-decay', type=float, default=0.05,
+                    help='weight decay (default: 0.005 for adamw)')
+parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                    help='Clip gradient norm (default: None, no clipping)')
+
+# Learning rate schedule parameters
+parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                    help='LR scheduler (default: "cosine"')
+parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                    help='learning rate noise on/off epoch percentages')
+parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                    help='learning rate noise limit percent (default: 0.67)')
+parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                    help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                    help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                    help='learning rate cycle limit')
+parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+                    help='warmup learning rate (default: 0.0001)')
+parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                    help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+parser.add_argument('--epochs', type=int, default=300, metavar='N',
+                    help='number of epochs to train (default: 2)')
+parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                    help='epoch interval to decay LR')
+parser.add_argument('--warmup-epochs', type=int, default=10, metavar='N',
+                    help='epochs to warmup LR, if scheduler supports')
+parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                    help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                    help='patience epochs for Plateau LR scheduler (default: 10')
+parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                    help='LR decay rate (default: 0.1)')
+
+# Augmentation & regularization parameters
+parser.add_argument('--no-aug', action='store_true', default=False,
+                    help='Disable all training augmentation, override other train aug args')
+parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                    help='Random resize scale (default: 0.08 1.0)')
+parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                    help='Random resize aspect ratio (default: 0.75 1.33)')
+parser.add_argument('--hflip', type=float, default=0.5,
+                    help='Horizontal flip training aug probability')
+parser.add_argument('--vflip', type=float, default=0.,
+                    help='Vertical flip training aug probability')
+parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                    help='Color jitter factor (default: 0.4)')
+parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                    help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+parser.add_argument('--aug-splits', type=int, default=0,
+                    help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+parser.add_argument('--jsd', action='store_true', default=False,
+                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                    help='Random erase prob (default: 0.25)')
+parser.add_argument('--remode', type=str, default='pixel',
+                    help='Random erase mode (default: "const")')
+parser.add_argument('--recount', type=int, default=1,
+                    help='Random erase count (default: 1)')
+parser.add_argument('--resplit', action='store_true', default=False,
+                    help='Do not random erase first (clean) augmentation split')
+parser.add_argument('--mixup', type=float, default=0.8,
+                    help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix', type=float, default=1.0,
+                    help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                    help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+parser.add_argument('--mixup-prob', type=float, default=1.0,
+                    help='Probability of performing mixup or cutmix when either/both is enabled')
+parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                    help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+parser.add_argument('--smoothing', type=float, default=0.1,
+                    help='Label smoothing (default: 0.1)')
+parser.add_argument('--train-interpolation', type=str, default='random',
+                    help='Training interpolation (random, bilinear, bicubic default: "random")')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                    help='Dropout rate (default: 0.0)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
+
+# Batch norm parameters (only works with gen_efficientnet based models currently)
+parser.add_argument('--bn-tf', action='store_true', default=False,
+                    help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+parser.add_argument('--bn-momentum', type=float, default=None,
+                    help='BatchNorm momentum override (if not None)')
+parser.add_argument('--bn-eps', type=float, default=None,
+                    help='BatchNorm epsilon override (if not None)')
+parser.add_argument('--sync-bn', action='store_true',
+                    help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+parser.add_argument('--dist-bn', type=str, default='',
+                    help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+parser.add_argument('--split-bn', action='store_true',
+                    help='Enable separate BN layers per augmentation split.')
+
+# Model Exponential Moving Average
+parser.add_argument('--model-ema', action='store_true', default=True,
+                    help='Enable tracking moving average of model weights')
+parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                    help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+parser.add_argument('--model-ema-decay', type=float, default=0.99996,
+                    help='decay factor for model weights moving average (default: 0.9998)')
+
+# Misc
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=50, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
+                    help='how many batches to wait before writing recovery checkpoint')
+parser.add_argument('-j', '--workers', type=int, default=8, metavar='N',
+                    help='how many training processes to use (default: 1)')
+parser.add_argument('--num-gpu', type=int, default=1,
+                    help='Number of GPUS to use')
+parser.add_argument('--save-images', action='store_true', default=False,
+                    help='save images of input bathes every log interval for debugging')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='use NVIDIA Apex AMP or Native AMP for mixed precision training')
+parser.add_argument('--apex-amp', action='store_true', default=False,
+                    help='Use NVIDIA Apex AMP mixed precision')
+parser.add_argument('--native-amp', action='store_true', default=False,
+                    help='Use Native Torch AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--output', default='', type=str, metavar='PATH',
+                    help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='top1', type=str, metavar='EVAL_METRIC',
+                    help='Best metric (default: "top1"')
+parser.add_argument('--tta', type=int, default=0, metavar='N',
+                    help='Test/inference time augmentation (oversampling) factor. 0=None (default: 0)')
+parser.add_argument("--local_rank", default=0, type=int)
+parser.add_argument('--use-multi-epochs-loader', action='store_true', default=False,
+                    help='use the multi-epochs-loader to save time at the beginning of every epoch')
+
+parser.add_argument("--addr", default="127.0.0.1", type=str)
+parser.add_argument("--performance", action='store_true', default=False,
+                    help='whether get the model performance')
+
+has_apex = True
+
+import apex
+from apex import amp
+from apex.parallel import DistributedDataParallel as ApexDDP
+from apex.parallel import convert_syncbn_model
+
+def optimizer_kwargs(cfg):
+    """ cfg/argparse to kwargs helper
+    Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
+    """
+    kwargs = dict(
+        opt=cfg.opt,
+        lr=cfg.lr,
+        weight_decay=cfg.weight_decay,
+        momentum=cfg.momentum)
+    if getattr(cfg, 'opt_eps', None) is not None:
+        kwargs['eps'] = cfg.opt_eps
+    if getattr(cfg, 'opt_betas', None) is not None:
+        kwargs['betas'] = cfg.opt_betas
+    if getattr(cfg, 'opt_args', None) is not None:
+        kwargs.update(cfg.opt_args)
+    return kwargs
+
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    """Add weight decay
+    """
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            no_decay.append(param)
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+
+class Lookahead(Optimizer):
+    def __init__(self, base_optimizer, alpha=0.5, k=6):
+        # NOTE super().__init__() not called on purpose
+        if not 0.0 <= alpha <= 1.0:
+            raise ValueError(f'Invalid slow update rate: {alpha}')
+        if not 1 <= k:
+            raise ValueError(f'Invalid lookahead steps: {k}')
+        defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
+        self._base_optimizer = base_optimizer
+        self.param_groups = base_optimizer.param_groups
+        self.defaults = base_optimizer.defaults
+        self.defaults.update(defaults)
+        self.state = defaultdict(dict)
+        # manually add our defaults to the param groups
+        for name, default in defaults.items():
+            for group in self._base_optimizer.param_groups:
+                group.setdefault(name, default)
+
+    @torch.no_grad()
+    def update_slow(self, group):
+        for fast_p in group["params"]:
+            if fast_p.grad is None:
+                continue
+            param_state = self._base_optimizer.state[fast_p]
+            if 'lookahead_slow_buff' not in param_state:
+                param_state['lookahead_slow_buff'] = torch.empty_like(fast_p)
+                param_state['lookahead_slow_buff'].copy_(fast_p)
+            slow = param_state['lookahead_slow_buff']
+            slow.add_(fast_p - slow, alpha=group['lookahead_alpha'])
+            fast_p.copy_(slow)
+
+    def sync_lookahead(self):
+        for group in self._base_optimizer.param_groups:
+            self.update_slow(group)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        loss = self._base_optimizer.step(closure)
+        for group in self._base_optimizer.param_groups:
+            group['lookahead_step'] += 1
+            if group['lookahead_step'] % group['lookahead_k'] == 0:
+                self.update_slow(group)
+        return loss
+
+    def state_dict(self):
+        return self._base_optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._base_optimizer.load_state_dict(state_dict)
+        self.param_groups = self._base_optimizer.param_groups
+
+def create_optimizer_v2(
+        model_or_params,
+        opt: str = 'sgd',
+        lr: Optional[float] = None,
+        weight_decay: float = 0.,
+        momentum: float = 0.9,
+        filter_bias_and_bn: bool = True,
+        **kwargs):
+    """ Create an optimizer.
+    Only support npu fused AdamW and npu fused SGD
+    """
+    if isinstance(model_or_params, nn.Module):
+        # a model was passed in, extract parameters and add weight decays to appropriate layers
+        if weight_decay and filter_bias_and_bn:
+            skip = {}
+            if hasattr(model_or_params, 'no_weight_decay'):
+                skip = model_or_params.no_weight_decay()
+            parameters = add_weight_decay(model_or_params, weight_decay, skip)
+            weight_decay = 0.
+        else:
+            parameters = model_or_params.parameters()
+    else:
+        # iterable of parameters or param groups passed in
+        parameters = model_or_params
+
+    opt_lower = opt.lower()
+    opt_split = opt_lower.split('_')
+    opt_lower = opt_split[-1]
+    # if 'fused' in opt_lower:
+    #     assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+
+    opt_args = dict(weight_decay=weight_decay, **kwargs)
+    if lr is not None:
+        opt_args.setdefault('lr', lr)
+
+    # basic SGD & related
+    if opt_lower == 'sgd' or opt_lower == 'nesterov':
+        # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons
+        opt_args.pop('eps', None)
+        # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+        optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'momentum':
+        opt_args.pop('eps', None)
+        # optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+        optimizer = apex.optimizers.NpuFusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'adamw':
+    #     optimizer = optim.AdamW(parameters, **opt_args)
+        optimizer = NpuFusedAdamW(parameters, **opt_args)
+    else:
+        print(opt_lower, flush=True)
+        assert False and "Invalid optimizer"
+        raise ValueError
+
+    if len(opt_split) > 1:
+        if opt_split[0] == 'lookahead':
+            optimizer = Lookahead(optimizer)
+
+    return optimizer
+
+
+
+def _parse_args():
+    # Do we have a config file to parse?
+    args_config, remaining = config_parser.parse_known_args()
+    if args_config.config:
+        with open(args_config.config, 'r') as f:
+            cfg = yaml.safe_load(f)
+            parser.set_defaults(**cfg)
+
+    # The main arg parser parses the rest of the args, the usual
+    # defaults will have been overridden if config file specified.
+    args = parser.parse_args(remaining)
+
+    # Cache the args as a text string to save them in the output dir later
+    args_text = yaml.safe_dump(args.__dict__, default_flow_style=False)
+    return args, args_text
+
+
+def main():
+    setup_default_logging()
+    args, args_text = _parse_args()
+    
+    os.environ['MASTER_ADDR'] = args.addr # ip or '127.0.0.1'
+    os.environ['MASTER_PORT'] = '99999' # Any available port
+
+    args.prefetcher = not args.no_prefetcher
+    args.distributed = (args.workers > 1)
+
+    torch.npu.set_device(args.local_rank)
+    args.world_size = 1
+    args.rank = args.local_rank  # global rank
+    if args.distributed:
+        torch.npu.set_device(args.local_rank)
+        args.world_size = args.workers
+        torch.distributed.init_process_group(backend='hccl', rank=args.rank, world_size=args.world_size)
+        args.world_size = torch.distributed.get_world_size()
+    assert args.rank >= 0
+
+    if args.distributed:
+        _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
+                     % (args.rank, args.world_size))
+    else:
+        _logger.info('Training with a single process on %d GPUs.' % args.num_gpu)
+
+    torch.manual_seed(args.seed + args.rank)
+
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        drop_rate=args.drop,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps,
+        checkpoint_path=args.initial_checkpoint,
+        img_size=args.img_size)
+
+    if args.local_rank == 0 or args.workers == 1:
+        _logger.info('Model %s created, param count: %d' %
+                     (args.model, sum([m.numel() for m in model.parameters()])))
+
+    data_config = resolve_data_config(vars(args), model=model, verbose=(args.local_rank == 0 or args.workers==1))
+
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    if args.split_bn:
+        assert num_aug_splits > 1 or args.resplit
+        model = convert_splitbn_model(model, max(num_aug_splits, 2))
+
+    use_amp = None
+    args.apex_amp = True
+    use_amp = 'apex'
+    
+    model.npu()
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
+
+    optimizer = create_optimizer_v2(
+        model,
+        **optimizer_kwargs(cfg=args),
+        filter_bias_and_bn=True,
+    )
+    # optimizer = create_optimizer(args, model)
+
+    amp_autocast = suppress  # do nothing
+    loss_scaler = None
+    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0, combine_grad=True)
+    loss_scaler = ApexScaler()
+    if args.local_rank == 0:
+        _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.')
+
+    # optionally resume from a checkpoint
+    resume_epoch = None
+    if args.resume:
+        resume_epoch = resume_checkpoint(
+            model, args.resume,
+            optimizer=None if args.no_resume_opt else optimizer,
+            loss_scaler=None if args.no_resume_opt else loss_scaler,
+            log_info=args.local_rank == 0)
+
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume=args.resume)
+
+    if args.distributed:
+        if args.sync_bn:
+            assert not args.split_bn
+            try:
+                if has_apex and use_amp != 'native':
+                    # Apex SyncBN preferred unless native amp is activated
+                    model = convert_syncbn_model(model)
+                else:
+                    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+                if args.local_rank == 0:
+                    _logger.info(
+                        'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
+                        'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.')
+            except Exception as e:
+                _logger.error('Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1')
+
+        model = NativeDDP(model, device_ids=[args.local_rank], broadcast_buffers=False)  # can use device str in Torch >= 1.1
+        # NOTE: EMA model does not need to be wrapped by DDP
+
+    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
+    if args.performance:
+        num_epochs = 1
+    start_epoch = 0
+    if args.start_epoch is not None:
+        # a specified start_epoch will always override the resume epoch
+        start_epoch = args.start_epoch
+    elif resume_epoch is not None:
+        start_epoch = resume_epoch
+    if lr_scheduler is not None and start_epoch > 0:
+        lr_scheduler.step(start_epoch)
+
+    if args.local_rank == 0:
+        _logger.info('Scheduled epochs: {}'.format(num_epochs))
+
+    train_dir = os.path.join(args.data, 'train')
+    if not os.path.exists(train_dir):
+        _logger.error('Training folder does not exist at: {}'.format(train_dir))
+        exit(1)
+    dataset_train = Dataset(train_dir)
+
+    collate_fn = None
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_args = dict(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.num_classes)
+        if args.prefetcher:
+            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
+            collate_fn = FastCollateMixup(**mixup_args)
+        else:
+            mixup_fn = Mixup(**mixup_args)
+
+    if num_aug_splits > 1:
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+    loader_train = create_loader(
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        # re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader
+    )
+
+    eval_dir = os.path.join(args.data, 'val')
+    if not os.path.isdir(eval_dir):
+        eval_dir = os.path.join(args.data, 'validation')
+        if not os.path.isdir(eval_dir):
+            _logger.error('Validation folder does not exist at: {}'.format(eval_dir))
+            exit(1)
+    dataset_eval = Dataset(eval_dir)
+
+    loader_eval = create_loader(
+        dataset_eval,
+        input_size=data_config['input_size'],
+        batch_size=args.validation_batch_size_multiplier * args.batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+
+    if args.jsd:
+        assert num_aug_splits > 1  # JSD only valid with aug splits set
+        train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).npu()
+    elif mixup_active:
+        # smoothing is handled with mixup target transform
+        train_loss_fn = SoftTargetCrossEntropy().npu()
+    elif args.smoothing:
+        train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).npu()
+    else:
+        train_loss_fn = nn.CrossEntropyLoss().npu()
+    validate_loss_fn = nn.CrossEntropyLoss().npu()
+
+    eval_metric = args.eval_metric
+    best_metric = None
+    best_epoch = None
+    
+    if args.eval_checkpoint:  # evaluate the model
+        load_checkpoint(model, args.eval_checkpoint, args.model_ema)
+        val_metrics = validate(model, loader_eval, validate_loss_fn, args)
+        print(f"Top-1 accuracy of the model is: {val_metrics['top1']:.1f}%")
+        return
+
+    saver = None
+    output_dir = ''
+    if args.local_rank == 0:
+        output_base = args.output if args.output else './output'
+        exp_name = '-'.join([
+            datetime.now().strftime("%Y%m%d-%H%M%S"),
+            args.model,
+            str(data_config['input_size'][-1])
+        ])
+        output_dir = get_outdir(output_base, 'train', exp_name)
+        decreasing = True if eval_metric == 'loss' else False
+        saver = CheckpointSaver(
+            model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
+            checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing)
+        with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+            f.write(args_text)
+
+    try:  # train the model
+        for epoch in range(start_epoch, num_epochs):
+            if args.distributed:
+                loader_train.sampler.set_epoch(epoch)
+                
+
+            train_metrics = train_epoch(
+                epoch, model, loader_train, optimizer, train_loss_fn, args,
+                lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir,
+                amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn)
+
+            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                if args.local_rank == 0 or args.workers == 1:
+                    _logger.info("Distributing BatchNorm running means and vars")
+                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')
+
+            eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast)
+
+            if model_ema is not None and not args.model_ema_force_cpu:
+                if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
+                    distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce')
+                ema_eval_metrics = validate(
+                    model_ema.ema, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)')
+                eval_metrics = ema_eval_metrics
+
+            if lr_scheduler is not None:
+                # step LR for next epoch
+                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])
+
+            update_summary(
+                epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
+                write_header=best_metric is None)
+
+            if saver is not None:
+                # save proper checkpoint with eval metric
+                save_metric = eval_metrics[eval_metric]
+                best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric)
+
+    except KeyboardInterrupt:
+        pass
+    if best_metric is not None:
+        _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
+
+
+def train_epoch(
+        epoch, model, loader, optimizer, loss_fn, args,
+        lr_scheduler=None, saver=None, output_dir='', amp_autocast=suppress,
+        loss_scaler=None, model_ema=None, mixup_fn=None):
+    if args.mixup_off_epoch and epoch >= args.mixup_off_epoch:
+        if args.prefetcher and loader.mixup_enabled:
+            loader.mixup_enabled = False
+        elif mixup_fn is not None:
+            mixup_fn.mixup_enabled = False
+
+    second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.train()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    num_updates = epoch * len(loader)
+    epoch_fps = []
+    prof_list = []
+    for batch_idx, (input, target) in enumerate(loader):
+        last_batch = batch_idx == last_idx
+        data_time_m.update(time.time() - end)
+        if not args.prefetcher:
+            input, target = input.npu(), target.npu()
+            if mixup_fn is not None:
+                input, target = mixup_fn(input, target)
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
+
+        if batch_idx in prof_list:
+            with torch.autograd.profiler.profile(use_npu=True) as prof:
+                output = model(input)
+                loss = loss_fn(output, target)
+                if not args.distributed:
+                    losses_m.update(loss.item(), input.size(0))
+
+                optimizer.zero_grad()
+                if loss_scaler is not None:
+                    loss_scaler(
+                        loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+                else:
+                    loss.backward(create_graph=second_order)
+                    if args.clip_grad is not None:
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                    optimizer.step()
+            print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+            prof.export_chrome_trace("output_{}.prof".format(str(batch_idx).zfill(4)))
+            sys.exit()
+
+        else:
+            with amp_autocast():
+                output = model(input)
+                loss = loss_fn(output, target)
+
+            if not args.distributed:
+                losses_m.update(loss.item(), input.size(0))
+
+            optimizer.zero_grad()
+            if loss_scaler is not None:
+                loss_scaler(
+                    loss, optimizer, clip_grad=args.clip_grad, parameters=model.parameters(), create_graph=second_order)
+            else:
+                loss.backward(create_graph=second_order)
+                if args.clip_grad is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad)
+                optimizer.step()
+
+        torch.npu.synchronize()
+        if model_ema is not None:
+            model_ema.update(model)
+        num_updates += 1
+
+        batch_time_m.update(time.time() - end)
+        
+        if last_batch or batch_idx % args.log_interval == 0:
+            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+            lr = sum(lrl) / len(lrl)
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                losses_m.update(reduced_loss.item(), input.size(0))
+
+            if args.local_rank == 0 or args.workers == 1:
+                _logger.info(
+                    'Train: {} [{:>4d}/{} ({:>3.0f}%)]  '
+                    'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f})  '
+                    'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s  '
+                    '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'LR: {lr:.3e}  '
+                    'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+                        epoch,
+                        batch_idx, len(loader),
+                        100. * batch_idx / last_idx,
+                        loss=losses_m,
+                        batch_time=batch_time_m,
+                        rate=input.size(0) * args.world_size / batch_time_m.val,
+                        rate_avg=input.size(0) * args.world_size / batch_time_m.avg,
+                        lr=lr,
+                        data_time=data_time_m))
+
+                if args.save_images and output_dir:
+                    torchvision.utils.save_image(
+                        input,
+                        os.path.join(output_dir, 'train-batch-%d.jpg' % batch_idx),
+                        padding=0,
+                        normalize=True)
+
+        if saver is not None and args.recovery_interval and (
+                last_batch or (batch_idx + 1) % args.recovery_interval == 0):
+            saver.save_recovery(epoch, batch_idx=batch_idx)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step_update(num_updates=num_updates, metric=losses_m.avg)
+        
+        epoch_fps.append(input.shape[0] * args.workers / (time.time() - end))
+        end = time.time()
+        # end for
+    
+    if hasattr(optimizer, 'sync_lookahead'):
+        optimizer.sync_lookahead()
+
+    print('Epoch {}: {} fps'.format(epoch, sum(epoch_fps[5:]) / len(epoch_fps[5:])))
+    return OrderedDict([('loss', losses_m.avg)])
+
+
+def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix=''):
+    batch_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    top1_m = AverageMeter()
+    top5_m = AverageMeter()
+
+    model.eval()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    with torch.no_grad():
+        for batch_idx, (input, target) in enumerate(loader):
+            last_batch = batch_idx == last_idx
+            if not args.prefetcher:
+                input = input.npu()
+                target = target.npu()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+
+            with amp_autocast():
+                output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+
+            # augmentation reduction
+            reduce_factor = args.tta
+            if reduce_factor > 1:
+                output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
+                target = target[0:target.size(0):reduce_factor]
+
+            loss = loss_fn(output, target)
+            acc1, acc5 = t2taccuracy(output, target, topk=(1, 5))
+
+            if args.distributed:
+                reduced_loss = reduce_tensor(loss.data, args.world_size)
+                acc1 = reduce_tensor(acc1, args.world_size)
+                acc5 = reduce_tensor(acc5, args.world_size)
+            else:
+                reduced_loss = loss.data
+
+            torch.npu.synchronize()
+
+            losses_m.update(reduced_loss.item(), input.size(0))
+            top1_m.update(acc1.item(), output.size(0))
+            top5_m.update(acc5.item(), output.size(0))
+
+            batch_time_m.update(time.time() - end)
+            end = time.time()
+            if (args.local_rank == 0 or args.workers == 1) and (last_batch or batch_idx % args.log_interval == 0):
+                log_name = 'Test' + log_suffix
+                _logger.info(
+                    '{0}: [{1:>4d}/{2}]  '
+                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  '
+                    'Acc@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+                        log_name, batch_idx, last_idx, batch_time=batch_time_m,
+                        loss=losses_m, top1=top1_m, top5=top5_m))
+
+    metrics = OrderedDict([('loss', losses_m.avg), ('top1', top1_m.avg), ('top5', top5_m.avg)])
+
+    return metrics
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
Gitee


From b7c1dce395f052f3fd10d787079fab6993c9c97f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:38:52 +0000
Subject: [PATCH 14/15] update
 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
index 09dd270d72..88152a40e9 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_full_1p.sh
@@ -66,7 +66,7 @@ if [ x"${etp_flag}" != x"true" ];then
     #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
 fi
 
-nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 main.py \
   ${data_path} \
   --model t2t_vit_14 \
   --batch-size 64  \
-- 
Gitee


From 2f18386bf49f5ecb5fdab8f76ec24a0c5d38c14d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=97=AD?= <1149693659@qq.com>
Date: Wed, 21 Dec 2022 02:39:07 +0000
Subject: [PATCH 15/15] update
 PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 王旭 <1149693659@qq.com>
---
 .../cv/classification/T2T-ViT/test/train_performance_1p.sh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
index 6c1828793e..f6018663ff 100644
--- a/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/classification/T2T-ViT/test/train_performance_1p.sh
@@ -67,7 +67,7 @@ if [ x"${etp_flag}" != x"true" ];then
     #export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
 fi
 
-nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 t2t-main.py \
+nohup python3.7 -m torch.distributed.launch --nproc_per_node=1 main.py \
   ${data_path} \
   --model t2t_vit_14 \
   --batch-size 64  \
-- 
Gitee