From c307caae927948127a6cdbf7c23622457c469b51 Mon Sep 17 00:00:00 2001
From: zjy <1363845850@qq.com>
Date: Mon, 15 Aug 2022 10:19:13 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90=E8=A5=BF=E5=8C=97=E5=B7=A5=E4=B8=9A?=
 =?UTF-8?q?=E5=A4=A7=E5=AD=A6=E3=80=91=E3=80=90=E9=AB=98=E6=A0=A1=E8=B4=A1?=
 =?UTF-8?q?=E7=8C=AE=E3=80=91=E3=80=90PyTorch=E3=80=91ShiftViT=E9=87=8D?=
 =?UTF-8?q?=E6=96=B0=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../ShiftViT_for_PyTorch/.gitignore           | 350 ++++++++++++
 .../ShiftViT_for_PyTorch/CODE_OF_CONDUCT.md   |   9 +
 .../ShiftViT_for_PyTorch/LICENSE              |  21 +
 .../ShiftViT_for_PyTorch/NOTICE               |  11 +
 .../ShiftViT_for_PyTorch/README.md            | 130 +++++
 .../ShiftViT_for_PyTorch/README_raw.md        | 128 +++++
 .../ShiftViT_for_PyTorch/SECURITY.md          |  41 ++
 .../ShiftViT_for_PyTorch/SUPPORT.md           |  25 +
 .../ShiftViT_for_PyTorch/bind_pyt.py          | 141 +++++
 .../ShiftViT_for_PyTorch/datasets.py          |  73 +++
 .../ShiftViT_for_PyTorch/engine.py            | 175 ++++++
 .../ShiftViT_for_PyTorch/logger.py            |  48 ++
 .../ShiftViT_for_PyTorch/losses.py            |  78 +++
 .../ShiftViT_for_PyTorch/main.py              | 496 ++++++++++++++++++
 .../ShiftViT_for_PyTorch/mixup_nova.py        |  48 ++
 .../ShiftViT_for_PyTorch/models/__init__.py   |  17 +
 .../ShiftViT_for_PyTorch/models/registry.py   | 275 ++++++++++
 .../ShiftViT_for_PyTorch/models/shiftvit.py   | 365 +++++++++++++
 .../ShiftViT_for_PyTorch/models/smlp.py       | 143 +++++
 .../models/spach/__init__.py                  |  20 +
 .../models/spach/layers/__init__.py           |  19 +
 .../models/spach/layers/channel_func.py       |  46 ++
 .../models/spach/layers/spatial_func.py       | 122 +++++
 .../models/spach/layers/stem.py               | 110 ++++
 .../ShiftViT_for_PyTorch/models/spach/misc.py |  92 ++++
 .../models/spach/spach.py                     | 201 +++++++
 .../models/spach/spach_ms.py                  | 147 ++++++
 .../ShiftViT_for_PyTorch/requirements.txt     |   4 +
 .../ShiftViT_for_PyTorch/samplers.py          |  73 +++
 .../ShiftViT_for_PyTorch/test/env_npu.sh      |  79 +++
 .../test/train_full_1p.sh                     | 150 ++++++
 .../test/train_full_8p.sh                     | 150 ++++++
 .../test/train_performance_1p.sh              | 152 ++++++
 .../test/train_performance_8p.sh              | 150 ++++++
 .../ShiftViT_for_PyTorch/utils.py             | 298 +++++++++++
 35 files changed, 4387 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/.gitignore
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/CODE_OF_CONDUCT.md
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/LICENSE
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/NOTICE
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README.md
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README_raw.md
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SECURITY.md
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SUPPORT.md
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/bind_pyt.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/datasets.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/engine.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/logger.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/losses.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/main.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/mixup_nova.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/__init__.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/registry.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/shiftvit.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/smlp.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/__init__.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/__init__.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/channel_func.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/spatial_func.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/stem.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/misc.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach_ms.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/requirements.txt
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/samplers.py
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_8p.sh
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_1p.sh
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_8p.sh
 create mode 100644 PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/utils.py

diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/.gitignore b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/.gitignore
new file mode 100644
index 0000000000..dfcfd56f44
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/.gitignore
@@ -0,0 +1,350 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/CODE_OF_CONDUCT.md b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..f9ba8cf65f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/CODE_OF_CONDUCT.md
@@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/LICENSE b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/LICENSE
new file mode 100644
index 0000000000..9e841e7a26
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/LICENSE
@@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/NOTICE b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/NOTICE
new file mode 100644
index 0000000000..2cd7bd7638
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/NOTICE
@@ -0,0 +1,11 @@
+ NOTICES
+
+ This repository incorporates material as listed below or described in the code.
+ 
+ Component:
+ main.py, losses.py, datasets.py, engine.py, utils.py, logger.py, samplers.py
+ 
+ Open Source License/Copyright Notice:
+ MIT License
+ Copyright (c) 2015-present, Facebook, Inc.
+ All rights reserved.
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README.md b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README.md
new file mode 100644
index 0000000000..4344a86a96
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README.md
@@ -0,0 +1,130 @@
+# ShiftViT
+
+## 模型简介
+
+ShiftViT由microsoft开发，是一种图像分类的模型，
+
+源仓库：[microsoft/SPACH](https://github.com/microsoft/SPACH).
+
+## 依赖安装
++ install numactl：
+
+```
+apt-get install numactl # for Ubuntu
+yum install numactl # for CentOS
+```
+
++ install requirement
+```
+pip3 install torchvision==0.6.0
+pip3 install einops==0.4.1
+pip3 install --no-deps timm==0.4.5
+
+# other recommended requirements
+apex==0.1+ascend.20220315
+torch==1.5.0+ascend.post5.20220315
+```
+
+- source env and build：
+
+```
+source test/env_npu.sh
+```
+
+- Download the ImageNet dataset from http://www.image-net.org/
+    - Then, and move validation images to labeled subfolders, using [the following shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+## 训练方法
+
+训练脚本在test文件夹下
+
+按照下面的方法进行训练
+
+```bash
+# 1p精度训练
+bash ./test/train_full_1p.sh --data_path=real_data_path
+
+# 1p性能训练
+bash ./test/train_performance_1p.sh --data_path=real_data_path
+
+# 8p精度训练
+bash ./test/train_full_8p.sh --data_path=real_data_path
+
+# 8p性能训练
+bash ./test/train_performance_8p.sh --data_path=real_data_path
+
+```
+
+日志地址
+
+    test/output/devie_id/train_${device_id}.log           # training detail log
+
+    test/output/devie_id/shiftvit_light_tiny_bs128_8p_perf.log  # 8p training performance result log
+
+    test/output/devie_id/shiftvit_light_tiny_bs128_8p_acc.log   # 8p training accuracy result log
+
+## ShiftViT训练结果
+
+1. GPU训练
+   
+   GPU训练使用的是提供的服务器
+
+   精度目标由源仓库中数据提供
+
+   性能标杆根据服务器上的结果给出
+
+   运行日志（4epochs）：obs://zjy-lenet/shiftvit/gpu/
+
+   日志较大，建议使用`awk '/Total/' stdout.txt`查看运行性能记录
+
+   建议使用`awk '/Accuracy/' stdout.txt`查看运行精度记录
+
+   建议使用`awk '/FPS/' stdout.txt`查看运行FPS
+
+2. NPU训练
+   
+   NPU训练使用的是提供的服务器
+
+   运行日志（300 epochs）：obs://zjy-lenet/shiftvit/npu/
+
+   日志较大，建议使用`awk '/Total/' train_0.log`查看运行性能记录
+
+   建议使用`awk '/Accuracy/' train_0.log`查看运行精度记录
+
+   建议使用`awk '/FPS/' train_0.log`查看运行FPS
+
+3. 训练结果
+
+    batch_size=128
+
+    | Acc@1    | FPS       | Name     | Epochs   | AMP_Type |
+    | :----:   | :----:    | :------: | :------: | :--:     |
+    | 79.4%    | 1478.4869*| GPU-8p   | 300      | O2       |
+    |          | 1732.51   | NPU-1p   | 1        | O2       |
+    | 78.8%    | 1732.8838 | NPU-8p   | 300      | O2       |
+    
+    注*：这里的数据收集有点偏，NPU收集舍弃的是前3个step，GPU收集舍弃的是前5个step，这个数据可能有点偏小
+
+# 自验报告
+    ```shell    
+    # 1p train perf
+    # 是否正确输出了性能log文件
+    bash test/train_performance_1p.sh --data_path=xxx
+    # 验收结果： OK / Failed
+    # 备注： 目标性能301FPS；验收测试性能FPS163.308；
+    
+    # 8p train perf
+    # 是否正确输出了性能log文件
+    bash test/train_performance_8p.sh --data_path=xxx
+    # 验收结果： OK 
+    # 备注： 目标性能 FPS；验收测试性能FPS；
+
+    # 8p train full
+    # 是否正确输出了性能精度log文件，是否正确保存了模型文件
+    bash test/train_full_8p.sh --data_path=xxx 
+    # 验收结果： OK
+    # 备注： 目标精度79.4；验收精度79.36；
+    ```    
+**Special notes for your reviewers**:
+
+验收的时候可能会遇到环境变量的可能问题，可以把test脚本中，执行python命令之前的source命令的环境变量改为Ascend toolkit下的set_env.sh
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README_raw.md b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README_raw.md
new file mode 100644
index 0000000000..9fc37986f5
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/README_raw.md
@@ -0,0 +1,128 @@
+This repository contains Pytorch evaluation code, training code and pretrained models for the following projects:
+
++ SPACH ([A Battle of Network Structures: An Empirical Study of CNN, Transformer, and MLP](https://arxiv.org/abs/2108.13002))
++ sMLP ([Sparse MLP for Image Recognition: Is Self-Attention Really Necessary?](https://arxiv.org/abs/2109.05422))
++ ShiftViT ([When Shift Operation Meets Vision Transformer: An Extremely Simple Alternative to Attention Mechanism](https://arxiv.org/abs/2201.10801))
+
+Other unofficial implementations:
+
++ ShiftViT
+  + [Keras](https://keras.io/examples/vision/shiftvit/) by [Aritra Roy Gosthipaty](https://twitter.com/ariG23498) and [Ritwik Raha](https://twitter.com/ritwik_raha)
+
+# Main Results on ImageNet with Pretrained Models
+
+
+| name               | acc@1 | #params | FLOPs | url                                                          |
+| ------------------ | ----- | ------- | ----- | ------------------------------------------------------------ |
+| SPACH-Conv-MS-S    | 81.6  | 44M     | 7.2G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/spach_ms_conv_s.pth) |
+| SPACH-Trans-MS-S   | 82.9  | 40M     | 7.6G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/spach_ms_trans_s.pth) |
+| SPACH-MLP-MS-S     | 82.1  | 46M     | 8.2G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/spach_ms_mlp_s.pth) |
+| SPACH-Hybrid-MS-S  | 83.7  | 63M     | 11.2G | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/spach_ms_hybrid_s.pth) |
+| SPACH-Hybrid-MS-S+ | 83.9  | 63M     | 12.3G | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/spach_ms_hybrid_s+.pth) |
+| sMLPNet-T          | 81.9  | 24M     | 5.0G  |                                                              |
+| sMLPNet-S          | 83.1  | 49M     | 10.3G | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/smlp_s.pth) |
+| sMLPNet-B          | 83.4  | 66M     | 14.0G | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/smlp_b.pth) |
+| Shift-T / light    | 79.4  | 20M     | 3.0G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/shiftvit_tiny_light.pth) |
+| Shift-T            | 81.7  | 29M     | 4.5G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/shiftvit_tiny_r2.pth) |
+| Shift-S / light    | 81.6  | 34M     | 5.7G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/shiftvit_small_light.pth) |
+| Shift-S            | 82.8  | 50M     | 8.8G  | [github](https://github.com/microsoft/SPACH/releases/download/v1.0/shiftvit_small_r2.pth) |
+
+# Usage
+
+## Install
+First, clone the repo and install requirements:
+
+```bash
+git clone https://github.com/microsoft/Spach
+pip install -r requirements.txt
+```
+
+## Data preparation
+
+Download and extract ImageNet train and val images from http://image-net.org/. 
+The directory structure is the standard layout for the torchvision [`datasets.ImageFolder`](https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder), 
+and the training and validation data is expected to be in the `train/` folder and `val/` folder respectively:
+
+```
+/path/to/imagenet/
+  train/
+    class1/
+      img1.jpeg
+    class2/
+      img2.jpeg
+  val/
+    class1/
+      img3.jpeg
+    class/2
+      img4.jpeg
+```
+
+## Evaluation
+
+To evaluate a pre-trained model on ImageNet val with a single GPU run:
+
+```bash
+python main.py --eval --resume <checkpoint> --model <model-name> --data-path <imagenet-path> 
+```
+
+For example, to evaluate the SPACH-Hybrid-MS-S model, run
+
+```bash
+python main.py --eval --resume --model spach_ms_s_patch4_224_hybrid spach_ms_hybrid_s.pth --data-path <imagenet-path>
+```
+
+giving
+```bash
+* Acc@1 83.658 Acc@5 96.762 loss 0.688
+```
+
+You can find all supported models in `models/registry.py.`
+
+## Training
+
+One can simply call the following script to run training process. Distributed training is recommended even on single GPU node. 
+
+```bash
+python -m torch.distributed.launch --nproc_per_node <num-of-gpus-to-use> --use_env main.py 
+--model <model-name>
+--data-path <imagenet-path>
+--output_dir <output-path>
+--dist-eval
+```
+
+# Citation
+
+```
+@article{zhao2021battle,
+  title={A Battle of Network Structures: An Empirical Study of CNN, Transformer, and MLP},
+  author={Zhao, Yucheng and Wang, Guangting and Tang, Chuanxin and Luo, Chong and Zeng, Wenjun and Zha, Zheng-Jun},
+  journal={arXiv preprint arXiv:2108.13002},
+  year={2021}
+}
+
+@article{tang2021sparse,
+  title={Sparse MLP for Image Recognition: Is Self-Attention Really Necessary?},
+  author={Tang, Chuanxin and Zhao, Yucheng and Wang, Guangting and Luo, Chong and Xie, Wenxuan and Zeng, Wenjun},
+  journal={arXiv preprint arXiv:2109.05422},
+  year={2021}
+}
+
+```
+
+# Contributing
+
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+
+# Acknowledgement
+
+Our code are built on top of [DeiT](https://github.com/facebookresearch/deit). We test throughput following [Swin Transformer](https://github.com/microsoft/Swin-Transformer)
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SECURITY.md b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SECURITY.md
new file mode 100644
index 0000000000..f7b89984f0
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SUPPORT.md b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SUPPORT.md
new file mode 100644
index 0000000000..8b05616fc9
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/SUPPORT.md
@@ -0,0 +1,25 @@
+# TODO: The maintainer of this repo has not yet edited this file
+
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
+- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
+
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+
+# Support
+
+## How to file issues and get help  
+
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+feature request as a new Issue.
+
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+
+## Microsoft Support Policy  
+
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/bind_pyt.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/bind_pyt.py
new file mode 100644
index 0000000000..55daf6571d
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/bind_pyt.py
@@ -0,0 +1,141 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import sys
+import subprocess
+import os
+import socket
+from argparse import ArgumentParser, REMAINDER
+
+import torch
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=8,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29688, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+    parser.add_argument('--no_hyperthreads', action='store_true',
+                        help='Flag to disable binding to hyperthreads')
+    parser.add_argument('--no_membind', action='store_true',
+                        help='Flag to disable memory binding')
+
+    # non-optional arguments for binding
+    parser.add_argument("--nsockets_per_node", type=int, required=True,
+                        help="Number of CPU sockets on a node")
+    parser.add_argument("--ncores_per_socket", type=int, required=True,
+                        help="Number of CPU cores per socket")
+
+    # positional
+    parser.add_argument("training_script", type=str,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    parser.add_argument("--data_path", type=str, default='')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # variables for numactrl binding
+
+    NSOCKETS = args.nsockets_per_node
+    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (
+        1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
+    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+    current_env['NODE_RANK'] = str(args.node_rank)
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env['LOCAL_RANK'] = str(local_rank)
+
+        # form numactrl binding command
+        cpu_ranges = [local_rank * NCORES_PER_GPU,
+                      (local_rank + 1) * NCORES_PER_GPU - 1,
+                      local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
+                      (local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
+
+        numactlargs = []
+        if args.no_hyperthreads:
+            numactlargs += ["--physcpubind={}-{}".format(*cpu_ranges[0:2])]
+        else:
+            numactlargs += ["--physcpubind={}-{},{}-{}".format(*cpu_ranges)]
+
+        if not args.no_membind:
+            memnode = local_rank // NGPUS_PER_SOCKET
+            numactlargs += ["--membind={}".format(memnode)]
+
+        # spawn the processes
+        cmd = ["/usr/bin/numactl"] \
+              + numactlargs \
+              + [sys.executable,
+                 "-u",
+                 args.training_script,
+                 "--local_rank={}".format(local_rank)
+                 ] \
+              + args.training_script_args
+
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    for process in processes:
+        process.wait()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/datasets.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/datasets.py
new file mode 100644
index 0000000000..6e1033291f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/datasets.py
@@ -0,0 +1,73 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+import os
+import json
+
+from torchvision import datasets, transforms
+from torchvision.datasets.folder import ImageFolder, default_loader
+
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data import create_transform
+
+from torch.utils.data import Dataset
+
+
+def build_dataset(is_train, args):
+    transform = build_transform(is_train, args)
+
+    if args.data_set == 'IMNET':
+        root = os.path.join(args.data_path, 'train' if is_train else 'val')
+        dataset = datasets.ImageFolder(root, transform=transform)
+        nb_classes = 1000
+    else:
+        raise NotImplementedError("Support ImageNet only.")
+
+    return dataset, nb_classes
+
+
+def build_transform(is_train, args):
+    resize_im = args.input_size > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation=args.train_interpolation,
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(
+                args.input_size, padding=4)
+        return transform
+
+    t = []
+    if resize_im:
+        size = int((256 / 224) * args.input_size)
+        t.append(
+            transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
+        )
+        t.append(transforms.CenterCrop(args.input_size))
+
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/engine.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/engine.py
new file mode 100644
index 0000000000..4ff8eb7ed4
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/engine.py
@@ -0,0 +1,175 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+"""
+Train and eval functions used in main.py
+"""
+import math
+from operator import mod
+import sys
+from typing import Iterable, Optional
+import time
+import logging
+import os
+import torch
+
+from timm.data import Mixup
+from timm.utils import accuracy, ModelEma
+
+from losses import DistillationLoss
+import utils
+import apex.amp
+
+def train_one_epoch(model: torch.nn.Module, criterion: DistillationLoss,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, 
+                    output_dir: str, batch_size: int,
+                    max_norm: float = 0,
+                    model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None,
+                    set_training_mode=True, logger=logging, use_npu=False):
+    model.train(set_training_mode)
+    metric_logger = utils.MetricLogger(delimiter="  ", logger=logger)
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 10
+    i = 0
+
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, batch_size, header, use_npu=use_npu):
+        if i == 10:
+            with torch.autograd.profiler.profile(use_cuda=True) as prof:
+                samples = samples.to(device, non_blocking=True)
+                targets = targets.to(device, non_blocking=True)
+
+                if mixup_fn is not None:
+                    samples, targets = mixup_fn(samples, targets)
+
+                outputs = model(samples)
+                loss = criterion(samples, outputs, targets)
+                loss_value = loss.item()
+
+                if not math.isfinite(loss_value):
+                    logger.info("Loss is {}, stopping training".format(loss_value))
+                    sys.exit(1)
+
+                optimizer.zero_grad()
+
+                # this attribute is added by timm on one optimizer (adahessian)
+                is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+
+                with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward(create_graph=is_second_order)
+
+                optimizer.step()
+
+                metric_logger.update(loss=loss_value)
+                metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+                print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+                prof.export_chrome_trace("/home/zhangjiangyuan/SPACH-main/output_8p_perf.prof")
+        else:
+            samples = samples.to(device, non_blocking=True)
+            targets = targets.to(device, non_blocking=True)
+
+            if mixup_fn is not None:
+                samples, targets = mixup_fn(samples, targets)
+
+            outputs = model(samples)
+            loss = criterion(samples, outputs, targets)
+            loss_value = loss.item()
+
+            if not math.isfinite(loss_value):
+                logger.info("Loss is {}, stopping training".format(loss_value))
+                sys.exit(1)
+
+            optimizer.zero_grad()
+
+            # this attribute is added by timm on one optimizer (adahessian)
+            is_second_order = hasattr(optimizer, 'is_second_order') and optimizer.is_second_order
+
+            with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward(create_graph=is_second_order)
+
+            optimizer.step()
+
+            metric_logger.update(loss=loss_value)
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+        i += 1
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes(use_npu=use_npu, device=device)
+    logger.info(f"Averaged stats: {metric_logger}")
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def evaluate(data_loader, model, device, batch_size, logger=logging, use_npu=False):
+    criterion = torch.nn.CrossEntropyLoss()
+
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    # switch to evaluation mode
+    model.eval()
+
+    for images, target in metric_logger.log_every(data_loader, 10, batch_size, header, use_npu=use_npu):
+        images = images.to(device, non_blocking=True)
+        target = target.to(device, non_blocking=True)
+
+        output = model(images)
+        loss = criterion(output, target)
+
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+
+        batch_size = images.shape[0]
+        metric_logger.update(loss=loss.item())
+        metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
+        metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes(use_npu=use_npu, device=device)
+    logger.info('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
+                .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
+
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def throughput(data_loader, model, logger=logging, use_npu=False):
+    model.eval()
+
+    if use_npu:
+        for idx, (images, _) in enumerate(data_loader):
+            images = images.npu(non_blocking=True)
+            batch_size = images.shape[0]
+            for i in range(50):
+                model(images)
+            logger.info(f"throughput averaged with 30 times")
+            tic1 = time.time()
+            for i in range(30):
+                model(images)
+            tic2 = time.time()
+            logger.info(f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}")
+        return
+    else:
+        for idx, (images, _) in enumerate(data_loader):
+            images = images.cuda(non_blocking=True)
+            batch_size = images.shape[0]
+            for i in range(50):
+                model(images)
+            logger.info(f"throughput averaged with 30 times")
+            tic1 = time.time()
+            for i in range(30):
+                model(images)
+            tic2 = time.time()
+            logger.info(f"batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}")
+            return
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/logger.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/logger.py
new file mode 100644
index 0000000000..52f4ac9d5f
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/logger.py
@@ -0,0 +1,48 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+import os
+import sys
+import logging
+
+
+# @functools.lru_cache()
+def create_logger(output_dir, dist_rank=0, name=''):
+    # create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    # create formatter
+    fmt = '[%(asctime)s %(name)s] (%(filename)s %(lineno)d): %(levelname)s %(message)s'
+
+    # create console handlers for master process
+    if dist_rank == 0:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(
+            logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(console_handler)
+
+    # create file handlers
+    if len(output_dir) > 0:
+        file_handler = logging.FileHandler(os.path.join(output_dir, f'log_rank{dist_rank}.txt'), mode='a')
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S'))
+        logger.addHandler(file_handler)
+        file_handler.flush()
+
+    return logger
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/losses.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/losses.py
new file mode 100644
index 0000000000..2b3fe0ee63
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/losses.py
@@ -0,0 +1,78 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+"""
+Implements the knowledge distillation loss
+"""
+import torch
+from torch.nn import functional as F
+
+
+class DistillationLoss(torch.nn.Module):
+    """
+    This module wraps a standard criterion and adds an extra knowledge distillation loss by
+    taking a teacher model prediction and using it as additional supervision.
+    """
+    def __init__(self, base_criterion: torch.nn.Module, teacher_model: torch.nn.Module,
+                 distillation_type: str, alpha: float, tau: float):
+        super().__init__()
+        self.base_criterion = base_criterion
+        self.teacher_model = teacher_model
+        assert distillation_type in ['none', 'soft', 'hard']
+        self.distillation_type = distillation_type
+        self.alpha = alpha
+        self.tau = tau
+
+    def forward(self, inputs, outputs, labels):
+        """
+        Args:
+            inputs: The original inputs that are feed to the teacher model
+            outputs: the outputs of the model to be trained. It is expected to be
+                either a Tensor, or a Tuple[Tensor, Tensor], with the original output
+                in the first position and the distillation predictions as the second output
+            labels: the labels for the base criterion
+        """
+        outputs_kd = None
+        if not isinstance(outputs, torch.Tensor):
+            # assume that the model outputs a tuple of [outputs, outputs_kd]
+            outputs, outputs_kd = outputs
+        base_loss = self.base_criterion(outputs, labels)
+        if self.distillation_type == 'none':
+            return base_loss
+
+        if outputs_kd is None:
+            raise ValueError("When knowledge distillation is enabled, the model is "
+                             "expected to return a Tuple[Tensor, Tensor] with the output of the "
+                             "class_token and the dist_token")
+        # don't backprop throught the teacher
+        with torch.no_grad():
+            teacher_outputs = self.teacher_model(inputs)
+
+        if self.distillation_type == 'soft':
+            T = self.tau
+            # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+            # with slight modifications
+            distillation_loss = F.kl_div(
+                F.log_softmax(outputs_kd / T, dim=1),
+                F.log_softmax(teacher_outputs / T, dim=1),
+                reduction='sum',
+                log_target=True
+            ) * (T * T) / outputs_kd.numel()
+        elif self.distillation_type == 'hard':
+            distillation_loss = F.cross_entropy(outputs_kd, teacher_outputs.argmax(dim=1))
+
+        loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha
+        return loss
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/main.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/main.py
new file mode 100644
index 0000000000..f69304df1c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/main.py
@@ -0,0 +1,496 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+import argparse
+import datetime
+import numpy as np
+import time
+import torch
+import torch.backends.cudnn as cudnn
+import json
+import os
+import apex
+from pathlib import Path
+
+from mixup_nova import Mixup_nova as Mixup
+# from timm.data import Mixup
+from timm.models import create_model
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.scheduler import create_scheduler
+from timm.optim import create_optimizer
+from timm.utils import NativeScaler, get_state_dict, ModelEma
+
+from datasets import build_dataset
+from engine import train_one_epoch, evaluate, throughput
+from losses import DistillationLoss
+from samplers import RASampler
+import models
+import utils
+from logger import create_logger
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Training and evaluation script', add_help=False)
+    parser.add_argument('--batch-size', default=128, type=int)
+    parser.add_argument('--epochs', default=300, type=int)
+
+    # Model parameters
+    parser.add_argument('--model', default='smlpnet_tiny', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--input-size', default=224, type=int, help='images input size')
+
+    parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                        help='Dropout rate (default: 0.)')
+    parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
+                        help='Drop path rate (default: 0.1)')
+
+    parser.add_argument('--model-ema', action='store_true')
+    parser.add_argument('--no-model-ema', action='store_false', dest='model_ema')
+    parser.set_defaults(model_ema=True)
+    parser.add_argument('--model-ema-decay', type=float, default=0.99996, help='')
+    parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, help='')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "adamw"')
+    parser.add_argument('--opt-eps', default=1e-8, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: 1e-8)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='SGD momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "cosine"')
+    parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+                        help='learning rate (default: 5e-4)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+                        help='warmup learning rate (default: 1e-6)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=20, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    # Augmentation parameters
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". " + \
+                             "(default: rand-m9-mstd0.5-inc1)'),
+    parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='bicubic',
+                        help='Training interpolation (random, bilinear, bicubic default: "bicubic")')
+
+    parser.add_argument('--repeated-aug', action='store_true')
+    parser.add_argument('--no-repeated-aug', action='store_false', dest='repeated_aug')
+    parser.set_defaults(repeated_aug=True)
+
+    # * Random Erase params
+    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                        help='Random erase prob (default: 0.25)')
+    parser.add_argument('--remode', type=str, default='pixel',
+                        help='Random erase mode (default: "pixel")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+
+    # * Mixup params
+    parser.add_argument('--mixup', type=float, default=0.8,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.8)')
+    parser.add_argument('--cutmix', type=float, default=1.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 1.0)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+
+    # Distillation parameters
+    parser.add_argument('--teacher-model', default='regnety_160', type=str, metavar='MODEL',
+                        help='Name of teacher model to train (default: "regnety_160"')
+    parser.add_argument('--teacher-path', type=str, default='')
+    parser.add_argument('--distillation-type', default='none', choices=['none', 'soft', 'hard'], type=str, help="")
+    parser.add_argument('--distillation-alpha', default=0.5, type=float, help="")
+    parser.add_argument('--distillation-tau', default=1.0, type=float, help="")
+
+    # * Finetuning params
+    parser.add_argument('--finetune', default='', help='finetune from checkpoint')
+
+    # Dataset parameters
+    parser.add_argument('--data-path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+    parser.add_argument('--data-set', default='IMNET', choices=['CIFAR', 'IMNET', 'INAT', 'INAT19'],
+                        type=str, help='Image Net dataset path')
+    parser.add_argument('--inat-category', default='name',
+                        choices=['kingdom', 'phylum', 'class', 'order', 'supercategory', 'family', 'genus', 'name'],
+                        type=str, help='semantic granularity')
+
+    parser.add_argument('--output_dir', default='',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='npu',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
+    parser.add_argument('--dist-eval', action='store_true', default=False, help='Enabling distributed evaluation')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin-mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no-pin-mem', action='store_false', dest='pin_mem',
+                        help='')
+    parser.set_defaults(pin_mem=True)
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    parser.add_argument("--local_rank", type=int, default=0)
+    # parameters for training on preemptible clusters
+    parser.add_argument('--auto-resume', action='store_true')
+    parser.add_argument('--no-auto-resume', action='store_false', dest='auto_resume')
+    parser.set_defaults(auto_resume=True)
+
+    # spach parameters
+    parser.add_argument('--stem-type', default='conv1', type=str, choices=['conv1', 'conv4'])
+    parser.add_argument('--shared-spatial-func', action='store_true')
+    # npu parameters
+    parser.add_argument('--npu', action='store_true', default=False, help='Enabling npu training')
+    # parameters for benchmark
+    parser.add_argument('--throughput', action='store_true')
+
+    return parser
+
+
+def parse_model_args(args):
+    model = args.model
+    model_args = []
+    if model.startswith('spach'):
+        model_args = ['stem_type', 'shared_spatial_func']
+    args = vars(args)
+    model_args = {_: args[_] for _ in model_args}
+    return model_args
+
+
+def main(args):
+
+    utils.init_distributed_mode(args)
+    logger = create_logger(args.output_dir, utils.get_rank(), args.model)
+    logger.info(args)
+
+    if args.distillation_type != 'none' and args.finetune and not args.eval:
+        raise NotImplementedError("Finetuning with distillation not yet supported")
+
+    if args.npu:
+        device = f'npu:{str(utils.get_rank())}'
+    else:
+        device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # random.seed(seed)
+    cudnn.benchmark = True
+
+    dataset_train, args.nb_classes = build_dataset(is_train=True, args=args)
+    dataset_val, _ = build_dataset(is_train=False, args=args)
+
+    if args.distributed:  # args.distributed:
+        num_tasks = utils.get_world_size()
+        global_rank = utils.get_rank()
+        if args.repeated_aug:
+            sampler_train = RASampler(
+                dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+            )
+        else:
+            sampler_train = torch.utils.data.DistributedSampler(
+                dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+            )
+        if args.dist_eval:
+            if len(dataset_val) % num_tasks != 0:
+                logger.info('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                            'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                            'equal num of samples per-process.')
+            sampler_val = torch.utils.data.DistributedSampler(
+                dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False)
+        else:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=True,
+    )
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=int(1.5 * args.batch_size),
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False
+    )
+
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_fn = Mixup(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.nb_classes)
+
+    logger.info(f"Creating model: {args.model}")
+    model = create_model(
+        args.model,
+        pretrained=False,
+        num_classes=args.nb_classes,
+        drop_rate=args.drop,
+        drop_path_rate=args.drop_path,
+        drop_block_rate=None,
+        **parse_model_args(args)
+    )
+
+    if args.finetune:
+        if args.finetune.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.finetune, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.finetune, map_location='cpu')
+
+        checkpoint_model = checkpoint['model']
+        state_dict = model.state_dict()
+        for k in ['head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias']:
+            if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
+                logger.info(f"Removing key {k} from pretrained checkpoint")
+                del checkpoint_model[k]
+
+        # interpolate position embedding
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        checkpoint_model['pos_embed'] = new_pos_embed
+
+        model.load_state_dict(checkpoint_model, strict=False)
+
+    print(device, flush=True)
+    model.to(device)
+
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume='')
+
+    model_without_ddp = model
+    linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0
+    args.lr = linear_scaled_lr
+    optimizer = apex.optimizers.NpuFusedAdamW(model.parameters(), args.lr,
+                          weight_decay=args.weight_decay)
+    lr_scheduler, _ = create_scheduler(args, optimizer)
+    model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O2", loss_scale=128.0, combine_grad=True)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+        model_without_ddp = model.module
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    logger.info(f'number of params: {n_parameters}')
+    if hasattr(model_without_ddp, 'flops'):
+        try:
+            flops = model_without_ddp.flops()
+            logger.info(f"number of GFLOPs: {flops / 1e9}")
+        except Exception as e:
+            logger.exception(e)
+
+    criterion = LabelSmoothingCrossEntropy()
+
+    if args.mixup > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif args.smoothing:
+        criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    teacher_model = None
+    if args.distillation_type != 'none':
+        assert args.teacher_path, 'need to specify teacher-path when using distillation'
+        logger.info(f"Creating teacher model: {args.teacher_model}")
+        teacher_model = create_model(
+            args.teacher_model,
+            pretrained=False,
+            num_classes=args.nb_classes,
+            global_pool='avg',
+        )
+        if args.teacher_path.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.teacher_path, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.teacher_path, map_location='cpu')
+        teacher_model.load_state_dict(checkpoint['model'])
+        teacher_model.to(device)
+        teacher_model.eval()
+
+    # wrap the criterion in our custom DistillationLoss, which
+    # just dispatches to the original criterion if args.distillation_type is 'none'
+    criterion = DistillationLoss(
+        criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau
+    )
+
+    output_dir = Path(args.output_dir)
+    if args.auto_resume:
+        _resume = str((output_dir / 'checkpoint.pth').absolute())
+        if os.path.exists(_resume):
+            logger.info(f'auto resume from {output_dir}/checkpoint.pth')
+            args.resume = _resume
+
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if args.model_ema:
+                utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema'])
+
+    if args.eval:
+        test_stats = evaluate(data_loader_val, model, device, args.batch_size, logger=logger, use_npu=args.npu)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        return
+
+    if args.throughput:
+        throughput(data_loader_val, model, logger=logger, use_npu=args.npu)
+        return
+
+    criterion = criterion.to(device)
+    logger.info(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    max_accuracy = 0.0
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train,
+            optimizer, device, epoch, args.output_dir, args.batch_size, 
+            args.clip_grad, model_ema, mixup_fn,
+            set_training_mode=args.finetune == '',  # keep in eval mode during finetuning
+            logger=logger, 
+            use_npu=args.npu
+        )
+
+        lr_scheduler.step(epoch)
+        if args.output_dir and epoch % 5==0:
+            checkpoint_paths = [output_dir / f'checkpoint_{str(epoch)}.pth']
+            for checkpoint_path in checkpoint_paths:
+                utils.save_on_master({
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'model_ema': get_state_dict(model_ema),
+                    'amp': apex.amp.state_dict(),
+                    'args': args,
+                }, checkpoint_path)
+
+        test_stats = evaluate(data_loader_val, model, device, args.batch_size, logger=logger, use_npu=args.npu)
+        logger.info(f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+
+        if test_stats["acc1"] > max_accuracy:
+            best_checkpoint_path = output_dir / 'best.pth'
+            utils.save_on_master({
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
+                'epoch': epoch,
+                'model_ema': get_state_dict(model_ema),
+                'amp': apex.amp.state_dict(),
+                'args': args,
+            }, best_checkpoint_path)
+        max_accuracy = max(max_accuracy, test_stats["acc1"])
+        logger.info(f'Max accuracy: {max_accuracy:.2f}%')
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     **{f'test_{k}': v for k, v in test_stats.items()},
+                     'epoch': epoch,
+                     'n_parameters': n_parameters}
+
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('DeiT training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/mixup_nova.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/mixup_nova.py
new file mode 100644
index 0000000000..a86506ae5c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/mixup_nova.py
@@ -0,0 +1,48 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import timm.data.mixup as mixup
+import torch
+
+def one_hot(x, num_classes, on_value=1., off_value=0.):
+    x = x.long().view(-1, 1)
+    device = x.device
+    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)
+
+
+def mixup_target(target, num_classes, lam=1., smoothing=0.0):
+    off_value = smoothing / num_classes
+    on_value = 1. - smoothing + off_value
+    device = target.device
+    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value)
+    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value)
+    return y1 * lam + y2 * (1. - lam)
+
+class Mixup_nova(mixup.Mixup):
+    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
+                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
+        super().__init__(mixup_alpha=mixup_alpha, cutmix_alpha=cutmix_alpha, cutmix_minmax=cutmix_minmax, prob=prob, switch_prob=switch_prob,
+                 mode=mode, correct_lam=correct_lam, label_smoothing=label_smoothing, num_classes=num_classes)
+
+    def __call__(self, x, target):
+        assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing)
+        return x, target
+
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/__init__.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/__init__.py
new file mode 100644
index 0000000000..a1c231acf2
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from .registry import *
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/registry.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/registry.py
new file mode 100644
index 0000000000..f094e30d3a
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/registry.py
@@ -0,0 +1,275 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from timm.models.registry import register_model
+from .smlp import sMLPNet
+from .spach import Spach, SpachMS
+from .shiftvit import ShiftViT
+
+
+# sMLP
+@register_model
+def smlpnet_tiny(pretrained=False, **kwargs):
+    model = sMLPNet(dim=80, alpha=3, patch_size=4, depths=[2,8,14,2], dp_rate=0.0, **kwargs)
+    return model
+
+
+@register_model
+def smlpnet_small(pretrained=False, **kwargs):
+    model = sMLPNet(dim=96, alpha=3, patch_size=4, depths=[2,10,24,2], dp_rate=0.2, **kwargs)
+    return model
+
+
+@register_model
+def smlpnet_base(pretrained=False, **kwargs):
+    model = sMLPNet(dim=112, alpha=3, patch_size=4, depths=[2,10,24,2], dp_rate=0.3, **kwargs)
+    return model
+
+
+# SPACH
+@register_model
+def spach_xxs_patch16_224_mlp(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=384, token_ratio=0.5, num_heads=12, channel_ratio=2.0)
+    cfgs['net_arch'] = [('mlp', 12)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_xxs_patch16_224_conv(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=384, token_ratio=0.5, num_heads=12, channel_ratio=2.0)
+    cfgs['net_arch'] = [('pass', 12)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_xxs_patch16_224_attn(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=192, token_ratio=0.5, num_heads=6, channel_ratio=2.0)
+    cfgs['net_arch'] = [('attn', 12)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_xs_patch16_224_mlp(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=384, token_ratio=0.5, num_heads=12, channel_ratio=2.0)
+    cfgs['net_arch'] = [('mlp', 24)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_xs_patch16_224_conv(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=384, token_ratio=0.5, num_heads=12, channel_ratio=2.0)
+    cfgs['net_arch'] = [('pass', 24)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_xs_patch16_224_attn(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=384, token_ratio=0.5, num_heads=12, channel_ratio=2.0)
+    cfgs['net_arch'] = [('attn', 12)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_s_patch16_224_mlp(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=512, token_ratio=0.5, num_heads=16, channel_ratio=3.0)
+    cfgs['net_arch'] = [('mlp', 24)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_s_patch16_224_conv(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=512, token_ratio=0.5, num_heads=16, channel_ratio=3.0)
+    cfgs['net_arch'] = [('pass', 24)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_s_patch16_224_attn(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=16, hidden_dim=512, token_ratio=0.5, num_heads=16, channel_ratio=3.0)
+    cfgs['net_arch'] = [('attn', 12)]
+    cfgs.update(kwargs)
+    model = Spach(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xxs_patch4_224_conv(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=64, token_ratio=0.5, num_heads=2, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 2)], [('pass', 2)], [('pass', 6)], [('pass', 2)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xxs_patch4_224_mlp(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=64, token_ratio=0.5, num_heads=2, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 2)], [('mlp', 2)], [('mlp', 6)], [('mlp', 2)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xxs_patch4_224_attn(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=32, token_ratio=0.5, num_heads=1, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 2)], [('attn', 2)], [('attn', 6)], [('attn', 2)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xs_patch4_224_conv(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=96, token_ratio=0.5, num_heads=3, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('pass', 4)], [('pass', 12)], [('pass', 3)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xs_patch4_224_mlp(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=96, token_ratio=0.5, num_heads=3, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('mlp', 4)], [('mlp', 12)], [('mlp', 3)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xs_patch4_224_attn(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=64, token_ratio=0.5, num_heads=2, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('attn', 4)], [('attn', 12)], [('attn', 3)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_s_patch4_224_conv(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=128, token_ratio=0.5, num_heads=4, channel_ratio=3.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('pass', 4)], [('pass', 12)], [('pass', 3)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_s_patch4_224_mlp(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=128, token_ratio=0.5, num_heads=4, channel_ratio=3.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('mlp', 4)], [('mlp', 12)], [('mlp', 3)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_s_patch4_224_attn(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=96, token_ratio=0.5, num_heads=3, channel_ratio=3.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('attn', 4)], [('attn', 12)], [('attn', 3)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_xs_patch4_224_hybrid(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=96, token_ratio=0.5, num_heads=3, channel_ratio=2.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('pass', 4)], [('pass', 2), ('attn', 10)], [('pass', 1), ('attn', 2)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+@register_model
+def spach_ms_s_patch4_224_hybrid(pretrained=False, **kwargs):
+    cfgs = dict(img_size=224, patch_size=4, hidden_dim=128, token_ratio=0.5, num_heads=4, channel_ratio=3.0)
+    cfgs['net_arch'] = [[('pass', 3)], [('pass', 2), ('attn', 2)], [('pass', 2), ('attn', 10)], [('pass', 1), ('attn', 2)]]
+    cfgs.update(kwargs)
+    model = SpachMS(**cfgs)
+    return model
+
+
+# shift vit
+@register_model
+def shiftvit_light_tiny(**kwargs):
+    model = ShiftViT(embed_dim=96, depths=(2, 2, 6, 2), mlp_ratio=4, drop_path_rate=0.2, n_div=12)
+    return model
+
+
+@register_model
+def shiftvit_r4_tiny(**kwargs):
+    model = ShiftViT(embed_dim=96, depths=(2, 2, 12, 3), mlp_ratio=4, drop_path_rate=0.2, n_div=12)
+    return model
+
+
+@register_model
+def shiftvit_r2_tiny(**kwargs):
+    model = ShiftViT(embed_dim=96, depths=(6, 8, 18, 6), mlp_ratio=2, drop_path_rate=0.2, n_div=12)
+    return model
+
+
+@register_model
+def shiftvit_light_small(**kwargs):
+    model = ShiftViT(embed_dim=96, depths=(2, 2, 18, 2), mlp_ratio=4, drop_path_rate=0.4, n_div=12)
+    return model
+
+
+@register_model
+def shiftvit_r4_small(**kwargs):
+    model = ShiftViT(embed_dim=96, depths=(2, 6, 24, 4), mlp_ratio=4, drop_path_rate=0.4, n_div=12)
+    return model
+
+
+@register_model
+def shiftvit_r2_small(**kwargs):
+    model = ShiftViT(embed_dim=96, depths=(10, 18, 36, 10), mlp_ratio=2, drop_path_rate=0.4, n_div=12)
+    return model
+
+
+@register_model
+def shiftvit_light_base(**kwargs):
+    model = ShiftViT(embed_dim=128, depths=(2, 2, 18, 2), mlp_ratio=4, drop_path_rate=0.5, n_div=16)
+    return model
+
+
+@register_model
+def shiftvit_r4_base(**kwargs):
+    model = ShiftViT(embed_dim=128, depths=(4, 6, 22, 4), mlp_ratio=4, drop_path_rate=0.5, n_div=16)
+    return model
+
+
+@register_model
+def shiftvit_r2_base(**kwargs):
+    model = ShiftViT(embed_dim=128, depths=(10, 18, 36, 10), mlp_ratio=2, drop_path_rate=0.6, n_div=16)
+    return model
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/shiftvit.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/shiftvit.py
new file mode 100644
index 0000000000..6b243f2802
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/shiftvit.py
@@ -0,0 +1,365 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from functools import partial
+
+
+class GroupNorm(nn.GroupNorm):
+
+    def __init__(self, num_channels, num_groups=1):
+        """ We use GroupNorm (group = 1) to approximate LayerNorm
+        for [N, C, H, W] layout"""
+        super(GroupNorm, self).__init__(num_groups, num_channels)
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        """ MLP network in FFN. By default, the MLP is implemented by
+        nn.Linear. However, in our implementation, the data layout is
+        in format of [N, C, H, W], therefore we use 1x1 convolution to
+        implement fully-connected MLP layers.
+
+        Args:
+            in_features (int): input channels
+            hidden_features (int): hidden channels, if None, set to in_features
+            out_features (int): out channels, if None, set to in_features
+            act_layer (callable): activation function class type
+            drop (float): drop out probability
+        """
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class ShiftViTBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 n_div=12,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 input_resolution=None):
+        """ The building block of Shift-ViT network.
+
+        Args:
+            dim (int): feature dimension
+            n_div (int): how many divisions are used. Totally, 4/n_div of
+                channels will be shifted.
+            mlp_ratio (float): expand ratio of MLP network.
+            drop (float): drop out prob.
+            drop_path (float): drop path prob.
+            act_layer (callable): activation function class type.
+            norm_layer (callable): normalization layer class type.
+            input_resolution (tuple): input resolution. This optional variable
+                is used to calculate the flops.
+
+        """
+        super(ShiftViTBlock, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.mlp_ratio = mlp_ratio
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.n_div = n_div
+
+    def forward(self, x):
+        x = self.shift_feat(x, self.n_div)
+        shortcut = x
+        x = shortcut + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}," \
+               f"input_resolution={self.input_resolution}," \
+               f"shift percentage={4.0 / self.n_div * 100}%."
+
+    @staticmethod
+    def shift_feat(x, n_div):
+        B, C, H, W = x.shape
+        g = C // n_div
+        out = torch.zeros_like(x)
+
+        out[:, g * 0:g * 1, :, :-1] = x[:, g * 0:g * 1, :, 1:]  # shift left
+        out[:, g * 1:g * 2, :, 1:] = x[:, g * 1:g * 2, :, :-1]  # shift right
+        out[:, g * 2:g * 3, :-1, :] = x[:, g * 2:g * 3, 1:, :]  # shift up
+        out[:, g * 3:g * 4, 1:, :] = x[:, g * 3:g * 4, :-1, :]  # shift down
+
+        out[:, g * 4:, :, :] = x[:, g * 4:, :, :]  # no shift
+        return out
+
+
+class PatchMerging(nn.Module):
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Conv2d(dim, 2 * dim, (2, 2), stride=2, bias=False)
+        self.norm = norm_layer(dim)
+
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+
+class BasicLayer(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 n_div=12,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=None,
+                 norm_layer=None,
+                 downsample=None,
+                 use_checkpoint=False,
+                 act_layer=nn.GELU):
+
+        super(BasicLayer, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            ShiftViTBlock(dim=dim,
+                          n_div=n_div,
+                          mlp_ratio=mlp_ratio,
+                          drop=drop,
+                          drop_path=drop_path[i],
+                          norm_layer=norm_layer,
+                          act_layer=act_layer,
+                          input_resolution=input_resolution)
+            for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution,
+                                         dim=dim,
+                                         norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}," \
+               f"input_resolution={self.input_resolution}," \
+               f"depth={self.depth}"
+
+
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int, tuple): Image size.
+        patch_size (int, tuple): Patch token size.
+        in_chans (int): Number of input image channels.
+        embed_dim (int): Number of linear projection output channels.
+        norm_layer (nn.Module, optional): Normalization layer.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0],
+                              img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim,
+                              kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        x = self.proj(x)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class ShiftViT(nn.Module):
+
+    def __init__(self,
+                 n_div=12,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=96,
+                 depths=(2, 2, 6, 2),
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer='GN1',
+                 act_layer='GELU',
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 **kwargs):
+        super().__init__()
+        assert norm_layer in ('GN1', 'BN')
+        if norm_layer == 'BN':
+            norm_layer = nn.BatchNorm2d
+        elif norm_layer == 'GN1':
+            norm_layer = partial(GroupNorm, num_groups=1)
+        else:
+            raise NotImplementedError
+
+        if act_layer == 'GELU':
+            act_layer = nn.GELU
+        elif act_layer == 'RELU':
+            act_layer = partial(nn.ReLU, inplace=False)
+        else:
+            raise NotImplementedError
+
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = [x.item()
+               for x in torch.linspace(0, drop_path_rate, sum(depths))]
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
+                               n_div=n_div,
+                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                                 patches_resolution[1] // (2 ** i_layer)),
+                               depth=depths[i_layer],
+                               mlp_ratio=self.mlp_ratio,
+                               drop=drop_rate,
+                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                               norm_layer=norm_layer,
+                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                               use_checkpoint=use_checkpoint,
+                               act_layer=act_layer)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.head = nn.Linear(self.num_features, num_classes) \
+            if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.Conv1d, nn.Conv2d)):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.GroupNorm)):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x)  # B C 1
+        x = torch.flatten(x, 1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/smlp.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/smlp.py
new file mode 100644
index 0000000000..17330cdcba
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/smlp.py
@@ -0,0 +1,143 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import torch
+from torch import nn
+from einops.layers.torch import Rearrange
+from timm.models.layers import DropPath
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout=0.):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class BN_Activ_Conv(nn.Module):
+    def __init__(self, in_channels, activation, out_channels, kernel_size, stride=(1, 1), dilation=(1, 1), groups=1):
+        super(BN_Activ_Conv, self).__init__()
+        self.BN = nn.BatchNorm2d(out_channels)
+        self.Activation = activation
+        padding = [int((dilation[j] * (kernel_size[j] - 1) - stride[j] + 1) / 2) for j in range(2)]  # Same padding
+        self.Conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups=groups, bias=False)
+
+    def forward(self, img):
+        img = self.BN(img)
+        img = self.Activation(img)
+        img = self.Conv(img)
+        return img
+
+
+class sMLPBlock(nn.Module):
+    def __init__(self, W, H, channels):
+        super().__init__()
+        assert W == H
+        self.channels = channels
+        self.activation = nn.GELU()
+        self.BN = nn.BatchNorm2d(channels)
+        self.proj_h = nn.Conv2d(H, H, (1, 1))
+        self.proh_w = nn.Conv2d(W, W, (1, 1))
+        self.fuse = nn.Conv2d(channels*3, channels, (1,1), (1,1), bias=False)
+
+    def forward(self, x):
+        x = self.activation(self.BN(x))
+        x_h = self.proj_h(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        x_w = self.proh_w(x.permute(0, 2, 1, 3)).permute(0, 2, 1, 3)
+        x = self.fuse(torch.cat([x, x_h, x_w], dim=1))
+        return x
+
+
+class DWConvBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv_merge = BN_Activ_Conv(channels, nn.GELU(), channels, (3, 3), groups=channels)
+
+    def forward(self, img):
+        img = self.conv_merge(img)
+        return img
+
+
+class sMLPNet(nn.Module):
+
+    def __init__(self, in_chans=3, dim=80, alpha=3, num_classes=1000, patch_size=4, image_size=224, depths=[2,8,14,2], dp_rate=0.,
+                 **kwargs):
+        super(sMLPNet, self).__init__()
+        '''
+        (B,H,W,C): (B,(image_size// patch_size)**2,dim)
+        '''
+
+        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
+        self.num_patch = image_size // patch_size
+        self.depths = depths
+
+        self.to_patch_embedding = nn.ModuleList([])
+        self.token_mix = nn.ModuleList([])
+        self.channel_mix = nn.ModuleList([])
+        self.drop_path = nn.ModuleList([])
+
+        net_num_blocks = sum(self.depths)
+        net_block_idx = 0
+        for i in range(len(self.depths)):
+            ratio = 2 ** i
+            if i == 0:
+                self.to_patch_embedding.append(nn.Sequential(nn.Conv2d(in_chans, dim, patch_size, patch_size, bias=False)))
+            else:
+                self.to_patch_embedding.append(nn.Sequential(nn.Conv2d(dim * ratio // 2, dim * ratio, 2, 2, bias=False)))
+
+            for j in range(self.depths[i]):
+                block_dpr = dp_rate * net_block_idx / (net_num_blocks - 1)  # stochastic depth linear decay rule
+                self.drop_path.append(DropPath(block_dpr) if block_dpr > 0. else nn.Identity())
+                net_block_idx += 1
+
+                self.channel_mix.append(nn.Sequential(
+                                     Rearrange('b c h w -> b h w c'),
+                                     nn.LayerNorm(dim*ratio),
+                                     FeedForward(dim*ratio,dim*ratio*alpha),
+                                     Rearrange('b h w c -> b c h w'))
+                                     )
+
+                self.token_mix.append(nn.Sequential(DWConvBlock(dim*ratio), sMLPBlock(self.num_patch//ratio, self.num_patch//ratio, dim * ratio)))
+
+        self.batch_norm = nn.BatchNorm2d(dim*2**(len(self.depths)-1))
+
+        self.mlp_head = nn.Sequential(
+            nn.Linear(dim * 2**(len(self.depths)-1), num_classes)
+        )
+
+    def forward(self, x):
+
+        shift = 0
+        for i in range(len(self.depths)):
+            x = self.to_patch_embedding[i](x)
+            for j in range(self.depths[i]):
+                x = x + self.drop_path[j+shift](self.token_mix[j+shift](x))
+                x = x + self.drop_path[j+shift](self.channel_mix[j+shift](x))
+            shift += self.depths[i]
+
+        x = self.batch_norm(x)
+
+        x = x.mean(dim=[2,3]).flatten(1)
+
+        return self.mlp_head(x)
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/__init__.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/__init__.py
new file mode 100644
index 0000000000..a5cd47f745
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from .spach import Spach
+from .spach_ms import SpachMS
+
+__all__ = ['Spach', 'SpachMS']
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/__init__.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/__init__.py
new file mode 100644
index 0000000000..b882ec93e1
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from .channel_func import ChannelMLP
+from .spatial_func import DWConv, SPATIAL_FUNC
+from .stem import STEM_LAYER
+
+__all__ = ['ChannelMLP', 'DWConv', 'SPATIAL_FUNC', 'STEM_LAYER']
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/channel_func.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/channel_func.py
new file mode 100644
index 0000000000..b27dee16d0
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/channel_func.py
@@ -0,0 +1,46 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from torch import nn
+
+
+class ChannelMLP(nn.Module):
+    """Channel MLP"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., **kwargs):
+        super(ChannelMLP, self).__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+        self.hidden_features = hidden_features
+        self.out_features = out_features
+
+    def forward(self, x):
+        B, N, C = x.shape
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+    def flops(self, input_shape):
+        _, N, C = input_shape
+        flops = 0
+        flops += (C + 1) * self.hidden_features * N
+        flops += (self.hidden_features + 1) * self.out_features * N
+        return flops
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/spatial_func.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/spatial_func.py
new file mode 100644
index 0000000000..259c56abdb
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/spatial_func.py
@@ -0,0 +1,122 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from torch import nn
+from einops import rearrange
+
+from ..misc import Reshape2HW, Reshape2N
+
+
+class SpatialAttention(nn.Module):
+    """Spatial Attention"""
+    def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., **kwargs):
+        super(SpatialAttention, self).__init__()
+        head_dim = dim // num_heads
+
+        self.num_heads = num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, "b n (three heads head_c) -> three b heads n head_c", three=3, heads=self.num_heads)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+
+        attn = (q @ k.transpose(-2, -1))  # B, head, N, N
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        out = (attn @ v)  # B, head, N, C
+        out = rearrange(out, "b heads n head_c -> b n (heads head_c)")
+
+        out = self.proj(out)
+        out = self.proj_drop(out)
+
+        return out
+
+    def flops(self, input_shape):
+        _, N, C = input_shape
+        flops = 0
+        # qkv
+        flops += 3 * C * C * N
+        # q@k
+        flops += N ** 2 * C
+        # attn@v
+        flops += N ** 2 * C
+        # proj
+        flops += C * C * N
+        return flops
+
+class SpatialMLP(nn.Module):
+    """Spatial MLP"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., **kwargs):
+        super(SpatialMLP, self).__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+        self.hidden_features = hidden_features
+        self.out_features = out_features
+
+    def forward(self, x):
+        B, N, C = x.shape
+        x = x.transpose(1, 2)
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        x = x.transpose(1, 2)
+        return x
+
+    def flops(self, input_shape):
+        _, N, C = input_shape
+        flops = 0
+        flops += (N + 1) * self.hidden_features * C
+        flops += (self.hidden_features + 1) * self.out_features * C
+        return flops
+
+
+class DWConv(nn.Module):
+    def __init__(self, dim, kernel_size=3):
+        super(DWConv, self).__init__()
+        self.dim = dim
+        self.kernel_size = kernel_size
+
+        padding = (kernel_size - 1) // 2
+        self.net = nn.Sequential(Reshape2HW(),
+                                 nn.Conv2d(dim, dim, kernel_size, 1, padding, groups=dim),
+                                 Reshape2N())
+
+
+    def forward(self, x):
+        x = self.net(x)
+        return x
+
+    def flops(self, input_shape):
+        _, N, C = input_shape
+        flops = N * self.dim * (3 * 3 + 1)
+        return flops
+
+
+SPATIAL_FUNC = {'attn': SpatialAttention, 'mlp': SpatialMLP, 'pass': None}
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/stem.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/stem.py
new file mode 100644
index 0000000000..056a7b7e8c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/layers/stem.py
@@ -0,0 +1,110 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from torch import nn
+
+from timm.models.layers import to_2tuple
+
+from ..misc import check_upstream_shape
+
+
+class PatchEmbed(nn.Module):
+    """1-conv patch embedding layer"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, downstream=False):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.downstream = downstream
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.stem_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.stem_shape[0] * self.stem_shape[1]
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.out_size = None
+
+        # for flops
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+    def forward(self, x):
+        if not self.downstream:
+            check_upstream_shape(x, self.img_size)
+        x = self.proj(x)
+        return x
+
+    def flops(self, input_shape=None):
+        flops = self.num_patches * self.embed_dim * (sum(self.patch_size) * self.in_chans + 1)  # Ho*Wo*Co*(K^2*Ci+1)
+        return flops
+
+
+class Conv4PatchEmbed(nn.Module):
+    """4-conv patch embedding layer"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, downstream=False, hidden_chans=64):
+        super(Conv4PatchEmbed, self).__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.downstream = downstream
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.stem_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.stem_shape[0] * self.stem_shape[1]
+
+        sub_patch_size = (patch_size[0]//2, patch_size[1]//2)
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(in_chans, hidden_chans, kernel_size=7, stride=2, padding=3, bias=False),
+            nn.BatchNorm2d(hidden_chans),
+            nn.ReLU(),
+            nn.Conv2d(hidden_chans, hidden_chans, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(hidden_chans),
+            nn.ReLU(),
+            nn.Conv2d(hidden_chans, hidden_chans, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(hidden_chans),
+            nn.ReLU(),
+            nn.Conv2d(hidden_chans, embed_dim, kernel_size=sub_patch_size, stride=sub_patch_size)
+        )
+
+        # for flops
+        self.inside_num_patches = self.num_patches * sum(sub_patch_size)
+        self.in_chans = in_chans
+        self.new_patch_size = sub_patch_size
+        self.embed_dim = embed_dim
+        self.hidden_chans = hidden_chans
+
+    def forward(self, x):
+        if not self.downstream:
+            check_upstream_shape(x, self.img_size)
+        x = self.proj(x)
+        return x
+
+    def flops(self, input_shape=None):
+        flops = 0
+        flops += self.inside_num_patches * self.hidden_chans * self.in_chans * 7 * 7  # Ho*Wo*Co*K^2*Ci+1
+        flops += self.inside_num_patches * self.hidden_chans
+
+        flops += self.inside_num_patches * self.hidden_chans * self.hidden_chans * 3 * 3
+        flops += self.inside_num_patches * self.hidden_chans
+
+        flops += self.inside_num_patches * self.hidden_chans * self.hidden_chans * 3 * 3
+        flops += self.inside_num_patches * self.hidden_chans
+
+        flops += self.num_patches * self.embed_dim * (sum(self.new_patch_size)*self.hidden_chans + 1)  # Ho*Wo*Co*(K^2*Ci+1)
+
+        return flops
+
+
+STEM_LAYER = {'conv1': PatchEmbed, 'conv4': Conv4PatchEmbed}
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/misc.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/misc.py
new file mode 100644
index 0000000000..4c28bca9c8
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/misc.py
@@ -0,0 +1,92 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from functools import partial
+
+from torch import nn
+from einops import rearrange
+
+from timm.models.layers import to_2tuple
+
+
+def check_upstream_shape(x, img_size=(224, 224)):
+    _, _, H, W = x.shape
+    assert H == img_size[0] and W == img_size[1], \
+        f"Input image size ({H}*{W}) doesn't match model ({img_size[0]}*{img_size[1]})."
+
+
+def reshape2n(x):
+    return rearrange(x, 'b c h w -> b (h w) c')
+
+
+def reshape2hw(x, hw=None):
+    n = x.shape[1]
+    if hw is None:
+        hw = to_2tuple(int(n ** 0.5))
+    assert n == hw[0] * hw[1], f"N={n} is not equal to H={hw[0]}*W={hw[1]}"
+    return rearrange(x, 'b (h w) c -> b c h w', h=hw[0])
+
+
+def downsample_conv(in_channels, out_channels, kernel_size=2, stride=2, padding=0, dilation=1, norm_layer=None):
+    assert norm_layer is None, "only support default normalization"
+    norm_layer = norm_layer or partial(nn.GroupNorm, num_groups=1, num_channels=out_channels)
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    dilation = dilation if kernel_size > 1 else 1
+    return nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding,
+                                   dilation=dilation, bias=False),
+                         norm_layer()
+                         )
+
+
+class Reshape2N(nn.Module):
+    def __init__(self):
+        super(Reshape2N, self).__init__()
+
+    def forward(self, x):
+        return reshape2n(x)
+
+
+class Reshape2HW(nn.Module):
+    def __init__(self, hw=None):
+        super(Reshape2HW, self).__init__()
+        self.hw = hw
+
+    def forward(self, x):
+        return reshape2hw(x, self.hw)
+
+
+class DownsampleConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2, padding=0, dilation=1, norm_layer=None):
+        super(DownsampleConv, self).__init__()
+        self.net = nn.Sequential(
+            Reshape2HW(),
+            downsample_conv(in_channels, out_channels, kernel_size, stride, padding, dilation, norm_layer),
+            Reshape2N()
+        )
+
+        self.kernel_size = kernel_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        return self.net(x)
+
+    def flops(self, input_shape):
+        _, N, C = input_shape  # C == out_channels
+        flops = 0
+        flops += N * self.out_channels * self.in_channels * self.kernel_size**2
+        flops += N * self.out_channels
+        return flops
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach.py
new file mode 100644
index 0000000000..63874d69a4
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach.py
@@ -0,0 +1,201 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from functools import partial
+
+import torch
+from torch import nn
+from timm.models.layers import DropPath
+from einops.layers.torch import Reduce
+
+from .layers import DWConv, SPATIAL_FUNC, ChannelMLP, STEM_LAYER
+from .misc import reshape2n
+
+
+class MixingBlock(nn.Module):
+    def __init__(self, dim,
+                 spatial_func=None, scaled=True, init_values=1e-4, shared_spatial_func=False,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, drop_path=0., cpe=True,
+                 num_heads=None, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.,  # attn
+                 in_features=None, hidden_features=None, drop=0.,  # mlp
+                 channel_ratio=2.0
+                 ):
+        super(MixingBlock, self).__init__()
+
+        spatial_kwargs = dict(act_layer=act_layer,
+                              in_features=in_features, hidden_features=hidden_features, drop=drop,  # mlp
+                              dim=dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=proj_drop  # attn
+                              )
+
+        self.valid_spatial_func = True
+
+        if spatial_func is not None:
+            if shared_spatial_func:
+                self.spatial_func = spatial_func
+            else:
+                self.spatial_func = spatial_func(**spatial_kwargs)
+            self.norm1 = norm_layer(dim)
+            if scaled:
+                self.gamma_1 = nn.Parameter(init_values * torch.ones(1, 1, dim), requires_grad=True)
+            else:
+                self.gamma_1 = 1.
+        else:
+            self.valid_spatial_func = False
+
+        self.channel_func = ChannelMLP(in_features=dim, hidden_features=int(dim*channel_ratio), act_layer=act_layer,
+                                       drop=drop)
+
+        self.norm2 = norm_layer(dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+
+        self.cpe = cpe
+        if cpe:
+            self.cpe_net = DWConv(dim)
+
+
+    def forward(self, x):
+        in_x = x
+        if self.valid_spatial_func:
+            x = x + self.drop_path(self.gamma_1 * self.spatial_func(self.norm1(in_x)))
+        if self.cpe:
+            x = x + self.cpe_net(in_x)
+
+        x = x + self.drop_path(self.channel_func(self.norm2(x)))
+
+        return x
+
+    def flops(self, input_shape):
+        _, N, C = input_shape
+        flops = 0
+        if self.valid_spatial_func:
+            flops += self.spatial_func.flops(input_shape)
+            flops += N * C * 2  # norm + skip
+        if self.cpe:
+            flops += self.cpe_net.flops(input_shape)
+
+        flops += self.channel_func.flops(input_shape)
+        flops += N * C * 2
+        return flops
+
+
+class Spach(nn.Module):
+    def __init__(self,
+                 num_classes=1000,
+                 img_size=224,
+                 in_chans=3,
+                 hidden_dim=384,
+                 patch_size=16,
+                 net_arch=None,
+                 act_layer=nn.GELU,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 stem_type='conv1',
+                 scaled=True, init_values=1e-4, drop_path_rate=0., cpe=True, shared_spatial_func=False,  # mixing block
+                 num_heads=12, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.,  # attn
+                 token_ratio=0.5, channel_ratio=2.0, drop_rate=0.,  # mlp
+                 downstream=False,
+                 **kwargs
+                 ):
+        super(Spach, self).__init__()
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim
+        self.downstream = downstream
+
+        self.stem = STEM_LAYER[stem_type](
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=hidden_dim, downstream=downstream)
+        self.norm1 = norm_layer(hidden_dim)
+
+        block_kwargs = dict(dim=hidden_dim, scaled=scaled, init_values=init_values, cpe=cpe,
+                            shared_spatial_func=shared_spatial_func, norm_layer=norm_layer, act_layer=act_layer,
+                            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=proj_drop,  # attn
+                            in_features=self.stem.num_patches, hidden_features=int(self.stem.num_patches * token_ratio), channel_ratio=channel_ratio, drop=drop_rate)  # mlp
+
+        self.blocks = self.make_blocks(net_arch, block_kwargs, drop_path_rate, shared_spatial_func)
+        self.norm2 = norm_layer(hidden_dim)
+
+        if not downstream:
+            self.pool = Reduce('b n c -> b c', reduction='mean')
+            self.head = nn.Linear(hidden_dim, self.num_classes)
+
+        self.init_weights()
+
+    def make_blocks(self, net_arch, block_kwargs, drop_path, shared_spatial_func):
+        if shared_spatial_func:
+            assert len(net_arch) == 1, '`shared_spatial_func` only support unitary spatial function'
+            assert net_arch[0][0] != 'pass', '`shared_spatial_func` do not support pass'
+            spatial_func = SPATIAL_FUNC[net_arch[0][0]](**block_kwargs)
+        else:
+            spatial_func = None
+        blocks = []
+        for func_type, depth in net_arch:
+            for i in range(depth):
+                blocks.append(MixingBlock(spatial_func=spatial_func or SPATIAL_FUNC[func_type], drop_path=drop_path,
+                                          **block_kwargs))
+        return nn.Sequential(*blocks)
+
+    def init_weights(self):
+        for n, m in self.named_modules():
+            _init_weights(m, n)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = reshape2n(x)
+        x = self.norm1(x)
+
+        x = self.blocks(x)
+        x = self.norm2(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.pool(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        shape = (1, self.stem.num_patches, self.hidden_dim)
+        # stem
+        flops += self.stem.flops()
+        flops += sum(shape)
+        # blocks
+        flops += sum([i.flops(shape) for i in self.blocks])
+        flops += sum(shape)
+        # head
+        flops += self.hidden_dim * self.num_classes
+        return flops
+
+
+def _init_weights(m, n: str):
+    if isinstance(m, nn.Linear):
+        if n.startswith('head'):
+            nn.init.zeros_(m.weight)
+            nn.init.zeros_(m.bias)
+        else:
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                if 'mlp' in n:
+                    nn.init.normal_(m.bias, std=1e-6)
+                else:
+                    nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
+        nn.init.ones_(m.weight)
+        nn.init.zeros_(m.bias)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach_ms.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach_ms.py
new file mode 100644
index 0000000000..d4d769dd84
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/models/spach/spach_ms.py
@@ -0,0 +1,147 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from functools import partial
+
+from torch import nn
+from einops.layers.torch import Reduce
+
+from .spach import MixingBlock, _init_weights
+from .layers import STEM_LAYER, SPATIAL_FUNC
+from .misc import DownsampleConv, reshape2n
+
+
+class SpachMS(nn.Module):
+    def __init__(self,
+                 num_classes=1000,
+                 img_size=224,
+                 in_chans=3,
+                 hidden_dim=384,
+                 patch_size=16,
+                 net_arch=None,
+                 act_layer=nn.GELU,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 stem_type='conv1',
+                 scaled=True, init_values=1e-4, drop_path_rate=0., cpe=True, shared_spatial_func=False,  # mixing block
+                 num_heads=12, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.,  # attn
+                 token_ratio=0.5, channel_ratio=2.0, drop_rate=0.,  # mlp
+                 downstream=False,
+                 **kwargs
+                 ):
+        super(SpachMS, self).__init__()
+        assert len(net_arch) == 4
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim
+        self.downstream = downstream
+
+        self.stem = STEM_LAYER[stem_type](
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=hidden_dim, downstream=downstream)
+        self.norm1 = norm_layer(hidden_dim)
+
+        block_kwargs = dict(scaled=scaled, init_values=init_values, cpe=cpe,
+                            shared_spatial_func=shared_spatial_func, norm_layer=norm_layer, act_layer=act_layer,
+                            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=proj_drop,  # attn
+                            channel_ratio=channel_ratio, drop=drop_rate)  # mlp
+
+        stage_modules = self.make_blocks(hidden_dim, self.stem.num_patches, net_arch, block_kwargs, drop_path_rate,
+                                         shared_spatial_func, token_ratio)
+        for stage in stage_modules:
+            self.add_module(*stage)
+        hidden_dim = hidden_dim * 8
+        self.norm2 = norm_layer(hidden_dim)
+
+        if not downstream:
+            self.pool = Reduce('b n c -> b c', reduction='mean')
+            self.head = nn.Linear(hidden_dim, self.num_classes)
+
+        self.init_weights()
+
+    def make_blocks(self, dim, seq_len, net_arch, block_kwargs, drop_path, shared_spatial_func, token_ratio):
+        stages = []
+        num_blocks = sum(sum([depth for _, depth in stage_arch]) for stage_arch in net_arch)
+        block_idx = 0
+
+        for stage_idx, stage_arch in enumerate(net_arch):
+            stage_name = f'layer{stage_idx + 1}'
+            blocks = []
+            if stage_idx > 0:
+                down_kwargs = dict(in_channels=dim, out_channels=dim * 2)
+                downsample = DownsampleConv(**down_kwargs)
+                blocks.append(downsample)
+                dim = dim * 2
+                seq_len = seq_len // 4
+
+            block_kwargs.update(dict(dim=dim, in_features=seq_len, hidden_features=int(seq_len * token_ratio)))
+
+            if stage_idx > 0 and shared_spatial_func:
+                assert len(stage_arch) == 1, '`shared_spatial_func` only support unitary spatial function'
+                assert stage_arch[0][0] != 'pass', '`shared_spatial_func` do not support pass'
+                spatial_func = SPATIAL_FUNC[stage_arch[0][0]](**block_kwargs)
+            else:
+                spatial_func = None
+
+            for func_type, depth in stage_arch:
+                for i in range(depth):
+                    block_dpr = drop_path * block_idx / (num_blocks - 1)  # stochastic depth linear decay rule
+                    blocks.append(MixingBlock(spatial_func=spatial_func or SPATIAL_FUNC[func_type], drop_path=block_dpr,
+                                              **block_kwargs))
+                    block_idx += 1
+            stages.append((stage_name, nn.Sequential(*blocks)))
+
+        return stages
+
+    def init_weights(self):
+        for n, m in self.named_modules():
+            _init_weights(m, n)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = reshape2n(x)
+        x = self.norm1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.norm2(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.pool(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        shape = (1, self.stem.num_patches, self.hidden_dim)
+        # stem
+        flops += self.stem.flops()
+        flops += sum(shape)
+        # layer1,2,3,4
+        flops += sum([i.flops(shape) for i in self.layer1])
+        shape = (1, self.stem.num_patches//4, self.hidden_dim*2)
+        flops += sum([i.flops(shape) for i in self.layer2])
+        shape = (1, self.stem.num_patches//16, self.hidden_dim*4)
+        flops += sum([i.flops(shape) for i in self.layer3])
+        shape = (1, self.stem.num_patches//64, self.hidden_dim*8)
+        flops += sum([i.flops(shape) for i in self.layer4])
+        flops += sum(shape)
+        # head
+        flops += self.hidden_dim * 8 * self.num_classes
+        return flops
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/requirements.txt b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/requirements.txt
new file mode 100644
index 0000000000..825d68d5d0
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/requirements.txt
@@ -0,0 +1,4 @@
+torch==1.7.0
+torchvision==0.8.1
+timm==0.3.2
+einops==0.3.2
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/samplers.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/samplers.py
new file mode 100644
index 0000000000..8ab355d572
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/samplers.py
@@ -0,0 +1,73 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+import torch
+import torch.distributed as dist
+import math
+
+
+class RASampler(torch.utils.data.Sampler):
+    """Sampler that restricts data loading to a subset of the dataset for distributed,
+    with repeated augmentation.
+    It ensures that different each augmented version of a sample will be visible to a
+    different process (GPU)
+    Heavily based on torch.utils.data.DistributedSampler
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
+        self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices = [ele for ele in indices for i in range(3)]
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices[:self.num_selected_samples])
+
+    def __len__(self):
+        return self.num_selected_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/env_npu.sh b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/env_npu.sh
new file mode 100644
index 0000000000..a975f7978c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/env_npu.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启2个非连续combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置是否开启3个非连续combined标志,0-关闭/1-开启
+export TRI_COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+# HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+# HCCL默认超时时间120s较少，修改为1800s对齐PyTorch默认设置
+export HCCL_CONNECT_TIMEOUT=1800
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_1p.sh
new file mode 100644
index 0000000000..e9d7657cec
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_1p.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="shiftvit_light_tiny"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/dataset/imagenet/"
+
+# 训练最大iter数
+max_iter=10010
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+else
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+KERNEL_NUM=$(($(nproc)/8))
+for i in $(seq 0 0)
+do
+    if [ $(uname -m) = "aarch64" ]
+    then
+        PID_START=$((KERNEL_NUM * i))
+        PID_END=$((PID_START + KERNEL_NUM - 1))
+        taskset -c $PID_START-$PID_END \
+          python3.7 -u main.py \
+            --model shiftvit_light_tiny \
+            --data-path ${data_path} \
+            --output_dir ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+            --npu \
+            --num_workers 16\
+            > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_full_1p_${ASCEND_DEVICE_ID}.log 2>&1 &
+    else
+        python3.7 -u main.py \
+        --model shiftvit_light_tiny \
+        --data-path ${data_path} \
+        --output_dir ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+        --npu \
+        --num_workers 16\
+        > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_full_1p_${ASCEND_DEVICE_ID}.log 2>&1 &
+    fi
+done
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'FPS:'| awk '{sum+=$10} END {print sum/NR}'`
+#打印，不需要修改
+echo "Final Performance FPS : ${FPS}"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+AvgFPS=${FPS}
+
+#最后一个迭代loss值
+MinLoss=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Averaged stats:' | awk 'BEGIN {min = 65536} {if ($12+0 < min+0) min=$12} END {print min}'`
+MaxAccuracy=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Max accuracy' | awk 'BEGIN {max = 0} {if ($9+0 > max+0) max=$9} END {print max}'`
+echo "MaxAccuracy = ${MaxAccuracy}"
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_8p.sh
new file mode 100644
index 0000000000..24e0a5cdde
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_full_8p.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="shiftvit_light_tiny"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/dataset/imagenet/"
+
+# 训练最大iter数
+max_iter=1210
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    # source /home/wangchy/SpanBERT/code/env.sh
+    export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+
+lscpu_out=$(lscpu)
+n_sockets=4
+n_cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${n_sockets} cores_per_socket=${n_cores_per_socket}"
+
+export PYTHONPATH=../:$PYTHONPATH
+
+python3.7 -u -m bind_pyt \
+    --nsockets_per_node ${n_sockets} \
+    --ncores_per_socket ${n_cores_per_socket} \
+    --master_addr $(hostname -I |awk '{print $1}') \
+    --no_hyperthreads \
+    --no_membind "$@" main.py \
+    --model shiftvit_light_tiny \
+    --npu \
+    --data-path ${data_path} \
+    --pin-mem \
+    --dist-eval \
+    --num_workers 16 \
+    --output_dir ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+    > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_full_8p_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'FPS:'| awk '{sum+=$10} END {print sum/NR}'`
+#打印，不需要修改
+echo "Final Performance FPS : ${FPS}"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+AvgFPS=${FPS}
+
+#最后一个迭代loss值
+MinLoss=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Averaged stats:' | awk 'BEGIN {min = 65536} {if ($12+0 < min+0) min=$12} END {print min}'`
+MaxAccuracy=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Max accuracy' | awk 'BEGIN {max = 0} {if ($9+0 > max+0) max=$9} END {print max}'`
+echo "MaxAccuracy = ${MaxAccuracy}"
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_1p.sh
new file mode 100644
index 0000000000..592efbc030
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_1p.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="shiftvit_light_tiny"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/dataset/imagenet/"
+
+# 训练最大iter数
+max_iter=10010
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+else
+    mkdir -p ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+KERNEL_NUM=$(($(nproc)/8))
+for i in $(seq 0 0)
+do
+    if [ $(uname -m) = "aarch64" ]
+    then
+        PID_START=$((KERNEL_NUM * i))
+        PID_END=$((PID_START + KERNEL_NUM - 1))
+        taskset -c $PID_START-$PID_END \
+          python3.7 -u main.py \
+            --model shiftvit_light_tiny \
+            --data-path ${data_path} \
+            --output_dir ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+            --npu \
+            --num_workers 16\
+            --epochs 1 \
+            > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_perf_1p_${ASCEND_DEVICE_ID}.log 2>&1 &
+    else
+        python3.7 -u main.py \
+        --model shiftvit_light_tiny \
+        --data-path ${data_path} \
+        --output_dir ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+        --npu \
+        --num_workers 16\
+        --epochs 1 \
+        > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_perf_1p_${ASCEND_DEVICE_ID}.log 2>&1 &
+    fi
+done
+
+wait
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'FPS:'| awk '{sum+=$10} END {print sum/NR}'`
+#打印，不需要修改
+echo "Final Performance FPS : ${FPS}"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+AvgFPS=${FPS}
+
+#最后一个迭代loss值
+MinLoss=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Averaged stats:' | awk 'BEGIN {min = 65536} {if ($12+0 < min+0) min=$12} END {print min}'`
+MaxAccuracy=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Max accuracy' | awk 'BEGIN {max = 0} {if ($9+0 > max+0) max=$9} END {print max}'`
+echo "MaxAccuracy = ${MaxAccuracy}"
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_8p.sh
new file mode 100644
index 0000000000..db64bb23d2
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/test/train_performance_8p.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="shiftvit_light_tiny"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path="/opt/npu/dataset/imagenet/"
+
+# 训练最大iter数
+max_iter=1210
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+echo ${pwd}
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+# 变量
+export SPACH_DATASETS=${data_path}
+export PYTHONPATH=./:$PYTHONPATH
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+    export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+
+lscpu_out=$(lscpu)
+n_sockets=4
+n_cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${n_sockets} cores_per_socket=${n_cores_per_socket}"
+
+export PYTHONPATH=../:$PYTHONPATH
+
+python3.7 -u -m bind_pyt \
+    --nsockets_per_node ${n_sockets} \
+    --ncores_per_socket ${n_cores_per_socket} \
+    --master_addr $(hostname -I |awk '{print $1}') \
+    --no_hyperthreads \
+    --no_membind "$@" main.py \
+    --model shiftvit_light_tiny \
+    --npu \
+    --data-path ${data_path} \
+    --pin-mem \
+    --dist-eval \
+    --num_workers 16 \
+    --epochs 5 \
+    --output_dir ${test_path_dir}/output/${ASCEND_DEVICE_ID} \
+    > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_perf_8p_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'FPS:'| awk '{sum+=$10} END {print sum/NR}'`
+#打印，不需要修改
+echo "Final Performance FPS : ${FPS}"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+AvgFPS=${FPS}
+
+#最后一个迭代loss值
+MinLoss=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Averaged stats:' | awk 'BEGIN {min = 65536} {if ($12+0 < min+0) min=$12} END {print min}'`
+MaxAccuracy=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep 'Max accuracy' | awk 'BEGIN {max = 0} {if ($9+0 > max+0) max=$9} END {print max}'`
+echo "MaxAccuracy = ${MaxAccuracy}"
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "AvgFPS = ${AvgFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MinLoss = ${MinLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "MaxAccuracy = ${MaxAccuracy}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/utils.py b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/utils.py
new file mode 100644
index 0000000000..e117062cfb
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/ShiftViT_for_PyTorch/utils.py
@@ -0,0 +1,298 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright (c) 2015-present, Facebook, Inc.
+# All rights reserved.
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+import io
+import os
+import time
+from collections import defaultdict, deque
+import datetime
+import logging
+
+import torch
+import torch.distributed as dist
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self, use_npu=False, device='cuda'):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        if use_npu:
+            t = torch.tensor([self.count, self.total], dtype=torch.float64, device=device)
+        else:
+            t = torch.tensor([self.count, self.total], dtype=torch.float64, device=device)
+        
+        dist.barrier()
+        # dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t", logger=logging):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.logger = logger
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self, use_npu=False, device='cuda'):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes(use_npu=use_npu, device=device)
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, batch_size, header=None, use_npu=False):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        skip_pre_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if use_npu:
+            if torch.npu.is_available():
+                log_msg.append('max mem: {memory:.0f}')
+        else:
+            if torch.cuda.is_available():
+                log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for i, obj in enumerate(iterable):
+            if i == 3:
+                skip_pre_time = time.time()
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if use_npu:
+                    if torch.npu.is_available():
+                        self.logger.info(log_msg.format(
+                            i, len(iterable), eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time), data=str(data_time),
+                            memory=torch.npu.max_memory_allocated() / MB))
+                    else:
+                        self.logger.info(log_msg.format(
+                            i, len(iterable), eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time), data=str(data_time)))
+                else:
+                    if torch.cuda.is_available():
+                        self.logger.info(log_msg.format(
+                            i, len(iterable), eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time), data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB))
+                    else:
+                        self.logger.info(log_msg.format(
+                            i, len(iterable), eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        FPS_valid_time = time.time() - skip_pre_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        self.logger.info('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+        self.logger.info('iters num: {}, batch_size: {}, world_size: {}'.format(
+            len(iterable), batch_size, get_world_size()))
+        self.logger.info('{} FPS: {} ({:.4f} s / it)'.format(
+            header, float(len(iterable) * batch_size * get_world_size()) / float(FPS_valid_time), float(FPS_valid_time) / float(len(iterable))))
+
+
+def _load_checkpoint_for_ema(model_ema, checkpoint):
+    """
+    Workaround for ModelEma._load_checkpoint to accept an already-loaded object
+    """
+    mem_file = io.BytesIO()
+    torch.save(checkpoint, mem_file)
+    mem_file.seek(0)
+    model_ema._load_checkpoint(mem_file)
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        if args.npu:
+            args.gpu = args.rank % torch.npu.device_count()
+        else:
+            args.gpu = args.rank % torch.cuda.device_count()        
+    elif 'OMPI_COMM_WORLD_SIZE' in os.environ and 'OMPI_COMM_WORLD_RANK' in os.environ:
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        print(f'dist train on amlk8s| word_size {args.world_size} | rank {args.rank} | gpu {args.gpu} | dist_url {args.dist_url}')
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+    if args.npu:
+        torch.distributed.init_process_group(backend='hccl',
+                                         world_size=args.world_size, rank=args.rank)
+        loc = 'npu:{}'.format(args.gpu)
+        torch.npu.set_device(loc)  
+        print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
+    else:
+        torch.cuda.set_device(args.gpu)
+        args.dist_backend = 'nccl'
+        print('| distributed init (rank {}): {}'.format(
+            args.rank, args.dist_url), flush=True)
+        torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                            world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+
-- 
Gitee