From fd4fa5440ec0fc826acf5601b17355456f057d38 Mon Sep 17 00:00:00 2001
From: zhangyanmin <2716635239@qq.com>
Date: Wed, 21 Sep 2022 09:34:03 +0000
Subject: [PATCH 1/3] Update cla
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

put Evo-Levit_256_384 into correct file_path

update Readme and test

updata performance 1P and 8P sh

删除文件 Evo-Levit_256_384

push into one commit

delete custom_tune_bank and modified test/env_npu.sh

support 1P obtain the fps of 1000 steps

print 1000 fps then exit

env_npu

add FPS in full.sh

update Readme

update right Readme and fps
---
 .../classification/Evo-Levit_256_384/LICENSE  |  13 +
 .../Evo-Levit_256_384/README.md               | 154 ++++
 .../Evo-Levit_256_384/benchmark.py            | 486 +++++++++++
 .../Evo-Levit_256_384/datasets.py             | 185 ++++
 .../Evo-Levit_256_384/engine_levit.py         | 214 +++++
 .../Evo-Levit_256_384/fix_timm/mixup.py       | 319 +++++++
 .../fix_timm/optim_factory.py                 | 187 ++++
 .../Evo-Levit_256_384/levit/evo_levit.py      | 788 +++++++++++++++++
 .../Evo-Levit_256_384/levit/evo_levit_384.py  | 813 ++++++++++++++++++
 .../Evo-Levit_256_384/levit/losses_levit.py   | 127 +++
 .../Evo-Levit_256_384/main_levit.py           | 506 +++++++++++
 .../Evo-Levit_256_384/samplers.py             |  74 ++
 .../Evo-Levit_256_384/test/env_npu.sh         |  68 ++
 .../Evo-Levit_256_384/test/train_full_1P.sh   | 116 +++
 .../Evo-Levit_256_384/test/train_full_8P.sh   | 115 +++
 .../test/train_performance_1P.sh              | 113 +++
 .../test/train_performance_8P.sh              | 127 +++
 .../classification/Evo-Levit_256_384/utils.py | 277 ++++++
 .../Evo-Levit_256_384/visualize.py            | 599 +++++++++++++
 19 files changed, 5281 insertions(+)
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/LICENSE
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/benchmark.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/datasets.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/engine_levit.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/mixup.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/optim_factory.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit_384.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/losses_levit.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/main_levit.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/samplers.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_1P.sh
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_8P.sh
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_1P.sh
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_8P.sh
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/utils.py
 create mode 100644 PyTorch/contrib/cv/classification/Evo-Levit_256_384/visualize.py

diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/LICENSE b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/LICENSE
new file mode 100644
index 0000000000..325b1ea4fa
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/LICENSE
@@ -0,0 +1,13 @@
+Copyright 2021 Huawei Technologies Co., Ltd
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
new file mode 100644
index 0000000000..51c8512781
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
@@ -0,0 +1,154 @@
+# Evo-Levit for PyTorch
+
+[TOC]
+
+# 概述
+
+## 简述
+
+Evo-ViT的具体框架设计，包括基于全局class attention的token选择以及慢速-快速双流token更新两个模块。其根据全局class attention的排序判断高信息token和低信息token，将低信息token整合为一个归纳token，和高信息token一起输入到原始多头注意力（Multi-head Self-Attention, MSA）模块以及前向传播（Fast Fed-forward Network, FFN）模块中进行精细更新。更新后的归纳token用来快速更新低信息token。全局class attention也在精细更新过程中进行同步更新变化。
+
+- 参考实现
+
+```
+url = https://github.com/YifanXu74/Evo-ViT.git
+```
+
+- 适配昇腾AI处理器的实现
+- 通过Git获取代码方法如下
+
+```
+git clone {url}       # 克隆仓库的代码
+cd {code_path}        # 切换到模型代码所在路径
+```
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示
+
+  **表1** 版本配套表
+
+| 配套       | 版本                                                         |
+| ---------- | ------------------------------------------------------------ |
+| 固件与驱动 | [1.0.12](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+| CANN       | [5.0.3](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
+| PyTorch    | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/)       |
+
+- 安装依赖
+
+```
+pip install timm==0.4.12
+pip install torchvision==0.9.1
+pip install torch_npu-1.8.1rc2.20220607-cp37-cp37m-linux_aarch64.whl
+pip install torch-1.8.1+ascend.rc2.20220607-cp37-cp37m-linux_aarch64.whl
+pip install apex-0.1+ascend.20220607-cp37-cp37m-linux_aarch64.whl
+```
+
+- 关于timm包的NPU优化补丁
+
+```
+# 需要先cd到当前文件目录，一般timm包的安装位置在/usr/local/lib/python3.7/dist-packages/timm/
+#先后生成补丁并升级包
+diff -uN {timm_path}/data/mixup.py {code_path}/fix_timm/mixup.py >mixup.patch
+diff -uN {timm_path}/optim/optim_factory.py {code_path}/fix_timm/optim_factory.py >optim.patch
+patch -p0 {timm_path}/data/mixup.py mixup.patch
+patch -p0 {timm_path}/optim/optim_factory.py optim.patch
+```
+
+## 数据集
+
+1. 获取数据集
+
+​		选用的数据集是ImageNet，用户自行获取将数据集上传到服务器任意路径下并解压。
+
+​		ImageNet数据集的目录结构参考如下所示
+
+```
+├── ImageNet2012
+      ├──train
+           ├──类别1
+                 │──图片1
+                 │──图片2
+                 │   ...       
+           ├──类别2
+                 │──图片1
+                 │──图片2
+                 │   ...   
+           ├──...                     
+      ├──val  
+           ├──类别1
+                 │──图片1
+                 │──图片2
+                 │   ...       
+           ├──类别2
+                 │──图片1
+                 │──图片2
+                 │   ...              
+```
+
+## 获取Teacher checkpoint
+
+Evo-Vit模型训练需要配置teacher—model，获取方式为在GitHub的[Evo-Vit]([GitHub - YifanXu74/Evo-ViT: Official implement of Evo-ViT: Slow-Fast Token Evolution for Dynamic Vision Transformer](https://github.com/YifanXu74/Evo-ViT)),checkpoint文件可以在该仓库自行下载，也可以直接使用网址进行下载，网址如下
+https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录
+
+```
+cd /Evo-Levit_256_384
+```
+
+2.  运行训练脚本
+
+该模型支持单机单卡训练和单机8卡训练，开始训练前，请用户根据实际路径配置data_path参数。
+
+- 单机单卡训练
+
+​	启动单卡训练
+
+```
+bash ./test/train_full_1p.sh --data_path=/home/zym/imagenet/
+```
+
+- 单机 8卡训练
+
+​	启动8卡训练
+
+```
+bash ./test/train_full_8p.sh --data_path=/home/zym/imagenet/
+```
+
+训练完成后，权重文件保存在 参数设置 的路径下，并输出模型训练精度和性能信息
+
+# 训练结果展示
+
+**表2**  训练结果展示表
+
+| NAME   | PT版本 | 精度  | FPS  | Epochs | AMP_Type |
+| ------ | ------ | ----- | ---- | ------ | -------- |
+| 1P-GPU | 1.8.1  | -     | 51   | 1      | O1       |
+| 1P-NPU | 1.8.1  | -     | 59   | 1      | O1       |
+| 8P-GPU | 1.8.1  | 73.54 | 487  | 100    | O1       |
+| 8P-NPU | 1.8.1  | 74.11 | 496  | 100    | O1       |
+
+
+
+# 版本说明
+
+## 变更
+
+2022.09.17：首次发布
+
+2022.10.21: 新增teacher checkpoint网址，更新bash命令
+
+2022.11.09：将NPU1P的fps更新为符合设备的59
+
+# 已知问题
+
+
+
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/benchmark.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/benchmark.py
new file mode 100644
index 0000000000..16ec6a68b3
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/benchmark.py
@@ -0,0 +1,486 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import csv
+import json
+import time
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+from collections import OrderedDict
+from contextlib import suppress
+from functools import partial
+
+from timm.models import create_model, is_model, list_models
+from timm.optim import create_optimizer
+from timm.data import resolve_data_config
+from timm.utils import AverageMeter, setup_default_logging
+
+from deit import evo_deit
+from levit import evo_levit
+from levit import evo_levit_384
+
+has_apex = False
+try:
+    from apex import amp
+
+    has_apex = True
+except ImportError:
+    pass
+
+has_native_amp = False
+try:
+    if getattr(torch.cuda.amp, 'autocast') is not None:
+        has_native_amp = True
+except AttributeError:
+    pass
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('validate')
+
+parser = argparse.ArgumentParser(description='PyTorch Benchmark')
+
+# benchmark specific args
+parser.add_argument('--model_list', metavar='NAME', default='',
+                    help='txt file based list of model names to benchmark')
+parser.add_argument('--bench', default='inference', type=str,
+                    help="Benchmark mode. One of 'inference', 'train', 'both'. Defaults to 'inference'")
+parser.add_argument('--detail', action='store_true', default=False,
+                    help='Provide train fwd/bwd/opt breakdown detail if True. Defaults to False')
+parser.add_argument('--results_file', default='', type=str, metavar='FILENAME',
+                    help='Output csv file for validation results (summary)')
+parser.add_argument('--num_warm_iter', default=10, type=int,
+                    metavar='N', help='Number of warmup iterations (default: 10)')
+parser.add_argument('--num_bench_iter', default=40, type=int,
+                    metavar='N', help='Number of benchmark iterations (default: 40)')
+
+# common inference / train args
+parser.add_argument('--model', '-m', metavar='NAME', default='resnet50',
+                    help='model architecture (default: resnet50)')
+parser.add_argument('-b', '--batch_size', default=256, type=int,
+                    metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--img_size', default=None, type=int,
+                    metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--input_size', default=None, nargs=3, type=int,
+                    metavar='N N N',
+                    help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
+parser.add_argument('--num_classes', type=int, default=1000,
+                    help='Number classes in dataset')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+parser.add_argument('--channels_last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='use PyTorch Native AMP for mixed precision training. Overrides --precision arg.')
+parser.add_argument('--precision', default='float32', type=str,
+                    help='Numeric precision. One of (amp, float32, float16, bfloat16, tf32)')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+
+# train optimizer parameters
+parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                    help='Optimizer (default: "sgd"')
+parser.add_argument('--opt_eps', default=None, type=float, metavar='EPSILON',
+                    help='Optimizer Epsilon (default: None, use opt default)')
+parser.add_argument('--opt_betas', default=None, type=float, nargs='+', metavar='BETA',
+                    help='Optimizer Betas (default: None, use opt default)')
+parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                    help='Optimizer momentum (default: 0.9)')
+parser.add_argument('--weight_decay', type=float, default=0.0001,
+                    help='weight decay (default: 0.0001)')
+parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM',
+                    help='Clip gradient norm (default: None, no clipping)')
+parser.add_argument('--clip_mode', type=str, default='norm',
+                    help='Gradient clipping mode. One of ("norm", "value", "agc")')
+
+# model regularization / loss params that impact model or loss fn
+parser.add_argument('--smoothing', type=float, default=0.1,
+                    help='Label smoothing (default: 0.1)')
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                    help='Dropout rate (default: 0.)')
+parser.add_argument('--drop_path', type=float, default=None, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop_block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
+
+
+def timestamp(sync=False):
+    return time.perf_counter()
+
+
+def cuda_timestamp(sync=False, device=None):
+    if sync:
+        torch.cuda.synchronize(device=device)
+    return time.perf_counter()
+
+
+def count_params(model: nn.Module):
+    return sum([m.numel() for m in model.parameters()])
+
+
+def resolve_precision(precision: str):
+    assert precision in ('amp', 'float16', 'bfloat16', 'float32')
+    use_amp = False
+    model_dtype = torch.float32
+    data_dtype = torch.float32
+    if precision == 'amp':
+        use_amp = True
+    elif precision == 'float16':
+        model_dtype = torch.float16
+        data_dtype = torch.float16
+    elif precision == 'bfloat16':
+        model_dtype = torch.bfloat16
+        data_dtype = torch.bfloat16
+    return use_amp, model_dtype, data_dtype
+
+
+class BenchmarkRunner:
+    def __init__(
+            self, model_name, detail=False, device='cuda', torchscript=False, precision='float32',
+            num_warm_iter=10, num_bench_iter=50, **kwargs):
+        self.model_name = model_name
+        self.detail = detail
+        self.device = device
+        self.use_amp, self.model_dtype, self.data_dtype = resolve_precision(precision)
+        self.channels_last = kwargs.pop('channels_last', False)
+        self.amp_autocast = torch.cuda.amp.autocast if self.use_amp else suppress
+
+        self.model = evo_deit.evo_deit_base_patch16_224()
+        self.model.to(
+            device=self.device,
+            dtype=self.model_dtype,
+            memory_format=torch.channels_last if self.channels_last else None)
+        self.num_classes = self.model.num_classes
+        self.param_count = count_params(self.model)
+        _logger.info('Model %s created, param count: %d' % (model_name, self.param_count))
+        if torchscript:
+            self.model = torch.jit.script(self.model)
+
+        data_config = resolve_data_config(kwargs, model=self.model, use_test_size=True)
+        self.input_size = data_config['input_size']
+        self.batch_size = kwargs.pop('batch_size', 256)
+
+        self.example_inputs = None
+        self.num_warm_iter = num_warm_iter
+        self.num_bench_iter = num_bench_iter
+        self.log_freq = num_bench_iter // 5
+        if 'cuda' in self.device:
+            self.time_fn = partial(cuda_timestamp, device=self.device)
+        else:
+            self.time_fn = timestamp
+
+    def _init_input(self):
+        self.example_inputs = torch.randn(
+            (self.batch_size,) + self.input_size, device=self.device, dtype=self.data_dtype)
+        if self.channels_last:
+            self.example_inputs = self.example_inputs.contiguous(memory_format=torch.channels_last)
+
+
+class InferenceBenchmarkRunner(BenchmarkRunner):
+
+    def __init__(self, model_name, device='cuda', torchscript=False, **kwargs):
+        super().__init__(model_name=model_name, device=device, torchscript=torchscript, **kwargs)
+        self.model.eval()
+
+    def run(self):
+        def _step():
+            t_step_start = self.time_fn()
+            with self.amp_autocast():
+                output = self.model(self.example_inputs)
+            t_step_end = self.time_fn(True)
+            return t_step_end - t_step_start
+
+        _logger.info(
+            f'Running inference benchmark on {self.model_name} for {self.num_bench_iter} steps w/ '
+            f'input size {self.input_size} and batch size {self.batch_size}.')
+
+        with torch.no_grad():
+            self._init_input()
+
+            for _ in range(self.num_warm_iter):
+                _step()
+
+            total_step = 0.
+            num_samples = 0
+            t_run_start = self.time_fn()
+            for i in range(self.num_bench_iter):
+                delta_fwd = _step()
+                total_step += delta_fwd
+                num_samples += self.batch_size
+                num_steps = i + 1
+                if num_steps % self.log_freq == 0:
+                    _logger.info(
+                        f"Infer [{num_steps}/{self.num_bench_iter}]."
+                        f" {num_samples / total_step:0.2f} samples/sec."
+                        f" {1000 * total_step / num_steps:0.3f} ms/step.")
+            t_run_end = self.time_fn(True)
+            t_run_elapsed = t_run_end - t_run_start
+
+        results = dict(
+            samples_per_sec=round(num_samples / t_run_elapsed, 2),
+            step_time=round(1000 * total_step / self.num_bench_iter, 3),
+            batch_size=self.batch_size,
+            img_size=self.input_size[-1],
+            param_count=round(self.param_count / 1e6, 2),
+        )
+
+        _logger.info(
+            f"Inference benchmark of {self.model_name} done. "
+            f"{results['samples_per_sec']:.2f} samples/sec, {results['step_time']:.2f} ms/step")
+
+        return results
+
+
+class TrainBenchmarkRunner(BenchmarkRunner):
+
+    def __init__(self, model_name, device='cuda', torchscript=False, **kwargs):
+        super().__init__(model_name=model_name, device=device, torchscript=torchscript, **kwargs)
+        self.model.train()
+
+        if kwargs.pop('smoothing', 0) > 0:
+            self.loss = nn.CrossEntropyLoss().to(self.device)
+        else:
+            self.loss = nn.CrossEntropyLoss().to(self.device)
+        self.target_shape = tuple()
+
+        self.optimizer = create_optimizer(
+            self.model,
+            optimizer_name=kwargs.pop('opt', 'sgd'),
+            learning_rate=kwargs.pop('lr', 1e-4))
+
+    def _gen_target(self, batch_size):
+        return torch.empty(
+            (batch_size,) + self.target_shape, device=self.device, dtype=torch.long).random_(self.num_classes)
+
+    def run(self):
+        def _step(detail=False):
+            self.optimizer.zero_grad()  # can this be ignored?
+            t_start = self.time_fn()
+            t_fwd_end = t_start
+            t_bwd_end = t_start
+            with self.amp_autocast():
+                output = self.model(self.example_inputs)
+                if isinstance(output, tuple):
+                    output = output[0]
+                if detail:
+                    t_fwd_end = self.time_fn(True)
+                target = self._gen_target(output.shape[0])
+                self.loss(output, target).backward()
+                if detail:
+                    t_bwd_end = self.time_fn(True)
+            self.optimizer.step()
+            t_end = self.time_fn(True)
+            if detail:
+                delta_fwd = t_fwd_end - t_start
+                delta_bwd = t_bwd_end - t_fwd_end
+                delta_opt = t_end - t_bwd_end
+                return delta_fwd, delta_bwd, delta_opt
+            else:
+                delta_step = t_end - t_start
+                return delta_step
+
+        _logger.info(
+            f'Running train benchmark on {self.model_name} for {self.num_bench_iter} steps w/ '
+            f'input size {self.input_size} and batch size {self.batch_size}.')
+
+        self._init_input()
+
+        for _ in range(self.num_warm_iter):
+            _step()
+
+        t_run_start = self.time_fn()
+        if self.detail:
+            total_fwd = 0.
+            total_bwd = 0.
+            total_opt = 0.
+            num_samples = 0
+            for i in range(self.num_bench_iter):
+                delta_fwd, delta_bwd, delta_opt = _step(True)
+                num_samples += self.batch_size
+                total_fwd += delta_fwd
+                total_bwd += delta_bwd
+                total_opt += delta_opt
+                num_steps = (i + 1)
+                if num_steps % self.log_freq == 0:
+                    total_step = total_fwd + total_bwd + total_opt
+                    _logger.info(
+                        f"Train [{num_steps}/{self.num_bench_iter}]."
+                        f" {num_samples / total_step:0.2f} samples/sec."
+                        f" {1000 * total_fwd / num_steps:0.3f} ms/step fwd,"
+                        f" {1000 * total_bwd / num_steps:0.3f} ms/step bwd,"
+                        f" {1000 * total_opt / num_steps:0.3f} ms/step opt."
+                    )
+            total_step = total_fwd + total_bwd + total_opt
+            t_run_elapsed = self.time_fn() - t_run_start
+            results = dict(
+                samples_per_sec=round(num_samples / t_run_elapsed, 2),
+                step_time=round(1000 * total_step / self.num_bench_iter, 3),
+                fwd_time=round(1000 * total_fwd / self.num_bench_iter, 3),
+                bwd_time=round(1000 * total_bwd / self.num_bench_iter, 3),
+                opt_time=round(1000 * total_opt / self.num_bench_iter, 3),
+                batch_size=self.batch_size,
+                img_size=self.input_size[-1],
+                param_count=round(self.param_count / 1e6, 2),
+            )
+        else:
+            total_step = 0.
+            num_samples = 0
+            for i in range(self.num_bench_iter):
+                delta_step = _step(False)
+                num_samples += self.batch_size
+                total_step += delta_step
+                num_steps = (i + 1)
+                if num_steps % self.log_freq == 0:
+                    _logger.info(
+                        f"Train [{num_steps}/{self.num_bench_iter}]."
+                        f" {num_samples / total_step:0.2f} samples/sec."
+                        f" {1000 * total_step / num_steps:0.3f} ms/step.")
+            t_run_elapsed = self.time_fn() - t_run_start
+            results = dict(
+                samples_per_sec=round(num_samples / t_run_elapsed, 2),
+                step_time=round(1000 * total_step / self.num_bench_iter, 3),
+                batch_size=self.batch_size,
+                img_size=self.input_size[-1],
+                param_count=round(self.param_count / 1e6, 2),
+            )
+
+        _logger.info(
+            f"Train benchmark of {self.model_name} done. "
+            f"{results['samples_per_sec']:.2f} samples/sec, {results['step_time']:.2f} ms/sample")
+
+        return results
+
+
+def decay_batch_exp(batch_size, factor=0.5, divisor=16):
+    out_batch_size = batch_size * factor
+    if out_batch_size > divisor:
+        out_batch_size = (out_batch_size + 1) // divisor * divisor
+    else:
+        out_batch_size = batch_size - 1
+    return max(0, int(out_batch_size))
+
+
+def _try_run(model_name, bench_fn, initial_batch_size, bench_kwargs):
+    batch_size = initial_batch_size
+    results = dict()
+    while batch_size >= 1:
+        try:
+            bench = bench_fn(model_name=model_name, batch_size=batch_size, **bench_kwargs)
+            results = bench.run()
+            return results
+        except RuntimeError as e:
+            torch.cuda.empty_cache()
+            batch_size = decay_batch_exp(batch_size)
+            print(f'Error: {str(e)} while running benchmark. Reducing batch size to {batch_size} for retry.')
+    return results
+
+
+def benchmark(args):
+    if args.amp:
+        _logger.warning("Overriding precision to 'amp' since --amp flag set.")
+        args.precision = 'amp'
+    _logger.info(f'Benchmarking in {args.precision} precision. '
+                 f'{"NHWC" if args.channels_last else "NCHW"} layout. '
+                 f'torchscript {"enabled" if args.torchscript else "disabled"}')
+
+    bench_kwargs = vars(args).copy()
+    bench_kwargs.pop('amp')
+    model = bench_kwargs.pop('model')
+    batch_size = bench_kwargs.pop('batch_size')
+
+    bench_fns = (InferenceBenchmarkRunner,)
+    prefixes = ('infer',)
+    if args.bench == 'both':
+        bench_fns = (
+            InferenceBenchmarkRunner,
+            TrainBenchmarkRunner
+        )
+        prefixes = ('infer', 'train')
+    elif args.bench == 'train':
+        bench_fns = TrainBenchmarkRunner,
+        prefixes = 'train',
+
+    model_results = OrderedDict(model=model)
+    for prefix, bench_fn in zip(prefixes, bench_fns):
+        run_results = _try_run(model, bench_fn, initial_batch_size=batch_size, bench_kwargs=bench_kwargs)
+        if prefix:
+            run_results = {'_'.join([prefix, k]): v for k, v in run_results.items()}
+        model_results.update(run_results)
+    param_count = model_results.pop('infer_param_count', model_results.pop('train_param_count', 0))
+    model_results.setdefault('param_count', param_count)
+    model_results.pop('train_param_count', 0)
+    return model_results
+
+
+def main():
+    setup_default_logging()
+    args = parser.parse_args()
+    model_cfgs = []
+    model_names = []
+
+    if args.model_list:
+        args.model = ''
+        with open(args.model_list) as f:
+            model_names = [line.rstrip() for line in f]
+        model_cfgs = [(n, None) for n in model_names]
+    elif args.model == 'all':
+        # validate all models in a list of names with pretrained checkpoints
+        args.pretrained = True
+        model_names = list_models(pretrained=True, exclude_filters=['*in21k'])
+        model_cfgs = [(n, None) for n in model_names]
+    elif not is_model(args.model):
+        # model name doesn't exist, try as wildcard filter
+        model_names = list_models(args.model)
+        model_cfgs = [(n, None) for n in model_names]
+
+    if len(model_cfgs):
+        results_file = args.results_file or './benchmark.csv'
+        _logger.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names)))
+        results = []
+        try:
+            for m, _ in model_cfgs:
+                if not m:
+                    continue
+                args.model = m
+                r = benchmark(args)
+                results.append(r)
+        except KeyboardInterrupt as e:
+            pass
+        sort_key = 'train_samples_per_sec' if 'train' in args.bench else 'infer_samples_per_sec'
+        results = sorted(results, key=lambda x: x[sort_key], reverse=True)
+        if len(results):
+            write_results(results_file, results)
+
+        import json
+        json_str = json.dumps(results, indent=4)
+        print(json_str)
+    else:
+        benchmark(args)
+
+
+def write_results(results_file, results):
+    with open(results_file, mode='w') as cf:
+        dw = csv.DictWriter(cf, fieldnames=results[0].keys())
+        dw.writeheader()
+        for r in results:
+            dw.writerow(r)
+        cf.flush()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/datasets.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/datasets.py
new file mode 100644
index 0000000000..6482601780
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/datasets.py
@@ -0,0 +1,185 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+
+from torchvision import datasets, transforms
+from torchvision.datasets.folder import ImageFolder, default_loader
+
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.data import create_transform
+
+
+class INatDataset(ImageFolder):
+    def __init__(self, root, train=True, year=2018, transform=None, target_transform=None,
+                 category='name', loader=default_loader):
+        self.transform = transform
+        self.loader = loader
+        self.target_transform = target_transform
+        self.year = year
+        # assert category in ['kingdom','phylum','class','order','supercategory','family','genus','name']
+        path_json = os.path.join(root, f'{"train" if train else "val"}{year}.json')
+        with open(path_json) as json_file:
+            data = json.load(json_file)
+
+        with open(os.path.join(root, 'categories.json')) as json_file:
+            data_catg = json.load(json_file)
+
+        path_json_for_targeter = os.path.join(root, f"train{year}.json")
+
+        with open(path_json_for_targeter) as json_file:
+            data_for_targeter = json.load(json_file)
+
+        targeter = {}
+        indexer = 0
+        for elem in data_for_targeter['annotations']:
+            king = []
+            king.append(data_catg[int(elem['category_id'])][category])
+            if king[0] not in targeter.keys():
+                targeter[king[0]] = indexer
+                indexer += 1
+        self.nb_classes = len(targeter)
+
+        self.samples = []
+        for elem in data['images']:
+            cut = elem['file_name'].split('/')
+            target_current = int(cut[2])
+            path_current = os.path.join(root, cut[0], cut[2], cut[3])
+
+            categors = data_catg[target_current]
+            target_current_true = targeter[categors[category]]
+            self.samples.append((path_current, target_current_true))
+
+    # __getitem__ and __len__ inherited from ImageFolder
+
+
+def build_dataset(is_train, args):
+    transform = build_transform(is_train, args)
+
+    if args.data_set == 'CIFAR':
+        dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform)
+        nb_classes = 100
+    elif args.data_set == 'IMNET':
+        root = os.path.join(args.data_path, 'train' if is_train else 'val')
+        dataset = datasets.ImageFolder(root, transform=transform)
+        nb_classes = 1000
+    elif args.data_set == 'INAT':
+        dataset = INatDataset(args.data_path, train=is_train, year=2018,
+                              category=args.inat_category, transform=transform)
+        nb_classes = dataset.nb_classes
+    elif args.data_set == 'INAT19':
+        dataset = INatDataset(args.data_path, train=is_train, year=2019,
+                              category=args.inat_category, transform=transform)
+        nb_classes = dataset.nb_classes
+
+    return dataset, nb_classes
+
+
+def build_transform(is_train, args):
+    resize_im = args.input_size > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation=args.train_interpolation,
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(
+                args.input_size, padding=4)
+        return transform
+
+    t = []
+    if resize_im:
+        size = int((256 / 224) * args.input_size)
+        t.append(
+            transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
+        )
+        t.append(transforms.CenterCrop(args.input_size))
+
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)
+
+
+def build_dataset2(is_train, args):
+    transform = build_transform2(is_train, args)
+
+    if args.data_set == 'CIFAR':
+        dataset = datasets.CIFAR100(args.data_path, train=is_train, transform=transform)
+        nb_classes = 100
+    elif args.data_set == 'IMNET':
+        root = os.path.join(args.data_path, 'train' if is_train else 'val')
+        dataset = datasets.ImageFolder(root, transform=transform)
+        nb_classes = 1000
+    elif args.data_set == 'INAT':
+        dataset = INatDataset(args.data_path, train=is_train, year=2018,
+                              category=args.inat_category, transform=transform)
+        nb_classes = dataset.nb_classes
+    elif args.data_set == 'INAT19':
+        dataset = INatDataset(args.data_path, train=is_train, year=2019,
+                              category=args.inat_category, transform=transform)
+        nb_classes = dataset.nb_classes
+
+    return dataset, nb_classes
+
+
+
+
+def build_transform2(is_train, args):
+    resize_im = args.input_size > 32
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.input_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            auto_augment=args.aa,
+            interpolation=args.train_interpolation,
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+        )
+        if not resize_im:
+            # replace RandomResizedCropAndInterpolation with
+            # RandomCrop
+            transform.transforms[0] = transforms.RandomCrop(
+                args.input_size, padding=4)
+        return transform
+
+    t = []
+    if resize_im:
+        size = int((256 / 224) * args.input_size)
+        t.append(
+            transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
+        )
+        t.append(transforms.CenterCrop(args.input_size))
+        t.append(transforms.ToTensor())
+
+    return transforms.Compose(t)
+
+
+def get_post_process():
+    t = []
+    t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))
+    return transforms.Compose(t)
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/engine_levit.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/engine_levit.py
new file mode 100644
index 0000000000..7f6eab9332
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/engine_levit.py
@@ -0,0 +1,214 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import time
+from typing import Iterable, Optional
+import os
+
+import torch
+
+from timm.data import Mixup
+from timm.utils import accuracy, ModelEma
+
+from levit.losses_levit import DistillationLoss
+import utils
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+import torch_npu
+
+#获取Iterable的长度：
+def count(iterable):
+    c=0
+    for el in iterable: c+=1
+    return c
+
+
+def train_one_epoch(args,
+                    model: torch.nn.Module, criterion: DistillationLoss,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    clip_grad: float = 0,
+                    clip_mode: str = 'norm',
+                    model_ema: Optional[ModelEma] = None, mixup_fn: Optional[Mixup] = None,
+                    set_training_mode=True):
+    model.train(set_training_mode)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(
+        window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 100
+    mode1 = False
+    mode2 = False  # True
+    mode4 = False
+    CTPEP = False
+    num_steps = len(data_loader)
+    start = time.time()
+    step_n = 0
+    if epoch < 200:
+        model.module.stage_wise_prune = False
+        model.module.set_learn_tradeoff(False)
+    else:
+        model.module.stage_wise_prune = True
+        model.module.set_learn_tradeoff(True)
+
+    for samples, targets in metric_logger.log_every(
+            data_loader, print_freq, header):
+        samples = samples.to(device, non_blocking=True)
+        targets = targets.to(device, non_blocking=True)
+
+        # 不计算前5个epoch时间内，若需要生成prof的时候，把下面这一段注释掉
+        step_n += 1
+        if step_n < 5:
+            start = time.time()
+
+        #节约时间，计算1000个step的fps，然后输出
+        if step_n % 1000 == 999 and args.train_type == 'fps':
+            timm_999=time.time()-start
+            Fps_step = 995 * args.batch_size * utils.get_world_size() / float(timm_999)
+            print("fps:", Fps_step)
+            sys.exit()
+
+
+
+        if mixup_fn is not None:
+            samples, targets = mixup_fn(samples, targets)
+
+
+        if True:  # with torch.cuda.amp.autocast():
+            # outputs = model(samples)
+            if (mode1 or mode4):
+                outputs = model(samples, epoch)
+                loss = criterion(samples, outputs, targets)  # net1distill
+            elif (mode2 or CTPEP):
+                outputs = model(samples, epoch)
+                loss = criterion(samples, outputs, targets)
+            else:
+                outputs = model(samples)
+                loss = criterion(samples, outputs, targets)
+
+        loss_value = loss.item()
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        optimizer.zero_grad()
+
+        # this attribute is added by timm on one optimizer (adahessian)
+        is_second_order = hasattr(
+            optimizer, 'is_second_order') and optimizer.is_second_order
+
+        loss_scaler(loss, optimizer, clip_grad=clip_grad, clip_mode=clip_mode,
+                    parameters=model.parameters(), create_graph=is_second_order)
+        torch_npu.npu.synchronize()
+
+        if model_ema is not None:
+            model_ema.update(model)
+
+        metric_logger.update(loss=loss_value)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+    # 不用prof生成时的代码结束位置
+    epoch_time = time.time() - start
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    # 计算每个epoch的fps
+    Fps_epoch = (num_steps - 5) * args.batch_size * utils.get_world_size() / float(epoch_time)
+    print("Averaged stats:", metric_logger,"fps:",Fps_epoch)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()},Fps_epoch
+
+
+@torch.no_grad()
+def evaluate(data_loader, model, device, epoch=None):
+    criterion = torch.nn.CrossEntropyLoss()
+
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    # switch to evaluation mode
+    model.eval()
+    mode1 = False
+    mode2 = False
+    mutual = False  # True
+    cls = True
+    mode4 = False
+    CTPEP = False  # False
+
+    for images, target in metric_logger.log_every(data_loader, 10, header):
+        images = images.to(device, non_blocking=True)
+        target = target.to(device, non_blocking=True)
+
+        # compute output
+        if True: #with torch.cuda.amp.autocast():这里添加了if TRUE，考虑到NPU不能用CUDA的混合精度训练
+            if mode1:
+                output = model(images, epoch)
+                acc1, acc5 = accuracy(output[1], target, topk=(1, 5))
+                acc12, acc52 = accuracy(output[2], target, topk=(1, 5))
+                acc13, acc53 = accuracy(output[3], target, topk=(1, 5))
+                print("net 2 accuracy: {}, {}, net 3 accuracy: {}, {}, net 4 accuracy: {}, {}".format(acc1.item(),
+                                                                                                      acc5.item(),
+                                                                                                      acc12.item(),
+                                                                                                      acc52.item(),
+                                                                                                      acc13.item(),
+                                                                                                      acc53.item()))
+                output = output[0]
+            elif mode2:
+                output = model(images, epoch)
+                acc1, acc5 = accuracy(output[1], target, topk=(1, 5))
+                acc12, acc52 = accuracy(output[2], target, topk=(1, 5))
+                acc13, acc53 = accuracy(output[3], target, topk=(1, 5))
+                acc14, acc54 = accuracy(output[4], target, topk=(1, 5))
+                print(
+                    "net 2 accuracy: {}, {}, net 3 accuracy: {}, {}, net 4 accuracy: {}, {}, net merge accuracy: {}, {}".format(
+                        acc1.item(), acc5.item(), acc12.item(), acc52.item(), acc13.item(), acc53.item(), acc14.item(),
+                        acc54.item()))
+                output = output[0]
+            elif mode4:
+                output = model(images, epoch)
+                acc1, acc5 = accuracy(output[1], target, topk=(1, 5))
+                acc12, acc52 = accuracy(output[2], target, topk=(1, 5))
+                print("net 2 accuracy: {}, {}, net 3 accuracy: {}, {}".format(acc1.item(), acc5.item(), acc12.item(),
+                                                                              acc52.item()))
+                output = output[0]
+            elif mutual:
+                output = model(images)
+                acc1, acc5 = accuracy(output[1], target, topk=(1, 5))
+                print("net depth accuracy: {}, {}".format(acc1, acc5))
+                output = output[0]
+            elif cls:
+                if CTPEP:
+                    output = model(images, epoch)
+                else:
+                    output = model(images)
+            else:
+                output = model(images)
+            loss = criterion(output, target)
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        batch_size = images.shape[0]
+        metric_logger.update(loss=loss.item())
+        metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
+        metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
+          .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
+    print(output.mean().item(), output.std().item())
+
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/mixup.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/mixup.py
new file mode 100644
index 0000000000..a475602ad7
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/mixup.py
@@ -0,0 +1,319 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import torch_npu
+
+def one_hot(x, num_classes, on_value=1., off_value=0., device='npu'):
+    x = x.long().view(-1, 1)
+    return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value)
+
+
+def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='npu'):
+    off_value = smoothing / num_classes
+    on_value = 1. - smoothing + off_value
+    y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device)
+    y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device)
+    return y1 * lam + y2 * (1. - lam)
+
+
+def rand_bbox(img_shape, lam, margin=0., count=None):
+    """ Standard CutMix bounding-box
+    Generates a random square bbox based on lambda value. This impl includes
+    support for enforcing a border margin as percent of bbox dimensions.
+
+    Args:
+        img_shape (tuple): Image shape as tuple
+        lam (float): Cutmix lambda value
+        margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
+        count (int): Number of bbox to generate
+    """
+    ratio = np.sqrt(1 - lam)
+    img_h, img_w = img_shape[-2:]
+    cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+    margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+    cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+    cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+    yl = np.clip(cy - cut_h // 2, 0, img_h)
+    yh = np.clip(cy + cut_h // 2, 0, img_h)
+    xl = np.clip(cx - cut_w // 2, 0, img_w)
+    xh = np.clip(cx + cut_w // 2, 0, img_w)
+    return yl, yh, xl, xh
+
+
+def rand_bbox_minmax(img_shape, minmax, count=None):
+    """ Min-Max CutMix bounding-box
+    Inspired by Darknet cutmix impl, generates a random rectangular bbox
+    based on min/max percent values applied to each dimension of the input image.
+
+    Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
+
+    Args:
+        img_shape (tuple): Image shape as tuple
+        minmax (tuple or list): Min and max bbox ratios (as percent of image size)
+        count (int): Number of bbox to generate
+    """
+    assert len(minmax) == 2
+    img_h, img_w = img_shape[-2:]
+    cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
+    cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
+    yl = np.random.randint(0, img_h - cut_h, size=count)
+    xl = np.random.randint(0, img_w - cut_w, size=count)
+    yu = yl + cut_h
+    xu = xl + cut_w
+    return yl, yu, xl, xu
+
+
+def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None):
+    """ Generate bbox and apply lambda correction.
+    """
+    if ratio_minmax is not None:
+        yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count)
+    else:
+        yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count)
+    if correct_lam or ratio_minmax is not None:
+        bbox_area = (yu - yl) * (xu - xl)
+        lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
+    return (yl, yu, xl, xu), lam
+
+
+class Mixup:
+    """ Mixup/Cutmix that applies different params to each element or whole batch
+
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or element
+        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
+        label_smoothing (float): apply label smoothing to the mixed target tensor
+        num_classes (int): number of classes for target
+    """
+    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
+                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
+        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
+
+    def _params_per_elem(self, batch_size):
+        lam = np.ones(batch_size, dtype=np.float32)
+        use_cutmix = np.zeros(batch_size, dtype=np.bool)
+        if self.mixup_enabled:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand(batch_size) < self.switch_prob
+                lam_mix = np.where(
+                    use_cutmix,
+                    np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size))
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = np.ones(batch_size, dtype=np.bool)
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam)
+        return lam, use_cutmix
+
+    def _params_per_batch(self):
+        lam = 1.
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_elem(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.:
+            return 1.
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+            x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+        else:
+            x_flipped = x.flip(0).mul_(1. - lam)
+            x.mul_(lam).add_(x_flipped)
+        return lam
+
+    def __call__(self, x, target):
+        assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing)
+        return x, target
+
+
+class FastCollateMixup(Mixup):
+    """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch
+
+    A Mixup impl that's performed while collating the batches.
+    """
+
+    def _mix_elem_collate(self, output, batch, half=False):
+        batch_size = len(batch)
+        num_elem = batch_size // 2 if half else batch_size
+        assert len(output) == num_elem
+        lam_batch, use_cutmix = self._params_per_elem(num_elem)
+        for i in range(num_elem):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed = batch[i][0]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    if not half:
+                        mixed = mixed.copy()
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                    np.rint(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        if half:
+            lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_pair_collate(self, output, batch):
+        batch_size = len(batch)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed_i = batch[i][0]
+            mixed_j = batch[j][0]
+            assert 0 <= lam <= 1.0
+            if lam < 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
+                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
+                    mixed_j[:, yl:yh, xl:xh] = patch_i
+                    lam_batch[i] = lam
+                else:
+                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
+                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
+                    mixed_i = mixed_temp
+                    np.rint(mixed_j, out=mixed_j)
+                    np.rint(mixed_i, out=mixed_i)
+            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
+            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_batch_collate(self, output, batch):
+        batch_size = len(batch)
+        lam, use_cutmix = self._params_per_batch()
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            mixed = batch[i][0]
+            if lam != 1.:
+                if use_cutmix:
+                    mixed = mixed.copy()  # don't want to modify the original while iterating
+                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
+                else:
+                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                    np.rint(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        return lam
+
+    def __call__(self, batch, _=None):
+        batch_size = len(batch)
+        assert batch_size % 2 == 0, 'Batch size should be even when using this'
+        half = 'half' in self.mode
+        if half:
+            batch_size //= 2
+        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        if self.mode == 'elem' or self.mode == 'half':
+            lam = self._mix_elem_collate(output, batch, half=half)
+        elif self.mode == 'pair':
+            lam = self._mix_pair_collate(output, batch)
+        else:
+            lam = self._mix_batch_collate(output, batch)
+        target = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu')
+        target = target[:batch_size]
+        return output, target
+
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/optim_factory.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/optim_factory.py
new file mode 100644
index 0000000000..f70cd95d92
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/fix_timm/optim_factory.py
@@ -0,0 +1,187 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from .adafactor import Adafactor
+from .adahessian import Adahessian
+from .adamp import AdamP
+from .lookahead import Lookahead
+from .nadam import Nadam
+from .novograd import NovoGrad
+from .nvnovograd import NvNovoGrad
+from .radam import RAdam
+from .rmsprop_tf import RMSpropTF
+from .sgdp import SGDP
+from .adabelief import AdaBelief
+
+try:
+    from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD,NpuFusedAdamW
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            no_decay.append(param)
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+
+
+def optimizer_kwargs(cfg):
+    """ cfg/argparse to kwargs helper
+    Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn.
+    """
+    kwargs = dict(
+        optimizer_name=cfg.opt,
+        learning_rate=cfg.lr,
+        weight_decay=cfg.weight_decay,
+        momentum=cfg.momentum)
+    if getattr(cfg, 'opt_eps', None) is not None:
+        kwargs['eps'] = cfg.opt_eps
+    if getattr(cfg, 'opt_betas', None) is not None:
+        kwargs['betas'] = cfg.opt_betas
+    if getattr(cfg, 'opt_args', None) is not None:
+        kwargs.update(cfg.opt_args)
+    return kwargs
+
+
+def create_optimizer(args, model, filter_bias_and_bn=True):
+    """ Legacy optimizer factory for backwards compatibility.
+    NOTE: Use create_optimizer_v2 for new code.
+    """
+    return create_optimizer_v2(
+        model,
+        **optimizer_kwargs(cfg=args),
+        filter_bias_and_bn=filter_bias_and_bn,
+    )
+
+
+def create_optimizer_v2(
+        model: nn.Module,
+        optimizer_name: str = 'sgd',
+        learning_rate: Optional[float] = None,
+        weight_decay: float = 0.,
+        momentum: float = 0.9,
+        filter_bias_and_bn: bool = True,
+        **kwargs):
+    """ Create an optimizer.
+
+    TODO currently the model is passed in and all parameters are selected for optimization.
+    For more general use an interface that allows selection of parameters to optimize and lr groups, one of:
+      * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion
+      * expose the parameters interface and leave it up to caller
+
+    Args:
+        model (nn.Module): model containing parameters to optimize
+        optimizer_name: name of optimizer to create
+        learning_rate: initial learning rate
+        weight_decay: weight decay to apply in optimizer
+        momentum:  momentum for momentum based optimizers (others may use betas via kwargs)
+        filter_bias_and_bn:  filter out bias, bn and other 1d params from weight decay
+        **kwargs: extra optimizer specific kwargs to pass through
+
+    Returns:
+        Optimizer
+    """
+    opt_lower = optimizer_name.lower()
+    if weight_decay and filter_bias_and_bn:
+        skip = {}
+        if hasattr(model, 'no_weight_decay'):
+            skip = model.no_weight_decay()
+        parameters = add_weight_decay(model, weight_decay, skip)
+        weight_decay = 0.
+    else:
+        parameters = model.parameters()
+    if 'fused' in opt_lower:
+        assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
+
+    opt_args = dict(lr=learning_rate, weight_decay=weight_decay, **kwargs)
+    opt_split = opt_lower.split('_')
+    opt_lower = opt_split[-1]
+    if opt_lower == 'sgd' or opt_lower == 'nesterov':
+        opt_args.pop('eps', None)
+        optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'momentum':
+        opt_args.pop('eps', None)
+        optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'adam':
+        optimizer = optim.Adam(parameters, **opt_args) 
+    elif opt_lower == 'adabelief':
+        optimizer = AdaBelief(parameters, rectify=False, **opt_args)
+    elif opt_lower == 'adamw':
+        # optimizer = optim.AdamW(parameters, **opt_args)
+        optimizer = NpuFusedAdamW(parameters, **opt_args)
+    elif opt_lower == 'nadam':
+        optimizer = Nadam(parameters, **opt_args)
+    elif opt_lower == 'radam':
+        optimizer = RAdam(parameters, **opt_args)
+    elif opt_lower == 'adamp':        
+        optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
+    elif opt_lower == 'sgdp':
+        optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'adadelta':
+        optimizer = optim.Adadelta(parameters, **opt_args)
+    elif opt_lower == 'adafactor':
+        if not learning_rate:
+            opt_args['lr'] = None
+        optimizer = Adafactor(parameters, **opt_args)
+    elif opt_lower == 'adahessian':
+        optimizer = Adahessian(parameters, **opt_args)
+    elif opt_lower == 'rmsprop':
+        optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args)
+    elif opt_lower == 'rmsproptf':
+        optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args)
+    elif opt_lower == 'novograd':
+        optimizer = NovoGrad(parameters, **opt_args)
+    elif opt_lower == 'nvnovograd':
+        optimizer = NvNovoGrad(parameters, **opt_args)
+    elif opt_lower == 'fusedsgd':
+        opt_args.pop('eps', None)
+        optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
+    elif opt_lower == 'fusedmomentum':
+        opt_args.pop('eps', None)
+        optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
+    elif opt_lower == 'fusedadam':
+        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
+    elif opt_lower == 'fusedadamw':
+        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
+    elif opt_lower == 'fusedlamb':
+        optimizer = FusedLAMB(parameters, **opt_args)
+    elif opt_lower == 'fusednovograd':
+        opt_args.setdefault('betas', (0.95, 0.98))
+        optimizer = FusedNovoGrad(parameters, **opt_args)
+    else:
+        assert False and "Invalid optimizer"
+        raise ValueError
+
+    if len(opt_split) > 1:
+        if opt_split[0] == 'lookahead':
+            optimizer = Lookahead(optimizer)
+
+    return optimizer
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit.py
new file mode 100644
index 0000000000..7041bfd5bb
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit.py
@@ -0,0 +1,788 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import utils
+import torch.nn as nn
+
+from timm.models.vision_transformer import trunc_normal_
+from timm.models.registry import register_model
+
+specification = {
+    'EvoLeViT_128S': {
+        'C': '128_256_384', 'D': 16, 'N': '4_6_8', 'X': '2_3_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-128S-96703c44.pth'},
+    'EvoLeViT_128': {
+        'C': '128_256_384', 'D': 16, 'N': '4_8_12', 'X': '4_4_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-128-b88c2750.pth'},
+    'EvoLeViT_192': {
+        'C': '192_288_384', 'D': 32, 'N': '3_5_6', 'X': '4_4_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-192-92712e41.pth'},
+    'EvoLeViT_256': {
+        'C': '256_384_512', 'D': 32, 'N': '4_6_8', 'X': '4_4_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-256-13b5763e.pth'},
+    'EvoLeViT_384': {
+        'C': '384_512_768', 'D': 32, 'N': '6_9_12', 'X': '4_4_4', 'drop_path': 0.1,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-384-9bdaf2e2.pth'},
+}
+
+prune_ratio_list = {
+    'EvoLeViT_128S': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5]],
+    'EvoLeViT_128': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                     [0.5, 0.5, 0.5]],
+    'EvoLeViT_192': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                     [0.5, 0.5, 0.5]],
+    'EvoLeViT_256': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                     [0.5, 0.5, 0.5]],
+    'EvoLeViT_384': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                     [0.5, 0.5, 0.5]],
+}
+
+__all__ = [specification.keys()]
+
+
+@register_model
+def EvoLeViT_128S(num_classes=1000, distillation=True,
+                  pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_128S'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_128S'])
+
+
+@register_model
+def EvoLeViT_128(num_classes=1000, distillation=True,
+                 pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_128'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_128'])
+
+
+@register_model
+def EvoLeViT_192(num_classes=1000, distillation=True,
+                 pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_192'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_192'])
+
+
+@register_model
+def EvoLeViT_256(num_classes=1000, distillation=True,
+                 pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_256'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_256'])
+
+
+@register_model
+def EvoLeViT_384(num_classes=1000, distillation=True,
+                 pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_384'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_384'])
+
+
+global_attn = 0
+ori_indices = None
+learn_tradeoff_mode = True
+
+
+def easy_gather(x, indices):
+    # x: B,N,C; indices: B,N
+    B, N, C = x.shape
+    N_new = indices.shape[1]
+    offset = torch.arange(B, dtype=torch.long, device=x.device).view(B, 1) * N
+    indices = indices + offset
+    out = x.reshape(B * N, C)[indices.view(-1)].reshape(B, N_new, C)
+    return out
+
+
+def merge_tokens(x_drop, score):
+    # score B,N
+    # scale
+    weight = score / torch.sum(score, dim=1, keepdim=True)
+    x_drop = weight.unsqueeze(-1) * x_drop
+    return torch.sum(x_drop, dim=1, keepdim=True)
+
+
+class CatModule(torch.nn.Module):
+    def __init__(self, m1, m2, prune_ratio, N):
+        super().__init__()
+        self.m1 = m1
+        self.m2 = m2
+        self.prune_ratio = prune_ratio
+        # self.i = i
+        if prune_ratio < 1.0:
+            N_ = N - int(N * prune_ratio)
+            self.drop_fc = nn.AdaptiveAvgPool1d(1)
+            # self.recover_fc=nn.Linear(1,N_)
+
+    def set_prune_ratio(self, prune_ratio):
+        self.prune_ratio = prune_ratio
+
+    def forward(self, x_):
+        global global_attn  # ga
+        global ori_indices  # oi
+        if self.prune_ratio < 1:
+            x = x_[:, 1:]  # split out cls token
+
+            N = x.shape[1]
+            N_ = int(N * self.prune_ratio)
+            indices = torch.argsort(global_attn, dim=1, descending=True)
+
+            x_ga_oi = torch.cat((x, global_attn.unsqueeze(-1), ori_indices.unsqueeze(-1)), dim=-1)
+            x_ga_oi = easy_gather(x_ga_oi, indices)
+            x_sorted, global_attn, ori_indices = x_ga_oi[:, :, :-2], x_ga_oi[:, :, -2], x_ga_oi[:, :, -1]
+
+            if self.training:
+                x_ = torch.cat((x_[:, :1], x_sorted), dim=1)
+            else:
+                x_[:, 1:] = x_sorted
+            x = x_[:, :N_ + 1]
+            x_drop = x_[:, N_ + 1:]
+
+            add_token = merge_tokens(x_drop, global_attn[:, N_:])  # B,1,C
+            x = torch.cat((x, add_token), dim=1)  # B,N+1,C
+
+            x, raw_x1 = self.m1(x)
+            x, raw_x2 = self.m2(x)
+            x = x[:, :-1]
+
+            # fast update via skip connection
+            add_token1 = raw_x1[:, -1:]
+            add_token2 = raw_x2[:, -1:]
+            x_drop = x_drop + add_token1.expand(-1, x_drop.shape[1], -1) + add_token2.expand(-1, x_drop.shape[1], -1)
+
+            x_ = torch.cat((x, x_drop), dim=1)
+            # x_[:, N_ + 1:] = x_drop
+            # x_[:, :N_ + 1] = x
+        else:
+            x_, _ = self.m1(x_)
+            x_, _ = self.m2(x_)
+        return x_
+
+
+class StageModule(torch.nn.Module):
+    def __init__(self, m, prune_ratio):
+        super().__init__()
+        self.m = m
+        self.prune_ratio = prune_ratio
+
+    def forward(self, x_):
+        global global_attn  # ga
+        global ori_indices  # oi
+
+        if isinstance(x_, tuple):
+            x_ = x_[0]
+
+        if self.prune_ratio < 1:
+            x = x_[:, 1:]  # split out cls token
+
+            N = x.shape[1]
+            N_ = int(N * self.prune_ratio)
+            indices = torch.argsort(global_attn, dim=1, descending=True)
+
+            x_ga_oi = torch.cat((x, global_attn.unsqueeze(-1), ori_indices.unsqueeze(-1)), dim=-1)
+            x_ga_oi = easy_gather(x_ga_oi, indices)
+            x_sorted, global_attn, ori_indices = x_ga_oi[:, :, :-2], x_ga_oi[:, :, -2], x_ga_oi[:, :, -1]
+
+            if self.training:
+                x_ = torch.cat((x_[:, :1], x_sorted), dim=1)
+            else:
+                x_[:, 1:] = x_sorted
+
+            x = x_[:, :N_ + 1]
+            x_drop = x_[:, N_ + 1:]
+
+            merge_weight = global_attn[:, N_:]
+            add_token = merge_tokens(x_drop, merge_weight)  # B,1,C
+            x = torch.cat((x, add_token), dim=1)  # B,N+1,C
+
+            raw_total = 0
+            for blk in self.m:
+                x, raw = blk(x)
+                raw_total = raw_total + raw[:, -1:]
+
+            x_drop = x_drop + raw_total.expand(-1, x_drop.shape[1], -1)
+
+            x = x[:, :-1]
+            if self.training:
+                x_ = torch.cat((x, x_drop), dim=1)
+            else:
+                x_[:, N_ + 1:] = x_drop
+                x_[:, :N_ + 1] = x
+        else:
+            x_ = self.m(x_)
+        return x_
+
+
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1, resolution=-10000):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        bn = torch.nn.BatchNorm2d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps) ** 0.5
+        m = torch.nn.Conv2d(w.size(1), w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation,
+                            groups=self.c.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+
+class Linear_BN(torch.nn.Sequential):
+    def __init__(self, a, b, bn_weight_init=1, resolution=-100000):
+        super().__init__()
+        self.add_module('c', torch.nn.Linear(a, b, bias=False))
+        bn = torch.nn.BatchNorm1d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+
+    @torch.no_grad()
+    def fuse(self):
+        l, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = l.weight * w[:, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps) ** 0.5
+        m = torch.nn.Linear(w.size(1), w.size(0))
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+    def forward(self, x):
+        l, bn = self._modules.values()
+        x = l(x)
+        return bn(x.flatten(0, 1)).reshape_as(x)
+
+
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        l = torch.nn.Linear(a, b, bias=bias)
+        trunc_normal_(l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(l.bias, 0)
+        self.add_module('l', l)
+
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0))
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+
+def b16(n, activation, resolution=224):
+    return torch.nn.Sequential(
+        Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution),
+        activation(),
+        Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
+        activation(),
+        Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
+        activation(),
+        Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8))
+
+
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop, out_raw=False):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+        self.out_raw = out_raw
+
+    def set_prune_ratio(self, prune_ratio):
+        pass
+
+    def forward(self, x):
+        if isinstance(x, tuple):
+            x = x[0]
+        if self.training and self.drop > 0:
+            raw = self.m(x) * torch.rand(x.size(0), 1, 1,
+                                         device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            raw = self.m(x)
+        if self.out_raw:
+            return x + raw, raw
+        else:
+            return x + raw
+
+
+class Attention(torch.nn.Module):
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 activation=None,
+                 resolution=14, posembed=False, global_attn_tradeoff=0.5):
+        super().__init__()
+        self.tradeoff = global_attn_tradeoff
+
+        self.learn_tradeoff = torch.nn.Parameter(torch.Tensor([0]))
+        self.sigmoid = torch.nn.Sigmoid()
+
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.qkv = Linear_BN(dim, h, resolution=resolution)
+        self.proj = torch.nn.Sequential(activation(), Linear_BN(
+            self.dh, dim, bn_weight_init=0, resolution=resolution))
+
+        self.pos_embed = posembed
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+
+    def forward(self, x):  # x (B,N,C)
+        global global_attn
+        global learn_tradeoff_mode
+
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.view(B, N, self.num_heads, -
+        1).split([self.key_dim, self.key_dim, self.d], dim=3)
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+
+        attn_raw = (q @ k.transpose(-2, -1)) * self.scale
+
+        attn = attn_raw.softmax(dim=-1)
+
+        # update global attn
+        if learn_tradeoff_mode:
+            tradeoff = self.sigmoid(self.learn_tradeoff)
+        else:
+            tradeoff = self.tradeoff
+
+        if isinstance(global_attn, int):
+            cls_attn = torch.mean(attn[:, :, 0, 1:], dim=1)  # B,N
+            global_attn = cls_attn
+        else:
+            if global_attn.shape[1] - N + 2 == 1:
+                # no additional token and no pruning
+                cls_attn = torch.mean(attn[:, :, 0, 1:], dim=1)
+                global_attn = (1 - tradeoff) * global_attn + tradeoff * cls_attn
+            else:
+                cls_attn = torch.mean(attn[:, :, 0, 1:-1], dim=1)
+
+                if self.training:
+                    temp_attn = (1 - tradeoff) * global_attn[:, :N - 2] + tradeoff * cls_attn
+                    global_attn = torch.cat((temp_attn, global_attn[:, N - 2:]), dim=1)
+                else:
+                    global_attn[:, :N - 2] = (1 - tradeoff) * global_attn[:, :N - 2] + tradeoff * cls_attn
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = self.proj(x)
+        return x
+
+
+class Subsample(torch.nn.Module):
+    def __init__(self, stride, resolution):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def forward(self, x, with_cls=True):
+        if with_cls:
+            B, N, C = x.shape
+            x1 = x[:, 1:, :]
+            x1 = x1.view(B, self.resolution, self.resolution, C)[
+                 :, ::self.stride, ::self.stride].reshape(B, -1, C)
+            x = torch.cat((x[:, :1, :], x1), dim=1)
+        else:
+            B, N, C = x.shape
+            x = x.view(B, self.resolution, self.resolution, C)[
+                :, ::self.stride, ::self.stride].reshape(B, -1, C)
+        return x
+
+
+class AttentionSubsample(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, key_dim, num_heads=8,
+                 attn_ratio=2,
+                 activation=None,
+                 stride=2,
+                 resolution=14, resolution_=7, posembed=False, global_attn_tradeoff=0.5):
+        super().__init__()
+        self.tradeoff = global_attn_tradeoff
+
+        self.learn_tradeoff = torch.nn.Parameter(torch.Tensor([0]))
+        self.sigmoid = torch.nn.Sigmoid()
+
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * self.num_heads
+        self.attn_ratio = attn_ratio
+        self.resolution_ = resolution_
+        self.resolution_2 = resolution_ ** 2
+        h = self.dh + nh_kd
+        self.kv = Linear_BN(in_dim, h, resolution=resolution)
+
+        self.q = torch.nn.Sequential(
+            Subsample(stride, resolution),
+            Linear_BN(in_dim, nh_kd, resolution=resolution_))
+        self.proj = torch.nn.Sequential(activation(), Linear_BN(
+            self.dh, out_dim, resolution=resolution_))
+
+        self.pos_embed = posembed
+        if posembed:
+            self.poss = nn.Parameter(torch.zeros(1, resolution ** 2 + 1, in_dim))
+            trunc_normal_(self.poss, std=.02)
+
+        self.stride = stride
+        self.resolution = resolution
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+
+    def set_prune_ratio(self, prune_ratio):
+        pass
+
+    def forward(self, x):
+        global global_attn  # ga
+        global ori_indices  # oi
+        global learn_tradeoff_mode
+
+        if isinstance(x, tuple):
+            x = x[0]
+
+        # recover sequence
+        old_global_scale = torch.sum(global_attn, dim=1, keepdim=True)
+
+        x_patch = x[:, 1:]
+        indices = torch.argsort(ori_indices, dim=1)
+        x_ga_oi = torch.cat((x_patch, global_attn.unsqueeze(-1), ori_indices.unsqueeze(-1)), dim=-1)
+        x_ga_oi = easy_gather(x_ga_oi, indices)
+        x_patch, ga_oi = x_ga_oi[:, :, :-2], x_ga_oi[:, :, -2:]
+
+        # subsample global attn and ori indices
+        ga_oi = self.q[0](ga_oi, False)
+        global_attn, ori_indices = ga_oi[:, :, 0], ga_oi[:, :, 1]
+
+        # global_attn, ori_indices = ga_oi[:, :, 0], ga_oi[:, :, 1]
+
+        if self.training:
+            x = torch.cat((x[:, :1], x_patch), dim=1)
+        else:
+            x[:, 1:] = x_patch
+
+        x = x + self.poss
+        B, N, C = x.shape
+        k, v = self.kv(x).view(B, N, self.num_heads, -
+        1).split([self.key_dim, self.d], dim=3)
+        k = k.permute(0, 2, 1, 3)  # BHNC
+        v = v.permute(0, 2, 1, 3)  # BHNC
+        q = self.q(x).view(B, self.resolution_2 + 1, self.num_heads,
+                           self.key_dim).permute(0, 2, 1, 3)
+
+        attn_raw = (q @ k.transpose(-2, -1)) * self.scale
+
+        attn = attn_raw.softmax(dim=-1)
+
+        cls_attn = torch.mean(attn[:, :, 0, 1:], dim=1)  # B,N
+        cls_attn = self.q[0](cls_attn.unsqueeze(-1), False).squeeze(-1)
+
+        if learn_tradeoff_mode:
+            tradeoff = self.sigmoid(self.learn_tradeoff)
+        else:
+            tradeoff = self.tradeoff
+
+        global_attn = (1 - tradeoff) * global_attn + tradeoff * cls_attn
+
+        # normalize global attention
+        new_global_scale = torch.sum(global_attn, dim=1, keepdim=True)
+        scale = old_global_scale / new_global_scale
+        global_attn = global_attn * scale
+
+        x = (attn @ v).transpose(1, 2).reshape(B, -1, self.dh)
+        x = self.proj(x)
+        return x
+
+
+class LeViT(torch.nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self, img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=[192],
+                 key_dim=[64],
+                 depth=[12],
+                 num_heads=[3],
+                 attn_ratio=[2],
+                 mlp_ratio=[2],
+                 hybrid_backbone=None,
+                 down_ops=[],
+                 attention_activation=torch.nn.Hardswish,
+                 mlp_activation=torch.nn.Hardswish,
+                 distillation=True,
+                 drop_path=0, prune_ratio=None):
+        super().__init__()
+
+        self.stage_wise_prune = True
+
+        self.num_classes = num_classes
+        self.num_features = embed_dim[-1]
+        self.embed_dim = embed_dim
+        self.distillation = distillation
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim[0]))
+
+        self.patch_embed = hybrid_backbone
+
+        self.pos_embed = True
+
+        self.blocks = []
+        self.stage_blocks = []
+
+        down_ops.append([''])
+        resolution = img_size // patch_size
+        if self.pos_embed:
+            self.poss = nn.Parameter(torch.zeros(1, resolution ** 2 + 1, embed_dim[0]))
+            trunc_normal_(self.poss, std=.02)
+
+        self.prune_ratio = prune_ratio[0]
+        self.stage_prune_ratio = prune_ratio[1]
+
+        layer_index = -1
+        n = 14
+        j = 0
+
+        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)):
+            stage_subblocks = []
+            for _ in range(dpth):
+                layer_index += 1
+
+                m1 = Residual(Attention(
+                    ed, kd, nh,
+                    attn_ratio=ar,
+                    activation=attention_activation,
+                    resolution=resolution,
+                    posembed=self.pos_embed
+                ), drop_path, out_raw=True)
+                if self.prune_ratio[layer_index] == 1:
+                    self.stage_blocks.append(m1)
+                else:
+                    stage_subblocks.append(m1)
+
+                if mr > 0:
+                    h = int(ed * mr)
+                    m2 = Residual(torch.nn.Sequential(
+                        Linear_BN(ed, h, resolution=resolution),
+                        mlp_activation(),
+                        Linear_BN(h, ed, bn_weight_init=0,
+                                  resolution=resolution),
+                    ), drop_path, out_raw=True)
+                else:
+                    m2 = torch.nn.Identity()
+
+                if self.prune_ratio[layer_index] == 1:
+                    self.stage_blocks.append(m2)
+                else:
+                    stage_subblocks.append(m2)
+
+                self.blocks.append(CatModule(m1, m2, prune_ratio=self.prune_ratio[layer_index], N=n ** 2))
+                if self.prune_ratio[layer_index] < 1:
+                    j = j + 1
+
+            if len(stage_subblocks) != 0:
+                stage_subblocks = torch.nn.ModuleList(stage_subblocks)
+                self.stage_blocks.append(StageModule(stage_subblocks, prune_ratio=self.stage_prune_ratio[i]))
+
+            if do[0] == 'Subsample':
+                n = int((n + 1) / 2)
+                # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+                resolution_ = (resolution - 1) // do[5] + 1
+                subsample = AttentionSubsample(
+                    *embed_dim[i:i + 2], key_dim=do[1], num_heads=do[2],
+                    attn_ratio=do[3],
+                    activation=attention_activation,
+                    stride=do[5],
+                    resolution=resolution,
+                    resolution_=resolution_,
+                    posembed=self.pos_embed)
+                self.blocks.append(subsample)
+                self.stage_blocks.append(subsample)
+
+                resolution = resolution_
+                if do[4] > 0:  # mlp_ratio
+                    h = int(embed_dim[i + 1] * do[4])
+                    ffn = Residual(torch.nn.Sequential(
+                        Linear_BN(embed_dim[i + 1], h,
+                                  resolution=resolution),
+                        mlp_activation(),
+                        Linear_BN(
+                            h, embed_dim[i + 1], bn_weight_init=0, resolution=resolution),
+                    ), drop_path)
+                    self.blocks.append(ffn)
+                    self.stage_blocks.append(ffn)
+
+        self.blocks = torch.nn.Sequential(*self.blocks)
+        self.stage_blocks = torch.nn.Sequential(*self.stage_blocks)
+
+        # Classifier head
+        self.head = BN_Linear(
+            embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        if distillation:
+            self.head_dist = BN_Linear(
+                embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.clsc = True
+        if self.clsc:
+            self.head_cls = BN_Linear(
+                embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+            if distillation:
+                self.head_cls_dist = BN_Linear(
+                    embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {x for x in self.state_dict().keys() if 'poss' in x}
+
+    def set_learn_tradeoff(self, mode):
+        global learn_tradeoff_mode
+        learn_tradeoff_mode = mode
+
+    def set_prune_ratio(self, mode):
+        pass
+
+    def remove_cls(self):
+        if hasattr(self, 'head_cls'):
+            del self.head_cls
+        if hasattr(self, 'head_cls_dist'):
+            del self.head_cls_dist
+
+    def forward(self, x):
+        global global_attn
+        global ori_indices
+        global learn_tradeoff_mode
+
+        global_attn = 0
+
+        x = self.patch_embed(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        ori_indices = torch.arange(x.shape[1], dtype=torch.long, device=x.device).unsqueeze(0)
+        ori_indices = ori_indices.expand(x.shape[0], -1)
+
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), 1)
+        if self.pos_embed:
+            x = x + self.poss
+
+        if self.stage_wise_prune:
+            x = self.stage_blocks(x)
+        else:
+            x = self.blocks(x)
+
+        cls = x[:, 0, :]
+        x = x[:, 1:, :]
+        x = x.mean(1)
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if self.clsc:
+                if self.training:
+                    xcls = self.head_cls(cls)
+                    xcls_dist = self.head_cls_dist(cls)
+                    return x[0], x[1], xcls, xcls_dist
+                else:
+                    return (x[0] + x[1]) / 2
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+
+        else:
+            x = self.head(x)
+        return x
+
+
+def model_factory(C, D, X, N, drop_path, weights,
+                  num_classes, distillation, pretrained, fuse, prune_ratio):
+    embed_dim = [int(x) for x in C.split('_')]
+    num_heads = [int(x) for x in N.split('_')]
+    depth = [int(x) for x in X.split('_')]
+    act = torch.nn.Hardswish
+    model = LeViT(
+        patch_size=16,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        key_dim=[D] * 3,
+        depth=depth,
+        attn_ratio=[2, 2, 2],
+        mlp_ratio=[2, 2, 2],
+        down_ops=[
+            # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+            ['Subsample', D, embed_dim[0] // D, 4, 2, 2],
+            ['Subsample', D, embed_dim[1] // D, 4, 2, 2],
+        ],
+        attention_activation=act,
+        mlp_activation=act,
+        hybrid_backbone=b16(embed_dim[0], activation=act),
+        num_classes=num_classes,
+        drop_path=drop_path,
+        distillation=distillation,
+        prune_ratio=prune_ratio
+    )
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            weights, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+    if fuse:
+        utils.replace_batchnorm(model)
+
+    return model
+
+
+if __name__ == '__main__':
+    for name in specification:
+        net = globals()[name](fuse=False, pretrained=False)
+        net.eval()
+        net.remove_cls()
+        net(torch.randn(4, 3, 224, 224))
+        print(name, 'Parameters:', sum(p.numel() for p in net.parameters() if p.requires_grad))
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit_384.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit_384.py
new file mode 100644
index 0000000000..f6ffff25de
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/evo_levit_384.py
@@ -0,0 +1,813 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import utils
+import torch.nn as nn
+
+from timm.models.vision_transformer import trunc_normal_
+from timm.models.registry import register_model
+
+specification = {
+    'EvoLeViT_128S_384': {
+        'C': '128_256_384', 'D': 16, 'N': '4_6_8', 'X': '2_3_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-128S-96703c44.pth'},
+    'EvoLeViT_128_384': {
+        'C': '128_256_384', 'D': 16, 'N': '4_8_12', 'X': '4_4_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-128-b88c2750.pth'},
+    'EvoLeViT_192_384': {
+        'C': '192_288_384', 'D': 32, 'N': '3_5_6', 'X': '4_4_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-192-92712e41.pth'},
+    'EvoLeViT_256_384': {
+        'C': '256_384_512', 'D': 32, 'N': '4_6_8', 'X': '4_4_4', 'drop_path': 0,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-256-13b5763e.pth'},
+    'EvoLeViT_384_384': {
+        'C': '384_512_768', 'D': 32, 'N': '6_9_12', 'X': '4_4_4', 'drop_path': 0.1,
+        'weights': 'https://dl.fbaipublicfiles.com/LeViT/LeViT-384-9bdaf2e2.pth'},
+}
+
+prune_ratio_list = {
+    'EvoLeViT_128S_384': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5]],
+    'EvoLeViT_128_384': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                         [0.5, 0.5, 0.5]],
+    'EvoLeViT_192_384': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                         [0.5, 0.5, 0.5]],
+    'EvoLeViT_256_384': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                         [0.5, 0.5, 0.5]],
+    'EvoLeViT_384_384': [[1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                         [0.5, 0.5, 0.5]],
+}
+
+__all__ = [specification.keys()]
+
+
+@register_model
+def EvoLeViT_128S_384(num_classes=1000, distillation=True,
+                      pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_128S_384'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_128S_384'])
+
+
+@register_model
+def EvoLeViT_128_384(num_classes=1000, distillation=True,
+                     pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_128_384'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_128_384'])
+
+
+@register_model
+def EvoLeViT_192_384(num_classes=1000, distillation=True,
+                     pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_192_384'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_192_384'])
+
+
+@register_model
+def EvoLeViT_256_384(num_classes=1000, distillation=True,
+                     pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_256_384'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_256_384'])
+
+
+@register_model
+def EvoLeViT_384_384(num_classes=1000, distillation=True,
+                     pretrained=False, fuse=False):
+    return model_factory(**specification['EvoLeViT_384_384'], num_classes=num_classes,
+                         distillation=distillation, pretrained=pretrained, fuse=fuse,
+                         prune_ratio=prune_ratio_list['EvoLeViT_384_384'])
+
+
+global_attn = 0
+ori_indices = None
+learn_tradeoff_mode = True
+
+
+def easy_gather(x, indices):
+    # x: B,N,C; indices: B,N
+    B, N, C = x.shape
+    N_new = indices.shape[1]
+    offset = torch.arange(B, dtype=torch.long, device=x.device).view(B, 1) * N
+    indices = indices + offset
+    out = x.reshape(B * N, C)[indices.view(-1)].reshape(B, N_new, C)
+    return out
+
+
+def merge_tokens(x_drop, score):
+    # score B,N
+    # scale
+    weight = score / torch.sum(score, dim=1, keepdim=True)
+    x_drop = weight.unsqueeze(-1) * x_drop
+    return torch.sum(x_drop, dim=1, keepdim=True)
+
+
+class CatModule(torch.nn.Module):
+    def __init__(self, m1, m2, prune_ratio, N):
+        super().__init__()
+        self.m1 = m1
+        self.m2 = m2
+        self.prune_ratio = prune_ratio
+        # self.i = i
+        if prune_ratio < 1.0:
+            N_ = N - int(N * prune_ratio)
+            self.drop_fc = nn.AdaptiveAvgPool1d(1)
+            # self.recover_fc=nn.Linear(1,N_)
+
+    def set_prune_ratio(self, prune_ratio):
+        self.prune_ratio = prune_ratio
+
+    def forward(self, x_):
+        global global_attn  # ga
+        global ori_indices  # oi
+        if self.prune_ratio < 1:
+            x = x_[:, 1:]  # split out cls token
+
+            N = x.shape[1]
+            N_ = int(N * self.prune_ratio)
+            global_attn = global_attn.clone()
+            indices = torch.argsort(global_attn, dim=1, descending=True)
+
+            x_ga_oi = torch.cat((x, global_attn.unsqueeze(-1), ori_indices.unsqueeze(-1).half()), dim=-1)
+            x_ga_oi = easy_gather(x_ga_oi, indices)
+            x_ga_oi = x_ga_oi.contiguous()  # todo
+            x_sorted, global_attn, ori_indices = x_ga_oi[:, :, :-2].clone(), x_ga_oi[:, :, -2].clone(), x_ga_oi[:, :, -1].clone()  # todo
+
+            if self.training:
+                x_ = torch.cat((x_[:, :1], x_sorted), dim=1)
+            else:
+                x_[:, 1:] = x_sorted
+            x = x_[:, :N_ + 1]
+            x_drop = x_[:, N_ + 1:]
+
+            add_token = merge_tokens(x_drop, global_attn[:, N_:])  # B,1,C
+            x = torch.cat((x, add_token), dim=1)  # B,N+1,C
+
+            x, raw_x1 = self.m1(x)
+            x, raw_x2 = self.m2(x)
+            x = x[:, :-1]
+
+            # fast update via skip connection
+            add_token1 = raw_x1[:, -1:]
+            add_token2 = raw_x2[:, -1:]
+            x_drop = x_drop + add_token1.expand(-1, x_drop.shape[1], -1) + add_token2.expand(-1, x_drop.shape[1], -1)
+
+            x_ = torch.cat((x, x_drop), dim=1)
+            # x_[:, N_ + 1:] = x_drop
+            # x_[:, :N_ + 1] = x
+        else:
+            x_, _ = self.m1(x_)
+            x_, _ = self.m2(x_)
+        return x_
+
+
+class StageModule(torch.nn.Module):
+    def __init__(self, m, prune_ratio):
+        super().__init__()
+        self.m = m
+        self.prune_ratio = prune_ratio
+
+    def forward(self, x_):
+        global global_attn  # ga
+        global ori_indices  # oi
+
+        if isinstance(x_, tuple):
+            x_ = x_[0]
+
+        if self.prune_ratio < 1:
+            x = x_[:, 1:]  # split out cls token
+
+            N = x.shape[1]
+            N_ = int(N * self.prune_ratio)
+            indices = torch.argsort(global_attn, dim=1, descending=True)
+            x_ga_oi = torch.cat((x, global_attn.unsqueeze(-1), ori_indices.unsqueeze(-1).half()), dim=-1)
+            x_ga_oi = easy_gather(x_ga_oi, indices)
+            x_ga_oi = x_ga_oi.contiguous()  # todo
+            x_sorted, global_attn, ori_indices = x_ga_oi[:, :, :-2].clone(), x_ga_oi[:, :, -2].clone(), x_ga_oi[:, :, -1].clone()  # todo
+
+            if self.training:
+                x_ = torch.cat((x_[:, :1], x_sorted), dim=1)
+            else:
+                x_[:, 1:] = x_sorted
+
+            x = x_[:, :N_ + 1].clone()  # todo
+            x_drop = x_[:, N_ + 1:].clone()  # todo
+
+            merge_weight = global_attn[:, N_:].clone()  # todo
+            add_token = merge_tokens(x_drop, merge_weight)  # B,1,C
+            x = torch.cat((x, add_token), dim=1)  # B,N+1,C
+
+            raw_total = 0
+            for blk in self.m:
+                x, raw = blk(x)
+                raw_total = raw_total + raw[:, -1:].clone()  # todo
+
+            x_drop = x_drop + raw_total.expand(-1, x_drop.shape[1], -1)
+
+            x = x[:, :-1]
+            if self.training:
+                x_ = torch.cat((x, x_drop), dim=1)
+            else:
+                x_[:, N_ + 1:] = x_drop
+                x_[:, :N_ + 1] = x
+        else:
+            x_ = self.m(x_)
+        return x_
+
+
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1, resolution=-10000):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        bn = torch.nn.BatchNorm2d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps) ** 0.5
+        m = torch.nn.Conv2d(w.size(1), w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation,
+                            groups=self.c.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+
+class Linear_BN(torch.nn.Sequential):
+    def __init__(self, a, b, bn_weight_init=1, resolution=-100000):
+        super().__init__()
+        self.add_module('c', torch.nn.Linear(a, b, bias=False))
+        bn = torch.nn.BatchNorm1d(b)
+        torch.nn.init.constant_(bn.weight, bn_weight_init)
+        torch.nn.init.constant_(bn.bias, 0)
+        self.add_module('bn', bn)
+
+    @torch.no_grad()
+    def fuse(self):
+        l, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = l.weight * w[:, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps) ** 0.5
+        m = torch.nn.Linear(w.size(1), w.size(0))
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+    def forward(self, x):
+        l, bn = self._modules.values()
+        x = l(x)
+        return bn(x.flatten(0, 1)).reshape_as(x)
+
+
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        l = torch.nn.Linear(a, b, bias=bias)
+        trunc_normal_(l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(l.bias, 0)
+        self.add_module('l', l)
+
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps) ** 0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0))
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+
+
+def b16(n, activation, resolution=224):
+    return torch.nn.Sequential(
+        Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution),
+        activation(),
+        Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
+        activation(),
+        Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
+        activation(),
+        Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8))
+
+
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop, out_raw=False):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+        self.out_raw = out_raw
+
+    def set_prune_ratio(self, prune_ratio):
+        pass
+
+    def forward(self, x):
+        if isinstance(x, tuple):
+            x = x[0]
+        if self.training and self.drop > 0:
+            raw = self.m(x) * torch.rand(x.size(0), 1, 1,
+                                         device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            raw = self.m(x)
+        if self.out_raw:
+            return x + raw, raw
+        else:
+            return x + raw
+
+
+class  MatmulApply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, self, mat2):
+        ctx.save_for_backward(self, mat2)
+        result = torch.matmul(self, mat2.transpose(-2, -1))
+        return result
+    @staticmethod
+    def backward(ctx, grad):
+        self, mat2 = ctx.saved_tensors
+        self_grad = torch.npu_bmmV2(grad, mat2, [])
+        mat2_grad = torch.npu_bmmV2(grad.transpose(-2, -1), self, [])
+        return self_grad, mat2_grad
+
+matmul_transpose = MatmulApply.apply
+
+
+class Attention(torch.nn.Module):
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 activation=None,
+                 resolution=14, posembed=False, global_attn_tradeoff=0.5):
+        super().__init__()
+        self.tradeoff = global_attn_tradeoff
+
+        self.learn_tradeoff = torch.nn.Parameter(torch.Tensor([0]))
+        self.sigmoid = torch.nn.Sigmoid()
+
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.qkv = Linear_BN(dim, h, resolution=resolution)
+        self.proj = torch.nn.Sequential(activation(), Linear_BN(
+            self.dh, dim, bn_weight_init=0, resolution=resolution))
+
+        self.pos_embed = posembed
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+
+    def forward(self, x):  # x (B,N,C)
+        global global_attn
+        global learn_tradeoff_mode
+
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        q, k, v = qkv.view(B, N, self.num_heads, -
+        1).split([self.key_dim, self.key_dim, self.d], dim=3)
+
+        q = q.permute(0, 2, 1, 3).contiguous()
+        k = k.permute(0, 2, 1, 3).contiguous()
+        v = v.permute(0, 2, 1, 3).contiguous()
+
+        # attn_raw = (q @ k.transpose(-2, -1)) * self.scale
+        attn_raw = matmul_transpose(q, k) * self.scale  # todo
+
+        attn = attn_raw.softmax(dim=-1)
+
+        # update global attn
+        if learn_tradeoff_mode:
+            tradeoff = self.sigmoid(self.learn_tradeoff)
+        else:
+            tradeoff = self.tradeoff
+
+        if isinstance(global_attn, int):
+            cls_attn = torch.mean(attn[:, :, 0, 1:], dim=1)  # B,N
+            global_attn = cls_attn
+        else:
+            if global_attn.shape[1] - N + 2 == 1:
+                # no additional token and no pruning
+                cls_attn = torch.mean(attn[:, :, 0, 1:], dim=1)
+                global_attn = (1 - tradeoff) * global_attn + tradeoff * cls_attn
+            else:
+                cls_attn = torch.mean(attn[:, :, 0, 1:-1], dim=1)
+
+                if self.training:
+                    temp_attn = (1 - tradeoff) * global_attn[:, :N - 2] + tradeoff * cls_attn
+                    global_attn = torch.cat((temp_attn, global_attn[:, N - 2:]), dim=1)
+                else:
+                    global_attn[:, :N - 2] = (1 - tradeoff) * global_attn[:, :N - 2] + tradeoff * cls_attn
+
+        # x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
+        x = (attn @ v).npu_confusion_transpose([0, 2, 1, 3], (B, N, self.dh), True)  # todo
+        x = self.proj(x)
+        return x
+
+
+class Subsample(torch.nn.Module):
+    def __init__(self, stride, resolution):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def forward(self, x, with_cls=True):
+        if with_cls:
+            B, N, C = x.shape
+            x1 = x[:, 1:, :]
+            x1 = x1.view(B, self.resolution, self.resolution, C)[
+                 :, ::self.stride, ::self.stride].reshape(B, -1, C).contiguous()
+            x = torch.cat((x[:, :1, :], x1), dim=1)
+        else:
+            B, N, C = x.shape
+            x = x.view(B, self.resolution, self.resolution, C)[
+                :, ::self.stride, ::self.stride].reshape(B, -1, C).contiguous()
+        return x
+
+
+class AttentionSubsample(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, key_dim, num_heads=8,
+                 attn_ratio=2,
+                 activation=None,
+                 stride=2,
+                 resolution=14, resolution_=7, posembed=False, global_attn_tradeoff=0.5):
+        super().__init__()
+        self.tradeoff = global_attn_tradeoff
+
+        self.learn_tradeoff = torch.nn.Parameter(torch.Tensor([0]))
+        self.sigmoid = torch.nn.Sigmoid()
+
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * self.num_heads
+        self.attn_ratio = attn_ratio
+        self.resolution_ = resolution_
+        self.resolution_2 = resolution_ ** 2
+        h = self.dh + nh_kd
+        self.kv = Linear_BN(in_dim, h, resolution=resolution)
+
+        self.q = torch.nn.Sequential(
+            Subsample(stride, resolution),
+            Linear_BN(in_dim, nh_kd, resolution=resolution_))
+        self.proj = torch.nn.Sequential(activation(), Linear_BN(
+            self.dh, out_dim, resolution=resolution_))
+
+        self.pos_embed = posembed
+        if posembed:
+            self.poss = nn.Parameter(torch.zeros(1, resolution ** 2 + 1, in_dim))
+            trunc_normal_(self.poss, std=.02)
+
+        self.stride = stride
+        self.resolution = resolution
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+
+    def set_prune_ratio(self, prune_ratio):
+        pass
+
+    def forward(self, x):
+        global global_attn  # ga
+        global ori_indices  # oi
+        global learn_tradeoff_mode
+
+        if isinstance(x, tuple):
+            x = x[0]
+
+        # recover sequence
+        old_global_scale = torch.sum(global_attn, dim=1, keepdim=True)
+
+        x_patch = x[:, 1:]
+        ori_indices = ori_indices.clone()
+        indices = torch.argsort(ori_indices, dim=1)
+        x_ga_oi = torch.cat((x_patch, global_attn.unsqueeze(-1), ori_indices.unsqueeze(-1)), dim=-1)
+        x_ga_oi = easy_gather(x_ga_oi, indices)
+        x_ga_oi = x_ga_oi.contiguous()  # todo
+        x_patch, ga_oi = x_ga_oi[:, :, :-2].clone(), x_ga_oi[:, :, -2:].clone()  # todo
+
+        # subsample global attn and ori indices
+        ga_oi = self.q[0](ga_oi, False)
+        global_attn, ori_indices = ga_oi[:, :, 0].clone(), ga_oi[:, :, 1].clone()  # todo
+
+        # global_attn, ori_indices = ga_oi[:, :, 0], ga_oi[:, :, 1]
+
+        if self.training:
+            x = torch.cat((x[:, :1], x_patch), dim=1)
+        else:
+            x[:, 1:] = x_patch
+
+        x = x + self.poss
+        B, N, C = x.shape
+        k, v = self.kv(x).view(B, N, self.num_heads, -
+        1).split([self.key_dim, self.d], dim=3)
+        k = k.permute(0, 2, 1, 3).contiguous()    # BHNC
+        v = v.permute(0, 2, 1, 3).contiguous()    # BHNC
+        q = self.q(x).view(B, self.resolution_2 + 1, self.num_heads,
+                           self.key_dim).permute(0, 2, 1, 3)
+
+        # attn_raw = (q @ k.transpose(-2, -1)) * self.scale
+        attn_raw = matmul_transpose(q, k) * self.scale  # todo
+
+        attn = attn_raw.softmax(dim=-1)
+
+        cls_attn = torch.mean(attn[:, :, 0, 1:], dim=1)  # B,N
+        cls_attn = self.q[0](cls_attn.unsqueeze(-1), False).squeeze(-1)
+
+        if learn_tradeoff_mode:
+            tradeoff = self.sigmoid(self.learn_tradeoff)
+        else:
+            tradeoff = self.tradeoff
+
+        global_attn = (1 - tradeoff) * global_attn + tradeoff * cls_attn
+
+        # normalize global attention
+        new_global_scale = torch.sum(global_attn, dim=1, keepdim=True)
+        scale = old_global_scale / new_global_scale
+        global_attn = global_attn * scale
+
+        x = (attn @ v).transpose(1, 2).reshape(B, -1, self.dh)
+        x = self.proj(x)
+        return x
+
+
+class LeViT(torch.nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self, img_size=384,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=[192],
+                 key_dim=[64],
+                 depth=[12],
+                 num_heads=[3],
+                 attn_ratio=[2],
+                 mlp_ratio=[2],
+                 hybrid_backbone=None,
+                 down_ops=[],
+                 attention_activation=torch.nn.Hardswish,
+                 mlp_activation=torch.nn.Hardswish,
+                 distillation=True,
+                 drop_path=0, prune_ratio=None):
+        super().__init__()
+
+        self.stage_wise_prune = True
+
+        self.num_classes = num_classes
+        self.num_features = embed_dim[-1]
+        self.embed_dim = embed_dim
+        self.distillation = distillation
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim[0]))
+
+        self.patch_embed = hybrid_backbone
+
+        self.pos_embed = True
+
+        self.blocks = []
+        self.stage_blocks = []
+
+        down_ops.append([''])
+        resolution = img_size // patch_size
+        if self.pos_embed:
+            self.poss = nn.Parameter(torch.zeros(1, resolution ** 2 + 1, embed_dim[0]))
+            trunc_normal_(self.poss, std=.02)
+
+        self.prune_ratio = prune_ratio[0]
+        self.stage_prune_ratio = prune_ratio[1]
+
+        layer_index = -1
+        n = 14
+        j = 0
+
+        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)):
+            stage_subblocks = []
+            for _ in range(dpth):
+                layer_index += 1
+
+                m1 = Residual(Attention(
+                    ed, kd, nh,
+                    attn_ratio=ar,
+                    activation=attention_activation,
+                    resolution=resolution,
+                    posembed=self.pos_embed
+                ), drop_path, out_raw=True)
+                if self.prune_ratio[layer_index] == 1:
+                    self.stage_blocks.append(m1)
+                else:
+                    stage_subblocks.append(m1)
+
+                if mr > 0:
+                    h = int(ed * mr)
+                    m2 = Residual(torch.nn.Sequential(
+                        Linear_BN(ed, h, resolution=resolution),
+                        mlp_activation(),
+                        Linear_BN(h, ed, bn_weight_init=0,
+                                  resolution=resolution),
+                    ), drop_path, out_raw=True)
+                else:
+                    m2 = torch.nn.Identity()
+
+                if self.prune_ratio[layer_index] == 1:
+                    self.stage_blocks.append(m2)
+                else:
+                    stage_subblocks.append(m2)
+
+                self.blocks.append(CatModule(m1, m2, prune_ratio=self.prune_ratio[layer_index], N=n ** 2))
+                if self.prune_ratio[layer_index] < 1:
+                    j = j + 1
+
+            if len(stage_subblocks) != 0:
+                stage_subblocks = torch.nn.ModuleList(stage_subblocks)
+                self.stage_blocks.append(StageModule(stage_subblocks, prune_ratio=self.stage_prune_ratio[i]))
+
+            if do[0] == 'Subsample':
+                n = int((n + 1) / 2)
+                # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+                resolution_ = (resolution - 1) // do[5] + 1
+                subsample = AttentionSubsample(
+                    *embed_dim[i:i + 2], key_dim=do[1], num_heads=do[2],
+                    attn_ratio=do[3],
+                    activation=attention_activation,
+                    stride=do[5],
+                    resolution=resolution,
+                    resolution_=resolution_,
+                    posembed=self.pos_embed)
+                self.blocks.append(subsample)
+                self.stage_blocks.append(subsample)
+
+                resolution = resolution_
+                if do[4] > 0:  # mlp_ratio
+                    h = int(embed_dim[i + 1] * do[4])
+                    ffn = Residual(torch.nn.Sequential(
+                        Linear_BN(embed_dim[i + 1], h,
+                                  resolution=resolution),
+                        mlp_activation(),
+                        Linear_BN(
+                            h, embed_dim[i + 1], bn_weight_init=0, resolution=resolution),
+                    ), drop_path)
+                    self.blocks.append(ffn)
+                    self.stage_blocks.append(ffn)
+
+        self.blocks = torch.nn.Sequential(*self.blocks)
+        self.stage_blocks = torch.nn.Sequential(*self.stage_blocks)
+
+        # Classifier head
+        self.head = BN_Linear(
+            embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        if distillation:
+            self.head_dist = BN_Linear(
+                embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.clsc = True
+        if self.clsc:
+            self.head_cls = BN_Linear(
+                embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+            if distillation:
+                self.head_cls_dist = BN_Linear(
+                    embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {x for x in self.state_dict().keys() if 'poss' in x}
+
+    def set_learn_tradeoff(self, mode):
+        global learn_tradeoff_mode
+        learn_tradeoff_mode = mode
+
+    def set_prune_ratio(self, mode):
+        pass
+
+    def remove_cls(self):
+        if hasattr(self, 'head_cls'):
+            del self.head_cls
+        if hasattr(self, 'head_cls_dist'):
+            del self.head_cls_dist
+
+    def forward(self, x):
+        global global_attn
+        global ori_indices
+        global learn_tradeoff_mode
+
+        global_attn = 0
+
+        x = self.patch_embed(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        ori_indices = torch.arange(x.shape[1], dtype=torch.long, device=x.device).unsqueeze(0)
+        ori_indices = ori_indices.expand(x.shape[0], -1)
+
+        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), 1)
+        if self.pos_embed:
+            x = x + self.poss
+
+        if self.stage_wise_prune:
+            x = self.stage_blocks(x)
+        else:
+            x = self.blocks(x)
+
+        cls = x[:, 0, :]
+        x = x[:, 1:, :]
+        x = x.mean(1)
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if self.clsc:
+                if self.training:
+                    xcls = self.head_cls(cls)
+                    xcls_dist = self.head_cls_dist(cls)
+                    return x[0], x[1], xcls, xcls_dist
+                else:
+                    return (x[0] + x[1]) / 2
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+
+        else:
+            x = self.head(x)
+        return x
+
+
+def model_factory(C, D, X, N, drop_path, weights,
+                  num_classes, distillation, pretrained, fuse, prune_ratio):
+    embed_dim = [int(x) for x in C.split('_')]
+    num_heads = [int(x) for x in N.split('_')]
+    depth = [int(x) for x in X.split('_')]
+    act = torch.nn.Hardswish
+    model = LeViT(
+        patch_size=16,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        key_dim=[D] * 3,
+        depth=depth,
+        attn_ratio=[2, 2, 2],
+        mlp_ratio=[2, 2, 2],
+        down_ops=[
+            # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+            ['Subsample', D, embed_dim[0] // D, 4, 2, 2],
+            ['Subsample', D, embed_dim[1] // D, 4, 2, 2],
+        ],
+        attention_activation=act,
+        mlp_activation=act,
+        hybrid_backbone=b16(embed_dim[0], activation=act),
+        num_classes=num_classes,
+        drop_path=drop_path,
+        distillation=distillation,
+        prune_ratio=prune_ratio
+    )
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            weights, map_location='cpu')
+        model.load_state_dict(checkpoint['model'])
+    if fuse:
+        utils.replace_batchnorm(model)
+
+    return model
+
+
+if __name__ == '__main__':
+    if __name__ == '__main__':
+        for name in specification:
+            net = globals()[name](fuse=False, pretrained=False)
+            net.eval()
+            net.remove_cls()
+            net(torch.randn(2, 3, 384, 384))
+            print(name, 'Parameters:', sum(p.numel() for p in net.parameters() if p.requires_grad))
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/losses_levit.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/losses_levit.py
new file mode 100644
index 0000000000..e1d5a224fe
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/levit/losses_levit.py
@@ -0,0 +1,127 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.nn import functional as F
+
+
+class DistillationLoss(torch.nn.Module):
+    """
+    This module wraps a standard criterion and adds an extra knowledge distillation loss by
+    taking a teacher model prediction and using it as additional supervision.
+    """
+
+    def __init__(self, base_criterion: torch.nn.Module, teacher_model: torch.nn.Module,
+                 distillation_type: str, alpha: float, tau: float):
+        super().__init__()
+        self.base_criterion = base_criterion
+        self.teacher_model = teacher_model
+        assert distillation_type in ['none', 'soft', 'hard', 'mode1', 'mode2', 'mutual', 'cls', 'mode4']
+        self.distillation_type = distillation_type
+        self.alpha = alpha
+        self.tau = tau
+
+    def forward(self, inputs, outputs, labels):
+        """
+        Args:
+            inputs: The original inputs that are feed to the teacher model
+            outputs: the outputs of the model to be trained. It is expected to be
+                either a Tensor, or a Tuple[Tensor, Tensor], with the original output
+                in the first position and the distillation predictions as the second output
+            labels: the labels for the base criterion
+        """
+        outputs_kd = None
+        if not isinstance(outputs, torch.Tensor):
+            # assume that the model outputs a tuple of [outputs, outputs_kd]
+            if self.distillation_type == 'mode1':
+                outputs, outputs_kd, outn2, outn3, outn4 = outputs
+            elif self.distillation_type == 'mode2':
+                outputs, outputs_kd, outn2, outn3, outn4, outmg = outputs
+            elif self.distillation_type == 'mode4':
+                outputs, outputs_kd, outn2, outn3 = outputs
+            elif self.distillation_type == 'mutual':
+                outputs, outputs_kd, outm = outputs
+            elif self.distillation_type == 'cls':
+                outputs, outputs_kd, outc, outc_kd = outputs
+            else:
+                outputs, outputs_kd = outputs
+        base_loss = self.base_criterion(outputs, labels)
+        if self.distillation_type == 'none':
+            return base_loss
+        elif self.distillation_type == 'mode1':
+            base_loss += self.base_criterion(outn2, labels)
+            base_loss += self.base_criterion(outn3, labels)
+            base_loss += self.base_criterion(outn4, labels)
+            base_loss = base_loss / 4.0
+        elif self.distillation_type == 'mode2':
+            base_loss += self.base_criterion(outn2, labels)
+            base_loss += self.base_criterion(outn4, labels)
+            base_loss = base_loss / 3.0
+        elif self.distillation_type == 'mode4':
+            base_loss += self.base_criterion(outn2, labels)
+            base_loss += self.base_criterion(outn3, labels)
+            base_loss = base_loss / 3.0
+        elif self.distillation_type == 'mutual':
+            base_loss += self.base_criterion(outm, labels)
+            base_loss = base_loss / 2.0
+        elif self.distillation_type == 'cls':
+            base_loss = base_loss + self.base_criterion(outc, labels)
+            base_loss = base_loss / 2.0
+
+        if outputs_kd is None:
+            raise ValueError("When knowledge distillation is enabled, the model is "
+                             "expected to return a Tuple[Tensor, Tensor] with the output of the "
+                             "class_token and the dist_token")
+        # don't backprop throught the teacher
+        with torch.no_grad():
+            teacher_outputs = self.teacher_model(inputs)
+
+        if self.distillation_type == 'soft':
+            T = self.tau
+            # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+            # with slight modifications
+            distillation_loss = F.kl_div(
+                F.log_softmax(outputs_kd / T, dim=1),
+                F.log_softmax(teacher_outputs / T, dim=1),
+                reduction='sum',
+                log_target=True
+            ) * (T * T) / outputs_kd.numel()
+        elif self.distillation_type == 'hard':
+            distillation_loss = F.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(dim=1))
+        elif self.distillation_type == 'mode2':
+            distillation_loss = F.cross_entropy(
+                outn3, outmg.argmax(dim=1)) + F.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(dim=1))
+            distillation_loss = distillation_loss / 2.0
+        elif self.distillation_type == 'mode1':
+            distillation_loss = F.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(dim=1))
+        elif self.distillation_type == 'mutual':
+            distillation_loss = F.cross_entropy(
+                outm, teacher_outputs.argmax(dim=1)) + F.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(dim=1))
+            distillation_loss = distillation_loss / 2.0
+        elif self.distillation_type == 'cls':
+            distillation_loss = F.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(dim=1)) + F.cross_entropy(
+                outc_kd, teacher_outputs.argmax(dim=1))
+            distillation_loss = distillation_loss / 2.0
+        elif self.distillation_type == 'mode4':
+            distillation_loss = F.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(dim=1))
+
+        loss = base_loss * (1 - self.alpha) + distillation_loss * self.alpha
+        return loss
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/main_levit.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/main_levit.py
new file mode 100644
index 0000000000..9d3c2a145b
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/main_levit.py
@@ -0,0 +1,506 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import datetime
+import numpy as np
+import time
+import torch
+import torch.backends.cudnn as cudnn
+import json
+import os
+
+from pathlib import Path
+
+from timm.data import Mixup
+from timm.models import create_model
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
+from timm.scheduler import create_scheduler
+from timm.optim import create_optimizer
+from timm.utils import ApexScaler, get_state_dict, ModelEma
+
+from datasets import build_dataset
+from engine_levit import train_one_epoch, evaluate
+from levit.losses_levit import DistillationLoss
+from samplers import RASampler
+import utils
+
+from levit import evo_levit
+from levit import evo_levit_384
+try:
+    # noinspection PyUnresolvedReferences
+    from apex import amp
+except ImportError:
+    amp = None
+
+import torch_npu
+
+def get_args_parser():
+    parser = argparse.ArgumentParser(
+        'LeViT training and evaluation script', add_help=False)
+    parser.add_argument('--batch-size', default=256, type=int)
+    parser.add_argument('--epochs', default=300, type=int)
+
+    # Model parameters
+    parser.add_argument('--model', default='LeViT_256', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--input-size', default=224,
+                        type=int, help='images input size')
+
+    parser.add_argument('--model-ema', action='store_true')
+    parser.add_argument(
+        '--no-model-ema', action='store_false', dest='model_ema')
+    parser.set_defaults(model_ema=True)
+    parser.add_argument('--model-ema-decay', type=float,
+                        default=0.99996, help='')
+    parser.add_argument('--model-ema-force-cpu',
+                        action='store_true', default=False, help='')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "adamw"')
+    parser.add_argument('--opt-eps', default=1e-8, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: 1e-8)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--clip-grad', type=float, default=0.01, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+    parser.add_argument('--clip-mode', type=str, default='agc',
+                        help='Gradient clipping mode. One of ("norm", "value", "agc")')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='SGD momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.025,
+                        help='weight decay (default: 0.025)')
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "cosine"')
+    parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+                        help='learning rate (default: 5e-4)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+                        help='warmup learning rate (default: 1e-6)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=5, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    # Augmentation parameters
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". " + \
+                             "(default: rand-m9-mstd0.5-inc1)'),
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='bicubic',
+                        help='Training interpolation (random, bilinear, bicubic default: "bicubic")')
+
+    parser.add_argument('--repeated-aug', action='store_true')
+    parser.add_argument('--no-repeated-aug',
+                        action='store_false', dest='repeated_aug')
+    parser.set_defaults(repeated_aug=True)
+
+    # * Random Erase params
+    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                        help='Random erase prob (default: 0.25)')
+    parser.add_argument('--remode', type=str, default='pixel',
+                        help='Random erase mode (default: "pixel")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+
+    # * Mixup params
+    parser.add_argument('--mixup', type=float, default=0.8,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.8)')
+    parser.add_argument('--cutmix', type=float, default=1.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 1.0)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+
+    # Distillation parameters
+    parser.add_argument('--teacher-model', default='regnety_160', type=str, metavar='MODEL',
+                        help='Name of teacher model to train (default: "regnety_160"')
+    parser.add_argument('--teacher-path', type=str,
+                        default='./regnety_160-a5fe301d.pth')
+    parser.add_argument('--distillation-type', default='cls',
+                        choices=['none', 'soft', 'hard', 'cls'], type=str, help="")
+    parser.add_argument('--distillation-alpha',
+                        default=0.5, type=float, help="")
+    parser.add_argument('--distillation-tau', default=1.0, type=float, help="")
+
+    # * Finetuning params
+    parser.add_argument('--finetune', default='',
+                        help='finetune from checkpoint')
+
+    # Dataset parameters
+    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+    parser.add_argument('--data-set', default='IMNET', choices=['CIFAR', 'IMNET', 'INAT', 'INAT19'],
+                        type=str, help='Image Net dataset path')
+    parser.add_argument('--inat-category', default='name',
+                        choices=['kingdom', 'phylum', 'class', 'order',
+                                 'supercategory', 'family', 'genus', 'name'],
+                        type=str, help='semantic granularity')
+
+    parser.add_argument('--output_dir', default='',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')#/home/zym/save/checkpoint.pth
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true',
+                        help='Perform evaluation only')
+    parser.add_argument('--dist-eval', action='store_true',
+                        default=False, help='Enabling distributed evaluation')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin-mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no-pin-mem', action='store_false', dest='pin_mem',
+                        help='')
+    parser.set_defaults(pin_mem=True)#默认为true
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://',
+                        help='url used to set up distributed training')
+    parser.add_argument('--local_rank', default=-1)  # 新增
+    # training strategy
+    parser.add_argument('--prune-warmup-epoch', type=int, default=0,
+                        help='epoch to start pruning')
+
+    #add apex
+    parser.add_argument('--apex', default=True,action='store_true',
+                        help='Use apex for mixed precision training')
+    parser.add_argument('--apex_opt_level', default='O2', type=str,
+                        help='For apex mixed precision training'
+                             'O0 for FP32 training, O1 for mixed precision training.'
+                             'For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet')
+    parser.add_argument('--loss_scale_value', default=1024., type=float,
+                        help='loss scale using in amp, default -1 means dynamic')
+
+    #add fps or acc
+    parser.add_argument('--train_type', default='acc', type=str,
+                        help='support 1p performance or accuracy')
+    return parser
+
+
+def main(args):
+    utils.init_distributed_mode(args)
+
+    print(args)
+
+    if args.distillation_type != 'none' and args.finetune and not args.eval:
+        raise NotImplementedError(
+            "Finetuning with distillation not yet supported")
+
+
+    local_rank = utils.get_rank()
+    device = torch.device(f'npu:{local_rank}')
+
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # random.seed(seed)
+
+    cudnn.benchmark = True
+
+    dataset_train, args.nb_classes = build_dataset(is_train=True, args=args)
+    dataset_val, _ = build_dataset(is_train=False, args=args)
+
+    if True:  # args.distributed:
+        num_tasks = utils.get_world_size()
+        global_rank = utils.get_rank()
+        if args.repeated_aug:
+            sampler_train = RASampler(
+                dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+            )
+        else:
+            sampler_train = torch.utils.data.DistributedSampler(
+                dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+            )
+        if args.dist_eval:
+            if len(dataset_val) % num_tasks != 0:
+                print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                      'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                      'equal num of samples per-process.')
+            sampler_val = torch.utils.data.DistributedSampler(
+                dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False)
+        else:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset_train, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=True,
+    )
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=int(1.5 * args.batch_size),
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False
+    )
+
+    mixup_fn = None
+    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
+    if mixup_active:
+        mixup_fn = Mixup(
+            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
+            label_smoothing=args.smoothing, num_classes=args.nb_classes)
+
+    print(f"Creating model: {args.model}")
+
+    model = evo_levit_384.EvoLeViT_256_384(
+        num_classes=args.nb_classes,
+        distillation=(args.distillation_type != 'none'),
+        #pretrained=args.eval,
+        fuse=args.eval,
+    )
+    print("device:", device)
+    model.to(device)
+    # learning rate 的步长
+    linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0
+    args.lr = linear_scaled_lr
+    optimizer = create_optimizer(args, model)
+    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", loss_scale=128.0 , combine_grad=True)
+
+    if args.finetune:
+        if args.finetune.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.finetune, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.finetune, map_location='cpu')
+
+        checkpoint_model = checkpoint['model']
+        state_dict = model.state_dict()
+        for k in ['head.weight', 'head.bias',
+                  'head_dist.weight', 'head_dist.bias']:
+            if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
+                print(f"Removing key {k} from pretrained checkpoint")
+                del checkpoint_model[k]
+
+        model.load_state_dict(checkpoint_model, strict=False)
+
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but
+        # before SyncBN and DDP wrapper
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume='')
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.gpu], find_unused_parameters=True,broadcast_buffers=False)#相较于GPU添加了broadcast_buffers=False
+        model_without_ddp = model.module
+
+
+
+    n_parameters = sum(p.numel()
+                       for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+
+
+    loss_scaler = ApexScaler()
+
+    lr_scheduler, _ = create_scheduler(args, optimizer)
+
+    criterion = LabelSmoothingCrossEntropy()
+
+    if args.mixup > 0.:
+        # smoothing is handled with mixup label transform
+        criterion = SoftTargetCrossEntropy()
+    elif args.smoothing:
+        criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing)
+    else:
+        criterion = torch.nn.CrossEntropyLoss()
+
+    teacher_model = None
+    if args.distillation_type != 'none':
+        assert args.teacher_path, 'need to specify teacher-path when using distillation'
+        print(f"Creating teacher model: {args.teacher_model}")
+        teacher_model = create_model(
+            args.teacher_model,
+            pretrained=False,
+            num_classes=args.nb_classes,
+            global_pool='avg',
+        )
+        if args.teacher_path.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.teacher_path, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.teacher_path, map_location='cpu')
+        teacher_model.load_state_dict(checkpoint['model'])
+        teacher_model.to(device)
+        teacher_model.eval()
+
+    # wrap the criterion in our custom DistillationLoss, which
+    # just dispatches to the original criterion if args.distillation_type is
+    # 'none'
+    criterion = DistillationLoss(
+        criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau
+    )
+
+    output_dir = Path(args.output_dir)
+    flag = os.path.exists(args.resume)
+    if args.resume and flag:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if args.model_ema:
+                utils._load_checkpoint_for_ema(
+                    model_ema, checkpoint['model_ema'])
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+    if args.eval:
+        print("now eval...")
+        test_stats = evaluate(data_loader_val, model, device)
+        print(
+            f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        return
+
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    max_accuracy = 0.0
+    FPS = 0
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            data_loader_train.sampler.set_epoch(epoch)
+
+        if args.prune_warmup_epoch != 0:
+            if epoch < args.prune_warmup_epoch:
+                model.module.set_prune_ratio('no_prune')
+            if epoch == args.prune_warmup_epoch:
+                model.module.set_prune_ratio('prune')
+
+        train_stats, fps_epoch = train_one_epoch(
+            args,
+            model, criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            args.clip_grad, args.clip_mode, model_ema, mixup_fn,
+            set_training_mode=args.finetune == ''  # keep in eval mode during finetuning
+        )
+        FPS = FPS + fps_epoch
+        lr_scheduler.step(epoch)
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+            for checkpoint_path in checkpoint_paths:
+                utils.save_on_master({
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'model_ema': get_state_dict(model_ema),
+                    'scaler': loss_scaler.state_dict(),
+                    'args': args,
+                }, checkpoint_path)
+        if epoch % 10 == 9 or epoch > 290:
+            test_stats = evaluate(data_loader_val, model, device)
+            print(
+                f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+            max_accuracy = max(max_accuracy, test_stats["acc1"])
+            print(f'Max accuracy: {max_accuracy:.2f}%')
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         **{f'test_{k}': v for k, v in test_stats.items()},
+                         'epoch': epoch,
+                         'n_parameters': n_parameters,
+                         'fps':fps_epoch}#新增日志文件打印fps
+
+            # save best
+            if test_stats["acc1"] == max_accuracy and args.output_dir:
+                checkpoint_paths = [output_dir / 'best_checkpoint.pth']
+                for checkpoint_path in checkpoint_paths:
+                    utils.save_on_master({
+                        'model': model_without_ddp.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'lr_scheduler': lr_scheduler.state_dict(),
+                        'epoch': epoch,
+                        'model_ema': get_state_dict(model_ema),
+                        'scaler': loss_scaler.state_dict(),
+                        'args': args,
+                    }, checkpoint_path)
+
+        else:
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         'epoch': epoch,
+                         'n_parameters': n_parameters,
+                         'fps':fps_epoch}#新增日志文件打印fps
+
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+    print(f"Average FPS {FPS / args.epochs}")
+
+
+
+if __name__ == '__main__':
+    option = {}
+    option["ACL_OP_COMPILER_CACHE_MODE"] = "enable" # cache
+    option["ACL_OP_COMPILER_CACHE_DIR"] = "./cache"
+    # option={"autotune": "enable", "autotunegraphdumppath": "./graphs"}
+    torch.npu.set_option(option)
+    parser = argparse.ArgumentParser(
+        'LeViT training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/samplers.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/samplers.py
new file mode 100644
index 0000000000..640305632c
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/samplers.py
@@ -0,0 +1,74 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch.distributed as dist
+import math
+
+
+class RASampler(torch.utils.data.Sampler):
+    """Sampler that restricts data loading to a subset of the dataset for distributed,
+    with repeated augmentation.
+    It ensures that different each augmented version of a sample will be visible to a
+    different process (GPU)
+    Heavily based on torch.utils.data.DistributedSampler
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    "Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
+        self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices = [ele for ele in indices for i in range(3)]
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices[:self.num_selected_samples])
+
+    def __len__(self):
+        return self.num_selected_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/env_npu.sh b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/env_npu.sh
new file mode 100644
index 0000000000..084782cac1
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/env_npu.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+       CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=0
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+
+#设置device侧日志登记为error
+msnpureport -g error -d 0
+msnpureport -g error -d 1
+msnpureport -g error -d 2
+msnpureport -g error -d 3
+msnpureport -g error -d 4
+msnpureport -g error -d 5
+msnpureport -g error -d 6
+msnpureport -g error -d 7
+#关闭Device侧Event日志
+msnpureport -e disable
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_1P.sh b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_1P.sh
new file mode 100644
index 0000000000..6fe8be9240
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_1P.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Evo-Levit_256_384"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=1
+export WORLD_SIZE=1
+# 路径参数初始化
+#配置dataset的路径,请按照实际情况填写测试路径
+data_path=""
+#配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+output_dir=${cur_path}/save
+#参数校验，data_path和output_dir为必传参数
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3.7  -m torch.distributed.launch --nproc_per_node=1   main_levit.py \
+    --model EvoLeViT_256_384 \
+    --input-size 384 \
+    --batch-size 128 \
+    --data_path ${data_path} \
+    --output_dir ${output_dir}> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep 'fps:'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk -F "fps:" '{print $NF}'`
+FPS=${FPS#* }
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep 'Averaged' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss: " '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_8P.sh b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_8P.sh
new file mode 100644
index 0000000000..072837f5e9
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_full_8P.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Evo-Levit_256_384"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=8
+export WORLD_SIZE=8
+# 路径参数初始化
+#配置dataset的路径,请按照实际情况填写测试路径
+data_path=""
+#配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+output_dir=${cur_path}/save
+#参数校验，data_path和output_dir为必传参数
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3  -m torch.distributed.launch --nproc_per_node=8   main_levit.py \
+    --model EvoLeViT_256_384 \
+    --input-size 384 \
+    --batch-size 128 \
+    --data_path ${data_path} \
+    --output_dir ${output_dir}> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep 'fps:'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk -F "fps:" '{print $NF}'`
+FPS=${FPS#* }
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep 'Averaged' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss: " '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_1P.sh b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_1P.sh
new file mode 100644
index 0000000000..337c292d63
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_1P.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Evo-Levit_256_384"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=1
+export WORLD_SIZE=1
+# 路径参数初始化
+#配置dataset的路径,请按照实际情况填写测试路径
+data_path=""
+#配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+output_dir=${cur_path}/save
+#参数校验，data_path和output_dir为必传参数
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3.7  -m torch.distributed.launch --nproc_per_node=1   main_levit.py \
+    --model EvoLeViT_256_384 \
+    --input-size 384 \
+    --batch-size 128 \
+    --epochs 1 \
+    --train_type fps\
+    --data_path ${data_path} \
+    --output_dir ${output_dir}> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep 'fps:'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk -F "fps:" '{print $NF}'`
+FPS=${FPS#* }
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep 'Averaged' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss: " '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_8P.sh b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_8P.sh
new file mode 100644
index 0000000000..654e170197
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/test/train_performance_8P.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Evo-Levit_256_384"
+# 训练batch_size
+batch_size=128
+# 训练使用的npu卡数
+export RANK_SIZE=8
+export WORLD_SIZE=8
+# 路径参数初始化
+#配置dataset的路径,请按照实际情况填写测试路径
+data_path=""
+#配置输出checkpoint文件的路径，请按照实际情况填写测试路径
+cur_path=`pwd`
+output_dir=${cur_path}/save
+#参数校验，data_path和output_dir为必传参数
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path_last_diename=${cur_path##*/}
+if [ x"${cur_path_last_diename}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+###############checkpoint的保存路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+output_path=${cur_path}/save
+if [ -d ${cur_path}/save ];then
+    rm -rf ${cur_path}/save
+    mkdir -p ${cur_path}/save
+else
+    mkdir -p ${cur_path}/save
+fi
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python3  -m torch.distributed.launch --nproc_per_node=8   main_levit.py \
+    --model EvoLeViT_256_384 \
+    --input-size 384 \
+    --batch-size 128 \
+    --epochs 1 \
+    --train_type fps\
+    --data_path ${data_path} \
+    --output_dir ${output_dir}> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+##################获取训练数据################
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep 'fps:'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk -F "fps:" '{print $NF}'`
+FPS=${FPS#* }
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -a 'Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $3}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep 'Averaged' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss: " '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/utils.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/utils.py
new file mode 100644
index 0000000000..648d491803
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/utils.py
@@ -0,0 +1,277 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import time
+from collections import defaultdict, deque
+import datetime
+
+import torch
+import torch.distributed as dist
+import torch_npu
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total],
+                         dtype=torch.float32, device='npu')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+        # print('count and total is: {}, {}'.format(int(t[0]),int(t[1])))
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch_npu.npu.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch_npu.npu.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=0))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def _load_checkpoint_for_ema(model_ema, checkpoint):
+    """
+    Workaround for ModelEma._load_checkpoint to accept an already-loaded object
+    """
+    mem_file = io.BytesIO()
+    torch.save(checkpoint, mem_file)
+    mem_file.seek(0)
+    model_ema._load_checkpoint(mem_file)
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.local_rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+        print(f"RANK and WORLD_SIZE in environ: {args.local_rank}/{args.world_size}")
+    elif 'SLURM_PROCID' in os.environ:
+        args.local_rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.local_rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+
+    args.distributed = True
+
+    #NPU
+    torch_npu.npu.set_device(args.gpu)
+    args.dist_backend = 'hccl'
+
+    print('| distributed init (rank {}): {}'.format(
+        args.local_rank, args.dist_url), flush=True)
+    #NPU
+    torch.distributed.init_process_group(backend=args.dist_backend, world_size=args.world_size, rank=args.local_rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.local_rank == 0)
+
+
+def replace_batchnorm(net):
+    for child_name, child in net.named_children():
+        if hasattr(child, 'fuse'):
+            setattr(net, child_name, child.fuse())
+        elif isinstance(child, torch.nn.Conv2d):
+            child.bias = torch.nn.Parameter(torch.zeros(child.weight.size(0)))
+        elif isinstance(child, torch.nn.BatchNorm2d):
+            setattr(net, child_name, torch.nn.Identity())
+        else:
+            replace_batchnorm(child)
+
+
+def replace_layernorm(net):
+    import apex
+    for child_name, child in net.named_children():
+        if isinstance(child, torch.nn.LayerNorm):
+            setattr(net, child_name, apex.normalization.FusedLayerNorm(
+                child.weight.size(0)))
+        else:
+            replace_layernorm(child)
diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/visualize.py b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/visualize.py
new file mode 100644
index 0000000000..c91d13b433
--- /dev/null
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/visualize.py
@@ -0,0 +1,599 @@
+# encoding=utf-8
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import torch.backends.cudnn as cudnn
+
+from pathlib import Path
+
+from timm.data import Mixup
+from timm.models import create_model
+from timm.scheduler import create_scheduler
+from timm.optim import create_optimizer
+from timm.utils import NativeScaler, get_state_dict, ModelEma
+
+from datasets import build_dataset2, get_post_process
+
+import utils
+
+from timm.utils import accuracy, ModelEma
+from torchvision import utils as vutils
+
+import torch
+from torchvision import transforms
+
+from PIL import Image
+import os
+
+from deit import evo_deit_vis
+
+
+def get_transform(input_size):
+    t = []
+    resize_im = (input_size != 224)
+    if resize_im:
+        size = int((256 / 224) * args.input_size)
+        t.append(
+            transforms.Resize(size, interpolation=3),  # to maintain same ratio w.r.t. 224 images
+        )
+        t.append(transforms.CenterCrop(args.input_size))
+        t.append(transforms.ToTensor())
+    else:
+        t.append(transforms.ToTensor())
+
+    return transforms.Compose(t)
+
+
+def get_keep_indices(decisions):
+    keep_indices = []
+    for i in range(3):
+        if i == 0:
+            keep_indices.append(decisions[i])
+        else:
+            keep_indices.append(keep_indices[-1][decisions[i]])
+    return keep_indices
+
+
+def gen_masked_tokens(tokens, indices, alpha=0.3):
+    indices = [i for i in range(196) if i not in indices]
+    tokens = tokens.copy()
+    tokens[indices] = alpha * tokens[indices] + (1 - alpha) * 255
+    return tokens
+
+
+def recover_image(tokens):
+    # image: (C, 196, 16, 16)
+    image = tokens.reshape(14, 14, 16, 16, 3).swapaxes(1, 2).reshape(224, 224, 3)
+    return image
+
+
+def gen_visualization(image, keep_indices):
+    # keep_indices = get_keep_indices(decisions)
+    image_tokens = image.reshape(14, 16, 14, 16, 3).swapaxes(1, 2).reshape(196, 16, 16, 3)
+
+    viz = recover_image(gen_masked_tokens(image_tokens, keep_indices))
+    return viz
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('DeiT training and evaluation script', add_help=False)
+    parser.add_argument('--batch-size', default=64, type=int)
+    parser.add_argument('--epochs', default=300, type=int)
+
+    # Model parameters
+    parser.add_argument('--model', default='deit_base_patch16_224', type=str, metavar='MODEL',
+                        help='Name of model to train')
+    parser.add_argument('--input-size', default=224, type=int, help='images input size')
+
+    parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                        help='Dropout rate (default: 0.)')
+    parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
+                        help='Drop path rate (default: 0.1)')
+
+    parser.add_argument('--model-ema', action='store_true')
+    parser.add_argument('--no-model-ema', action='store_false', dest='model_ema')
+    parser.set_defaults(model_ema=True)
+    parser.add_argument('--model-ema-decay', type=float, default=0.99996, help='')
+    parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, help='')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "adamw"')
+    parser.add_argument('--opt-eps', default=1e-8, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: 1e-8)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='SGD momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.05,
+                        help='weight decay (default: 0.05)')
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "cosine"')
+    parser.add_argument('--lr', type=float, default=5e-4, metavar='LR',
+                        help='learning rate (default: 5e-4)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
+                        help='warmup learning rate (default: 1e-6)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=5, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    # Augmentation parameters
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". " + \
+                             "(default: rand-m9-mstd0.5-inc1)'),
+    parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='bicubic',
+                        help='Training interpolation (random, bilinear, bicubic default: "bicubic")')
+
+    parser.add_argument('--repeated-aug', action='store_true')
+    parser.add_argument('--no-repeated-aug', action='store_false', dest='repeated_aug')
+    parser.set_defaults(repeated_aug=True)
+
+    # * Random Erase params
+    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
+                        help='Random erase prob (default: 0.25)')
+    parser.add_argument('--remode', type=str, default='pixel',
+                        help='Random erase mode (default: "pixel")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+
+    # * Mixup params
+    parser.add_argument('--mixup', type=float, default=0.8,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.8)')
+    parser.add_argument('--cutmix', type=float, default=1.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 1.0)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+
+    # Distillation parameters
+    parser.add_argument('--teacher-model', default='regnety_160', type=str, metavar='MODEL',
+                        help='Name of teacher model to train (default: "regnety_160"')
+    parser.add_argument('--teacher-path', type=str, default='')
+    parser.add_argument('--distillation-type', default='none', choices=['none', 'soft', 'hard'], type=str, help="")
+    parser.add_argument('--distillation-alpha', default=0.5, type=float, help="")
+    parser.add_argument('--distillation-tau', default=1.0, type=float, help="")
+
+    # * Finetuning params
+    parser.add_argument('--finetune', default='', help='finetune from checkpoint')
+
+    # Dataset parameters
+    parser.add_argument('--data-path', default='/datasets01/imagenet_full_size/061417/', type=str,
+                        help='dataset path')
+    parser.add_argument('--data-set', default='IMNET', choices=['CIFAR', 'IMNET', 'INAT', 'INAT19'],
+                        type=str, help='Image Net dataset path')
+    parser.add_argument('--inat-category', default='name',
+                        choices=['kingdom', 'phylum', 'class', 'order', 'supercategory', 'family', 'genus', 'name'],
+                        type=str, help='semantic granularity')
+
+    parser.add_argument('--output_dir', default='./test_img/', help='path where to save')
+    parser.add_argument('--device', default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_false', help='Perform evaluation only')
+    parser.add_argument('--dist-eval', action='store_true', default=False, help='Enabling distributed evaluation')
+    parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin-mem', action='store_true',
+                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+    parser.add_argument('--no-pin-mem', action='store_false', dest='pin_mem',
+                        help='')
+    parser.set_defaults(pin_mem=True)
+
+    # distributed training parameters
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    parser.add_argument('--excel_filename', type=str, default='attention_matrix_cls', help='filename of saving excel')
+
+    # visualization
+    parser.add_argument('--img-path', default='', type=str,
+                        help='path to images to be visualized. Set '' to visualize batch images in imagenet val.')
+    parser.add_argument('--save-name', default='', type=str,
+                        help='name to save when visualizing a single image. Set '' to save name as the original image.')
+    parser.add_argument('--layer-wise-prune', action='store_true',
+                        help='set true when visualize a model trained without layer to stage training strategy')
+    return parser
+
+
+def save_image_tensor(input_tensor: torch.Tensor, filename):
+    """
+    将tensor保存为图片
+    :param input_tensor: 要保存的tensor
+    :param filename: 保存的文件名
+    """
+    assert ((len(input_tensor.shape) == 4 and input_tensor.shape[0] == 1) or len(input_tensor.shape) == 3)
+    # 复制一份
+    input_tensor = input_tensor.clone().detach()
+    # 到cpu
+    input_tensor = input_tensor.to(torch.device('cpu'))
+    # 反归一化
+    # input_tensor = unnormalize(input_tensor)
+    vutils.save_image(input_tensor, filename)
+
+
+@torch.no_grad()
+def visualize_single_img(img_input, model, device, transform, post_process, save_name):
+    model.eval()
+    # set stage_wise_prune = True if the trained model is under layer-to-stage training strategy
+    model.stage_wise_prune = not args.layer_wise_prune
+
+    # img: 1, 3, H, W
+    image_raw = transform(img_input)
+    save_image_tensor(image_raw, Path(args.output_dir, '{}.jpg'.format(save_name)))
+    images = post_process(image_raw)
+    images = images.unsqueeze(0)
+    images = images.to(device, non_blocking=True)
+    print(images.shape)
+    # compute output
+    with torch.cuda.amp.autocast():
+        output = model(images)
+        vis_dict = model.get_vis_dict()
+    image_raw = image_raw * 255
+    image_raw = image_raw.squeeze(0).permute(1, 2, 0).cpu().numpy()
+    for k in vis_dict:
+        keep_indices = vis_dict[k]
+        viz = gen_visualization(image_raw, keep_indices)
+        viz = torch.from_numpy(viz).permute(2, 0, 1)
+
+        viz = viz / 255
+
+        save_image_tensor(viz,
+                          Path(args.output_dir, '{}_{}.jpg'.format(save_name, k)))
+    print("Visualization finished")
+
+
+@torch.no_grad()
+def visualize(data_loader, model, device, post_process):
+    criterion = torch.nn.CrossEntropyLoss()
+
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    # switch to evaluation mode
+    model.eval()
+
+    # set stage_wise_prune = True if the trained model is under layer-to-stage training strategy
+    model.stage_wise_prune = not args.layer_wise_prune
+
+    for images_raw_full, target_full in metric_logger.log_every(data_loader, 10, header):
+        B = images_raw_full.shape[0]
+        for index in range(B):
+            images_raw = images_raw_full[index:index + 1]
+            target = target_full[index:index + 1]
+            assert images_raw.shape[0] == 1
+            images = post_process(images_raw)
+
+            name = 'label{}_seed{}_index{}.jpg'.format(str(target.item()), int(args.seed), index)
+            save_image_tensor(images_raw, Path(args.output_dir, name))
+            images = images.to(device, non_blocking=True)
+            target = target.to(device, non_blocking=True)
+
+            # compute output
+            with torch.cuda.amp.autocast():
+                output = model(images)
+            vis_dict = model.get_vis_dict()
+            loss = criterion(output, target)
+
+            images_raw = images_raw * 255
+            images_raw = images_raw.squeeze(0).permute(1, 2, 0).cpu().numpy()
+            # if np.max(images_raw) > 3:
+            #     images_raw = images_raw / 255
+
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            if acc1 == 0:
+                judger = 'wrong'
+            elif acc1 == 100:
+                judger = 'right'
+            else:
+                raise ValueError('xxxx')
+
+            for k in vis_dict:
+                keep_indices = vis_dict[k]
+                viz = gen_visualization(images_raw, keep_indices)
+                viz = torch.from_numpy(viz).permute(2, 0, 1)
+                viz = viz / 255
+
+                name = 'label{}_seed{}_{}_index{}_{}.jpg'.format(
+                    str(target.item()),
+                    int(args.seed), k, index, judger)
+                save_image_tensor(viz, Path(args.output_dir, name))
+
+            batch_size = images.shape[0]
+            metric_logger.update(loss=loss.item())
+            metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
+            metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
+        print("Visualization finished")
+        break
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
+          .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
+
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+def vis_single(args):
+    device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # random.seed(seed)
+
+    cudnn.benchmark = True
+
+    transform = get_transform(input_size=224)  # set input_size to other value if the test image is not 224*224
+    post_process = get_post_process()
+
+    print("Creating model: {args.model}")
+    model = create_model(
+        args.model,
+        pretrained=False,
+        num_classes=1000,
+        drop_rate=args.drop,
+        drop_path_rate=args.drop_path,
+        drop_block_rate=None,
+    )
+
+    model.to(device)
+
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+
+        model.load_state_dict(checkpoint['model'])
+
+    img_input = Image.open(args.img_path)
+    if args.save_name == '':
+        save_name = os.path.basename(args.img_path).split('.')[0]
+    else:
+        save_name = args.save_name
+    if args.eval:
+        test_stats = visualize_single_img(img_input, model, device, transform, post_process, save_name=save_name)
+        return
+
+
+def vis_batch(args):
+    utils.init_distributed_mode(args)
+    print(args)
+
+    if args.distillation_type != 'none' and args.finetune and not args.eval:
+        raise NotImplementedError("Finetuning with distillation not yet supported")
+
+    device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # random.seed(seed)
+
+    cudnn.benchmark = True
+
+    # dataset_train, args.nb_classes = build_dataset(is_train=True, args=args)
+    dataset_val, args.nb_classes = build_dataset2(is_train=False, args=args)
+    post_process = get_post_process()
+
+    if True:  # args.distributed:
+        num_tasks = utils.get_world_size()
+        global_rank = utils.get_rank()
+        # if args.repeated_aug:
+        #     sampler_train = RASampler(
+        #         dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        #     )
+        # else:
+        #     sampler_train = torch.utils.data.DistributedSampler(
+        #         dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        #     )
+        if args.dist_eval:
+            if len(dataset_val) % num_tasks != 0:
+                print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                      'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                      'equal num of samples per-process.')
+            # sampler_val = torch.utils.data.DistributedSampler(
+            #     dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False)
+            sampler_val = torch.utils.data.DistributedSampler(
+                dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=True)
+        else:
+            # sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+            sampler_val = torch.utils.data.RandomSampler(dataset_val)
+    else:
+        # sampler_train = torch.utils.data.RandomSampler(dataset_train)
+        # sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+        sampler_val = torch.utils.data.RandomSampler(dataset_val)
+    # data_loader_train = torch.utils.data.DataLoader(
+    #     dataset_train, sampler=sampler_train,
+    #     batch_size=args.batch_size,
+    #     num_workers=args.num_workers,
+    #     pin_memory=args.pin_mem,
+    #     drop_last=True,
+    # )
+
+    data_loader_val = torch.utils.data.DataLoader(
+        dataset_val, sampler=sampler_val,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=args.pin_mem,
+        drop_last=False,
+    )
+
+    print("Creating model: {args.model}")
+    model = create_model(
+        args.model,
+        pretrained=False,
+        num_classes=args.nb_classes,
+        drop_rate=args.drop,
+        drop_path_rate=args.drop_path,
+        drop_block_rate=None,
+    )
+
+    if args.finetune:
+        if args.finetune.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.finetune, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.finetune, map_location='cpu')
+
+        checkpoint_model = checkpoint['model']
+        state_dict = model.state_dict()
+        for k in ['head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias']:
+            if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
+                print("Removing key {k} from pretrained checkpoint")
+                del checkpoint_model[k]
+
+        # interpolate position embedding
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        checkpoint_model['pos_embed'] = new_pos_embed
+
+        model.load_state_dict(checkpoint_model, strict=False)
+
+    model.to(device)
+
+    model_ema = None
+    if args.model_ema:
+        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
+        model_ema = ModelEma(
+            model,
+            decay=args.model_ema_decay,
+            device='cpu' if args.model_ema_force_cpu else '',
+            resume='')
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+
+    linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size() / 512.0
+    args.lr = linear_scaled_lr
+    optimizer = create_optimizer(args, model_without_ddp)
+    loss_scaler = NativeScaler()
+
+    lr_scheduler, _ = create_scheduler(args, optimizer)
+
+    if args.distillation_type != 'none':
+        assert args.teacher_path, 'need to specify teacher-path when using distillation'
+        print("Creating teacher model: {args.teacher_model}")
+        teacher_model = create_model(
+            args.teacher_model,
+            pretrained=False,
+            num_classes=args.nb_classes,
+            global_pool='avg',
+        )
+        if args.teacher_path.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.teacher_path, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.teacher_path, map_location='cpu')
+        teacher_model.load_state_dict(checkpoint['model'])
+        teacher_model.to(device)
+        teacher_model.eval()
+
+    # wrap the criterion in our custom DistillationLoss, which
+    # just dispatches to the original criterion if args.distillation_type is 'none'
+
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+            if args.model_ema:
+                utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema'])
+            if 'scaler' in checkpoint:
+                loss_scaler.load_state_dict(checkpoint['scaler'])
+
+    if args.eval:
+        test_stats = visualize(data_loader_val, model, device, post_process=post_process)
+        print("Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%")
+        return
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('DeiT training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    args.eval = True
+
+    if args.img_path == '':
+        # To visualize batch images of imagenet val, please run this:
+        vis_batch(args)
+    else:
+        # To visualize a single image, please run this:
+        vis_single(args)
-- 
Gitee


From 3721892ba8b8973b399aeac036be90cb7dd8e0a6 Mon Sep 17 00:00:00 2001
From: zhangyanmin <2716635239@qq.com>
Date: Wed, 9 Nov 2022 14:00:37 +0000
Subject: [PATCH 2/3] updata README

---
 .../Evo-Levit_256_384/README.md               | 248 ++++++++++--------
 1 file changed, 139 insertions(+), 109 deletions(-)

diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
index 51c8512781..4b449cde2e 100644
--- a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
@@ -1,154 +1,184 @@
 # Evo-Levit for PyTorch
-
-[TOC]
+-   [概述](#概述)
+-   [准备训练环境](#准备训练环境)
+-   [开始训练](#开始训练)
+-   [训练结果展示](#训练结果展示)
+-   [版本说明](#版本说明)
 
 # 概述
 
 ## 简述
 
-Evo-ViT的具体框架设计，包括基于全局class attention的token选择以及慢速-快速双流token更新两个模块。其根据全局class attention的排序判断高信息token和低信息token，将低信息token整合为一个归纳token，和高信息token一起输入到原始多头注意力（Multi-head Self-Attention, MSA）模块以及前向传播（Fast Fed-forward Network, FFN）模块中进行精细更新。更新后的归纳token用来快速更新低信息token。全局class attention也在精细更新过程中进行同步更新变化。
+Evo-ViT的具体框架设计，包括基于全局class attention的token选择以及慢速、快速双流token更新两个模块。其根据全局class attention的排序判断高信息token和低信息token，将低信息token整合为一个归纳token，和高信息token一起输入到原始多头注意力（Multi-head Self-Attention, MSA）模块以及前向传播（Fast Fed-forward Network, FFN）模块中进行精细更新。更新后的归纳token用来快速更新低信息token。全局class attention也在精细更新过程中进行同步更新变化。
+
+- 参考实现：
+
+  ```
+  url=https://github.com/YifanXu74/Evo-ViT
+  commit_id=4c5d9b30b0a3c9b1e7b8687a9490555bd9d714ca
+  ```
 
-- 参考实现
 
-```
-url = https://github.com/YifanXu74/Evo-ViT.git
-```
+- 适配昇腾 AI 处理器的实现：
 
-- 适配昇腾AI处理器的实现
-- 通过Git获取代码方法如下
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/classification
+  ```
+  
+- 通过Git获取代码方法如下：
 
-```
-git clone {url}       # 克隆仓库的代码
-cd {code_path}        # 切换到模型代码所在路径
-```
+  ```
+  git clone {url}        # 克隆仓库的代码   
+  cd {code_path}         # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
+  ```
+  
+- 通过单击“立即下载”，下载源码包。
 
 # 准备训练环境
 
 ## 准备环境
 
-- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示
-
-  **表1** 版本配套表
-
-| 配套       | 版本                                                         |
-| ---------- | ------------------------------------------------------------ |
-| 固件与驱动 | [1.0.12](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
-| CANN       | [5.0.3](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) |
-| PyTorch    | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/)       |
-
-- 安装依赖
-
-```
-pip install timm==0.4.12
-pip install torchvision==0.9.1
-pip install torch_npu-1.8.1rc2.20220607-cp37-cp37m-linux_aarch64.whl
-pip install torch-1.8.1+ascend.rc2.20220607-cp37-cp37m-linux_aarch64.whl
-pip install apex-0.1+ascend.20220607-cp37-cp37m-linux_aarch64.whl
-```
-
-- 关于timm包的NPU优化补丁
-
-```
-# 需要先cd到当前文件目录，一般timm包的安装位置在/usr/local/lib/python3.7/dist-packages/timm/
-#先后生成补丁并升级包
-diff -uN {timm_path}/data/mixup.py {code_path}/fix_timm/mixup.py >mixup.patch
-diff -uN {timm_path}/optim/optim_factory.py {code_path}/fix_timm/optim_factory.py >optim.patch
-patch -p0 {timm_path}/data/mixup.py mixup.patch
-patch -p0 {timm_path}/optim/optim_factory.py optim.patch
-```
-
-## 数据集
-
-1. 获取数据集
-
-​		选用的数据集是ImageNet，用户自行获取将数据集上传到服务器任意路径下并解压。
-
-​		ImageNet数据集的目录结构参考如下所示
-
-```
-├── ImageNet2012
-      ├──train
-           ├──类别1
-                 │──图片1
-                 │──图片2
-                 │   ...       
-           ├──类别2
-                 │──图片1
-                 │──图片2
-                 │   ...   
-           ├──...                     
-      ├──val  
-           ├──类别1
-                 │──图片1
-                 │──图片2
-                 │   ...       
-           ├──类别2
-                 │──图片1
-                 │──图片2
-                 │   ...              
-```
-
-## 获取Teacher checkpoint
+- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。
+
+  **表 1**  版本配套表
+
+  | 配套       | 版本                                                         |
+  | ---------- | ------------------------------------------------------------ |
+  | 固件与驱动 | [1.0.17](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial ) |
+  | CANN       | [6.0.RC1](https://www.hiascend.com/software/cann/commercial?version=6.0.RC1 ) |
+  | PyTorch    | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/)       |
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖。
+
+  ```
+  pip install timm==0.4.12
+  pip install torchvision==0.9.1
+  pip install torch_npu-1.8.1rc2.20220607-cp37-cp37m-linux_aarch64.whl
+  pip install torch-1.8.1+ascend.rc2.20220607-cp37-cp37m-linux_aarch64.whl
+  pip install apex-0.1+ascend.20220607-cp37-cp37m-linux_aarch64.whl
+  ```
+  
+- 关于timm包的NPU优化补丁。
+
+  ```
+  # 需要先cd到当前文件目录，一般timm包的安装位置在/usr/local/lib/python3.7/dist-packages/timm/
+  #先后生成补丁并升级包
+  diff -uN {timm_path}/data/mixup.py {code_path}/fix_timm/mixup.py >mixup.patch
+  diff -uN {timm_path}/optim/optim_factory.py {code_path}/fix_timm/optim_factory.py >optim.patch
+  patch -p0 {timm_path}/data/mixup.py mixup.patch
+  patch -p0 {timm_path}/optim/optim_factory.py optim.patch
+  ```
+
+  
+
+
+## 准备数据集
+
+1. 获取数据集。
+
+   用户自行获取原始数据集ImageNet2012，将数据集上传到服务器任意路径下并解压。
+
+   以ImageNet2012数据集为例，数据集目录结构参考如下所示。
+
+   ```
+   ├── ImageNet2012
+         ├──train
+              ├──类别1
+                    │──图片1
+                    │──图片2
+                    │   ...       
+              ├──类别2
+                    │──图片1
+                    │──图片2
+                    │   ...   
+              ├──...                     
+         ├──val  
+              ├──类别1
+                    │──图片1
+                    │──图片2
+                    │   ...       
+              ├──类别2
+                    │──图片1
+                    │──图片2
+                    │   ...              
+   ```
+
+   > **说明：** 
+   > 数据集路径以用户自行定义的路径为准
+
+## 获取预训练模型
 
 Evo-Vit模型训练需要配置teacher—model，获取方式为在GitHub的[Evo-Vit]([GitHub - YifanXu74/Evo-ViT: Official implement of Evo-ViT: Slow-Fast Token Evolution for Dynamic Vision Transformer](https://github.com/YifanXu74/Evo-ViT)),checkpoint文件可以在该仓库自行下载，也可以直接使用网址进行下载，网址如下
 https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth
 
+预训练模型需要放置在模型文件夹下，与main_levit.py或者README处于同级目录下。与源码中的与配置参数的默认值 ”./regnety_160-a5fe301d.pth“保持一致。
+
 # 开始训练
 
 ## 训练模型
+1. 进入解压后的源码包根目录。
 
-1. 进入解压后的源码包根目录
+    ```
+    cd /${模型文件夹名称} 
+    ```
 
-```
-cd /Evo-Levit_256_384
-```
+2. 运行训练脚本。
 
-2.  运行训练脚本
+   该模型支持单机单卡训练和单机8卡训练，开始训练前，请用户根据实际路径配置data_path参数。
 
-该模型支持单机单卡训练和单机8卡训练，开始训练前，请用户根据实际路径配置data_path参数。
+   - 单机单卡训练
 
-- 单机单卡训练
+     启动单卡训练。
 
-​	启动单卡训练
+     ```
+     bash ./test/train_full_1P.sh --data_path=/data/xxx/    
+     ```
 
-```
-bash ./test/train_full_1p.sh --data_path=/home/zym/imagenet/
-```
+   - 单机8卡训练
 
-- 单机 8卡训练
+     启动8卡训练。
 
-​	启动8卡训练
+     ```
+     bash ./test/train_full_8P.sh --data_path=/data/xxx/   
+     ```
 
-```
-bash ./test/train_full_8p.sh --data_path=/home/zym/imagenet/
-```
+   --data\_path参数填写数据集路径。
 
-训练完成后，权重文件保存在 参数设置 的路径下，并输出模型训练精度和性能信息
+   模型训练脚本参数说明如下。
 
-# 训练结果展示
+   ```
+   公共参数：
+   --data_path                         //数据集路径     
+   --epochs                            //重复训练次数
+   --batch-size                        //训练批次大小
+   --nproc_per_node                    //数字表示启用单卡还是多卡
+   ```
+   
+   训练完成后，权重文件保存在当前路径的save中，并输出模型训练精度和性能信息。
 
-**表2**  训练结果展示表
+# 训练结果展示
 
-| NAME   | PT版本 | 精度  | FPS  | Epochs | AMP_Type |
-| ------ | ------ | ----- | ---- | ------ | -------- |
-| 1P-GPU | 1.8.1  | -     | 51   | 1      | O1       |
-| 1P-NPU | 1.8.1  | -     | 59   | 1      | O1       |
-| 8P-GPU | 1.8.1  | 73.54 | 487  | 100    | O1       |
-| 8P-NPU | 1.8.1  | 74.11 | 496  | 100    | O1       |
+**表 2**  训练结果展示表
 
+| NAME   | PT版本 |  精度 | FPS    | Epochs | AMP_Type |
+| ------ | ------ | ----: | ------ | -----: | -------- |
+| 1P-GPU | 1.8.1  |     - | 51     |      1 | O1       |
+| 1P-NPU | 1.8.1  |     - | 66.93  |      1 | O1       |
+| 8P-GPU | 1.8.1  | 73.54 | 487    |    100 | O1       |
+| 8P-NPU | 1.8.1  | 74.32 | 510.72 |    100 | O1       |
 
 
 # 版本说明
 
 ## 变更
 
-2022.09.17：首次发布
-
-2022.10.21: 新增teacher checkpoint网址，更新bash命令
-
-2022.11.09：将NPU1P的fps更新为符合设备的59
-
-# 已知问题
-
+2022.11.09：首次发布。
 
+## 已知问题
 
+无。
\ No newline at end of file
-- 
Gitee


From 9d925f2ebd43c41c4ecd3e14a8b6034276ba75ab Mon Sep 17 00:00:00 2001
From: zhangyanmin <2716635239@qq.com>
Date: Thu, 10 Nov 2022 05:34:16 +0000
Subject: [PATCH 3/3] update README

---
 PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
index 4b449cde2e..33492b0c86 100644
--- a/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
+++ b/PyTorch/contrib/cv/classification/Evo-Levit_256_384/README.md
@@ -116,7 +116,7 @@ Evo-ViT的具体框架设计，包括基于全局class attention的token选择
 Evo-Vit模型训练需要配置teacher—model，获取方式为在GitHub的[Evo-Vit]([GitHub - YifanXu74/Evo-ViT: Official implement of Evo-ViT: Slow-Fast Token Evolution for Dynamic Vision Transformer](https://github.com/YifanXu74/Evo-ViT)),checkpoint文件可以在该仓库自行下载，也可以直接使用网址进行下载，网址如下
 https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth
 
-预训练模型需要放置在模型文件夹下，与main_levit.py或者README处于同级目录下。与源码中的与配置参数的默认值 ”./regnety_160-a5fe301d.pth“保持一致。
+预训练模型需要放置在模型文件夹下，与main_levit.py或者README处于同级目录下。与源码中的配置参数的默认值 ”./regnety_160-a5fe301d.pth“保持一致。
 
 # 开始训练
 
-- 
Gitee