From 7bf147d759a6ee7964b58bc0edd5d9390dcf0e15 Mon Sep 17 00:00:00 2001
From: zhang-zhibin-123 <qq1336356953@163.com>
Date: Tue, 25 Apr 2023 09:44:38 +0800
Subject: [PATCH 1/8] my first commit

---
 .../cv/detection/YoloV2-640/.gitignore        |   6 +
 .../contrib/cv/detection/YoloV2-640/README.md | 331 +++++++++++
 .../cv/detection/YoloV2-640/benchmark.py      | 139 +++++
 .../contrib/cv/detection/YoloV2-640/coco.py   | 209 +++++++
 .../YoloV2-640/config/yolo_config.py          | 227 ++++++++
 .../cv/detection/YoloV2-640/data/__init__.py  |   0
 .../cv/detection/YoloV2-640/data/coco.py      | 336 +++++++++++
 .../YoloV2-640/data/scripts/COCO2017.sh       |  20 +
 .../YoloV2-640/data/scripts/VOC2007.sh        |  42 ++
 .../YoloV2-640/data/scripts/VOC2012.sh        |  38 ++
 .../detection/YoloV2-640/data/transforms.py   | 423 ++++++++++++++
 .../cv/detection/YoloV2-640/data/voc.py       | 342 +++++++++++
 .../contrib/cv/detection/YoloV2-640/demo.py   | 249 ++++++++
 .../cv/detection/YoloV2-640/env_npu.sh        |  79 +++
 .../contrib/cv/detection/YoloV2-640/eval.py   | 134 +++++
 .../YoloV2-640/evaluator/cocoapi_evaluator.py | 135 +++++
 .../YoloV2-640/evaluator/vocapi_evaluator.py  | 347 +++++++++++
 .../detection/YoloV2-640/models/__init__.py   |   0
 .../YoloV2-640/models/backbone/__init__.py    |  84 +++
 .../models/backbone/cspdarknet53.py           | 296 ++++++++++
 .../models/backbone/cspdarknet_tiny.py        | 128 ++++
 .../YoloV2-640/models/backbone/darknet.py     | 102 ++++
 .../YoloV2-640/models/backbone/resnet.py      | 227 ++++++++
 .../models/backbone/shufflenetv2.py           | 194 +++++++
 .../YoloV2-640/models/backbone/vit.py         | 378 ++++++++++++
 .../models/backbone/weights/README.md         |  15 +
 .../models/backbone/yolox_backbone.py         | 409 +++++++++++++
 .../YoloV2-640/models/basic/__init__.py       |   0
 .../YoloV2-640/models/basic/bottleneck_csp.py |  30 +
 .../detection/YoloV2-640/models/basic/conv.py |  59 ++
 .../YoloV2-640/models/basic/upsample.py       |  20 +
 .../YoloV2-640/models/head/__init__.py        |   0
 .../YoloV2-640/models/head/coupled_head.py    | 100 ++++
 .../YoloV2-640/models/head/decoupled_head.py  | 120 ++++
 .../YoloV2-640/models/neck/__init__.py        |  23 +
 .../YoloV2-640/models/neck/dilated_encoder.py |  39 ++
 .../detection/YoloV2-640/models/neck/fpn.py   | 120 ++++
 .../detection/YoloV2-640/models/neck/spp.py   |  95 +++
 .../YoloV2-640/models/yolo/__init__.py        |  92 +++
 .../YoloV2-640/models/yolo/yolo_nano.py       | 340 +++++++++++
 .../YoloV2-640/models/yolo/yolo_tiny.py       | 335 +++++++++++
 .../YoloV2-640/models/yolo/yolov1.py          | 260 +++++++++
 .../YoloV2-640/models/yolo/yolov2.py          | 271 +++++++++
 .../YoloV2-640/models/yolo/yolov3.py          | 327 +++++++++++
 .../YoloV2-640/models/yolo/yolov4.py          | 345 +++++++++++
 .../cv/detection/YoloV2-640/requirements.txt  |  19 +
 .../contrib/cv/detection/YoloV2-640/test.py   | 233 ++++++++
 .../cv/detection/YoloV2-640/train-1p.sh       |  13 +
 .../cv/detection/YoloV2-640/train-8p.sh       |  96 +++
 .../cv/detection/YoloV2-640/train1p.py        | 545 ++++++++++++++++++
 .../cv/detection/YoloV2-640/train8p.py        | 545 ++++++++++++++++++
 .../cv/detection/YoloV2-640/train_yolonano.sh |  15 +
 .../cv/detection/YoloV2-640/train_yolov1.sh   |  16 +
 .../cv/detection/YoloV2-640/train_yolov3.sh   |  15 +
 .../detection/YoloV2-640/train_yolov3_de.sh   |  15 +
 .../detection/YoloV2-640/train_yolov3_spp.sh  |  15 +
 .../cv/detection/YoloV2-640/train_yolov4.sh   |  19 +
 .../cv/detection/YoloV2-640/utils/__init__.py |   0
 .../cv/detection/YoloV2-640/utils/box_ops.py  | 101 ++++
 .../YoloV2-640/utils/com_flops_params.py      |  17 +
 .../YoloV2-640/utils/create_labels.py         | 240 ++++++++
 .../detection/YoloV2-640/utils/criterion.py   | 192 ++++++
 .../YoloV2-640/utils/distributed_utils.py     |  77 +++
 .../YoloV2-640/utils/fuse_conv_bn.py          |  55 ++
 .../YoloV2-640/utils/kmeans_anchor.py         | 230 ++++++++
 .../cv/detection/YoloV2-640/utils/misc.py     | 149 +++++
 .../cv/detection/YoloV2-640/utils/vis.py      | 106 ++++
 .../cv/detection/YoloV2-640/weights/README.md |  15 +
 68 files changed, 10194 insertions(+)
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/.gitignore
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/README.md
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/benchmark.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/coco.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/config/yolo_config.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/coco.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/COCO2017.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2007.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2012.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/transforms.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/data/voc.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/demo.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/env_npu.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/eval.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/evaluator/cocoapi_evaluator.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/evaluator/vocapi_evaluator.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet53.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet_tiny.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/darknet.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/resnet.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/shufflenetv2.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/vit.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/weights/README.md
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/yolox_backbone.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/basic/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/basic/bottleneck_csp.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/basic/conv.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/basic/upsample.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/head/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/head/coupled_head.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/head/decoupled_head.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/neck/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/neck/dilated_encoder.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/neck/fpn.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/neck/spp.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_nano.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_tiny.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov1.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov2.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov3.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov4.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/requirements.txt
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/test.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train_yolonano.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train_yolov1.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_de.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_spp.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train_yolov4.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/__init__.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/box_ops.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/com_flops_params.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/create_labels.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/criterion.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/distributed_utils.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/fuse_conv_bn.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/kmeans_anchor.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/misc.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/utils/vis.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/weights/README.md

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/.gitignore b/PyTorch/contrib/cv/detection/YoloV2-640/.gitignore
new file mode 100644
index 0000000000..00eaf9d2bd
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/.gitignore
@@ -0,0 +1,6 @@
+*.pt
+*.pth
+*.pkl
+__pycache__
+.vscode
+det_results
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/README.md b/PyTorch/contrib/cv/detection/YoloV2-640/README.md
new file mode 100644
index 0000000000..5cf9a9f4ad
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/README.md
@@ -0,0 +1,331 @@
+# Update: 2022-05-31
+Recently, I have released an anchor-free YOLO:
+
+https://github.com/yjh0410/FreeYOLO
+
+# A new and strong YOLO family
+Recently, I rebuild my YOLO-Family project !!
+
+# Requirements
+- We recommend you to use Anaconda to create a conda environment:
+```Shell
+conda create -n yolo python=3.6
+```
+
+- Then, activate the environment:
+```Shell
+conda activate yolo
+```
+
+- Requirements:
+```Shell
+pip install -r requirements.txt 
+```
+PyTorch >= 1.1.0 and Torchvision >= 0.3.0
+
+# Visualize positive samples
+You can run following command to visualize positiva sample:
+```Shell
+python train.py \
+        -d voc \
+        --root path/to/your/dataset \
+        -m yolov2 \
+        --batch_size 2 \
+        --vis_targets
+```
+
+# Come soon
+My better YOLO family
+
+
+# This project
+In this project, you can enjoy: 
+- a new and stronger YOLOv1
+- a new and stronger YOLOv2
+- a stronger YOLOv3
+- a stronger YOLOv3 with SPP
+- a stronger YOLOv3 with DilatedEncoder
+- YOLOv4 (I'm trying to make it better)
+- YOLO-Tiny
+- YOLO-Nano
+
+
+# Future work
+- Try to make my YOLO-v4 better.
+- Train my YOLOv1/YOLOv2 with ViT-Base (pretrained by MaskAutoencoder)
+
+# Weights
+You can download all weights including my DarkNet-53, CSPDarkNet-53, MAE-ViT and YOLO weights from the following links.
+
+## Backbone
+My Backbone:
+- DarkNet53: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/darknet53.pth
+- CSPDarkNet-53: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/cspdarknet53.pth
+- CSPDarkNet-Tiny: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/cspdarknet_tiny.pth
+
+YOLOX-Backbone:
+- CSPDarkNet-S: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_s.pth
+- CSPDarkNet-M: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_m.pth
+- CSPDarkNet-L: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_l.pth
+- CSPDarkNet-X: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_x.pth
+- CSPDarkNet-Tiny: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_tiny.pth
+- CSPDarkNet-Nano: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_nano.pth
+
+## YOLO
+- YOLOv1: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov1_35.22_54.7.pth
+- YOLOv2: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov2_36.4_56.6.pth
+- YOLOv3: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov3_36.9_59.0.pth
+- YOLOv3-SPP: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov3_spp_38.2_60.1.pth
+- YOLOv3-DE: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov3_de_38.7_60.2.pth
+- YOLOv4: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov4_exp_43.0_63.4.pth
+- YOLO-Tiny: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolo_tiny_28.8_48.6.pth
+- YOLO-Nano: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolo_nano_22.4_40.7.pth
+
+
+# Experiments
+## Tricks
+Tricks in this project:
+- [x] Augmentations: Flip + Color jitter + RandomCrop
+- [x] Model EMA
+- [x] Mosaic Augmentation
+- [x] Multi Scale training
+- [ ] Gradient accumulation
+- [ ] MixUp Augmentation
+- [ ] Cosine annealing learning schedule
+- [ ] AdamW
+- [ ] Scale loss by number of positive samples
+
+
+# Experiments
+All experiment results are evaluated on COCO val. All FPS results except YOLO-Nano's are measured on a 2080ti GPU. 
+We will measure the speed of YOLO-Nano on a CPU.
+
+## YOLOv1
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-320</th><td bgcolor=white> 151 </td><td bgcolor=white> 25.4 </td><td bgcolor=white> 41.5 </td><td bgcolor=white> 26.0 </td><td bgcolor=white> 4.2   </td><td bgcolor=white> 25.0 </td><td bgcolor=white> 49.8 </td><td bgcolor=white> 10.49 </td><td bgcolor=white> 44.54M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-416</th><td bgcolor=white> 128 </td><td bgcolor=white> 30.1 </td><td bgcolor=white> 47.8 </td><td bgcolor=white> 30.9 </td><td bgcolor=white> 7.8   </td><td bgcolor=white> 31.9 </td><td bgcolor=white> 53.3 </td><td bgcolor=white> 17.73 </td><td bgcolor=white> 44.54M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-512</th><td bgcolor=white> 114 </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 52.2 </td><td bgcolor=white> 34.0 </td><td bgcolor=white> 10.8  </td><td bgcolor=white> 35.9 </td><td bgcolor=white> 54.9 </td><td bgcolor=white> 26.85 </td><td bgcolor=white> 44.54M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-640</th><td bgcolor=white> 75 </td><td bgcolor=white> 35.2 </td><td bgcolor=white> 54.7 </td><td bgcolor=white> 37.1 </td><td bgcolor=white>  14.3 </td><td bgcolor=white>  39.5 </td><td bgcolor=white>  53.4 </td><td bgcolor=white> 41.96 </td><td bgcolor=white> 44.54M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-800 </th><td bgcolor=white>     </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white> 65.56 </td><td bgcolor=white> 44.54M </td></tr>
+
+</table></tbody>
+
+## YOLOv2
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-320 </th><td bgcolor=white> 147 </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 44.1 </td><td bgcolor=white> 27.1 </td><td bgcolor=white> 4.7  </td><td bgcolor=white> 27.6 </td><td bgcolor=white> 50.8 </td><td bgcolor=white> 10.53 </td><td bgcolor=white> 44.89M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-416 </th><td bgcolor=white> 123 </td><td bgcolor=white> 31.6 </td><td bgcolor=white> 50.3 </td><td bgcolor=white> 32.4 </td><td bgcolor=white> 9.1  </td><td bgcolor=white> 33.8 </td><td bgcolor=white> 54.0 </td><td bgcolor=white> 17.79 </td><td bgcolor=white> 44.89M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-512 </th><td bgcolor=white> 108 </td><td bgcolor=white> 34.3 </td><td bgcolor=white> 54.0 </td><td bgcolor=white> 35.4 </td><td bgcolor=white> 12.3 </td><td bgcolor=white> 37.8 </td><td bgcolor=white> 55.2 </td><td bgcolor=white> 26.94 </td><td bgcolor=white> 44.89M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-640 </th><td bgcolor=white> 73 </td><td bgcolor=white> 36.3 </td><td bgcolor=white> 56.6 </td><td bgcolor=white> 37.7 </td><td bgcolor=white> 15.1 </td><td bgcolor=white>  41.1 </td><td bgcolor=white>  54.0 </td><td bgcolor=white> 42.10 </td><td bgcolor=white> 44.89M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-800 </th><td bgcolor=white>     </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>     </td><td bgcolor=white>     </td><td bgcolor=white> 65.78 </td><td bgcolor=white> 44.89M </td></tr>
+
+</table></tbody>
+
+## YOLOv3
+
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-320</th><td bgcolor=white> 111 </td><td bgcolor=white> 30.8 </td><td bgcolor=white> 50.3 </td><td bgcolor=white> 31.8 </td><td bgcolor=white> 10.0 </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 50.0 </td><td bgcolor=white> 19.57 </td><td bgcolor=white> 61.97M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-416</th><td bgcolor=white> 89 </td><td bgcolor=white> 34.8 </td><td bgcolor=white> 55.8 </td><td bgcolor=white> 36.1 </td><td bgcolor=white> 14.6 </td><td bgcolor=white> 37.5 </td><td bgcolor=white> 52.9 </td><td bgcolor=white> 33.08 </td><td bgcolor=white> 61.97M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-512</th><td bgcolor=white> 77 </td><td bgcolor=white> 36.9 </td><td bgcolor=white> 58.1 </td><td bgcolor=white> 39.3 </td><td bgcolor=white> 18.0 </td><td bgcolor=white> 40.3 </td><td bgcolor=white> 52.2 </td><td bgcolor=white> 50.11 </td><td bgcolor=white> 61.97M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-608</th><td bgcolor=white> 51 </td><td bgcolor=white> 37.0 </td><td bgcolor=white> 58.9 </td><td bgcolor=white> 39.3 </td><td bgcolor=white> 20.5 </td><td bgcolor=white> 41.2 </td><td bgcolor=white> 49.0 </td><td bgcolor=white> 70.66 </td><td bgcolor=white> 61.97M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-640</th><td bgcolor=white> 49 </td><td bgcolor=white> 36.9 </td><td bgcolor=white> 59.0 </td><td bgcolor=white> 39.7 </td><td bgcolor=white> 21.6 </td><td bgcolor=white> 41.6 </td><td bgcolor=white> 47.7 </td><td bgcolor=white> 78.30 </td><td bgcolor=white> 61.97M </td></tr>
+</table></tbody>
+
+## YOLOv3 with SPP
+
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-320</th><td bgcolor=white> 110 </td><td bgcolor=white> 31.0 </td><td bgcolor=white> 50.8 </td><td bgcolor=white> 32.0 </td><td bgcolor=white> 10.5 </td><td bgcolor=white> 33.0 </td><td bgcolor=white> 50.4 </td><td bgcolor=white> 19.68 </td><td bgcolor=white> 63.02M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-416</th><td bgcolor=white> 88 </td><td bgcolor=white> 35.0 </td><td bgcolor=white> 56.1 </td><td bgcolor=white> 36.4 </td><td bgcolor=white> 14.9 </td><td bgcolor=white> 37.7 </td><td bgcolor=white> 52.8 </td><td bgcolor=white> 33.26 </td><td bgcolor=white> 63.02M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-512</th><td bgcolor=white> 75 </td><td bgcolor=white> 37.2 </td><td bgcolor=white> 58.7 </td><td bgcolor=white> 39.1 </td><td bgcolor=white> 19.1 </td><td bgcolor=white> 40.0 </td><td bgcolor=white> 53.0 </td><td bgcolor=white> 50.38 </td><td bgcolor=white> 63.02M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-608</th><td bgcolor=white> 50 </td><td bgcolor=white> 38.3 </td><td bgcolor=white> 60.1 </td><td bgcolor=white> 40.7 </td><td bgcolor=white> 20.9 </td><td bgcolor=white> 41.1 </td><td bgcolor=white> 51.2 </td><td bgcolor=white> 71.04 </td><td bgcolor=white>  63.02M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-640</th><td bgcolor=white> 48 </td><td bgcolor=white> 38.2 </td><td bgcolor=white> 60.1 </td><td bgcolor=white> 40.4 </td><td bgcolor=white> 21.6 </td><td bgcolor=white> 41.1 </td><td bgcolor=white> 50.5 </td><td bgcolor=white> 78.72 </td><td bgcolor=white> 63.02M </td></tr>
+</table></tbody>
+
+## YOLOv3 with Dilated Encoder
+The DilatedEncoder is proposed by YOLOF.
+
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-320</th><td bgcolor=white> 109 </td><td bgcolor=white> 31.1 </td><td bgcolor=white> 51.1 </td><td bgcolor=white> 31.7 </td><td bgcolor=white> 10.2 </td><td bgcolor=white> 32.6 </td><td bgcolor=white> 51.2 </td><td bgcolor=white> 19.10 </td><td bgcolor=white> 57.25M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-416</th><td bgcolor=white> 88 </td><td bgcolor=white> 35.0 </td><td bgcolor=white> 56.1 </td><td bgcolor=white> 36.3 </td><td bgcolor=white> 14.6 </td><td bgcolor=white> 37.4 </td><td bgcolor=white> 53.7 </td><td bgcolor=white> 32.28 </td><td bgcolor=white> 57.25M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-512</th><td bgcolor=white> 74 </td><td bgcolor=white> 37.7 </td><td bgcolor=white> 59.3 </td><td bgcolor=white> 39.6 </td><td bgcolor=white> 17.9 </td><td bgcolor=white> 40.4 </td><td bgcolor=white> 54.4 </td><td bgcolor=white> 48.90 </td><td bgcolor=white> 57.25M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-608</th><td bgcolor=white> 50 </td><td bgcolor=white> 38.7 </td><td bgcolor=white> 60.5 </td><td bgcolor=white> 40.8 </td><td bgcolor=white> 20.6 </td><td bgcolor=white> 41.7 </td><td bgcolor=white> 53.1 </td><td bgcolor=white> 68.96 </td><td bgcolor=white> 57.25M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-640</th><td bgcolor=white> 48 </td><td bgcolor=white> 38.7 </td><td bgcolor=white> 60.2 </td><td bgcolor=white> 40.7 </td><td bgcolor=white>  21.3 </td><td bgcolor=white> 41.7 </td><td bgcolor=white> 51.7  </td><td bgcolor=white> 76.41 </td><td bgcolor=white> 57.25M </td></tr>
+</table></tbody>
+
+## YOLOv4
+I'm still trying to make it better.
+
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-320</th><td bgcolor=white> 89 </td><td bgcolor=white> 39.2 </td><td bgcolor=white> 58.6 </td><td bgcolor=white> 40.9 </td><td bgcolor=white> 16.9 </td><td bgcolor=white> 44.1 </td><td bgcolor=white> 59.2 </td><td bgcolor=white> 16.38 </td><td bgcolor=white> 58.14M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-416</th><td bgcolor=white> 84 </td><td bgcolor=white> 41.7 </td><td bgcolor=white> 61.6 </td><td bgcolor=white> 44.2 </td><td bgcolor=white> 22.0 </td><td bgcolor=white> 46.6 </td><td bgcolor=white> 57.7 </td><td bgcolor=white> 27.69 </td><td bgcolor=white> 58.14M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-512</th><td bgcolor=white> 70 </td><td bgcolor=white> 42.9 </td><td bgcolor=white> 63.1 </td><td bgcolor=white> 46.1 </td><td bgcolor=white> 24.5 </td><td bgcolor=white> 48.3 </td><td bgcolor=white> 56.5 </td><td bgcolor=white> 41.94 </td><td bgcolor=white> 58.14M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-608</th><td bgcolor=white> 51 </td><td bgcolor=white> 43.0 </td><td bgcolor=white> 63.4 </td><td bgcolor=white> 46.1 </td><td bgcolor=white> 26.7 </td><td bgcolor=white> 48.6 </td><td bgcolor=white> 53.9 </td><td bgcolor=white> 59.14 </td><td bgcolor=white> 58.14M </td></tr>
+
+</table></tbody>
+
+## YOLO-Tiny
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLO-Tiny-320</th><td bgcolor=white> 143 </td><td bgcolor=white> 26.4 </td><td bgcolor=white> 44.5 </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 8.8 </td><td bgcolor=white> 28.2 </td><td bgcolor=white> 42.4 </td><td bgcolor=white> 2.17 </td><td bgcolor=white> 7.66M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLO-Tiny-416</th><td bgcolor=white> 130 </td><td bgcolor=white> 28.2 </td><td bgcolor=white> 47.6 </td><td bgcolor=white> 28.8 </td><td bgcolor=white> 11.6 </td><td bgcolor=white> 31.5 </td><td bgcolor=white> 41.4 </td><td bgcolor=white> 3.67 </td><td bgcolor=white> 7.82M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLO-Tiny-512</th><td bgcolor=white> 118 </td><td bgcolor=white> 28.8 </td><td bgcolor=white> 48.6 </td><td bgcolor=white> 29.4 </td><td bgcolor=white> 13.3 </td><td bgcolor=white> 33.4 </td><td bgcolor=white> 38.3 </td><td bgcolor=white> 5.57 </td><td bgcolor=white> 7.82M </td></tr>
+
+</table></tbody>
+
+## YOLO-Nano
+The FPS is measured on i5-1135G& CPU. Any accelerated deployments that would help speed up detection are not done.
+
+<table><tbody>
+<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLO-Nano-320</th><td bgcolor=white> 25 </td><td bgcolor=white> 18.4 </td><td bgcolor=white> 33.7 </td><td bgcolor=white> 17.8 </td><td bgcolor=white> 3.9 </td><td bgcolor=white> 17.5 </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 0.64 </td><td bgcolor=white> 1.86M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLO-Nano-416</th><td bgcolor=white> 15 </td><td bgcolor=white> 21.4 </td><td bgcolor=white> 38.5 </td><td bgcolor=white> 20.9 </td><td bgcolor=white> 6.5 </td><td bgcolor=white> 21.4 </td><td bgcolor=white> 34.8 </td><td bgcolor=white> 0.99 </td><td bgcolor=white> 1.86M </td></tr>
+
+<tr><th align="left" bgcolor=#f8f8f8> YOLO-Nano-512</th><td bgcolor=white> 10 </td><td bgcolor=white> 22.4 </td><td bgcolor=white> 40.7 </td><td bgcolor=white> 22.1 </td><td bgcolor=white> 8.0 </td><td bgcolor=white> 24.0 </td><td bgcolor=white> 33.2 </td><td bgcolor=white> 1.65 </td><td bgcolor=white> 1.86M </td></tr>
+
+</table></tbody>
+
+
+# Dataset
+
+## VOC Dataset
+### My BaiduYunDisk
+- BaiduYunDisk: https://pan.baidu.com/s/1tYPGCYGyC0wjpC97H-zzMQ Password：4la9
+
+### Download VOC2007 trainval & test
+
+```Shell
+# specify a directory for dataset to be downloaded into, else default is ~/data/
+sh data/scripts/VOC2007.sh # <directory>
+```
+
+### Download VOC2012 trainval
+```Shell
+# specify a directory for dataset to be downloaded into, else default is ~/data/
+sh data/scripts/VOC2012.sh # <directory>
+```
+### My BaiduYunDisk
+- BaiduYunDisk: https://pan.baidu.com/s/1xAPk8fnaWMMov1VEjr8-zA Password：6vhp
+
+On Ubuntu system, you might use the command `jar xvf xxx.zip` to unzip the `train2017.zip` and `test2017.zip` files
+since they are larger than 2G (As far as I know, `unzip` operation can't process the zip file which is larger than 2G.).
+
+## MSCOCO Dataset
+
+### Download MSCOCO 2017 dataset
+Just run ```sh data/scripts/COCO2017.sh```. You will get COCO train2017, val2017, test2017.
+
+
+# Train
+For example:
+
+```Shell
+python train.py --cuda \
+                -d coco \
+                -m yolov2 \
+                -ms \
+                --ema \
+                --batch_size 16 \
+                --root path/to/dataset/
+```
+
+You can run ```python train.py -h``` to check all optional argument. Or you can just run the shell file, for example:
+```Shell
+sh train_yolov1.sh
+```
+
+If you have multi gpus like 8, and you put 4 images on each gpu:
+```Shell
+python -m torch.distributed.launch --nproc_per_node=8 train.py -d coco \
+                                                               --cuda \
+                                                               -m yolov1 \
+                                                               -ms \
+                                                               --ema \
+                                                               -dist \
+                                                               --sybn \
+                                                               --num_gpu 8 \
+                                                               --batch_size 4 \
+                                                               --root path/to/dataset/
+```
+Attention, `--batch_size` is the number of batchsize on per GPU, not all GPUs.
+
+I have upload all training log files. For example, `1-v1.txt` contains all the output information during the training YOLOv1.
+
+It is strongly recommended that you open the training shell file to check how I train each YOLO detector.
+
+# Test
+For example:
+
+```Shell
+python test.py -d coco \
+               --cuda \
+               -m yolov2 \
+               --weight path/to/weight \
+               --img_size 640 \
+               --root path/to/dataset/ \
+               --show
+```
+
+# Evaluation
+For example
+
+```Shell
+python eval.py -d coco-val \
+               --cuda \
+               -m yolov1 \
+               --weight path/to/weight \
+               --img_size 640 \
+               --root path/to/dataset/
+```
+
+# Evaluation on COCO-test-dev
+To run on COCO_test-dev(You must be sure that you have downloaded test2017):
+```Shell
+python eval.py -d coco-test \
+               --cuda \
+               -m yolov1 \
+               --weight path/to/weight \
+               --img_size 640 \
+                --root path/to/dataset/
+```
+You will get a `coco_test-dev.json` file. 
+Then you should follow the official requirements to compress it into zip format 
+and upload it the official evaluation server.
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/benchmark.py b/PyTorch/contrib/cv/detection/YoloV2-640/benchmark.py
new file mode 100644
index 0000000000..0d6c755e66
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/benchmark.py
@@ -0,0 +1,139 @@
+import argparse
+import numpy as np
+import time
+import os
+import torch
+
+from config.yolo_config import yolo_config
+from data.transforms import ValTransforms
+from data.coco import COCODataset, coco_class_index, coco_class_labels
+from utils.com_flops_params import FLOPs_and_Params
+from utils import fuse_conv_bn
+
+from models.yolo import build_model
+import torch_npu
+
+
+parser = argparse.ArgumentParser(description='Benchmark')
+# Model
+parser.add_argument('-m', '--model', default='yolov1',
+                    help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                            'yolov4, yolo_tiny, yolo_nano')
+parser.add_argument('--fuse_conv_bn', action='store_true', default=False,
+                    help='fuse conv and bn')
+parser.add_argument('--conf_thresh', default=0.1, type=float,
+                    help='confidence threshold')
+parser.add_argument('--nms_thresh', default=0.45, type=float,
+                    help='NMS threshold')
+parser.add_argument('--center_sample', action='store_true', default=False,
+                    help='center sample trick.')
+# data root
+parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                    help='data root')
+# basic
+parser.add_argument('-size', '--img_size', default=640, type=int or list,
+                    help='img_size')
+parser.add_argument('--weight', default=None,
+                    type=str, help='Trained state_dict file path to open')
+# cuda
+parser.add_argument('--cuda', action='store_true', default=False, 
+                    help='use cuda.')
+
+args = parser.parse_args()
+
+
+def test(net, device, img_size, testset, transform):
+    # Step-1: Compute FLOPs and Params
+    FLOPs_and_Params(net, img_size)
+
+    # Step-2: Compute FPS
+    num_images = 2002
+    total_time = 0
+    count = 0
+    with torch.no_grad():
+        for index in range(num_images):
+            if index % 500 == 0:
+                print('Testing image {:d}/{:d}....'.format(index+1, num_images))
+            image, _ = testset.pull_image(index)
+
+            h, w, _ = image.shape
+            size = np.array([[w, h, w, h]])
+
+            # prepare
+            x, _, _, scale, offset = transform(image)
+            x = x.unsqueeze(0).to(device)
+
+            # star time
+            torch_npu.npu.synchronize()
+            start_time = time.perf_counter()    
+
+            # inference
+            bboxes, scores, cls_inds = net(x)
+            
+            # rescale
+            bboxes -= offset
+            bboxes /= scale
+            bboxes *= size
+
+            # end time
+            torch_npu.npu.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            # print("detection time used ", elapsed, "s")
+            if index > 1:
+                total_time += elapsed
+                count += 1
+            
+        print('- FPS :', 1.0 / (total_time / count))
+
+
+
+if __name__ == '__main__':
+    # get device
+    if args.cuda:
+        print('use cuda')
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # dataset
+    print('test on coco-val ...')
+    data_dir = os.path.join(args.root, 'COCO')
+    class_names = coco_class_labels
+    class_indexs = coco_class_index
+    num_classes = 80
+    dataset = COCODataset(
+                data_dir=data_dir,
+                image_set='val2017',
+                img_size=args.img_size)
+
+    # YOLO Config
+    cfg = yolo_config[args.model]
+    # build model
+    model = build_model(args=args, 
+                        cfg=cfg, 
+                        device=device, 
+                        num_classes=num_classes, 
+                        trainable=False)
+
+    # load weight
+    if args.weight:
+        model.load_state_dict(torch.load(args.weight, map_location='cpu'), strict=False)
+        print('Finished loading model!')
+    else:
+        print('The path to weight file is None !')
+        exit(0)
+    model = model.to(device).eval()
+
+    # fuse conv bn
+    if args.fuse_conv_bn:
+        print('fuse conv and bn ...')
+        model = fuse_conv_bn(model)
+
+    # run
+    test(net=model, 
+        img_size=args.img_size,
+        device=device, 
+        testset=dataset,
+        transform=ValTransforms(args.img_size)
+        )
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/coco.py b/PyTorch/contrib/cv/detection/YoloV2-640/coco.py
new file mode 100644
index 0000000000..3260ebe1e3
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/coco.py
@@ -0,0 +1,209 @@
+#
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+#
+"""
+@author: Wenbo Li
+@contact: fenglinglwb@gmail.com
+"""
+
+import cv2
+import json
+import numpy as np
+import os
+
+from dataset.JointsDataset import JointsDataset
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+class COCODataset(JointsDataset):
+
+    def __init__(self, DATASET, stage, transform=None):
+        super().__init__(DATASET, stage, transform)
+        self.cur_dir = os.path.split(os.path.realpath(__file__))[0]
+
+        self.train_gt_file = 'train_val_minus_minival_2014.json'
+        self.train_gt_path = os.path.join(self.cur_dir, 'gt_json',
+                                          self.train_gt_file)
+
+        self.val_gt_file = 'minival_2014.json'
+        self.val_gt_path = os.path.join(self.cur_dir, 'gt_json',
+                                        self.val_gt_file)
+        self.val_det_file = 'minival_2014_det.json'
+        self.val_det_path = os.path.join(self.cur_dir, 'det_json',
+                                         self.val_det_file)
+
+        self.test_det_file = ''
+        self.test_det_path = os.path.join(self.cur_dir, 'det_json',
+                                          self.test_det_file)
+
+        self._exception_ids = ['366379']
+
+        self.data = self._get_data()
+        self.data_num = len(self.data)
+
+    def _get_data(self):
+        data = list()
+
+        if self.stage == 'train':
+            coco = COCO(self.train_gt_path)
+        elif self.stage == 'val':
+            coco = COCO(self.val_gt_path)
+            self.val_gt = coco
+        else:
+            pass
+
+        if self.stage == 'train':
+            for aid, ann in coco.anns.items():
+                img_id = ann['image_id']
+                if img_id not in coco.imgs \
+                        or img_id in self._exception_ids:
+                    continue
+
+                if ann['iscrowd']:
+                    continue
+
+                img_name = coco.imgs[img_id]['file_name']
+                prefix = 'val2014' if 'val' in img_name else 'train2014'
+                img_path = os.path.join(self.cur_dir, 'images', prefix,
+                                        img_name)
+
+                bbox = np.array(ann['bbox'])
+                area = ann['area']
+                joints = np.array(ann['keypoints']).reshape((-1, 3))
+                headRect = np.array([0, 0, 1, 1], np.int32)
+
+                center, scale = self._bbox_to_center_and_scale(bbox)
+
+                if np.sum(joints[:, -1] > 0) < self.kp_load_min_num or \
+                        ann['num_keypoints'] == 0:
+                    continue
+
+                d = dict(aid=aid,
+                         area=area,
+                         bbox=bbox,
+                         center=center,
+                         headRect=headRect,
+                         img_id=img_id,
+                         img_name=img_name,
+                         img_path=img_path,
+                         joints=joints,
+                         scale=scale)
+
+                data.append(d)
+
+        else:
+            if self.stage == 'val':
+                det_path = self.val_det_path
+            else:
+                det_path = self.test_det_path
+            dets = json.load(open(det_path))
+
+            for det in dets:
+                if det['image_id'] not in coco.imgs or det['category_id'] != 1:
+                    continue
+
+                img_id = det['image_id']
+                img_name = 'COCO_val2014_000000%06d.jpg' % img_id
+                img_path = os.path.join(self.cur_dir, 'images', 'val2014',
+                                        img_name)
+
+                bbox = np.array(det['bbox'])
+                center, scale = self._bbox_to_center_and_scale(bbox)
+                joints = np.zeros((self.keypoint_num, 3))
+                score = det['score']
+                headRect = np.array([0, 0, 1, 1], np.int32)
+
+                d = dict(bbox=bbox,
+                         center=center,
+                         headRect=headRect,
+                         img_id=img_id,
+                         img_name=img_name,
+                         img_path=img_path,
+                         joints=joints,
+                         scale=scale,
+                         score=score)
+
+                data.append(d)
+
+        return data
+
+    def _bbox_to_center_and_scale(self, bbox):
+        x, y, w, h = bbox
+
+        center = np.zeros(2, dtype=np.float32)
+        center[0] = x + w / 2.0
+        center[1] = y + h / 2.0
+
+        scale = np.array([w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+                         dtype=np.float32)
+
+        return center, scale
+
+    def evaluate(self, pred_path):
+        pred = self.val_gt.loadRes(pred_path)
+        coco_eval = COCOeval(self.val_gt, pred, iouType='keypoints')
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+    def visualize(self, img, joints, score=None):
+        pairs = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
+                 [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
+                 [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
+        color = np.random.randint(0, 256, (self.keypoint_num, 3)).tolist()
+
+        for i in range(self.keypoint_num):
+            if joints[i, 0] > 0 and joints[i, 1] > 0:
+                cv2.circle(img, tuple(joints[i, :2]), 2, tuple(color[i]), 2)
+        if score:
+            cv2.putText(img, score, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.2,
+                        (128, 255, 0), 2)
+
+        def draw_line(img, p1, p2):
+            c = (0, 0, 255)
+            if p1[0] > 0 and p1[1] > 0 and p2[0] > 0 and p2[1] > 0:
+                cv2.line(img, tuple(p1), tuple(p2), c, 2)
+
+        for pair in pairs:
+            draw_line(img, joints[pair[0] - 1], joints[pair[1] - 1])
+
+        return img
+
+
+if __name__ == '__main__':
+    from dataset.attribute import load_dataset
+
+    dataset = load_dataset('COCO')
+    coco = COCODataset(dataset, 'val')
+    print(coco.data_num)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/config/yolo_config.py b/PyTorch/contrib/cv/detection/YoloV2-640/config/yolo_config.py
new file mode 100644
index 0000000000..5af1106830
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/config/yolo_config.py
@@ -0,0 +1,227 @@
+# YOLO config
+
+
+yolo_config = {
+    'yolov1': {
+        # backbone
+        'backbone': 'r50',
+        # neck
+        'neck': 'dilated_encoder',
+        # anchor size
+        'anchor_size': None
+    },
+    'yolov2': {
+        # backbone
+        'backbone': 'r50',
+        # neck
+        'neck': 'dilated_encoder',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]]
+    },
+    'yolov3': {
+        # backbone
+        'backbone': 'd53',
+        # neck
+        'neck': 'conv_blocks',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]]
+    },
+    'yolov3_spp': {
+        # backbone
+        'backbone': 'd53',
+        # neck
+        'neck': 'spp',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]]
+    },
+    'yolov3_de': {
+        # backbone
+        'backbone': 'd53',
+        # neck
+        'neck': 'dilated_encoder',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]]
+    },
+    'yolov4': {
+        # backbone
+        'backbone': 'cspd53',
+        # neck
+        'neck': 'spp',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, qfl
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolov5_s': {
+        # backbone
+        'backbone': 'csp_s',
+        'width': 0.5,
+        'depth': 0.33,
+        'depthwise': False,
+        'freeze': False,
+        # neck
+        'neck': 'yolopafpn',
+        # head
+        'head_dim': 256,
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, bce
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolov5_m': {
+        # backbone
+        'backbone': 'csp_m',
+        'width': 0.75,
+        'depth': 0.67,
+        'depthwise': False,
+        'freeze': False,
+        # neck
+        'neck': 'yolopafpn',
+        # head
+        'head_dim': 256,
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, bce
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolov5_l': {
+        # backbone
+        'backbone': 'csp_l',
+        'width': 1.0,
+        'depth': 1.0,
+        'depthwise': False,
+        'freeze': False,
+        # neck
+        'neck': 'yolopafpn',
+        # head
+        'head_dim': 256,
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, bce
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolov5_x': {
+        # backbone
+        'backbone': 'csp_x',
+        'width': 1.25,
+        'depth': 1.33,
+        'depthwise': False,
+        'freeze': False,
+        # neck
+        'neck': 'yolopafpn',
+        # head
+        'head_dim': 256,
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, bce
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolov5_t': {
+        # backbone
+        'backbone': 'csp_t',
+        'width': 0.375,
+        'depth': 0.33,
+        'depthwise': False,
+        'freeze': False,
+        # neck
+        'neck': 'yolopafpn',
+        # head
+        'head_dim': 256,
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, bce
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolov5_n': {
+        # backbone
+        'backbone': 'csp_n',
+        'width': 0.25,
+        'depth': 0.33,
+        'depthwise': True,
+        'freeze': False,
+        # neck
+        'neck': 'yolopafpn',
+        # head
+        'head_dim': 256,
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, bce
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },    
+    'yolo_tiny': {
+        # backbone
+        'backbone': 'cspd_tiny',
+        # neck
+        'neck': 'spp-csp',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]]
+    },
+    'yolo_nano': {
+        # backbone
+        'backbone': 'sfnet_v2',
+        # neck
+        'neck': 'spp-dw',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, qfl
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolo_nano_plus': {
+        # backbone
+        'backbone': 'csp_n',
+        'depthwise': True,
+        # neck
+        'neck': 'yolopafpn',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]],
+        # loss
+        'loss_obj': 'mse',  # optional: mse, qfl
+        'loss_box': 'giou'  # optional: iou, giou, ciou
+    },
+    'yolotr': {
+        # backbone
+        'backbone': 'vit_b',
+        # neck
+        'neck': 'dilated_encoder',
+        # anchor size: P5-640
+        'anchor_size': [[10, 13],   [16, 30],   [33, 23],
+                        [30, 61],   [62, 45],   [59, 119],
+                        [116, 90],  [156, 198], [373, 326]]
+    }
+}
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/data/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/coco.py b/PyTorch/contrib/cv/detection/YoloV2-640/data/coco.py
new file mode 100644
index 0000000000..5648bc821e
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/data/coco.py
@@ -0,0 +1,336 @@
+import os
+import numpy as np
+import random
+
+from torch.utils.data import Dataset
+import cv2
+import torch_npu
+
+try:
+    from pycocotools.coco import COCO
+except:
+    print("It seems that the COCOAPI is not installed.")
+
+
+
+coco_class_labels = ('background',
+                        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
+                        'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign',
+                        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+                        'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella',
+                        'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
+                        'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+                        'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass',
+                        'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
+                        'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+                        'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk',
+                        'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+                        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book',
+                        'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+coco_class_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20,
+                    21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+                    46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67,
+                    70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+
+class COCODataset(Dataset):
+    """
+    COCO dataset class.
+    """
+    def __init__(self, 
+                 data_dir=None, 
+                 image_set='train2017',
+                 img_size=640,
+                 transform=None,
+                 color_augment=None, 
+                 mosaic=False,
+                 mixup=False):
+        """
+        COCO dataset initialization. Annotation data are read into memory by COCO API.
+        Args:
+            data_dir (str): dataset root directory
+            json_file (str): COCO json file name
+            name (str): COCO data name (e.g. 'train2017' or 'val2017')
+            img_size (int): target image size after pre-processing
+            debug (bool): if True, only one data id is selected from the dataset
+        """
+        if image_set == 'train2017':
+            self.json_file='instances_train2017.json'
+        elif image_set == 'val2017':
+            self.json_file='instances_val2017.json'
+        elif image_set == 'test2017':
+            self.json_file='image_info_test-dev2017.json'
+        self.image_set = image_set
+        self.data_dir = data_dir
+
+        self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file))
+        self.ids = self.coco.getImgIds()
+        self.img_size = img_size
+        self.class_ids = sorted(self.coco.getCatIds())
+        # augmentation
+        self.transform = transform
+        self.mosaic = mosaic
+        self.mixup = mixup
+        self.color_augment = color_augment
+        if self.mosaic:
+            print('use Mosaic Augmentation ...')
+        if self.mixup:
+            print('use MixUp Augmentation ...')
+
+    def __len__(self):
+        return len(self.ids)
+
+
+    def __getitem__(self, index):
+        im, gt, h, w, scale, offset = self.pull_item(index)
+        return im, gt
+
+
+    def load_img_targets(self, index):
+        anno_ids = self.coco.getAnnIds(imgIds=[int(index)], iscrowd=None)
+        annotations = self.coco.loadAnns(anno_ids)
+
+        # load an image
+        img_file = os.path.join(self.data_dir, self.image_set,
+                                '{:012}'.format(index) + '.jpg')
+        img = cv2.imread(img_file)
+        
+        if self.json_file == 'instances_val5k.json' and img is None:
+            img_file = os.path.join(self.data_dir, 'train2017',
+                                    '{:012}'.format(index) + '.jpg')
+            img = cv2.imread(img_file)
+
+        assert img is not None
+
+        height, width, channels = img.shape
+        
+        #load a target
+        target = []
+        for anno in annotations:
+            if 'bbox' in anno and anno['area'] > 0:   
+                xmin = np.max((0, anno['bbox'][0]))
+                ymin = np.max((0, anno['bbox'][1]))
+                xmax = np.min((width - 1, xmin + np.max((0, anno['bbox'][2] - 1))))
+                ymax = np.min((height - 1, ymin + np.max((0, anno['bbox'][3] - 1))))
+                if xmax > xmin and ymax > ymin:
+                    label_ind = anno['category_id']
+                    cls_id = self.class_ids.index(label_ind)
+                    xmin /= width
+                    ymin /= height
+                    xmax /= width
+                    ymax /= height
+
+                    target.append([xmin, ymin, xmax, ymax, cls_id])  # [xmin, ymin, xmax, ymax, label_ind]
+            else:
+                print('No bbox !!!')
+
+        return img, target, height, width
+
+
+    def load_mosaic(self, index):
+        ids_list_ = self.ids[:index] + self.ids[index+1:]
+        # random sample other indexs
+        id1 = self.ids[index]
+        id2, id3, id4 = random.sample(ids_list_, 3)
+        ids = [id1, id2, id3, id4]
+
+        img_lists = []
+        tg_lists = []
+        # load image and target
+        for id_ in ids:
+            img_i, target_i, _, _ = self.load_img_targets(id_)
+            img_lists.append(img_i)
+            tg_lists.append(target_i)
+
+        mean = np.array([v*255 for v in self.transform.mean])
+        mosaic_img = np.ones([self.img_size*2, self.img_size*2, img_i.shape[2]], dtype=np.uint8) * mean
+        # mosaic center
+        yc, xc = [int(random.uniform(-x, 2*self.img_size + x)) for x in [-self.img_size // 2, -self.img_size // 2]]
+        # yc = xc = self.img_size
+
+        mosaic_tg = []
+        for i in range(4):
+            img_i, target_i = img_lists[i], tg_lists[i]
+            target_i = np.array(target_i)
+            h0, w0, _ = img_i.shape
+
+            # resize
+            scale_range = np.arange(50, 210, 10)
+            s = np.random.choice(scale_range) / 100.
+
+            if np.random.randint(2):
+                # keep aspect ratio
+                r = self.img_size / max(h0, w0)
+                if r != 1: 
+                    img_i = cv2.resize(img_i, (int(w0 * r * s), int(h0 * r * s)))
+            else:
+                img_i = cv2.resize(img_i, (int(self.img_size * s), int(self.img_size * s)))
+            h, w, _ = img_i.shape
+
+            # place img in img4
+            if i == 0:  # top left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, self.img_size * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(self.img_size * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, self.img_size * 2), min(self.img_size * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+            mosaic_img[y1a:y2a, x1a:x2a] = img_i[y1b:y2b, x1b:x2b]
+            padw = x1a - x1b
+            padh = y1a - y1b
+
+            # labels
+            target_i_ = target_i.copy()
+            if len(target_i) > 0:
+                # a valid target, and modify it.
+                target_i_[:, 0] = (w * (target_i[:, 0]) + padw)
+                target_i_[:, 1] = (h * (target_i[:, 1]) + padh)
+                target_i_[:, 2] = (w * (target_i[:, 2]) + padw)
+                target_i_[:, 3] = (h * (target_i[:, 3]) + padh)     
+                # check boxes
+                valid_tgt = []
+                for tgt in target_i_:
+                    x1, y1, x2, y2, label = tgt
+                    bw, bh = x2 - x1, y2 - y1
+                    if bw > 5. and bh > 5.:
+                        valid_tgt.append([x1, y1, x2, y2, label])
+                if len(valid_tgt) == 0:
+                    valid_tgt.append([0., 0., 0., 0., 0.])
+
+                mosaic_tg.append(target_i_)
+        # check target
+        if len(mosaic_tg) == 0:
+            mosaic_tg = np.zeros([1, 5])
+        else:
+            mosaic_tg = np.concatenate(mosaic_tg, axis=0)
+            # Cutout/Clip targets
+            np.clip(mosaic_tg[:, :4], 0, 2 * self.img_size, out=mosaic_tg[:, :4])
+            # normalize
+            mosaic_tg[:, :4] /= (self.img_size * 2) 
+
+        return mosaic_img, mosaic_tg, self.img_size, self.img_size
+
+
+    def pull_item(self, index):
+        # load a mosaic image
+        if self.mosaic and np.random.randint(2):
+            # mosaic
+            img, target, height, width = self.load_mosaic(index)
+
+            # MixUp https://arxiv.org/pdf/1710.09412.pdf
+            if self.mixup and np.random.randint(2):
+                img2, target2, height, width = self.load_mosaic(np.random.randint(0, len(self.ids)))
+                r = np.random.beta(8.0, 8.0)  # mixup ratio, alpha=beta=8.0
+                img = (img * r + img2 * (1 - r)).astype(np.uint8)
+                target = np.concatenate((target, target2), 0)
+
+            # augment
+            img, boxes, labels, scale, offset = self.color_augment(img, target[:, :4], target[:, 4])
+
+        # load an image and target
+        else:
+            id_ = self.ids[index]
+            img, target, height, width = self.load_img_targets(id_)
+            if len(target) == 0:
+                target = np.zeros([1, 5])
+            else:
+                target = np.array(target)
+            # augment
+            img, boxes, labels, scale, offset = self.transform(img, target[:, :4], target[:, 4])
+        
+        target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
+
+        return img, target, height, width, scale, offset
+
+
+    def pull_image(self, index):
+        id_ = self.ids[index]
+        img_file = os.path.join(self.data_dir, self.image_set,
+                                '{:012}'.format(id_) + '.jpg')
+        img = cv2.imread(img_file)
+
+        if self.json_file == 'instances_val5k.json' and img is None:
+            img_file = os.path.join(self.data_dir, 'train2017',
+                                    '{:012}'.format(id_) + '.jpg')
+            img = cv2.imread(img_file)
+
+        return img, id_
+
+
+    def pull_anno(self, index):
+        id_ = self.ids[index]
+
+        anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None)
+        annotations = self.coco.loadAnns(anno_ids)
+        
+        target = []
+        for anno in annotations:
+            if 'bbox' in anno:
+                xmin = np.max((0, anno['bbox'][0]))
+                ymin = np.max((0, anno['bbox'][1]))
+                xmax = xmin + anno['bbox'][2]
+                ymax = ymin + anno['bbox'][3]
+                
+                if anno['area'] > 0 and xmax >= xmin and ymax >= ymin:
+                    label_ind = anno['category_id']
+                    cls_id = self.class_ids.index(label_ind)
+
+                    target.append([xmin, ymin, xmax, ymax, cls_id])  # [xmin, ymin, xmax, ymax, label_ind]
+            else:
+                print('No bbox !!')
+        return target
+
+
+if __name__ == "__main__":
+    from transforms import TrainTransforms, ColorTransforms, ValTransforms
+
+    mean=(0.406, 0.456, 0.485)
+    std=(0.225, 0.224, 0.229)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+
+    img_size = 640
+    dataset = COCODataset(
+                data_dir='/mnt/share/ssd2/dataset/COCO/',
+                img_size=img_size,
+                image_set='train2017',
+                transform=TrainTransforms(img_size),
+                color_augment=ColorTransforms(img_size),
+                mosaic=True,
+                mixup=True)
+    
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255)) for _ in range(80)]
+    print('Data length: ', len(dataset))
+    for i in range(1000):
+        image, target, _, _, _, _ = dataset.pull_item(i)
+        image = image.permute(1, 2, 0).numpy()[:, :, (2, 1, 0)]
+        image = ((image * std + mean)*255).astype(np.uint8)
+        image = image.copy()
+
+        for box in target:
+            x1, y1, x2, y2, cls_id = box
+            cls_id = int(cls_id.item())
+            color = class_colors[cls_id]
+            # class name
+            label = coco_class_labels[coco_class_index[cls_id]]
+            # bbox
+            x1 *= img_size
+            y1 *= img_size
+            x2 *= img_size
+            y2 *= img_size
+            image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
+            # put the test on the bbox
+            cv2.putText(image, label, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
+        cv2.imshow('gt', image)
+        # cv2.imwrite(str(i)+'.jpg', img)
+        cv2.waitKey(0)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/COCO2017.sh b/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/COCO2017.sh
new file mode 100644
index 0000000000..6adddfcb36
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/COCO2017.sh
@@ -0,0 +1,20 @@
+mkdir COCO
+cd COCO
+
+wget http://images.cocodataset.org/zips/train2017.zip
+wget http://images.cocodataset.org/zips/val2017.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
+wget http://images.cocodataset.org/zips/test2017.zip
+wget http://images.cocodataset.org/annotations/image_info_test2017.zip 
+
+unzip train2017.zip
+unzip val2017.zip
+unzip annotations_trainval2017.zip
+unzip test2017.zip
+unzip image_info_test2017.zip
+
+# rm -f train2017.zip
+# rm -f val2017.zip
+# rm -f annotations_trainval2017.zip
+# rm -f test2017.zip
+# rm -f image_info_test2017.zip
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2007.sh b/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2007.sh
new file mode 100644
index 0000000000..9d53c8e990
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2007.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Ellis Brown
+
+start=`date +%s`
+
+# handle optional download dir
+if [ -z "$1" ]
+  then
+    # navigate to ~/data
+    echo "navigating to ~/data/ ..." 
+    mkdir -p ~/data
+    cd ~/data/
+  else
+    # check if is valid directory
+    if [ ! -d $1 ]; then
+        echo $1 "is not a valid directory"
+        exit 0
+    fi
+    echo "navigating to" $1 "..."
+    cd $1
+fi
+
+echo "Downloading VOC2007 trainval ..."
+# Download the data.
+curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+echo "Downloading VOC2007 test data ..."
+curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+echo "Done downloading."
+
+# Extract data
+echo "Extracting trainval ..."
+tar -xvf VOCtrainval_06-Nov-2007.tar
+echo "Extracting test ..."
+tar -xvf VOCtest_06-Nov-2007.tar
+echo "removing tars ..."
+rm VOCtrainval_06-Nov-2007.tar
+rm VOCtest_06-Nov-2007.tar
+
+end=`date +%s`
+runtime=$((end-start))
+
+echo "Completed in" $runtime "seconds"
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2012.sh b/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2012.sh
new file mode 100644
index 0000000000..0a00f38969
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/data/scripts/VOC2012.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Ellis Brown
+
+start=`date +%s`
+
+# handle optional download dir
+if [ -z "$1" ]
+  then
+    # navigate to ~/data
+    echo "navigating to ~/data/ ..." 
+    mkdir -p ~/data
+    cd ~/data/
+  else
+    # check if is valid directory
+    if [ ! -d $1 ]; then
+        echo $1 "is not a valid directory"
+        exit 0
+    fi
+    echo "navigating to" $1 "..."
+    cd $1
+fi
+
+echo "Downloading VOC2012 trainval ..."
+# Download the data.
+curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+echo "Done downloading."
+
+
+# Extract data
+echo "Extracting trainval ..."
+tar -xvf VOCtrainval_11-May-2012.tar
+echo "removing tar ..."
+rm VOCtrainval_11-May-2012.tar
+
+end=`date +%s`
+runtime=$((end-start))
+
+echo "Completed in" $runtime "seconds"
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/transforms.py b/PyTorch/contrib/cv/detection/YoloV2-640/data/transforms.py
new file mode 100644
index 0000000000..09e2bd2c9f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/data/transforms.py
@@ -0,0 +1,423 @@
+import cv2
+import torch
+import numpy as np
+from numpy import random
+import torch_npu
+
+
+def intersect(box_a, box_b):
+    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
+    min_xy = np.maximum(box_a[:, :2], box_b[:2])
+    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
+    return inter[:, 0] * inter[:, 1]
+
+
+def jaccard_numpy(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
+        box_b: Single bounding box, Shape: [4]
+    Return:
+        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1]))  # [A,B]
+    area_b = ((box_b[2]-box_b[0]) *
+              (box_b[3]-box_b[1]))  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None, scale=None, offset=None):
+        for t in self.transforms:
+            img, boxes, labels, scale, offset = t(img, boxes, labels, scale, offset)
+        return img, boxes, labels, scale, offset
+
+
+class ConvertFromInts(object):
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        return image.astype(np.float32), boxes, labels, scale, offset
+
+
+class ToAbsoluteCoords(object):
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        height, width, channels = image.shape
+        boxes[:, 0] *= width
+        boxes[:, 2] *= width
+        boxes[:, 1] *= height
+        boxes[:, 3] *= height
+
+        return image, boxes, labels, scale, offset
+
+
+class ToPercentCoords(object):
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        height, width, channels = image.shape
+        boxes[:, 0] /= width
+        boxes[:, 2] /= width
+        boxes[:, 1] /= height
+        boxes[:, 3] /= height
+
+        return image, boxes, labels, scale, offset
+
+
+# ColorJitter
+class ColorJitter(object):
+    def __init__(self):
+        self.pd = [
+            RandomContrast(),
+            ConvertColor(transform='HSV'),
+            RandomSaturation(),
+            RandomHue(),
+            ConvertColor(current='HSV', transform='BGR'),
+            RandomContrast()
+        ]
+        self.rand_brightness = RandomBrightness()
+
+    def __call__(self, image, boxes, labels, scale=None, offset=None):
+        im = image.copy()
+        im, boxes, labels, scale, offset = self.rand_brightness(im, boxes, labels, scale, offset)
+        if random.randint(2):
+            distort = Compose(self.pd[:-1])
+        else:
+            distort = Compose(self.pd[1:])
+        im, boxes, labels, scale, offset = distort(im, boxes, labels, scale, offset)
+        return im, boxes, labels, scale, offset
+
+
+class RandomSaturation(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        if random.randint(2):
+            image[:, :, 1] *= random.uniform(self.lower, self.upper)
+
+        return image, boxes, labels, scale, offset
+
+
+class RandomHue(object):
+    def __init__(self, delta=18.0):
+        assert delta >= 0.0 and delta <= 360.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        if random.randint(2):
+            image[:, :, 0] += random.uniform(-self.delta, self.delta)
+            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
+            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
+        return image, boxes, labels, scale, offset
+
+
+class ConvertColor(object):
+    def __init__(self, current='BGR', transform='HSV'):
+        self.transform = transform
+        self.current = current
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        if self.current == 'BGR' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+        elif self.current == 'HSV' and self.transform == 'BGR':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+        else:
+            raise NotImplementedError
+        return image, boxes, labels, scale, offset
+
+
+class RandomContrast(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    # expects float image
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        if random.randint(2):
+            alpha = random.uniform(self.lower, self.upper)
+            image *= alpha
+        return image, boxes, labels, scale, offset
+
+
+class RandomBrightness(object):
+    def __init__(self, delta=32):
+        assert delta >= 0.0
+        assert delta <= 255.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        if random.randint(2):
+            delta = random.uniform(-self.delta, self.delta)
+            image += delta
+        return image, boxes, labels, scale, offset
+
+
+# RandomCrop
+class RandomSampleCrop(object):
+    """Crop
+    Arguments:
+        img (Image): the image being input during training
+        boxes (Tensor): the original bounding boxes in pt form
+        labels (Tensor): the class labels for each bbox
+        mode (float tuple): the min and max jaccard overlaps
+    Return:
+        (img, boxes, classes)
+            img (Image): the cropped image
+            boxes (Tensor): the adjusted bounding boxes in pt form
+            labels (Tensor): the class labels for each bbox
+    """
+    def __init__(self):
+        self.sample_options = (
+            # using entire original input image
+            None,
+            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
+            (0.1, None),
+            (0.3, None),
+            (0.7, None),
+            (0.9, None),
+            # randomly sample a patch
+            (None, None),
+        )
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        height, width, _ = image.shape
+        while True:
+            # randomly choose a mode
+            sample_id = np.random.randint(len(self.sample_options))
+            mode = self.sample_options[sample_id]
+            if mode is None:
+                return image, boxes, labels, scale, offset
+
+            min_iou, max_iou = mode
+            if min_iou is None:
+                min_iou = float('-inf')
+            if max_iou is None:
+                max_iou = float('inf')
+
+            # max trails (50)
+            for _ in range(50):
+                current_image = image
+
+                w = random.uniform(0.3 * width, width)
+                h = random.uniform(0.3 * height, height)
+
+                # aspect ratio constraint b/t .5 & 2
+                if h / w < 0.5 or h / w > 2:
+                    continue
+
+                left = random.uniform(width - w)
+                top = random.uniform(height - h)
+
+                # convert to integer rect x1,y1,x2,y2
+                rect = np.array([int(left), int(top), int(left+w), int(top+h)])
+
+                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
+                overlap = jaccard_numpy(boxes, rect)
+
+                # is min and max overlap constraint satisfied? if not try again
+                if overlap.min() < min_iou and max_iou < overlap.max():
+                    continue
+
+                # cut the crop from the image
+                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
+                                              :]
+
+                # keep overlap with gt box IF center in sampled patch
+                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
+
+                # mask in all gt boxes that above and to the left of centers
+                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
+
+                # mask in all gt boxes that under and to the right of centers
+                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
+
+                # mask in that both m1 and m2 are true
+                mask = m1 * m2
+
+                # have any valid boxes? try again if not
+                if not mask.any():
+                    continue
+
+                # take only matching gt boxes
+                current_boxes = boxes[mask, :].copy()
+
+                # take only matching gt labels
+                current_labels = labels[mask]
+
+                # should we use the box left and top corner or the crop's
+                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
+                                                  rect[:2])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, :2] -= rect[:2]
+
+                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
+                                                  rect[2:])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, 2:] -= rect[:2]
+
+                return current_image, current_boxes, current_labels, scale, offset
+
+
+# RandomHFlip
+class RandomHFlip(object):
+    def __call__(self, image, boxes, classes, scale=None, offset=None):
+        _, width, _ = image.shape
+        if random.randint(2):
+            image = image[:, ::-1]
+            boxes = boxes.copy()
+            boxes[:, 0::2] = width - boxes[:, 2::-2]
+        return image, boxes, classes, scale, offset
+
+
+# Normalize image
+class Normalize(object):
+    def __init__(self, mean=None, std=None):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        image = image.astype(np.float32)
+        image /= 255.
+        image -= self.mean
+        image /= self.std
+
+        return image, boxes, labels, scale, offset
+
+
+# Resize
+class Resize(object):
+    def __init__(self, size=640, mean=None):
+        self.size = size
+        self.mean = np.array([v*255 for v in mean])
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        h0, w0, _ = image.shape
+
+        if h0 > w0:
+            # resize
+            r = w0 / h0
+            image = cv2.resize(image, (int(r * self.size), self.size)).astype(np.float32)
+            # zero padding
+            h, w, _ = image.shape
+            image_ = np.ones([h, h, 3]) * self.mean
+            dw = h - w
+            left = dw // 2
+            image_[:, left:left+w, :] = image
+            offset = np.array([[ left / h, 0.,  left / h, 0.]])
+            scale =  np.array([[w / h, 1., w / h, 1.]])
+
+        elif h0 < w0:
+            # resize
+            r = h0 / w0
+            image = cv2.resize(image, (self.size, int(r * self.size))).astype(np.float32)
+            # zero padding
+            h, w, _ = image.shape
+            image_ = np.ones([w, w, 3]) * self.mean
+            dh = w - h
+            top = dh // 2
+            image_[top:top+h, :, :] = image
+            offset = np.array([[0., top / w, 0., top / w]])
+            scale = np.array([1., h / w, 1., h / w])
+
+        else:
+            # resize
+            if h0 == self.size:
+                image_ = image
+            else:
+                image_ = cv2.resize(image, (self.size, self.size)).astype(np.float32)
+            offset = np.zeros([1, 4])
+            scale =  1.
+
+        if boxes is not None:
+            boxes = boxes * scale + offset
+        
+        return image_, boxes, labels, scale, offset
+
+
+# convert ndarray image to tensor type
+class ToTensor(object):
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        # to rgb
+        image = image[..., (2, 1, 0)]
+        return torch.from_numpy(image).permute(2, 0, 1).float(), boxes, labels, scale, offset
+
+
+# TrainTransform
+class TrainTransforms(object):
+    def __init__(self, size=640, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
+        self.mean = mean
+        self.size = size
+        self.std = std
+        self.augment = Compose([
+            ConvertFromInts(),
+            ToAbsoluteCoords(),
+            ColorJitter(),
+            RandomSampleCrop(),
+            RandomHFlip(),
+            ToPercentCoords(),
+            Resize(self.size, self.mean),
+            Normalize(self.mean, self.std),
+            ToTensor()
+        ])
+
+    def __call__(self, image, boxes, labels, scale=None, offset=None):
+        return self.augment(image, boxes, labels, scale, offset)
+
+
+# ColorTransform
+class ColorTransforms(object):
+    def __init__(self, size=640, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
+        self.mean = mean
+        self.size = size
+        self.std = std
+        self.augment = Compose([
+            ConvertFromInts(),
+            ToAbsoluteCoords(),
+            ColorJitter(),
+            RandomHFlip(),
+            ToPercentCoords(),
+            Resize(self.size, self.mean),
+            Normalize(self.mean, self.std),
+            ToTensor()
+        ])
+
+    def __call__(self, image, boxes, labels, scale=None, offset=None):
+        return self.augment(image, boxes, labels, scale, offset)
+
+
+# ValTransform
+class ValTransforms(object):
+    def __init__(self, size=640, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
+        self.size = size
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.augment = Compose([
+            Resize(self.size, self.mean),
+            Normalize(self.mean, self.std),
+            ToTensor()
+        ])
+
+
+    def __call__(self, image, boxes=None, labels=None, scale=None, offset=None):
+        return self.augment(image, boxes, labels, scale, offset)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/data/voc.py b/PyTorch/contrib/cv/detection/YoloV2-640/data/voc.py
new file mode 100644
index 0000000000..f98abe232d
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/data/voc.py
@@ -0,0 +1,342 @@
+"""VOC Dataset Classes
+
+Original author: Francisco Massa
+https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
+
+Updated by: Ellis Brown, Max deGroot
+"""
+import os.path as osp
+import torch.utils.data as data
+import cv2
+import random
+import numpy as np
+import xml.etree.ElementTree as ET
+import torch_npu
+
+
+VOC_CLASSES = (  # always index 0
+    'aeroplane', 'bicycle', 'bird', 'boat',
+    'bottle', 'bus', 'car', 'cat', 'chair',
+    'cow', 'diningtable', 'dog', 'horse',
+    'motorbike', 'person', 'pottedplant',
+    'sheep', 'sofa', 'train', 'tvmonitor')
+
+
+class VOCAnnotationTransform(object):
+    """Transforms a VOC annotation into a Tensor of bbox coords and label index
+    Initilized with a dictionary lookup of classnames to indexes
+
+    Arguments:
+        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
+            (default: alphabetic indexing of VOC's 20 classes)
+        keep_difficult (bool, optional): keep difficult instances or not
+            (default: False)
+        height (int): height
+        width (int): width
+    """
+
+    def __init__(self, class_to_ind=None, keep_difficult=False):
+        self.class_to_ind = class_to_ind or dict(
+            zip(VOC_CLASSES, range(len(VOC_CLASSES))))
+        self.keep_difficult = keep_difficult
+
+    def __call__(self, target, width, height):
+        """
+        Arguments:
+            target (annotation) : the target annotation to be made usable
+                will be an ET.Element
+        Returns:
+            a list containing lists of bounding boxes  [bbox coords, class name]
+        """
+        res = []
+        for obj in target.iter('object'):
+            difficult = int(obj.find('difficult').text) == 1
+            if not self.keep_difficult and difficult:
+                continue
+            name = obj.find('name').text.lower().strip()
+            bbox = obj.find('bndbox')
+
+            pts = ['xmin', 'ymin', 'xmax', 'ymax']
+            bndbox = []
+            for i, pt in enumerate(pts):
+                cur_pt = int(bbox.find(pt).text) - 1
+                # scale height or width
+                cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
+                bndbox.append(cur_pt)
+            label_idx = self.class_to_ind[name]
+            bndbox.append(label_idx)
+            res += [bndbox]  # [x1, y1, x2, y2, label_ind]
+            # img_id = target.find('filename').text[:-4]
+
+        return res  # [[x1, y1, x2, y2, label_ind], ... ]
+
+
+class VOCDetection(data.Dataset):
+    """VOC Detection Dataset Object
+
+    input is image, target is annotation
+
+    Arguments:
+        root (string): filepath to VOCdevkit folder.
+        image_set (string): imageset to use (eg. 'train', 'val', 'test')
+        transform (callable, optional): transformation to perform on the
+            input image
+        target_transform (callable, optional): transformation to perform on the
+            target `annotation`
+            (eg: take in caption string, return tensor of word indices)
+        dataset_name (string, optional): which dataset to load
+            (default: 'VOC2007')
+    """
+
+    def __init__(self, 
+                 data_dir=None,
+                 img_size=640,
+                 image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+                 transform=None, 
+                 color_augment=None,
+                 target_transform=VOCAnnotationTransform(),
+                 mosaic=False,
+                 mixup=False):
+        self.root = data_dir
+        self.img_size = img_size
+        self.image_set = image_sets
+        self.target_transform = target_transform
+        self._annopath = osp.join('%s', 'Annotations', '%s.xml')
+        self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
+        self.ids = list()
+        for (year, name) in image_sets:
+            rootpath = osp.join(self.root, 'VOC' + year)
+            for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
+                self.ids.append((rootpath, line.strip()))
+        # augmentation
+        self.transform = transform
+        self.mosaic = mosaic
+        self.mixup = mixup
+        self.color_augment = color_augment
+        if self.mosaic:
+            print('use Mosaic Augmentation ...')
+        if self.mixup:
+            print('use MixUp Augmentation ...')
+
+
+    def __getitem__(self, index):
+        im, gt, h, w, scale, offset = self.pull_item(index)
+        return im, gt
+
+
+    def __len__(self):
+        return len(self.ids)
+
+
+    def load_img_targets(self, img_id):
+        # load an image
+        img = cv2.imread(self._imgpath % img_id)
+        height, width, channels = img.shape
+
+        # laod a target
+        target = ET.parse(self._annopath % img_id).getroot()
+        if self.target_transform is not None:
+            target = self.target_transform(target, width, height)
+
+        return img, target, height, width
+
+
+    def load_mosaic(self, index):
+        ids_list_ = self.ids[:index] + self.ids[index+1:]
+        # random sample other indexs
+        id1 = self.ids[index]
+        id2, id3, id4 = random.sample(ids_list_, 3)
+        ids = [id1, id2, id3, id4]
+
+        img_lists = []
+        tg_lists = []
+        # load image and target
+        for id_ in ids:
+            img_i, target_i, _, _ = self.load_img_targets(id_)
+            img_lists.append(img_i)
+            tg_lists.append(target_i)
+
+        mean = np.array([v*255 for v in self.transform.mean])
+        mosaic_img = np.ones([self.img_size*2, self.img_size*2, img_i.shape[2]], dtype=np.uint8) * mean
+        # mosaic center
+        yc, xc = [int(random.uniform(-x, 2*self.img_size + x)) for x in [-self.img_size // 2, -self.img_size // 2]]
+        # yc = xc = self.img_size
+
+        mosaic_tg = []
+        for i in range(4):
+            img_i, target_i = img_lists[i], tg_lists[i]
+            target_i = np.array(target_i)
+            h0, w0, _ = img_i.shape
+
+            # resize
+            scale_range = np.arange(50, 210, 10)
+            s = np.random.choice(scale_range) / 100.
+
+            if np.random.randint(2):
+                # keep aspect ratio
+                r = self.img_size / max(h0, w0)
+                if r != 1: 
+                    img_i = cv2.resize(img_i, (int(w0 * r * s), int(h0 * r * s)))
+            else:
+                img_i = cv2.resize(img_i, (int(self.img_size * s), int(self.img_size * s)))
+            h, w, _ = img_i.shape
+
+            # place img in img4
+            if i == 0:  # top left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, self.img_size * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(self.img_size * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, self.img_size * 2), min(self.img_size * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+            mosaic_img[y1a:y2a, x1a:x2a] = img_i[y1b:y2b, x1b:x2b]
+            padw = x1a - x1b
+            padh = y1a - y1b
+
+            # labels
+            target_i_ = target_i.copy()
+            if len(target_i) > 0:
+                # a valid target, and modify it.
+                target_i_[:, 0] = (w * (target_i[:, 0]) + padw)
+                target_i_[:, 1] = (h * (target_i[:, 1]) + padh)
+                target_i_[:, 2] = (w * (target_i[:, 2]) + padw)
+                target_i_[:, 3] = (h * (target_i[:, 3]) + padh)     
+                # check boxes
+                valid_tgt = []
+                for tgt in target_i_:
+                    x1, y1, x2, y2, label = tgt
+                    bw, bh = x2 - x1, y2 - y1
+                    if bw > 5. and bh > 5.:
+                        valid_tgt.append([x1, y1, x2, y2, label])
+                if len(valid_tgt) == 0:
+                    valid_tgt.append([0., 0., 0., 0., 0.])
+
+                mosaic_tg.append(target_i_)
+        # check target
+        if len(mosaic_tg) == 0:
+            mosaic_tg = np.zeros([1, 5])
+        else:
+            mosaic_tg = np.concatenate(mosaic_tg, axis=0)
+            # Cutout/Clip targets
+            np.clip(mosaic_tg[:, :4], 0, 2 * self.img_size, out=mosaic_tg[:, :4])
+            # normalize
+            mosaic_tg[:, :4] /= (self.img_size * 2) 
+
+        return mosaic_img, mosaic_tg, self.img_size, self.img_size
+
+
+    def pull_item(self, index):
+        # load a mosaic image
+        if self.mosaic and np.random.randint(2):
+            # mosaic
+            img, target, height, width = self.load_mosaic(index)
+
+            # MixUp https://arxiv.org/pdf/1710.09412.pdf
+            if self.mixup and np.random.randint(2):
+                img2, target2, height, width = self.load_mosaic(np.random.randint(0, len(self.ids)))
+                r = np.random.beta(8.0, 8.0)  # mixup ratio, alpha=beta=8.0
+                img = (img * r + img2 * (1 - r)).astype(np.uint8)
+                target = np.concatenate((target, target2), 0)
+
+            # augment
+            img, boxes, labels, scale, offset = self.color_augment(img, target[:, :4], target[:, 4])
+
+        # load an image and target
+        else:
+            img_id = self.ids[index]
+            img, target, height, width = self.load_img_targets(img_id)
+            if len(target) == 0:
+                target = np.zeros([1, 5])
+            else:
+                target = np.array(target)
+            # augment
+            img, boxes, labels, scale, offset = self.transform(img, target[:, :4], target[:, 4])
+            
+        target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
+
+        return img, target, height, width, scale, offset
+
+
+    def pull_image(self, index):
+        '''Returns the original image object at index in PIL form
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to show
+        Return:
+            PIL img
+        '''
+        img_id = self.ids[index]
+        return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id
+
+
+    def pull_anno(self, index):
+        '''Returns the original annotation of image at index
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to get annotation of
+        Return:
+            list:  [img_id, [(label, bbox coords),...]]
+                eg: ('001718', [('dog', (96, 13, 438, 332))])
+        '''
+        img_id = self.ids[index]
+        anno = ET.parse(self._annopath % img_id).getroot()
+        gt = self.target_transform(anno, 1, 1)
+        return img_id[1], gt
+
+
+if __name__ == "__main__":
+    from transforms import TrainTransforms, ColorTransforms, ValTransforms
+
+    mean=(0.406, 0.456, 0.485)
+    std=(0.225, 0.224, 0.229)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+
+    img_size = 640
+    dataset = VOCDetection(
+                data_dir='d:/datasets/VOCdevkit/',
+                img_size=img_size,
+                transform=ValTransforms(img_size),
+                color_augment=ColorTransforms(img_size),
+                mosaic=True,
+                mixup=True)
+    
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255)) for _ in range(20)]
+    print('Data length: ', len(dataset))
+    for i in range(len(dataset)):
+        image, target, _, _, _, _ = dataset.pull_item(i)
+        image = image.permute(1, 2, 0).numpy()[:, :, (2, 1, 0)]
+        image = ((image * std + mean)*255).astype(np.uint8)
+        image = image.copy()
+
+        for box in target:
+            x1, y1, x2, y2, cls_id = box
+            cls_id = int(cls_id)
+            color = class_colors[cls_id]
+            # class name
+            label = VOC_CLASSES[cls_id]
+            x1 *= img_size
+            y1 *= img_size
+            x2 *= img_size
+            y2 *= img_size
+            image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
+            # put the test on the bbox
+            cv2.putText(image, label, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
+        cv2.imshow('gt', image)
+        # cv2.imwrite(str(i)+'.jpg', img)
+        cv2.waitKey(0)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/demo.py b/PyTorch/contrib/cv/detection/YoloV2-640/demo.py
new file mode 100644
index 0000000000..0481695818
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/demo.py
@@ -0,0 +1,249 @@
+import argparse
+import os
+import cv2
+import time
+import numpy as np
+import torch
+
+from config.yolo_config import yolo_config
+from data.coco import coco_class_labels, coco_class_index
+from data.transforms import ValTransforms
+from models.yolo import build_model
+import torch_npu
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO Demo Detection')
+
+    # basic
+    parser.add_argument('--mode', default='image',
+                        type=str, help='Use the data from image, video or camera')
+    parser.add_argument('--cuda', action='store_true', default=False,
+                        help='Use cuda')
+    parser.add_argument('--path_to_img', default='data/demo/images/',
+                        type=str, help='The path to image files')
+    parser.add_argument('--path_to_vid', default='data/demo/videos/',
+                        type=str, help='The path to video files')
+    parser.add_argument('--path_to_save', default='det_results/images/',
+                        type=str, help='The path to save the detection results')
+    parser.add_argument('--path_to_saveVid', default='data/video/result.avi',
+                        type=str, help='The path to save the detection results video')
+    parser.add_argument('-vs', '--visual_threshold', default=0.3,
+                        type=float, help='visual threshold')
+
+    # model
+    parser.add_argument('-m', '--model', default='yolov1',
+                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                             'yolov4, yolo_tiny, yolo_nano')
+    parser.add_argument('--num_queries', type=int, default=4, 
+                        help='number of queris of YOLOQ')
+    parser.add_argument('--weight', default='weights/',
+                        type=str, help='Trained state_dict file path to open')
+    parser.add_argument('-size', '--img_size', default=640, type=int,
+                        help='img_size')
+    parser.add_argument('--conf_thresh', default=0.1, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--nms_thresh', default=0.45, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--center_sample', action='store_true', default=False,
+                        help='center sample trick.')
+    
+    return parser.parse_args()
+                    
+
+def plot_bbox_labels(img, bbox, label, cls_color, test_scale=0.4):
+    x1, y1, x2, y2 = bbox
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
+    # plot bbox
+    cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
+    # plot title bbox
+    cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * test_scale), y1), cls_color, -1)
+    # put the test on the title bbox
+    cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, test_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
+
+    return img
+
+
+def visualize(img, bboxes, scores, cls_inds, class_colors, vis_thresh=0.3):
+    ts = 0.4
+    for i, bbox in enumerate(bboxes):
+        if scores[i] > vis_thresh:
+            cls_color = class_colors[int(cls_inds[i])]
+            cls_id = coco_class_index[int(cls_inds[i])]
+            mess = '%s: %.2f' % (coco_class_labels[cls_id], scores[i])
+            img = plot_bbox_labels(img, bbox, mess, cls_color, test_scale=ts)
+
+    return img
+
+
+def detect(net, 
+           device, 
+           transform, 
+           vis_thresh, 
+           mode='image', 
+           path_to_img=None, 
+           path_to_vid=None, 
+           path_to_save=None):
+    # class color
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255)) for _ in range(80)]
+    save_path = os.path.join(path_to_save, mode)
+    os.makedirs(save_path, exist_ok=True)
+
+    # ------------------------- Camera ----------------------------
+    if mode == 'camera':
+        print('use camera !!!')
+        cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
+        while True:
+            ret, frame = cap.read()
+            if ret:
+                if cv2.waitKey(1) == ord('q'):
+                    break
+                img_h, img_w = frame.shape[:2]
+                size = np.array([[img_w, img_h, img_w, img_h]])
+                # prepare
+                x, _, _, scale, offset = transform(frame)
+                x = x.unsqueeze(0).to(device)
+                # inference
+                t0 = time.time()
+                bboxes, scores, cls_inds = net(x)
+                t1 = time.time()
+                print("detection time used ", t1-t0, "s")
+
+                # rescale
+                bboxes -= offset
+                bboxes /= scale
+                bboxes *= size
+
+                frame_processed = visualize(img=frame, 
+                                            bboxes=bboxes,
+                                            scores=scores, 
+                                            cls_inds=cls_inds,
+                                            class_colors=class_colors,
+                                            vis_thresh=vis_thresh)
+                cv2.imshow('detection result', frame_processed)
+                cv2.waitKey(1)
+            else:
+                break
+        cap.release()
+        cv2.destroyAllWindows()
+
+    # ------------------------- Image ----------------------------
+    elif mode == 'image':
+        for i, img_id in enumerate(os.listdir(path_to_img)):
+            img = cv2.imread(path_to_img + '/' + img_id, cv2.IMREAD_COLOR)
+            img_h, img_w = img.shape[:2]
+            size = np.array([[img_w, img_h, img_w, img_h]])
+            
+            # prepare
+            x, _, _, scale, offset = transform(img)
+            x = x.unsqueeze(0).to(device)
+            # inference
+            t0 = time.time()
+            bboxes, scores, cls_inds = net(x)
+            t1 = time.time()
+            print("detection time used ", t1-t0, "s")
+
+            # rescale
+            bboxes -= offset
+            bboxes /= scale
+            bboxes *= size
+
+            img_processed = visualize(img=img, 
+                                    bboxes=bboxes,
+                                    scores=scores, 
+                                    cls_inds=cls_inds,
+                                    class_colors=class_colors,
+                                    vis_thresh=vis_thresh)
+
+            cv2.imshow('detection', img_processed)
+            cv2.imwrite(os.path.join(save_path, str(i).zfill(6)+'.jpg'), img_processed)
+            cv2.waitKey(0)
+
+    # ------------------------- Video ---------------------------
+    elif mode == 'video':
+        video = cv2.VideoCapture(path_to_vid)
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        save_size = (640, 480)
+        save_path = os.path.join(save_path, 'det.avi')
+        fps = 15.0
+        out = cv2.VideoWriter(save_path, fourcc, fps, save_size)
+
+        while(True):
+            ret, frame = video.read()
+            
+            if ret:
+                # ------------------------- Detection ---------------------------
+                img_h, img_w = frame.shape[:2]
+                size = np.array([[img_w, img_h, img_w, img_h]])
+                # prepare
+                x, _, _, scale, offset = transform(frame)
+                x = x.unsqueeze(0).to(device)
+                # inference
+                t0 = time.time()
+                bboxes, scores, cls_inds = net(x)
+                t1 = time.time()
+                print("detection time used ", t1-t0, "s")
+
+                # rescale
+                bboxes -= offset
+                bboxes /= scale
+                bboxes *= size
+                
+                frame_processed = visualize(img=frame, 
+                                            bboxes=bboxes,
+                                            scores=scores, 
+                                            cls_inds=cls_inds,
+                                            class_colors=class_colors,
+                                            vis_thresh=vis_thresh)
+
+                frame_processed_resize = cv2.resize(frame_processed, save_size)
+                out.write(frame_processed_resize)
+                cv2.imshow('detection', frame_processed)
+                cv2.waitKey(1)
+            else:
+                break
+        video.release()
+        out.release()
+        cv2.destroyAllWindows()
+
+
+def run():
+    args = parse_args()
+
+    # use cuda
+    if args.cuda:
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # YOLO Config
+    cfg = yolo_config[args.model]
+    # build model
+    model = build_model(args=args, 
+                        cfg=cfg, 
+                        device=device, 
+                        num_classes=80, 
+                        trainable=False)
+
+    # load weight
+    model.load_state_dict(torch.load(args.weight, map_location='cpu'), strict=False)
+    model = model.to(device).eval()
+    print('Finished loading model!')
+
+    # run
+    detect(net=model, 
+            device=device,
+            transform=ValTransforms(args.img_size),
+            mode=args.mode,
+            path_to_img=args.path_to_img,
+            path_to_vid=args.path_to_vid,
+            path_to_save=args.path_to_save,
+            vis_thresh=args.visual_threshold)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/env_npu.sh b/PyTorch/contrib/cv/detection/YoloV2-640/env_npu.sh
new file mode 100644
index 0000000000..0c12c76322
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/env_npu.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+export install_path=/usr/local/Ascend
+
+if [ -d ${install_path}/toolkit ]; then
+    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
+    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
+    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
+    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
+    export ASCEND_OPP_PATH=${install_path}/opp
+else
+    if [ -d ${install_path}/nnae/latest ];then
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
+    else
+        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
+        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
+    fi
+fi
+
+
+#将Host日志输出到串口,0-关闭/1-开启
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#设置默认日志级别,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#设置Event日志开启标志,0-关闭/1-开启
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+#设置是否开启taskque,0-关闭/1-开启
+export TASK_QUEUE_ENABLE=1
+#设置是否开启PTCopy,0-关闭/1-开启
+export PTCOPY_ENABLE=1
+#设置是否开启combined标志,0-关闭/1-开启
+export COMBINED_ENABLE=1
+#设置特殊场景是否需要重新编译,不需要修改
+export DYNAMIC_OP="ADD#MUL"
+#HCCL白名单开关,1-关闭/0-开启
+export HCCL_WHITELIST_DISABLE=1
+export HCCL_IF_IP=$(hostname -I |gawk '{print $1}')
+${install_path}/driver/tools/msnpureport -g error -d 0
+${install_path}/driver/tools/msnpureport -g error -d 1
+${install_path}/driver/tools/msnpureport -g error -d 2
+${install_path}/driver/tools/msnpureport -g error -d 3
+${install_path}/driver/tools/msnpureport -g error -d 4
+${install_path}/driver/tools/msnpureport -g error -d 5
+${install_path}/driver/tools/msnpureport -g error -d 6
+${install_path}/driver/tools/msnpureport -g error -d 7
+# HCCL默认超时时间120s较少，修改为1800s对齐PyTorch默认设置
+export HCCL_CONNECT_TIMEOUT=5400
+export HCCL_EXEC_TIMEOUT=5400
+
+ulimit -SHn 512000
+
+path_lib=$(python3.7 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
+
+echo ${path_lib}
+
+export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/eval.py b/PyTorch/contrib/cv/detection/YoloV2-640/eval.py
new file mode 100644
index 0000000000..e6e47646e4
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/eval.py
@@ -0,0 +1,134 @@
+import argparse
+import os
+
+import torch
+
+from config.yolo_config import yolo_config
+from data.transforms import ValTransforms
+from models.yolo import build_model
+from utils.misc import TestTimeAugmentation
+
+from evaluator.vocapi_evaluator import VOCAPIEvaluator
+from evaluator.cocoapi_evaluator import COCOAPIEvaluator
+import torch_npu
+
+
+parser = argparse.ArgumentParser(description='YOLO Detection')
+# basic
+parser.add_argument('-size', '--img_size', default=640, type=int,
+                    help='img_size')
+parser.add_argument('--cuda', action='store_true', default=False,
+                    help='Use cuda')
+# model
+parser.add_argument('-m', '--model', default='yolov1',
+                    help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                            'yolov4, yolo_tiny, yolo_nano')
+parser.add_argument('--weight', type=str,
+                    default='weights/', 
+                    help='Trained state_dict file path to open')
+parser.add_argument('--conf_thresh', default=0.001, type=float,
+                    help='NMS threshold')
+parser.add_argument('--nms_thresh', default=0.6, type=float,
+                    help='NMS threshold')
+parser.add_argument('--center_sample', action='store_true', default=False,
+                    help='center sample trick.')
+# dataset
+parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                    help='data root')
+parser.add_argument('-d', '--dataset', default='coco-val',
+                    help='voc, coco-val, coco-test.')
+# TTA
+parser.add_argument('-tta', '--test_aug', action='store_true', default=False,
+                    help='use test augmentation.')
+
+args = parser.parse_args()
+
+
+def voc_test(model, data_dir, device, img_size):
+    evaluator = VOCAPIEvaluator(data_root=data_dir,
+                                img_size=img_size,
+                                device=device,
+                                transform=ValTransforms(img_size),
+                                display=True
+                                )
+
+    # VOC evaluation
+    evaluator.evaluate(model)
+
+
+def coco_test(model, data_dir, device, img_size, test=False):
+    if test:
+        # test-dev
+        print('test on test-dev 2017')
+        evaluator = COCOAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=img_size,
+                        device=device,
+                        testset=True,
+                        transform=ValTransforms(img_size)
+                        )
+
+    else:
+        # eval
+        evaluator = COCOAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=img_size,
+                        device=device,
+                        testset=False,
+                        transform=ValTransforms(img_size)
+                        )
+
+    # COCO evaluation
+    evaluator.evaluate(model)
+
+
+if __name__ == '__main__':
+    # dataset
+    if args.dataset == 'voc':
+        print('eval on voc ...')
+        num_classes = 20
+        data_dir = os.path.join(args.root, 'VOCdevkit')
+    elif args.dataset == 'coco-val':
+        print('eval on coco-val ...')
+        num_classes = 80
+        data_dir = os.path.join(args.root, 'COCO')
+    elif args.dataset == 'coco-test':
+        print('eval on coco-test-dev ...')
+        num_classes = 80
+        data_dir = os.path.join(args.root, 'COCO')
+    else:
+        print('unknow dataset !! we only support voc, coco-val, coco-test !!!')
+        exit(0)
+
+    # cuda
+    if args.cuda:
+        print('use cuda')
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # YOLO Config
+    cfg = yolo_config[args.model]
+    # build model
+    model = build_model(args=args, 
+                        cfg=cfg, 
+                        device=device, 
+                        num_classes=num_classes, 
+                        trainable=False)
+
+    # load weight
+    model.load_state_dict(torch.load(args.weight, map_location='cpu'), strict=False)
+    model = model.to(device).eval()
+    print('Finished loading model!')
+
+    # TTA
+    test_aug = TestTimeAugmentation(num_classes=num_classes) if args.test_aug else None
+    
+    # evaluation
+    with torch.no_grad():
+        if args.dataset == 'voc':
+            voc_test(model, data_dir, device, args.img_size)
+        elif args.dataset == 'coco-val':
+            coco_test(model, data_dir, device, args.img_size, test=False)
+        elif args.dataset == 'coco-test':
+            coco_test(model, data_dir, device, args.img_size, test=True)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/evaluator/cocoapi_evaluator.py b/PyTorch/contrib/cv/detection/YoloV2-640/evaluator/cocoapi_evaluator.py
new file mode 100644
index 0000000000..9480ce3b59
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/evaluator/cocoapi_evaluator.py
@@ -0,0 +1,135 @@
+import torch.nn.functional as F
+import json
+import tempfile
+import torch
+from data.coco import *
+import torch_npu
+try:
+    from pycocotools.cocoeval import COCOeval
+except:
+    print("It seems that the COCOAPI is not installed.")
+
+
+class COCOAPIEvaluator():
+    """
+    COCO AP Evaluation class.
+    All the data in the val2017 dataset are processed \
+    and evaluated by COCO API.
+    """
+    def __init__(self, data_dir, img_size, device, testset=False, transform=None):
+        """
+        Args:
+            data_dir (str): dataset root directory
+            img_size (int): image size after preprocess. images are resized \
+                to squares whose shape is (img_size, img_size).
+            confthre (float):
+                confidence threshold ranging from 0 to 1, \
+                which is defined in the config file.
+            nmsthre (float):
+                IoU threshold of non-max supression ranging from 0 to 1.
+        """
+        self.testset = testset
+        if self.testset:
+            image_set = 'test2017'
+        else:
+            image_set = 'val2017'
+
+        self.dataset = COCODataset(
+                            data_dir=data_dir,
+                            image_set=image_set,
+                            img_size=img_size,
+                            transform=None)
+        self.img_size = img_size
+        self.transform = transform
+        self.device = device
+
+        self.map = 0.
+        self.ap50_95 = 0.
+        self.ap50 = 0.
+
+    def evaluate(self, model):
+        """
+        COCO average precision (AP) Evaluation. Iterate inference on the test dataset
+        and the results are evaluated by COCO API.
+        Args:
+            model : model object
+        Returns:
+            ap50_95 (float) : calculated COCO AP for IoU=50:95
+            ap50 (float) : calculated COCO AP for IoU=50
+        """
+        model.eval()
+        ids = []
+        data_dict = []
+        num_images = len(self.dataset)
+        print('total number of images: %d' % (num_images))
+
+        # start testing
+        for index in range(num_images): # all the data in val2017
+            if index % 500 == 0:
+                print('[Eval: %d / %d]'%(index, num_images))
+
+            # load an image
+            img, id_ = self.dataset.pull_image(index)
+            h, w, _ = img.shape
+            size = np.array([[w, h, w, h]])
+
+            # preprocess
+            x, _, _, scale, offset = self.transform(img)
+            x = x.unsqueeze(0).to(self.device)
+            
+            id_ = int(id_)
+            ids.append(id_)
+            # inference
+            with torch.no_grad():
+                outputs = model(x)
+                bboxes, scores, cls_inds = outputs
+                # map the boxes to original image
+                bboxes -= offset
+                bboxes /= scale
+                bboxes *= size
+
+            for i, box in enumerate(bboxes):
+                x1 = float(box[0])
+                y1 = float(box[1])
+                x2 = float(box[2])
+                y2 = float(box[3])
+                label = self.dataset.class_ids[int(cls_inds[i])]
+                
+                bbox = [x1, y1, x2 - x1, y2 - y1]
+                score = float(scores[i]) # object score * class score
+                A = {"image_id": id_, "category_id": label, "bbox": bbox,
+                     "score": score} # COCO json format
+                data_dict.append(A)
+
+        annType = ['segm', 'bbox', 'keypoints']
+
+        # Evaluate the Dt (detection) json comparing with the ground truth
+        if len(data_dict) > 0:
+            print('evaluating ......')
+            cocoGt = self.dataset.coco
+            # workaround: temporarily write data to json file because pycocotools can't process dict in py36.
+            if self.testset:
+                json.dump(data_dict, open('coco_test-dev.json', 'w'))
+                cocoDt = cocoGt.loadRes('coco_test-dev.json')
+                return -1, -1
+            else:
+                _, tmp = tempfile.mkstemp()
+                json.dump(data_dict, open(tmp, 'w'))
+                cocoDt = cocoGt.loadRes(tmp)
+                cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1])
+                cocoEval.params.imgIds = ids
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+                cocoEval.summarize()
+
+                ap50_95, ap50 = cocoEval.stats[0], cocoEval.stats[1]
+                print('ap50_95 : ', ap50_95)
+                print('ap50 : ', ap50)
+                self.map = ap50_95
+                self.ap50_95 = ap50_95
+                self.ap50 = ap50
+
+                return ap50, ap50_95
+        else:
+            return 0, 0
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/evaluator/vocapi_evaluator.py b/PyTorch/contrib/cv/detection/YoloV2-640/evaluator/vocapi_evaluator.py
new file mode 100644
index 0000000000..43f09e93a8
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/evaluator/vocapi_evaluator.py
@@ -0,0 +1,347 @@
+"""Adapted from:
+    @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
+    @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
+    Licensed under The MIT License [see LICENSE for details]
+"""
+
+from data.voc import VOCDetection, VOC_CLASSES
+import os
+import time
+import numpy as np
+import pickle
+import xml.etree.ElementTree as ET
+
+
+class VOCAPIEvaluator():
+    """ VOC AP Evaluation class """
+    def __init__(self, 
+                 data_dir, 
+                 img_size, 
+                 device, 
+                 transform, 
+                 set_type='test', 
+                 year='2007', 
+                 display=False):
+        self.data_dir = data_dir
+        self.img_size = img_size
+        self.device = device
+        self.transform = transform
+        self.labelmap = VOC_CLASSES
+        self.set_type = set_type
+        self.year = year
+        self.display = display
+
+        # path
+        self.devkit_path = os.path.join(data_dir, 'VOC' + year)
+        self.annopath = os.path.join(data_dir, 'VOC2007', 'Annotations', '%s.xml')
+        self.imgpath = os.path.join(data_dir, 'VOC2007', 'JPEGImages', '%s.jpg')
+        self.imgsetpath = os.path.join(data_dir, 'VOC2007', 'ImageSets', 'Main', set_type+'.txt')
+        self.output_dir = self.get_output_dir('voc_eval/', self.set_type)
+
+        # dataset
+        self.dataset = VOCDetection(data_dir=data_dir, 
+                                    image_sets=[('2007', set_type)],
+                                    transform=transform)
+
+    def evaluate(self, net):
+        net.eval()
+        num_images = len(self.dataset)
+        # all detections are collected into:
+        #    all_boxes[cls][image] = N x 5 array of detections in
+        #    (x1, y1, x2, y2, score)
+        self.all_boxes = [[[] for _ in range(num_images)]
+                        for _ in range(len(self.labelmap))]
+
+        # timers
+        det_file = os.path.join(self.output_dir, 'detections.pkl')
+
+        for i in range(num_images):
+            im, _ = self.dataset.pull_image(i)
+            h, w, _ = im.shape
+            size = np.array([[w, h, w, h]])
+
+            # preprocess
+            x, _, _, scale, offset = self.transform(im)
+            x = x.unsqueeze(0).to(self.device)
+
+            t0 = time.time()
+            # forward
+            bboxes, scores, cls_inds = net(x)
+            detect_time = time.time() - t0
+            # map the boxes to original image
+            bboxes -= offset
+            bboxes /= scale
+            bboxes *= size
+
+            for j in range(len(self.labelmap)):
+                inds = np.where(cls_inds == j)[0]
+                if len(inds) == 0:
+                    self.all_boxes[j][i] = np.empty([0, 5], dtype=np.float32)
+                    continue
+                c_bboxes = bboxes[inds]
+                c_scores = scores[inds]
+                c_dets = np.hstack((c_bboxes,
+                                    c_scores[:, np.newaxis])).astype(np.float32,
+                                                                    copy=False)
+                self.all_boxes[j][i] = c_dets
+
+            if i % 500 == 0:
+                print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time))
+
+        with open(det_file, 'wb') as f:
+            pickle.dump(self.all_boxes, f, pickle.HIGHEST_PROTOCOL)
+
+        print('Evaluating detections')
+        self.evaluate_detections(self.all_boxes)
+
+        print('Mean AP: ', self.map)
+  
+
+    def parse_rec(self, filename):
+        """ Parse a PASCAL VOC xml file """
+        tree = ET.parse(filename)
+        objects = []
+        for obj in tree.findall('object'):
+            obj_struct = {}
+            obj_struct['name'] = obj.find('name').text
+            obj_struct['pose'] = obj.find('pose').text
+            obj_struct['truncated'] = int(obj.find('truncated').text)
+            obj_struct['difficult'] = int(obj.find('difficult').text)
+            bbox = obj.find('bndbox')
+            obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                                int(bbox.find('ymin').text),
+                                int(bbox.find('xmax').text),
+                                int(bbox.find('ymax').text)]
+            objects.append(obj_struct)
+
+        return objects
+
+
+    def get_output_dir(self, name, phase):
+        """Return the directory where experimental artifacts are placed.
+        If the directory does not exist, it is created.
+        A canonical path is built using the name from an imdb and a network
+        (if not None).
+        """
+        filedir = os.path.join(name, phase)
+        if not os.path.exists(filedir):
+            os.makedirs(filedir)
+        return filedir
+
+
+    def get_voc_results_file_template(self, cls):
+        # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
+        filename = 'det_' + self.set_type + '_%s.txt' % (cls)
+        filedir = os.path.join(self.devkit_path, 'results')
+        if not os.path.exists(filedir):
+            os.makedirs(filedir)
+        path = os.path.join(filedir, filename)
+        return path
+
+
+    def write_voc_results_file(self, all_boxes):
+        for cls_ind, cls in enumerate(self.labelmap):
+            if self.display:
+                print('Writing {:s} VOC results file'.format(cls))
+            filename = self.get_voc_results_file_template(cls)
+            with open(filename, 'wt') as f:
+                for im_ind, index in enumerate(self.dataset.ids):
+                    dets = all_boxes[cls_ind][im_ind]
+                    if dets == []:
+                        continue
+                    # the VOCdevkit expects 1-based indices
+                    for k in range(dets.shape[0]):
+                        f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
+                                format(index[1], dets[k, -1],
+                                    dets[k, 0] + 1, dets[k, 1] + 1,
+                                    dets[k, 2] + 1, dets[k, 3] + 1))
+
+
+    def do_python_eval(self, use_07=True):
+        cachedir = os.path.join(self.devkit_path, 'annotations_cache')
+        aps = []
+        # The PASCAL VOC metric changed in 2010
+        use_07_metric = use_07
+        print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+        if not os.path.isdir(self.output_dir):
+            os.mkdir(self.output_dir)
+        for i, cls in enumerate(self.labelmap):
+            filename = self.get_voc_results_file_template(cls)
+            rec, prec, ap = self.voc_eval(detpath=filename, 
+                                          classname=cls, 
+                                          cachedir=cachedir, 
+                                          ovthresh=0.5, 
+                                          use_07_metric=use_07_metric
+                                        )
+            aps += [ap]
+            print('AP for {} = {:.4f}'.format(cls, ap))
+            with open(os.path.join(self.output_dir, cls + '_pr.pkl'), 'wb') as f:
+                pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+        if self.display:
+            self.map = np.mean(aps)
+            print('Mean AP = {:.4f}'.format(np.mean(aps)))
+            print('~~~~~~~~')
+            print('Results:')
+            for ap in aps:
+                print('{:.3f}'.format(ap))
+            print('{:.3f}'.format(np.mean(aps)))
+            print('~~~~~~~~')
+            print('')
+            print('--------------------------------------------------------------')
+            print('Results computed with the **unofficial** Python eval code.')
+            print('Results should be very close to the official MATLAB eval code.')
+            print('--------------------------------------------------------------')
+        else:
+            self.map = np.mean(aps)
+            print('Mean AP = {:.4f}'.format(np.mean(aps)))
+
+
+    def voc_ap(self, rec, prec, use_07_metric=True):
+        """ ap = voc_ap(rec, prec, [use_07_metric])
+        Compute VOC AP given precision and recall.
+        If use_07_metric is true, uses the
+        VOC 07 11 point method (default:True).
+        """
+        if use_07_metric:
+            # 11 point metric
+            ap = 0.
+            for t in np.arange(0., 1.1, 0.1):
+                if np.sum(rec >= t) == 0:
+                    p = 0
+                else:
+                    p = np.max(prec[rec >= t])
+                ap = ap + p / 11.
+        else:
+            # correct AP calculation
+            # first append sentinel values at the end
+            mrec = np.concatenate(([0.], rec, [1.]))
+            mpre = np.concatenate(([0.], prec, [0.]))
+
+            # compute the precision envelope
+            for i in range(mpre.size - 1, 0, -1):
+                mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+            # to calculate area under PR curve, look for points
+            # where X axis (recall) changes value
+            i = np.where(mrec[1:] != mrec[:-1])[0]
+
+            # and sum (\Delta recall) * prec
+            ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+        return ap
+
+
+    def voc_eval(self, detpath, classname, cachedir, ovthresh=0.5, use_07_metric=True):
+        if not os.path.isdir(cachedir):
+            os.mkdir(cachedir)
+        cachefile = os.path.join(cachedir, 'annots.pkl')
+        # read list of images
+        with open(self.imgsetpath, 'r') as f:
+            lines = f.readlines()
+        imagenames = [x.strip() for x in lines]
+        if not os.path.isfile(cachefile):
+            # load annots
+            recs = {}
+            for i, imagename in enumerate(imagenames):
+                recs[imagename] = self.parse_rec(self.annopath % (imagename))
+                if i % 100 == 0 and self.display:
+                    print('Reading annotation for {:d}/{:d}'.format(
+                    i + 1, len(imagenames)))
+            # save
+            if self.display:
+                print('Saving cached annotations to {:s}'.format(cachefile))
+            with open(cachefile, 'wb') as f:
+                pickle.dump(recs, f)
+        else:
+            # load
+            with open(cachefile, 'rb') as f:
+                recs = pickle.load(f)
+
+        # extract gt objects for this class
+        class_recs = {}
+        npos = 0
+        for imagename in imagenames:
+            R = [obj for obj in recs[imagename] if obj['name'] == classname]
+            bbox = np.array([x['bbox'] for x in R])
+            difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+            det = [False] * len(R)
+            npos = npos + sum(~difficult)
+            class_recs[imagename] = {'bbox': bbox,
+                                    'difficult': difficult,
+                                    'det': det}
+
+        # read dets
+        detfile = detpath.format(classname)
+        with open(detfile, 'r') as f:
+            lines = f.readlines()
+        if any(lines) == 1:
+
+            splitlines = [x.strip().split(' ') for x in lines]
+            image_ids = [x[0] for x in splitlines]
+            confidence = np.array([float(x[1]) for x in splitlines])
+            BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+            # sort by confidence
+            sorted_ind = np.argsort(-confidence)
+            sorted_scores = np.sort(-confidence)
+            BB = BB[sorted_ind, :]
+            image_ids = [image_ids[x] for x in sorted_ind]
+
+            # go down dets and mark TPs and FPs
+            nd = len(image_ids)
+            tp = np.zeros(nd)
+            fp = np.zeros(nd)
+            for d in range(nd):
+                R = class_recs[image_ids[d]]
+                bb = BB[d, :].astype(float)
+                ovmax = -np.inf
+                BBGT = R['bbox'].astype(float)
+                if BBGT.size > 0:
+                    # compute overlaps
+                    # intersection
+                    ixmin = np.maximum(BBGT[:, 0], bb[0])
+                    iymin = np.maximum(BBGT[:, 1], bb[1])
+                    ixmax = np.minimum(BBGT[:, 2], bb[2])
+                    iymax = np.minimum(BBGT[:, 3], bb[3])
+                    iw = np.maximum(ixmax - ixmin, 0.)
+                    ih = np.maximum(iymax - iymin, 0.)
+                    inters = iw * ih
+                    uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
+                        (BBGT[:, 2] - BBGT[:, 0]) *
+                        (BBGT[:, 3] - BBGT[:, 1]) - inters)
+                    overlaps = inters / uni
+                    ovmax = np.max(overlaps)
+                    jmax = np.argmax(overlaps)
+
+                if ovmax > ovthresh:
+                    if not R['difficult'][jmax]:
+                        if not R['det'][jmax]:
+                            tp[d] = 1.
+                            R['det'][jmax] = 1
+                        else:
+                            fp[d] = 1.
+                else:
+                    fp[d] = 1.
+
+            # compute precision recall
+            fp = np.cumsum(fp)
+            tp = np.cumsum(tp)
+            rec = tp / float(npos)
+            # avoid divide by zero in case the first detection matches a difficult
+            # ground truth
+            prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+            ap = self.voc_ap(rec, prec, use_07_metric)
+        else:
+            rec = -1.
+            prec = -1.
+            ap = -1.
+
+        return rec, prec, ap
+
+
+    def evaluate_detections(self, box_list):
+        self.write_voc_results_file(box_list)
+        self.do_python_eval()
+
+
+if __name__ == '__main__':
+    pass
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/__init__.py
new file mode 100644
index 0000000000..270c40cb74
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/__init__.py
@@ -0,0 +1,84 @@
+from .resnet import resnet18, resnet50, resnet101
+from .darknet import darknet53
+from .cspdarknet_tiny import cspdarknet_tiny
+from .cspdarknet53 import cspdarknet53
+from .yolox_backbone import yolox_cspdarknet_s, yolox_cspdarknet_m, yolox_cspdarknet_l, \
+                            yolox_cspdarknet_x, yolox_cspdarknet_tiny, yolox_cspdarknet_nano
+from .shufflenetv2 import shufflenetv2
+from .vit import vit_base_patch16_224
+
+
+def build_backbone(model_name='r18', pretrained=False, freeze=None, img_size=224):
+    if model_name == 'r18':
+        print('Backbone: ResNet-18 ...')
+        model = resnet18(pretrained=pretrained)
+        feature_channels = [128, 256, 512]
+        strides = [8, 16, 32]
+    elif model_name == 'r50':
+        print('Backbone: ResNet-50 ...')
+        model = resnet50(pretrained=pretrained)
+        feature_channels = [512, 1024, 2048]
+        strides = [8, 16, 32]
+    elif model_name == 'r101':
+        print('Backbone: ResNet-101 ...')
+        model = resnet101(pretrained=pretrained)
+        feature_channels = [512, 1024, 2048]
+        strides = [8, 16, 32]
+    elif model_name == 'd53':
+        print('Backbone: DarkNet-53 ...')
+        model = darknet53(pretrained=pretrained)
+        feature_channels = [256, 512, 1024]
+        strides = [8, 16, 32]
+    elif model_name == 'cspd53':
+        print('Backbone: CSPDarkNet-53 ...')
+        model = cspdarknet53(pretrained=pretrained)
+        feature_channels = [256, 512, 1024]
+        strides = [8, 16, 32]
+    elif model_name == 'cspd_tiny':
+        print('Backbone: CSPDarkNet-Tiny ...')
+        model = cspdarknet_tiny(pretrained=pretrained)
+        feature_channels = [128, 256, 512]
+        strides = [8, 16, 32]
+    elif model_name == 'sfnet_v2':
+        print('Backbone: ShuffleNet-V2 ...')
+        model = shufflenetv2(pretrained=pretrained)
+        feature_channels = [116, 232, 464]
+        strides = [8, 16, 32]
+    elif model_name == 'vit_base_16':
+        print('Backbone: ViT_Base_16 ...')
+        model = vit_base_patch16_224(img_size=img_size, pretrained=pretrained)
+        feature_channels = [None, None, 768]
+        strides = [None, None, 16]
+    # YOLOX backbone
+    elif model_name == 'csp_s':
+        print('Backbone: YOLOX-CSPDarkNet-S ...')
+        model = yolox_cspdarknet_s(pretrained=pretrained, freeze=freeze)
+        feature_channels = [128, 256, 512]
+        strides = [8, 16, 32]
+    elif model_name == 'csp_m':
+        print('Backbone: YOLOX-CSPDarkNet-M ...')
+        model = yolox_cspdarknet_m(pretrained=pretrained, freeze=freeze)
+        feature_channels = [192, 384, 768]
+        strides = [8, 16, 32]
+    elif model_name == 'csp_l':
+        print('Backbone: YOLOX-CSPDarkNet-L ...')
+        model = yolox_cspdarknet_l(pretrained=pretrained, freeze=freeze)
+        feature_channels = [256, 512, 1024]
+        strides = [8, 16, 32]
+    elif model_name == 'csp_x':
+        print('Backbone: YOLOX-CSPDarkNet-X ...')
+        model = yolox_cspdarknet_x(pretrained=pretrained, freeze=freeze)
+        feature_channels = [320, 640, 1280]
+        strides = [8, 16, 32]
+    elif model_name == 'csp_t':
+        print('Backbone: YOLOX-CSPDarkNet-Tiny ...')
+        model = yolox_cspdarknet_tiny(pretrained=pretrained, freeze=freeze)
+        feature_channels = [96, 192, 384]
+        strides = [8, 16, 32]
+    elif model_name == 'csp_n':
+        print('Backbone: YOLOX-CSPDarkNet-Nano ...')
+        model = yolox_cspdarknet_nano(pretrained=pretrained, freeze=freeze)
+        feature_channels = [64, 128, 256]
+        strides = [8, 16, 32]
+    
+    return model, feature_channels, strides
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet53.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet53.py
new file mode 100644
index 0000000000..40dcf833de
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet53.py
@@ -0,0 +1,296 @@
+"""
+    This is a CSPDarkNet-53 with Mish.
+"""
+import os
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+def ConvNormActivation(inplanes,
+                       planes,
+                       kernel_size=3,
+                       stride=1,
+                       padding=0,
+                       dilation=1,
+                       groups=1):
+    """
+    A help function to build a 'conv-bn-activation' module
+    """
+    layers = []
+    layers.append(nn.Conv2d(inplanes,
+                            planes,
+                            kernel_size=kernel_size,
+                            stride=stride,
+                            padding=padding,
+                            dilation=dilation,
+                            groups=groups,
+                            bias=False))
+    layers.append(nn.BatchNorm2d(planes, eps=1e-4, momentum=0.03))
+    layers.append(nn.Mish(inplace=True))
+    return nn.Sequential(*layers)
+
+
+def make_cspdark_layer(block,
+                       inplanes,
+                       planes,
+                       num_blocks,
+                       is_csp_first_stage,
+                       dilation=1):
+    downsample = ConvNormActivation(
+        inplanes=planes,
+        planes=planes if is_csp_first_stage else inplanes,
+        kernel_size=1,
+        stride=1,
+        padding=0
+    )
+
+    layers = []
+    for i in range(0, num_blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes if is_csp_first_stage else inplanes,
+                downsample=downsample if i == 0 else None,
+                dilation=dilation
+            )
+        )
+    return nn.Sequential(*layers)
+
+
+class DarkBlock(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 dilation=1,
+                 downsample=None):
+        """Residual Block for DarkNet.
+        This module has the dowsample layer (optional),
+        1x1 conv layer and 3x3 conv layer.
+        """
+        super(DarkBlock, self).__init__()
+
+        self.downsample = downsample
+
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-4, momentum=0.03)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-4, momentum=0.03)
+
+        self.conv1 = nn.Conv2d(
+            planes,
+            inplanes,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False
+        )
+
+        self.conv2 = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            dilation=dilation,
+            bias=False
+        )
+
+        self.activation = nn.Mish(inplace=True)
+
+    def forward(self, x):
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.activation(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.activation(out)
+
+        out += identity
+
+        return out
+
+
+class CrossStagePartialBlock(nn.Module):
+    """CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
+    Refer to the paper for more details: https://arxiv.org/abs/1911.11929.
+    In this module, the inputs go throuth the base conv layer at the first,
+    and then pass the two partial transition layers.
+    1. go throuth basic block (like DarkBlock)
+        and one partial transition layer.
+    2. go throuth the other partial transition layer.
+    At last, They are concat into fuse transition layer.
+    Args:
+        inplanes (int): number of input channels.
+        planes (int): number of output channels
+        stage_layers (nn.Module): the basic block which applying CSPNet.
+        is_csp_first_stage (bool): Is the first stage or not.
+            The number of input and output channels in the first stage of
+            CSPNet is different from other stages.
+        dilation (int): conv dilation
+        stride (int): stride for the base layer
+    """
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stage_layers,
+                 is_csp_first_stage,
+                 dilation=1,
+                 stride=2):
+        super(CrossStagePartialBlock, self).__init__()
+
+        self.base_layer = ConvNormActivation(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation
+        )
+        self.partial_transition1 = ConvNormActivation(
+            inplanes=planes,
+            planes=inplanes if not is_csp_first_stage else planes,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+        self.stage_layers = stage_layers
+
+        self.partial_transition2 = ConvNormActivation(
+            inplanes=inplanes if not is_csp_first_stage else planes,
+            planes=inplanes if not is_csp_first_stage else planes,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+        self.fuse_transition = ConvNormActivation(
+            inplanes=planes if not is_csp_first_stage else planes * 2,
+            planes=planes,
+            kernel_size=1,
+            stride=1,
+            padding=0
+        )
+
+    def forward(self, x):
+        x = self.base_layer(x)
+
+        out1 = self.partial_transition1(x)
+
+        out2 = self.stage_layers(x)
+        out2 = self.partial_transition2(out2)
+
+        out = torch.cat([out2, out1], dim=1)
+        out = self.fuse_transition(out)
+
+        return out
+
+
+class CSPDarkNet53(nn.Module):
+    """CSPDarkNet backbone.
+    Refer to the paper for more details: https://arxiv.org/pdf/1804.02767
+    Args:
+        depth (int): Depth of Darknet, from {53}.
+        num_stages (int): Darknet stages, normally 5.
+        with_csp (bool): Use cross stage partial connection or not.
+        out_features (List[str]): Output features.
+        norm_type (str): type of normalization layer.
+        res5_dilation (int): dilation for the last stage
+    """
+
+    def __init__(self):
+        super(CSPDarkNet53, self).__init__()
+     
+        self.block =  DarkBlock
+        self.stage_blocks = (1, 2, 8, 8, 4)
+        self.with_csp = True
+        self.inplanes = 32
+
+        self.backbone = nn.ModuleDict()
+        self.layer_names = []
+        # First stem layer
+        self.backbone["conv1"] = nn.Conv2d(3, self.inplanes, kernel_size=3, padding=1, bias=False)
+        self.backbone["bn1"] = nn.BatchNorm2d(self.inplanes, eps=1e-4, momentum=0.03)
+        self.backbone["act1"] = nn.Mish(inplace=True)
+
+        for i, num_blocks in enumerate(self.stage_blocks):
+            planes = 64 * 2 ** i
+            dilation = 1
+            stride = 2
+            layer = make_cspdark_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                is_csp_first_stage=True if i == 0 else False,
+                dilation=dilation
+            )
+            layer = CrossStagePartialBlock(
+                self.inplanes,
+                planes,
+                stage_layers=layer,
+                is_csp_first_stage=True if i == 0 else False,
+                dilation=dilation,
+                stride=stride
+            )
+            self.inplanes = planes
+            layer_name = 'layer{}'.format(i + 1)
+            self.backbone[layer_name]=layer
+            self.layer_names.append(layer_name)
+
+
+    def forward(self, x):
+        outputs = []
+        x = self.backbone["conv1"](x)
+        x = self.backbone["bn1"](x)
+        x = self.backbone["act1"](x)
+
+        for i, layer_name in enumerate(self.layer_names):
+            layer = self.backbone[layer_name]
+            x = layer(x)
+            outputs.append(x)
+        return outputs[-3:]  # C3, C4, C5
+
+
+def cspdarknet53(pretrained=False):
+    """
+    Create a CSPDarkNet.
+    """
+    model = CSPDarkNet53()
+    if pretrained:
+        print('Loading the pretrained model ...')
+        path_to_weight = os.path.dirname(os.path.abspath(__file__)) + '/weights/cspdarknet53/cspdarknet53.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        # checkpoint state dict
+        checkpoint_state_dict = checkpoint.pop("model")
+        # model state dict
+        model_state_dict = model.state_dict()
+        # check
+        for k in list(checkpoint_state_dict.keys()):
+            if k in model_state_dict:
+                shape_model = tuple(model_state_dict[k].shape)
+                shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
+                if shape_model != shape_checkpoint:
+                    checkpoint_state_dict.pop(k)
+            else:
+                print(k)
+
+        model.load_state_dict(checkpoint_state_dict, strict=False)
+    return model
+
+
+if __name__=='__main__':
+    img_size = 512
+    input = torch.ones(1, 3, img_size, img_size)
+    
+    model = cspdarknet53(pretrained=True)
+    output = model(input)
+    for y in output:
+        print(y.size())
+    print(output[-1])
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet_tiny.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet_tiny.py
new file mode 100644
index 0000000000..1d0ae5e8f1
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/cspdarknet_tiny.py
@@ -0,0 +1,128 @@
+"""
+    This is a CSPDarkNet-53 with LaekyReLU.
+"""
+import os
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+__all__ = ['cspdarkner53']
+
+
+class Conv(nn.Module):
+    def __init__(self, c1, c2, k, s=1, p=0, d=1, g=1, act=True):
+        super(Conv, self).__init__()
+        if act:
+            self.convs = nn.Sequential(
+                nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=False),
+                nn.BatchNorm2d(c2),
+                nn.LeakyReLU(0.1, inplace=True)
+            )
+        else:
+            self.convs = nn.Sequential(
+                nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=False),
+                nn.BatchNorm2d(c2)
+            )
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+class ResidualBlock(nn.Module):
+    """
+    basic residual block for CSP-Darknet
+    """
+    def __init__(self, in_ch):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = Conv(in_ch, in_ch, k=1)
+        self.conv2 = Conv(in_ch, in_ch, k=3, p=1, act=False)
+        self.act = nn.LeakyReLU(0.1, inplace=True)
+
+    def forward(self, x):
+        h = self.conv2(self.conv1(x))
+        out = self.act(x + h)
+
+        return out
+
+
+class CSPStage(nn.Module):
+    def __init__(self, c1, n=1):
+        super(CSPStage, self).__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, k=1)
+        self.cv2 = Conv(c1, c_, k=1)
+        self.res_blocks = nn.Sequential(*[ResidualBlock(in_ch=c_) for _ in range(n)])
+        self.cv3 = Conv(2 * c_, c1, k=1)
+
+    def forward(self, x):
+        y1 = self.cv1(x)
+        y2 = self.res_blocks(self.cv2(x))
+
+        return self.cv3(torch.cat([y1, y2], dim=1))
+
+
+# CSPDarkNet-Tiny
+class CSPDarknetTiny(nn.Module):
+    """
+    CSPDarknet_Tiny.
+    """
+    def __init__(self):
+        super(CSPDarknetTiny, self).__init__()
+            
+        self.layer_1 = nn.Sequential(
+            Conv(3, 16, k=3, p=1),      
+            Conv(16, 32, k=3, p=1, s=2),
+            CSPStage(c1=32, n=1)                       # p1/2
+        )
+        self.layer_2 = nn.Sequential(   
+            Conv(32, 64, k=3, p=1, s=2),             
+            CSPStage(c1=64, n=1)                      # P2/4
+        )
+        self.layer_3 = nn.Sequential(
+            Conv(64, 128, k=3, p=1, s=2),             
+            CSPStage(c1=128, n=1)                      # P3/8
+        )
+        self.layer_4 = nn.Sequential(
+            Conv(128, 256, k=3, p=1, s=2),             
+            CSPStage(c1=256, n=1)                      # P4/16
+        )
+        self.layer_5 = nn.Sequential(
+            Conv(256, 512, k=3, p=1, s=2),             
+            CSPStage(c1=512, n=1)                     # P5/32
+        )
+
+
+    def forward(self, x):
+        c1 = self.layer_1(x)
+        c2 = self.layer_2(c1)
+        c3 = self.layer_3(c2)
+        c4 = self.layer_4(c3)
+        c5 = self.layer_5(c4)
+
+        return c3, c4, c5
+
+
+def cspdarknet_tiny(pretrained=False, **kwargs):
+    """Constructs a CSPDarknet53 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = CSPDarknetTiny()
+    if pretrained:
+        print('Loading the pretrained model ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        checkpoint = torch.load(path_to_dir + '/weights/cspdarknet_tiny/cspdarknet_tiny.pth', map_location='cpu')
+        model.load_state_dict(checkpoint, strict=False)
+    return model
+
+
+if __name__ == '__main__':
+    import time
+    net = cspdarknet_tiny(pretrained=True)
+    x = torch.randn(1, 3, 224, 224)
+    t0 = time.time()
+    y = net(x)
+    t1 = time.time()
+    print('Time: ', t1 - t0)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/darknet.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/darknet.py
new file mode 100644
index 0000000000..5ce99fd305
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/darknet.py
@@ -0,0 +1,102 @@
+import torch
+import torch.nn as nn
+import os
+import torch_npu
+
+
+__all__ = ['darknet53']
+
+
+class Conv_BN_LeakyReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1):
+        super(Conv_BN_LeakyReLU, self).__init__()
+        self.convs = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation),
+            nn.BatchNorm2d(out_channels),
+            nn.LeakyReLU(0.1, inplace=True)
+        )
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+class resblock(nn.Module):
+    def __init__(self, ch, nblocks=1):
+        super().__init__()
+        self.module_list = nn.ModuleList()
+        for _ in range(nblocks):
+            resblock_one = nn.Sequential(
+                Conv_BN_LeakyReLU(ch, ch//2, 1),
+                Conv_BN_LeakyReLU(ch//2, ch, 3, padding=1)
+            )
+            self.module_list.append(resblock_one)
+
+    def forward(self, x):
+        for module in self.module_list:
+            x = module(x) + x
+        return x
+
+
+class DarkNet_53(nn.Module):
+    """
+    DarkNet-53.
+    """
+    def __init__(self, num_classes=1000):
+        super(DarkNet_53, self).__init__()
+        # stride = 2
+        self.layer_1 = nn.Sequential(
+            Conv_BN_LeakyReLU(3, 32, 3, padding=1),
+            Conv_BN_LeakyReLU(32, 64, 3, padding=1, stride=2),
+            resblock(64, nblocks=1)
+        )
+        # stride = 4
+        self.layer_2 = nn.Sequential(
+            Conv_BN_LeakyReLU(64, 128, 3, padding=1, stride=2),
+            resblock(128, nblocks=2)
+        )
+        # stride = 8
+        self.layer_3 = nn.Sequential(
+            Conv_BN_LeakyReLU(128, 256, 3, padding=1, stride=2),
+            resblock(256, nblocks=8)
+        )
+        # stride = 16
+        self.layer_4 = nn.Sequential(
+            Conv_BN_LeakyReLU(256, 512, 3, padding=1, stride=2),
+            resblock(512, nblocks=8)
+        )
+        # stride = 32
+        self.layer_5 = nn.Sequential(
+            Conv_BN_LeakyReLU(512, 1024, 3, padding=1, stride=2),
+            resblock(1024, nblocks=4)
+        )
+
+        # self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        # self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x, targets=None):
+        c1 = self.layer_1(x)
+        c2 = self.layer_2(c1)
+        c3 = self.layer_3(c2)
+        c4 = self.layer_4(c3)
+        c5 = self.layer_5(c4)
+
+        return c3, c4, c5
+
+
+def darknet53(pretrained=False, **kwargs):
+    """Constructs a darknet-53 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = DarkNet_53()
+    if pretrained:
+        try:
+            print('Loading the pretrained model ...')
+            path_to_dir = os.path.dirname(os.path.abspath(__file__))
+            checkpoint = torch.load(path_to_dir + '/weights/darknet53/darknet53.pth', map_location='cpu')
+            model.load_state_dict(checkpoint, strict=False)
+        except:
+            print('The pretrained weight can not be found ...')
+            pass
+    return model
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/resnet.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/resnet.py
new file mode 100644
index 0000000000..1f4df9b046
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/resnet.py
@@ -0,0 +1,227 @@
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import torch_npu
+import torch.nn.functional as F
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = conv1x1(inplanes, planes)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = conv1x1(planes, planes * self.expansion)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, zero_init_residual=False):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        c2 = self.layer1(x)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+
+        return c3, c4, c5
+            
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        print('Loading the pretrained model ...')
+        # strict = False as we don't need fc layer params.
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False)
+    return model
+
+def resnet34(pretrained=False, **kwargs):
+    """Constructs a ResNet-34 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        print('Loading the pretrained model ...')
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False)
+    return model
+
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        print('Loading the pretrained model ...')
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
+    return model
+
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        print('Loading the pretrained model ...')
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False)
+    return model
+
+def resnet152(pretrained=False, **kwargs):
+    """Constructs a ResNet-152 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    if pretrained:
+        print('Loading the pretrained model ...')
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
+    return model
+
+if __name__=='__main__':
+    #model = torchvision.models.resnet50()
+    print("found ", torch_npu.npu.device_count(), " GPU(s)")
+    device = torch.device("npu")
+    model = resnet101(detection=True).to(device)
+    print(model)
+
+    input = torch.randn(1, 3, 512, 512).to(device)
+    output = model(input)
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/shufflenetv2.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/shufflenetv2.py
new file mode 100644
index 0000000000..f7637cb60e
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/shufflenetv2.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import torch_npu
+
+
+model_urls = {
+    'shufflenetv2_0.5x': 'https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth',
+    'shufflenetv2_1.0x': 'https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth',
+    'shufflenetv2_1.5x': None,
+    'shufflenetv2_2.0x': None,
+}
+
+
+def channel_shuffle(x, groups):
+    # type: (torch.Tensor, int) -> torch.Tensor
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = x.view(batchsize, groups,
+               channels_per_group, height, width)
+
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    # flatten
+    x = x.view(batchsize, -1, height, width)
+
+    return x
+
+
+class ShuffleV2Block(nn.Module):
+    def __init__(self, inp, oup, stride):
+        super(ShuffleV2Block, self).__init__()
+
+        if not (1 <= stride <= 3):
+            raise ValueError('illegal stride value')
+        self.stride = stride
+
+        branch_features = oup // 2
+        assert (self.stride != 1) or (inp == branch_features << 1)
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1),
+                nn.BatchNorm2d(inp),
+                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(branch_features),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.branch1 = nn.Sequential()
+
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(inp if (self.stride > 1) else branch_features,
+                      branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+            self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1),
+            nn.BatchNorm2d(branch_features),
+            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+        )
+
+    @staticmethod
+    def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
+        return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def forward(self, x):
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = torch.cat((x1, self.branch2(x2)), dim=1)
+        else:
+            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+
+        out = channel_shuffle(out, 2)
+
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+    def __init__(self,
+                 model_size='1.0x',
+                 out_stages=(2, 3, 4),
+                 with_last_conv=False,
+                 kernal_size=3):
+        super(ShuffleNetV2, self).__init__()
+        print('model size is ', model_size)
+
+        self.stage_repeats = [4, 8, 4]
+        self.model_size = model_size
+        self.out_stages = out_stages
+        self.with_last_conv = with_last_conv
+        self.kernal_size = kernal_size
+        if model_size == '0.5x':
+            self._stage_out_channels = [24, 48, 96, 192, 1024]
+        elif model_size == '1.0x':
+            self._stage_out_channels = [24, 116, 232, 464, 1024]
+        elif model_size == '1.5x':
+            self._stage_out_channels = [24, 176, 352, 704, 1024]
+        elif model_size == '2.0x':
+            self._stage_out_channels = [24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(output_channels),
+            nn.ReLU(inplace=True),
+        )
+        input_channels = output_channels
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
+        for name, repeats, output_channels in zip(
+                stage_names, self.stage_repeats, self._stage_out_channels[1:]):
+            seq = [ShuffleV2Block(input_channels, output_channels, 2)]
+            for i in range(repeats - 1):
+                seq.append(ShuffleV2Block(output_channels, output_channels, 1))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+        output_channels = self._stage_out_channels[-1]
+        
+        self._initialize_weights()
+
+
+    def _initialize_weights(self, pretrain=True):
+        print('init weights...')
+        for name, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                if 'first' in name:
+                    nn.init.normal_(m.weight, 0, 0.01)
+                else:
+                    nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1])
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0.0001)
+                nn.init.constant_(m.running_mean, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0.0001)
+                nn.init.constant_(m.running_mean, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        output = []
+        for i in range(2, 5):
+            stage = getattr(self, 'stage{}'.format(i))
+            x = stage(x)
+            if i in self.out_stages:
+                output.append(x)
+
+        return tuple(output)
+
+
+def shufflenetv2(model_size='1.0x', pretrained=False, **kwargs):
+    """Constructs a shufflenetv2 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ShuffleNetV2(model_size=model_size)
+    if pretrained:
+        print('Loading the pretrained model ...')
+        url = model_urls['shufflenetv2_{}'.format(model_size)]
+        print('=> loading pretrained model {}'.format(url))
+        model.load_state_dict(model_zoo.load_url(url), strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+    model = shufflenetv2(model_size='0.5x', pretrained=True)
+    print(model)
+    test_data = torch.rand(5, 3, 320, 320)
+    c3, c4, c5 = model(test_data)
+    print(c3.size())
+    print(c4.size())
+    print(c5.size())
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/vit.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/vit.py
new file mode 100644
index 0000000000..73eb7383b8
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/vit.py
@@ -0,0 +1,378 @@
+# --------------------------------------------------------
+# Based on BEiT, timm, DINO and DeiT code bases
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/facebookresearch/deit
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from functools import partial
+
+from timm.models.layers import drop_path, to_2tuple
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_ as __call_trunc_normal_
+import torch_npu
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement 
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+    
+# sin-cos position encoding
+# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
+def get_sinusoid_encoding_table(n_position, d_hid): 
+    ''' Sinusoid position encoding table ''' 
+    # TODO: make it with torch instead of numpy 
+    def get_position_angle_vec(position): 
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] 
+
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) 
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 
+
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0) 
+
+
+def trunc_normal_(tensor, mean=0., std=1.):
+    __call_trunc_normal_(tensor, mean=mean, std=std, a=-std, b=std)
+
+
+__all__ = [
+    'pretrain_mae_base_patch16_224', 
+    'pretrain_mae_large_patch16_224', 
+]
+
+
+class PretrainVisionTransformerEncoder(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_learnable_pos_emb=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        # TODO: Add the cls token
+        # self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_learnable_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            # sine-cosine positional embeddings 
+            self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values)
+            for i in range(depth)])
+        self.norm =  norm_layer(embed_dim)
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if use_learnable_pos_emb:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        # trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        
+        x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()
+
+        B, _, C = x.shape
+        x = x.reshape(B, -1, C)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+class PretrainVisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 img_size=224, 
+                 patch_size=16, 
+                 encoder_in_chans=3, 
+                 encoder_num_classes=0, 
+                 encoder_embed_dim=768, 
+                 encoder_depth=12,
+                 encoder_num_heads=12, 
+                 mlp_ratio=4., 
+                 qkv_bias=False, 
+                 qk_scale=None, 
+                 drop_rate=0., 
+                 attn_drop_rate=0.,
+                 drop_path_rate=0., 
+                 norm_layer=nn.LayerNorm, 
+                 init_values=0.,
+                 use_learnable_pos_emb=False,
+                 num_classes=0, # avoid the error from create_fn in timm
+                 in_chans=0, # avoid the error from create_fn in timm
+                 ):
+        super().__init__()
+        self.encoder = PretrainVisionTransformerEncoder(
+            img_size=img_size, 
+            patch_size=patch_size, 
+            in_chans=encoder_in_chans, 
+            num_classes=encoder_num_classes, 
+            embed_dim=encoder_embed_dim, 
+            depth=encoder_depth,
+            num_heads=encoder_num_heads, 
+            mlp_ratio=mlp_ratio, 
+            qkv_bias=qkv_bias, 
+            qk_scale=qk_scale, 
+            drop_rate=drop_rate, 
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate, 
+            norm_layer=norm_layer, 
+            init_values=init_values,
+            use_learnable_pos_emb=use_learnable_pos_emb)
+
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token', 'mask_token'}
+
+    def forward(self, x): 
+        fmap_list = []
+        x = self.encoder(x) # [B, N, C]
+
+        fmp_h = self.encoder.patch_embed.img_size[0] // self.encoder.patch_embed.patch_size[0]
+        fmp_w = self.encoder.patch_embed.img_size[1] // self.encoder.patch_embed.patch_size[1]
+        # [B, N, C] -> [B, C, N] -> [B, C, H, W]
+        x = x.permute(0, 2, 1).contiguous().view(x.size(0), x.size(-1), fmp_h, fmp_w)
+        fmap_list.append(x)
+        
+        return fmap_list
+
+        
+@register_model
+def vit_base_patch16_224(img_size=224, pretrained=False, **kwargs):
+    model = PretrainVisionTransformer(
+        img_size=img_size,
+        patch_size=16, 
+        encoder_embed_dim=768, 
+        encoder_depth=12, 
+        encoder_num_heads=12,
+        encoder_num_classes=0,
+        mlp_ratio=4, 
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6), 
+        **kwargs)
+    model.default_cfg = _cfg()
+    if pretrained:
+        try:
+            print('Loading the pretrained weights ...')
+            path_to_dir = os.path.dirname(os.path.abspath(__file__))
+            checkpoint = torch.load(path_to_dir + '/weights/vit/pretrain_mae_vit_base_mask_0.75_400e.pth', map_location='cpu')
+            model.load_state_dict(checkpoint['model'], strict=False)
+        except:
+            print('The pretrained weight can not be found ...')
+            pass
+    return model
+ 
+if __name__ == '__main__':
+    x = torch.ones(2, 3, 224, 224)
+    model = vit_base_patch16_224(pretrained=True)
+    outputs = model(x)
+    for y in outputs:
+        print(y.size())
+        print(y)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/weights/README.md b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/weights/README.md
new file mode 100644
index 0000000000..ce687a1c95
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/weights/README.md
@@ -0,0 +1,15 @@
+# darknet19, darknet53, darknet-tiny, darknet-light
+darknet-tiny is designed by myself. It is a very simple and lightweight backbone.
+
+darknet-light is same to the backbone used in official TinyYOLOv3.
+
+For researchers in China, you can download them from BaiduYunDisk:
+
+link：https://pan.baidu.com/s/1Rm87Fcj1RXZFmeTUrDWANA 
+
+password：qgzn
+
+
+Also, you can download them from Google Drive:
+
+link: https://drive.google.com/drive/folders/15saMtvYiz3yfFNu5EnC7GSltEAvTImMB?usp=sharing
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/yolox_backbone.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/yolox_backbone.py
new file mode 100644
index 0000000000..28f9ac719d
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/backbone/yolox_backbone.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import os
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_activation(name="silu", inplace=True):
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(
+        self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+    ):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
+        )
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act="silu",
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
+            )
+            for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+        super().__init__()
+        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+# CSPDarkNet
+class CSPDarknet(nn.Module):
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=("dark3", "dark4", "dark5"),
+        depthwise=False,
+        act="silu",
+    ):
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+            ),
+        )
+
+
+    def freeze_stage(self):
+        # Because the YOLOX-Backbone has been trained on COCO, we freeze all stages to save computation.
+        print('freeze all stage of YOLOX-Backbone ...')
+        for m in self.parameters():
+            m.requires_grad = False
+        
+
+    def forward(self, x):
+        outputs = {}
+        c1 = self.stem(x)
+        c2 = self.dark2(c1)
+        c3 = self.dark3(c2)
+        c4 = self.dark4(c3)
+        c5 = self.dark5(c4)
+
+        return c3, c4, c5
+
+
+def yolox_cspdarknet_s(pretrained=False, freeze=False):
+    # build backbone
+    backbone = CSPDarknet(dep_mul=0.33, wid_mul=0.5, depthwise=False, act='silu')
+
+    # load weight
+    if pretrained:
+        print('Loading pretrained cspdarknet_s ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        path_to_weight = path_to_dir + '/weights/yolox_backbone/yolox_cspdarknet_s.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        backbone.load_state_dict(checkpoint)
+
+    # freeze stage
+    if freeze:
+        backbone.freeze_stage()
+
+    return backbone
+
+
+def yolox_cspdarknet_m(pretrained=False, freeze=False):
+    # build backbone
+    backbone = CSPDarknet(dep_mul=0.67, wid_mul=0.75, depthwise=False, act='silu')
+
+    # load weight
+    if pretrained:
+        print('Loading pretrained cspdarknet_m ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        path_to_weight = path_to_dir + '/weights/yolox_backbone/yolox_cspdarknet_m.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        backbone.load_state_dict(checkpoint)
+
+    # freeze stage
+    if freeze:
+        backbone.freeze_stage()
+
+    return backbone
+
+
+def yolox_cspdarknet_l(pretrained=False, freeze=False):
+    # build backbone
+    backbone = CSPDarknet(dep_mul=1.0, wid_mul=1.0, depthwise=False, act='silu')
+
+    # load weight
+    if pretrained:
+        print('Loading pretrained cspdarknet_l ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        path_to_weight = path_to_dir + '/weights/yolox_backbone/yolox_cspdarknet_l.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        backbone.load_state_dict(checkpoint)
+
+    # freeze stage
+    if freeze:
+        backbone.freeze_stage()
+
+    return backbone
+
+
+def yolox_cspdarknet_x(pretrained=False, freeze=False):
+    # build backbone
+    backbone = CSPDarknet(dep_mul=1.33, wid_mul=1.25, depthwise=False, act='silu')
+
+    # load weight
+    if pretrained:
+        print('Loading pretrained cspdarknet_x ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        path_to_weight = path_to_dir + '/weights/yolox_backbone/yolox_cspdarknet_x.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        backbone.load_state_dict(checkpoint)
+
+    # freeze stage
+    if freeze:
+        backbone.freeze_stage()
+
+    return backbone
+
+
+def yolox_cspdarknet_tiny(pretrained=False, freeze=False):
+    # build backbone
+    backbone = CSPDarknet(dep_mul=0.33, wid_mul=0.375, depthwise=False, act='silu')
+
+    # load weight
+    if pretrained:
+        print('Loading pretrained cspdarknet_tiny ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        path_to_weight = path_to_dir + '/weights/yolox_backbone/yolox_cspdarknet_tiny.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        backbone.load_state_dict(checkpoint)
+
+    # freeze stage
+    if freeze:
+        backbone.freeze_stage()
+
+    return backbone
+
+
+def yolox_cspdarknet_nano(pretrained=False, freeze=False):
+    # build backbone
+    backbone = CSPDarknet(dep_mul=0.33, wid_mul=0.25, depthwise=True, act='silu')
+
+    # load weight
+    if pretrained:
+        print('Loading pretrained cspdarknet_nano ...')
+        path_to_dir = os.path.dirname(os.path.abspath(__file__))
+        path_to_weight = path_to_dir + '/weights/yolox_backbone/yolox_cspdarknet_nano.pth'
+        checkpoint = torch.load(path_to_weight, map_location='cpu')
+        backbone.load_state_dict(checkpoint)
+
+    # freeze stage
+    if freeze:
+        backbone.freeze_stage()
+
+    return backbone
+
+
+if __name__ == '__main__':
+    import time
+    net = yolox_cspdarknet_nano(pretrained=True)
+    x = torch.randn(1, 3, 224, 224)
+    t0 = time.time()
+    outputs = net(x)
+    t1 = time.time()
+    print('Time: ', t1 - t0)
+    for y in outputs:
+        print(y.shape)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/bottleneck_csp.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/bottleneck_csp.py
new file mode 100644
index 0000000000..b246d68e83
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/bottleneck_csp.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+from .conv import Conv
+import torch_npu
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, d=1, e=0.5, depthwise=False, act='lrelu'):  # ch_in, ch_out, shortcut, groups, expansion
+        super(Bottleneck, self).__init__()
+        c_ = int(c2 * e)  # hidden channels            
+        self.cv1 = Conv(c1, c_, k=1, act=act)
+        self.cv2 = Conv(c_, c2, k=3, p=d, d=d, act=act, depthwise=depthwise)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class BottleneckCSP(nn.Module):
+    def __init__(self, c1, c2, n=1, shortcut=True, e=0.5, depthwise=False, act='lrelu'):
+        super(BottleneckCSP, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k=1, act=act)
+        self.cv2 = Conv(c1, c_, k=1, act=act)
+        self.cv3 = Conv(2 * c_, c2, k=1, act=act)
+        self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, e=1.0, depthwise=depthwise, act=act) for _ in range(n)])
+
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/conv.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/conv.py
new file mode 100644
index 0000000000..4811d455f9
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/conv.py
@@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+def get_activation(name="lrelu", inplace=True):
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    elif name is None:
+        module = nn.Identity()
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+# Basic conv layer
+class Conv(nn.Module):
+    def __init__(self, c1, c2, k=1, p=0, s=1, d=1, g=1, act='lrelu', depthwise=False, bias=False):
+        super(Conv, self).__init__()
+        if depthwise:
+            assert c1 == c2
+            self.convs = nn.Sequential(
+                nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=c1, bias=bias),
+                nn.BatchNorm2d(c2),
+                get_activation(name=act),
+                nn.Conv2d(c2, c2, kernel_size=1, bias=bias),
+                nn.BatchNorm2d(c2),
+                get_activation(name=act)
+            )
+        else:
+            self.convs = nn.Sequential(
+                nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias),
+                nn.BatchNorm2d(c2),
+                get_activation(name=act)
+            )
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+# ConvBlocks
+class ConvBlocks(nn.Module):
+    def __init__(self, c1, c2, act='lrelu'):  # in_channels, inner_channels
+        super().__init__()
+        c_ = c2 *2
+        self.convs = nn.Sequential(
+            Conv(c1, c2, k=1, act=act),
+            Conv(c2, c_, k=3, p=1, act=act),
+            Conv(c_, c2, k=1, act=act),
+            Conv(c2, c_, k=3, p=1, act=act),
+            Conv(c_, c2, k=1, act=act)
+        )
+
+    def forward(self, x):
+        return self.convs(x)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/upsample.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/upsample.py
new file mode 100644
index 0000000000..3e78bc8407
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/basic/upsample.py
@@ -0,0 +1,20 @@
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+class UpSample(nn.Module):
+    def __init__(self, size=None, scale_factor=None, mode='nearest', align_corner=None):
+        super(UpSample, self).__init__()
+        self.size = size
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corner = align_corner
+
+    def forward(self, x):
+        return torch.nn.functional.interpolate(input=x, 
+                                               size=self.size, 
+                                               scale_factor=self.scale_factor, 
+                                               mode=self.mode, 
+                                               align_corners=self.align_corner
+                                               )
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/head/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/head/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/head/coupled_head.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/head/coupled_head.py
new file mode 100644
index 0000000000..777a5721b3
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/head/coupled_head.py
@@ -0,0 +1,100 @@
+import torch
+import torch.nn as nn
+
+from ..basic.conv import Conv
+import torch_npu
+
+
+class CoupledHead(nn.Module):
+    def __init__(self, 
+                 in_dim=[256, 512, 1024], 
+                 stride=[8, 16, 32],
+                 kernel_size=3,
+                 padding=1,
+                 width=1.0, 
+                 num_classes=80, 
+                 num_anchors=3,
+                 depthwise=False,
+                 act='silu', 
+                 init_bias=True,
+                 center_sample=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+        self.width = width
+        self.stride = stride
+        self.center_sample = center_sample
+
+        self.head_feat = nn.ModuleList()
+        self.head_pred = nn.ModuleList()
+
+        for c in in_dim:
+            head_dim = int(c * width)
+            self.head_feat.append(
+                nn.Sequential(
+                    Conv(head_dim, head_dim, k=kernel_size, p=padding, act=act, depthwise=depthwise),
+                    Conv(head_dim, head_dim, k=kernel_size, p=padding, act=act, depthwise=depthwise),
+                )
+            )
+            self.head_pred.append(
+                nn.Conv2d(head_dim, num_anchors * (1 + num_classes + 4), kernel_size=1)
+            )
+
+        if init_bias:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        for head_pred in self.head_pred:
+            nn.init.constant_(head_pred.bias[..., :self.num_anchors], bias_value)
+
+
+    def forward(self, features, grid_cell=None, anchors_wh=None):
+        """
+            features: (List of Tensor) of multiple feature maps
+        """
+        B = features[0].size(0)
+        obj_preds = []
+        cls_preds = []
+        box_preds = []
+        for i in range(len(features)):
+            feat = features[i]
+            head_feat = self.head_feat[i](feat)
+            head_pred = self.head_pred[i](head_feat)
+            # obj_pred / cls_pred / reg_pred
+            obj_pred = head_pred[:, :self.num_anchors, :, :]
+            cls_pred = head_pred[:, self.num_anchors:self.num_anchors*(1+self.num_classes), :, :]
+            reg_pred = head_pred[:, self.num_anchors*(1+self.num_classes):, :, :]
+
+            # [B, KA, H, W] -> [B, H, W, KA] ->  [B, HW*KA, 1]
+            obj_preds.append(obj_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1))
+            # [[B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+            cls_preds.append(cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes))
+            # [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_anchors, 4)
+
+            # decode box
+            ## txty -> xy
+            if self.center_sample:     
+                xy_pred = (grid_cell[i] + reg_pred[..., :2].sigmoid() * 2.0 - 1.0) * self.stride[i]
+            else:
+                xy_pred = (grid_cell[i] + reg_pred[..., :2].sigmoid()) * self.stride[i]
+            ## twth -> wh
+            if anchors_wh is not None:
+                wh_pred = reg_pred[..., 2:].exp() * anchors_wh[i]
+            else:
+                wh_pred = reg_pred[..., 2:].exp() * self.stride[i]
+            ## xywh -> x1y1x2y2
+            x1y1_pred = xy_pred - wh_pred * 0.5
+            x2y2_pred = xy_pred + wh_pred * 0.5
+            box_preds.append(torch.cat([x1y1_pred, x2y2_pred], dim=-1).view(B, -1, 4))
+
+        obj_preds = torch.cat(obj_preds, dim=1)  # [B, N, 1]
+        cls_preds = torch.cat(cls_preds, dim=1)  # [B, N, C]
+        box_preds = torch.cat(box_preds, dim=1)  # [B, N, 4]
+
+        return obj_preds, cls_preds, box_preds
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/head/decoupled_head.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/head/decoupled_head.py
new file mode 100644
index 0000000000..0e0ace6a70
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/head/decoupled_head.py
@@ -0,0 +1,120 @@
+import torch
+import torch.nn as nn
+
+from ..basic.conv import Conv
+import torch_npu
+
+
+class DecoupledHead(nn.Module):
+    def __init__(self, 
+                 in_dim=[256, 512, 1024], 
+                 stride=[8, 16, 32],
+                 head_dim=256,
+                 kernel_size=3,
+                 padding=1,
+                 width=1.0, 
+                 num_classes=80, 
+                 num_anchors=3,
+                 depthwise=False,
+                 act='silu', 
+                 init_bias=True,
+                 center_sample=False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+        self.head_dim = int(head_dim * width)
+        self.width = width
+        self.stride = stride
+        self.center_sample = center_sample
+
+        self.input_proj = nn.ModuleList()
+        self.cls_feat = nn.ModuleList()
+        self.reg_feat = nn.ModuleList()
+        self.obj_pred = nn.ModuleList()
+        self.cls_pred = nn.ModuleList()
+        self.reg_pred = nn.ModuleList()
+
+        for c in in_dim:
+            self.input_proj.append(
+                Conv(c, self.head_dim, k=1, act=act)
+            )
+            self.cls_feat.append(
+                nn.Sequential(
+                    Conv(self.head_dim, self.head_dim, k=kernel_size, p=padding, act=act, depthwise=depthwise),
+                    Conv(self.head_dim, self.head_dim, k=kernel_size, p=padding, act=act, depthwise=depthwise)
+                )
+            )
+            self.reg_feat.append(
+                nn.Sequential(
+                    Conv(self.head_dim, self.head_dim, k=kernel_size, p=padding, act=act, depthwise=depthwise),
+                    Conv(self.head_dim, self.head_dim, k=kernel_size, p=padding, act=act, depthwise=depthwise)
+                )
+            )
+            self.obj_pred.append(
+                nn.Conv2d(self.head_dim, num_anchors * 1, kernel_size=1)
+            )
+            self.cls_pred.append(
+                nn.Conv2d(self.head_dim, num_anchors * num_classes, kernel_size=1)
+            )
+            self.reg_pred.append(
+                nn.Conv2d(self.head_dim, num_anchors * 4, kernel_size=1)
+            )
+
+        if init_bias:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        for obj_pred in self.obj_pred:
+            nn.init.constant_(obj_pred.bias, bias_value)
+
+
+    def forward(self, features, grid_cell=None, anchors_wh=None):
+        """
+            features: (List of Tensor) of multiple feature maps
+        """
+        B = features[0].size(0)
+        obj_preds = []
+        cls_preds = []
+        box_preds = []
+        for i in range(len(features)):
+            feat = features[i]
+            feat = self.input_proj[i](feat)
+            cls_feat = self.cls_feat[i](feat)
+            reg_feat = self.reg_feat[i](feat)
+            obj_pred = self.obj_pred[i](reg_feat)
+            cls_pred = self.cls_pred[i](cls_feat)
+            reg_pred = self.reg_pred[i](reg_feat)
+
+            # [B, KA, H, W] -> [B, H, W, KA] ->  [B, HW*KA, 1]
+            obj_preds.append(obj_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1))
+            # [[B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+            cls_preds.append(cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes))
+            # [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_anchors, 4)
+
+            # decode box
+            ## txty -> xy
+            if self.center_sample:     
+                xy_pred = (grid_cell[i] + reg_pred[..., :2].sigmoid() * 2.0 - 1.0) * self.stride[i]
+            else:
+                xy_pred = (grid_cell[i] + reg_pred[..., :2].sigmoid()) * self.stride[i]
+            ## twth -> wh
+            if anchors_wh is not None:
+                wh_pred = reg_pred[..., 2:].exp() * anchors_wh[i]
+            else:
+                wh_pred = reg_pred[..., 2:].exp() * self.stride[i]
+            ## xywh -> x1y1x2y2
+            x1y1_pred = xy_pred - wh_pred * 0.5
+            x2y2_pred = xy_pred + wh_pred * 0.5
+            box_preds.append(torch.cat([x1y1_pred, x2y2_pred], dim=-1).view(B, -1, 4))
+
+        obj_preds = torch.cat(obj_preds, dim=1)  # [B, N, 1]
+        cls_preds = torch.cat(cls_preds, dim=1)  # [B, N, C]
+        box_preds = torch.cat(box_preds, dim=1)  # [B, N, 4]
+
+        return obj_preds, cls_preds, box_preds
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/__init__.py
new file mode 100644
index 0000000000..39c4c1a379
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/__init__.py
@@ -0,0 +1,23 @@
+from .spp import SPPBlock, SPPBlockCSP, SPPBlockDW
+from .dilated_encoder import DilatedEncoder
+from ..basic.conv import ConvBlocks
+
+
+def build_neck(model, in_ch, out_ch, act='lrelu'):
+    if model == 'conv_blocks':
+        print("Neck: ConvBlocks")
+        neck = ConvBlocks(c1=in_ch, c2=out_ch, act=act)
+    elif model == 'spp':
+        print("Neck: SPP")
+        neck = SPPBlock(c1=in_ch, c2=out_ch, act=act)
+    elif model == 'spp-csp':
+        print("Neck: SPP-CSP")
+        neck = SPPBlockCSP(c1=in_ch, c2=out_ch, act=act)
+    elif model == 'spp-dw':
+        print("Neck: SPP-DW")
+        neck = SPPBlockDW(c1=in_ch, c2=out_ch, act=act)
+    elif model == 'dilated_encoder':
+        print("Neck: Dilated Encoder")
+        neck = DilatedEncoder(c1=in_ch, c2=out_ch, act=act)
+
+    return neck
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/dilated_encoder.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/dilated_encoder.py
new file mode 100644
index 0000000000..e544a997d2
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/dilated_encoder.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+from ..basic.conv import Conv
+import torch_npu
+
+
+# Dilated Encoder
+class DilatedBottleneck(nn.Module):
+    def __init__(self, c, d=1, e=0.5, act='lrelu'):
+        super(DilatedBottleneck, self).__init__()
+        c_ = int(c * e)
+        self.branch = nn.Sequential(
+            Conv(c, c_, k=1, act=act),
+            Conv(c_, c_, k=3, p=d, d=d, act=act),
+            Conv(c_, c, k=1, act=act)
+        )
+
+    def forward(self, x):
+        return x + self.branch(x)
+
+
+class DilatedEncoder(nn.Module):
+    """ DilateEncoder """
+    def __init__(self, c1, c2, act='lrelu', dilation_list=[2, 4, 6, 8]):
+        super(DilatedEncoder, self).__init__()
+        self.projector = nn.Sequential(
+            Conv(c1, c2, k=1, act=None),
+            Conv(c2, c2, k=3, p=1, act=None)
+        )
+        encoders = []
+        for d in dilation_list:
+            encoders.append(DilatedBottleneck(c=c2, d=d, act=act))
+        self.encoders = nn.Sequential(*encoders)
+
+    def forward(self, x):
+        x = self.projector(x)
+        x = self.encoders(x)
+
+        return x
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/fpn.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/fpn.py
new file mode 100644
index 0000000000..437d6c8980
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/fpn.py
@@ -0,0 +1,120 @@
+import torch
+import torch.nn as nn
+from ..basic.conv import Conv, ConvBlocks
+from ..basic.upsample import UpSample
+from ..basic.bottleneck_csp import BottleneckCSP
+import torch_npu
+
+
+# YoloFPN
+class YoloFPN(nn.Module):
+    def __init__(self, in_dim=[512, 1024, 2048]):
+        super(YoloFPN, self).__init__()
+        c3, c4, c5 = in_dim
+        # head
+        # P3/8-small
+        self.head_convblock_0 = ConvBlocks(c5, c5//2)
+        self.head_conv_0 = Conv(c5//2, c4//2, k=1)
+        self.head_upsample_0 = UpSample(scale_factor=2)
+        self.head_conv_1 = Conv(c5//2, c5, k=3, p=1)
+
+        # P4/16-medium
+        self.head_convblock_1 = ConvBlocks(c4 + c4//2, c4//2)
+        self.head_conv_2 = Conv(c4//2, c3//2, k=1)
+        self.head_upsample_1 = UpSample(scale_factor=2)
+        self.head_conv_3 = Conv(c4//2, c4, k=3, p=1)
+
+        # P8/32-large
+        self.head_convblock_2 = ConvBlocks(c3 + c3//2, c3//2)
+        self.head_conv_4 = Conv(c3//2, c3, k=3, p=1)
+
+
+    def forward(self, features):
+        c3, c4, c5 = features
+        
+        # p5/32
+        p5 = self.head_convblock_0(c5)
+        p5_up = self.head_upsample_0(self.head_conv_0(p5))
+        p5 = self.head_conv_1(p5)
+
+        # p4/16
+        p4 = self.head_convblock_1(torch.cat([c4, p5_up], dim=1))
+        p4_up = self.head_upsample_1(self.head_conv_2(p4))
+        p4 = self.head_conv_3(p4)
+
+        # P3/8
+        p3 = self.head_convblock_2(torch.cat([c3, p4_up], dim=1))
+        p3 = self.head_conv_4(p3)
+
+        return [p3, p4, p5]
+
+
+# YoloPaFPN
+class YoloPaFPN(nn.Module):
+    def __init__(self, 
+                 in_dim=[256, 512, 1024], 
+                 depth=1.0, 
+                 depthwise=False,
+                 act='silu'):
+        super(YoloPaFPN, self).__init__()
+        c3, c4, c5 = in_dim
+        nblocks = int(3 * depth)
+        self.head_conv_0 = Conv(c5, c5//2, k=1, act=act)  # 10
+        self.head_upsample_0 = UpSample(scale_factor=2)
+        self.head_csp_0 = BottleneckCSP(c4 + c5//2, c4, n=nblocks, shortcut=False, depthwise=depthwise, act=act)
+
+        # P3/8-small
+        self.head_conv_1 = Conv(c4, c4//2, k=1, act=act)  # 14
+        self.head_upsample_1 = UpSample(scale_factor=2)
+        self.head_csp_1 = BottleneckCSP(c3 + c4//2, c3, n=nblocks, shortcut=False, depthwise=depthwise, act=act)
+
+        # P4/16-medium
+        self.head_conv_2 = Conv(c3, c3, k=3, p=1, s=2, depthwise=depthwise, act=act)
+        self.head_csp_2 = BottleneckCSP(c3 + c4//2, c4, n=nblocks, shortcut=False, depthwise=depthwise, act=act)
+
+        # P8/32-large
+        self.head_conv_3 = Conv(c4, c4, k=3, p=1, s=2, depthwise=depthwise, act=act)
+        self.head_csp_3 = BottleneckCSP(c4 + c5//2, c5, n=nblocks, shortcut=False, depthwise=depthwise)
+
+
+    def forward(self, features):
+        c3, c4, c5 = features
+
+        c6 = self.head_conv_0(c5)
+        c7 = self.head_upsample_0(c6)   # s32->s16
+        c8 = torch.cat([c7, c4], dim=1)
+        c9 = self.head_csp_0(c8)
+        # P3/8
+        c10 = self.head_conv_1(c9)
+        c11 = self.head_upsample_1(c10)   # s16->s8
+        c12 = torch.cat([c11, c3], dim=1)
+        c13 = self.head_csp_1(c12)  # to det
+        # p4/16
+        c14 = self.head_conv_2(c13)
+        c15 = torch.cat([c14, c10], dim=1)
+        c16 = self.head_csp_2(c15)  # to det
+        # p5/32
+        c17 = self.head_conv_3(c16)
+        c18 = torch.cat([c17, c6], dim=1)
+        c19 = self.head_csp_3(c18)  # to det
+
+        return [c13, c16, c19] # [P3, P4, P5]
+
+
+# build Head
+def build_fpn(model_name='yolofpn', 
+              in_dim=[256, 512, 1024], 
+              depth=1.0, 
+              depthwise=False, 
+              act='silu'):
+    if model_name == 'yolofpn':
+        print("Head: YoloFPN ...")
+        return YoloFPN(in_dim)
+        
+    elif model_name == 'yolopafpn':
+        print('Head: YoloPaFPN ...')
+        return YoloPaFPN(in_dim, depth, depthwise, act)
+    
+    else:
+        print("Unknown FPN version ...")
+        exit()
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/spp.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/spp.py
new file mode 100644
index 0000000000..652124089c
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/neck/spp.py
@@ -0,0 +1,95 @@
+import torch
+import torch.nn as nn
+
+from ..basic.conv import Conv
+import torch_npu
+
+
+# Spatial Pyramid Pooling
+class SPP(nn.Module):
+    """
+        Spatial Pyramid Pooling
+    """
+    def __init__(self, c1, c2, e=0.5, kernel_sizes=[5, 9, 13], act='lrelu'):
+        super(SPP, self).__init__()
+        c_ = int(c1 * e)
+        self.cv1 = Conv(c1, c_, k=1, act=act)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+                for k in kernel_sizes
+            ]
+        )
+        
+        self.cv2 = Conv(c_*(len(kernel_sizes) + 1), c2, k=1, act=act)
+
+    def forward(self, x):
+        x = self.cv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.cv2(x)
+
+        return x
+
+
+class SPPBlock(nn.Module):
+    """
+        Spatial Pyramid Pooling Block
+    """
+    def __init__(self, c1, c2, e=0.5, kernel_sizes=[5, 9, 13], act='lrelu'):
+        super(SPPBlock, self).__init__()
+        self.m = nn.Sequential(
+            Conv(c1, c1//2, k=1, act=act),
+            Conv(c1//2, c1, k=3, p=1, act=act),
+            SPP(c1, c1//2, e=e, kernel_sizes=kernel_sizes, act=act),
+            Conv(c1//2, c1, k=3, p=1, act=act),
+            Conv(c1, c2, k=1, act=act)
+        )
+
+        
+    def forward(self, x):
+        x = self.m(x)
+
+        return x
+
+
+class SPPBlockCSP(nn.Module):
+    """
+        CSP Spatial Pyramid Pooling Block
+    """
+    def __init__(self, c1, c2, e=0.5, kernel_sizes=[5, 9, 13], act='lrelu'):
+        super(SPPBlockCSP, self).__init__()
+        self.cv1 = Conv(c1, c1//2, k=1, act=act)
+        self.cv2 = Conv(c1, c1//2, k=1, act=act)
+        self.m = nn.Sequential(
+            Conv(c1//2, c1//2, k=3, p=1, act=act),
+            SPP(c1//2, c1//2, e=e, kernel_sizes=kernel_sizes, act=act),
+            Conv(c1//2, c1//2, k=3, p=1, act=act)
+        )
+        self.cv3 = Conv(c1, c2, k=1, act=act)
+
+        
+    def forward(self, x):
+        x1 = self.cv1(x)
+        x2 = self.cv2(x)
+        x3 = self.m(x2)
+        y = self.cv3(torch.cat([x1, x3], dim=1))
+
+        return y
+
+
+class SPPBlockDW(nn.Module):
+    """
+        Depth-wise Spatial Pyramid Pooling Block
+    """
+    def __init__(self, c1, c2, e=0.5, kernel_sizes=[5, 9, 13], act='lrelu'):
+        super(SPPBlockDW, self).__init__()
+        self.m = nn.Sequential(
+            Conv(c1, c1//2, k=1, act=act),
+            Conv(c1//2, c1//2, k=3, p=1, g=c1//2, act=act),
+            SPP(c1//2, c1//2, e=e, kernel_sizes=kernel_sizes, act=act),
+            Conv(c1//2, c1//2, k=3, p=1, g=c1//2, act=act),
+            Conv(c1//2, c2, k=1, act=act)
+        )
+        
+    def forward(self, x):
+        return self.m(x)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/__init__.py
new file mode 100644
index 0000000000..6a154046dc
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/__init__.py
@@ -0,0 +1,92 @@
+from .yolov1 import YOLOv1
+from .yolov2 import YOLOv2
+from .yolov3 import YOLOv3
+from .yolov4 import YOLOv4
+from .yolo_tiny import YOLOTiny
+from .yolo_nano import YOLONano
+
+
+# build YOLO detector
+def build_model(args, cfg, device, num_classes=80, trainable=False):
+    
+    if args.model == 'yolov1':
+        print('Build YOLOv1 ...')
+        model = YOLOv1(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolov2':
+        print('Build YOLOv2 ...')
+        model = YOLOv2(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolov3':
+        print('Build YOLOv3 ...')
+        model = YOLOv3(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolov3_spp':
+        print('Build YOLOv3 with SPP ...')
+        model = YOLOv3(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolov3_de':
+        print('Build YOLOv3 with DilatedEncoder ...')
+        model = YOLOv3(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolov4':
+        print('Build YOLOv4 ...')
+        model = YOLOv4(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolo_tiny':
+        print('Build YOLO-Tiny ...')
+        model = YOLOTiny(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    elif args.model == 'yolo_nano':
+        print('Build YOLO-Nano ...')
+        model = YOLONano(cfg=cfg,
+                        device=device, 
+                        img_size=args.img_size, 
+                        num_classes=num_classes, 
+                        trainable=trainable,
+                        conf_thresh=args.conf_thresh,
+                        nms_thresh=args.nms_thresh,
+                        center_sample=args.center_sample)
+    return model
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_nano.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_nano.py
new file mode 100644
index 0000000000..fbe8fb99ab
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_nano.py
@@ -0,0 +1,340 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+from ..backbone import build_backbone
+from ..neck.spp import SPP
+from ..basic.conv import Conv
+from utils import box_ops
+import torch_npu
+
+
+class YOLONano(nn.Module):
+    def __init__(self, 
+                 cfg=None,
+                 device=None, 
+                 img_size=640, 
+                 num_classes=80, 
+                 trainable=False, 
+                 conf_thresh=0.001, 
+                 nms_thresh=0.60, 
+                 center_sample=False):
+        super(YOLONano, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.center_sample = center_sample
+
+        # backbone
+        self.backbone, feature_channels, strides = build_backbone(model_name=cfg["backbone"], 
+                                                                  pretrained=trainable)
+        self.stride = strides
+        anchor_size = cfg["anchor_size"]
+        self.anchor_size = torch.tensor(anchor_size).reshape(len(self.stride), len(anchor_size) // 3, 2).float()
+        self.num_anchors = self.anchor_size.size(1)
+        c3, c4, c5 = feature_channels
+
+        # build grid cell
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+        # neck
+        self.neck = SPP(c5, c5)
+
+        # FPN+PAN
+        self.conv1x1_0 = Conv(c3, 96, k=1)
+        self.conv1x1_1 = Conv(c4, 96, k=1)
+        self.conv1x1_2 = Conv(c5, 96, k=1)
+
+        self.smooth_0 = Conv(96, 96, k=3, p=1)
+        self.smooth_1 = Conv(96, 96, k=3, p=1)
+        self.smooth_2 = Conv(96, 96, k=3, p=1)
+        self.smooth_3 = Conv(96, 96, k=3, p=1)
+
+        # det head
+        self.head_conv_1 = nn.Sequential(
+            Conv(96, 96, k=3, p=1, g=96),
+            Conv(96, 96, k=1),
+            Conv(96, 96, k=3, p=1, g=96),
+            Conv(96, 96, k=1)
+        )
+        self.head_conv_2 = nn.Sequential(
+            Conv(96, 96, k=3, p=1, g=96),
+            Conv(96, 96, k=1),
+            Conv(96, 96, k=3, p=1, g=96),
+            Conv(96, 96, k=1)
+        )
+        self.head_conv_3 = nn.Sequential(
+            Conv(96, 96, k=3, p=1, g=96),
+            Conv(96, 96, k=1),
+            Conv(96, 96, k=3, p=1, g=96),
+            Conv(96, 96, k=1)
+        )
+
+        # det conv
+        self.head_det_1 = nn.Conv2d(96, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_2 = nn.Conv2d(96, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_3 = nn.Conv2d(96, self.num_anchors * (1 + self.num_classes + 4), 1)
+
+        if self.trainable:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.head_det_1.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_2.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_3.bias[..., :self.num_anchors], bias_value)
+
+
+    def create_grid(self, img_size):
+        total_grid_xy = []
+        total_anchor_wh = []
+        w, h = img_size, img_size
+        for ind, s in enumerate(self.stride):
+            # generate grid cells
+            fmp_w, fmp_h = w // s, h // s
+            grid_y, grid_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+            # [H, W, 2] -> [HW, 2]
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2)
+            # [HW, 2] -> [1, HW, 1, 2]   
+            grid_xy = grid_xy[None, :, None, :].to(self.device)
+            # [1, HW, 1, 2]
+            anchor_wh = self.anchor_size[ind].repeat(fmp_h*fmp_w, 1, 1).unsqueeze(0).to(self.device)
+
+            total_grid_xy.append(grid_xy)
+            total_anchor_wh.append(anchor_wh)
+
+        return total_grid_xy, total_anchor_wh
+
+
+    def set_grid(self, img_size):
+        self.img_size = img_size
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+
+    def nms(self, dets, scores):
+        """"Pure Python NMS YOLOv4."""
+        x1 = dets[:, 0]  #xmin
+        y1 = dets[:, 1]  #ymin
+        x2 = dets[:, 2]  #xmax
+        y2 = dets[:, 3]  #ymax
+
+        areas = (x2 - x1) * (y2 - y1)                 # the size of bbox
+        order = scores.argsort()[::-1]                        # sort bounding boxes by decreasing order
+
+        keep = []                                             # store the final bounding boxes
+        while order.size > 0:
+            i = order[0]                                      #the index of the bbox with highest confidence
+            keep.append(i)                                    #save it to keep
+            # compute iou
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(1e-28, xx2 - xx1)
+            h = np.maximum(1e-28, yy2 - yy1)
+            inter = w * h
+
+            ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+            #reserve all the boundingbox whose ovr less than thresh
+            inds = np.where(ovr <= self.nms_thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+    def postprocess(self, bboxes, scores):
+        """
+        bboxes: (N, 4), bsize = 1
+        scores: (N, C), bsize = 1
+        """
+
+        cls_inds = np.argmax(scores, axis=1)
+        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
+        
+        # threshold
+        keep = np.where(scores >= self.conf_thresh)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        # NMS
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(cls_inds == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        return bboxes, scores, cls_inds
+
+
+    @torch.no_grad()
+    def inference_single_image(self, x):
+        KA = self.num_anchors
+        C = self.num_classes
+        # backbone
+        c3, c4, c5 = self.backbone(x)
+   
+        # neck
+        c5 = self.neck(c5)
+            
+        # head
+        p3 = self.conv1x1_0(c3)
+        p4 = self.conv1x1_1(c4)
+        p5 = self.conv1x1_2(c5)
+
+        # top-down
+        p4 = self.smooth_0(p4 + F.interpolate(p5, scale_factor=2.0))
+        p3 = self.smooth_1(p3 + F.interpolate(p4, scale_factor=2.0))
+
+        # bottom-up
+        p4 = self.smooth_2(p4 + F.interpolate(p3, scale_factor=0.5))
+        p5 = self.smooth_3(p5 + F.interpolate(p4, scale_factor=0.5))
+
+        # det head
+        pred_s = self.head_det_1(self.head_conv_1(p3))[0]
+        pred_m = self.head_det_2(self.head_conv_2(p4))[0]
+        pred_l = self.head_det_3(self.head_conv_3(p5))[0]
+
+        preds = [pred_s, pred_m, pred_l]
+        obj_pred_list = []
+        cls_pred_list = []
+        box_pred_list = []
+
+        for i, pred in enumerate(preds):
+            # [KA*(1 + C + 4), H, W] -> [KA*1, H, W] -> [H, W, KA*1] -> [HW*KA, 1]
+            obj_pred_i = pred[:KA, :, :].permute(1, 2, 0).contiguous().view(-1, 1)
+            # [KA*(1 + C + 4), H, W] -> [KA*C, H, W] -> [H, W, KA*C] -> [HW*KA, C]
+            cls_pred_i = pred[KA:KA*(1+C), :, :].permute(1, 2, 0).contiguous().view(-1, C)
+            # [KA*(1 + C + 4), H, W] -> [KA*4, H, W] -> [H, W, KA*4] -> [HW, KA, 4]
+            reg_pred_i = pred[KA*(1+C):, :, :].permute(1, 2, 0).contiguous().view(-1, KA, 4)
+            # txty -> xy
+            if self.center_sample:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+            else:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+            # twth -> wh
+            wh_pred_i = reg_pred_i[None, ..., 2:].exp() * self.anchors_wh[i]
+            # xywh -> x1y1x2y2           
+            x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+            x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+            box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1)[0].view(-1, 4)
+
+            obj_pred_list.append(obj_pred_i)
+            cls_pred_list.append(cls_pred_i)
+            box_pred_list.append(box_pred_i)
+        
+        obj_pred = torch.cat(obj_pred_list, dim=0)
+        cls_pred = torch.cat(cls_pred_list, dim=0)
+        box_pred = torch.cat(box_pred_list, dim=0)
+        
+        # normalize bbox
+        bboxes = torch.clamp(box_pred / self.img_size, 0., 1.)
+
+        # scores
+        scores = torch.sigmoid(obj_pred) * torch.softmax(cls_pred, dim=-1)
+
+        # to cpu
+        scores = scores.to('cpu').numpy()
+        bboxes = bboxes.to('cpu').numpy()
+
+        # post-process
+        bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
+
+        return bboxes, scores, cls_inds
+
+
+    def forward(self, x, targets=None):
+        if not self.trainable:
+            return self.inference_single_image(x)
+        else:
+            B = x.size(0)
+            KA = self.num_anchors
+            C = self.num_classes
+            # backbone
+            c3, c4, c5 = self.backbone(x)
+
+            # neck
+            c5 = self.neck(c5)
+
+            p3 = self.conv1x1_0(c3)
+            p4 = self.conv1x1_1(c4)
+            p5 = self.conv1x1_2(c5)
+
+            # top-down
+            p4 = self.smooth_0(p4 + F.interpolate(p5, scale_factor=2.0))
+            p3 = self.smooth_1(p3 + F.interpolate(p4, scale_factor=2.0))
+
+            # bottom-up
+            p4 = self.smooth_2(p4 + F.interpolate(p3, scale_factor=0.5))
+            p5 = self.smooth_3(p5 + F.interpolate(p4, scale_factor=0.5))
+
+            # det head
+            pred_s = self.head_det_1(self.head_conv_1(p3))
+            pred_m = self.head_det_2(self.head_conv_2(p4))
+            pred_l = self.head_det_3(self.head_conv_3(p5))
+
+            preds = [pred_s, pred_m, pred_l]
+            obj_pred_list = []
+            cls_pred_list = []
+            box_pred_list = []
+
+            for i, pred in enumerate(preds):
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA, H, W] -> [B, H, W, KA] ->  [B, HW*KA, 1]
+                obj_pred_i = pred[:, :KA, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+                cls_pred_i = pred[:, KA:KA*(1+C), :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, C)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+                reg_pred_i = pred[:, KA*(1+C):, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, KA, 4)
+                # txty -> xy
+                if self.center_sample:
+                    xy_pred_i = (reg_pred_i[..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+                else:
+                    xy_pred_i = (reg_pred_i[..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+                # twth -> wh
+                wh_pred_i = reg_pred_i[..., 2:].exp() * self.anchors_wh[i]
+                # xywh -> x1y1x2y2
+                x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+                x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+                box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1).view(B, -1, 4)
+
+                obj_pred_list.append(obj_pred_i)
+                cls_pred_list.append(cls_pred_i)
+                box_pred_list.append(box_pred_i)
+            
+            obj_pred = torch.cat(obj_pred_list, dim=1)
+            cls_pred = torch.cat(cls_pred_list, dim=1)
+            box_pred = torch.cat(box_pred_list, dim=1)
+            
+            # normalize bbox
+            box_pred = box_pred / self.img_size
+
+            # compute giou between prediction bbox and target bbox
+            x1y1x2y2_pred = box_pred.view(-1, 4)
+            x1y1x2y2_gt = targets[..., 2:6].view(-1, 4)
+
+            # giou: [B, HW,]
+            giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+
+            # we set giou as the target of the objectness
+            targets = torch.cat([0.5 * (giou_pred[..., None].clone().detach() + 1.0), targets], dim=-1)
+
+            return obj_pred, cls_pred, giou_pred, targets
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_tiny.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_tiny.py
new file mode 100644
index 0000000000..42bec87756
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolo_tiny.py
@@ -0,0 +1,335 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from utils import box_ops
+
+from ..backbone import build_backbone
+from ..neck import build_neck
+from ..basic.conv import Conv 
+from ..basic.upsample import UpSample
+from ..basic.bottleneck_csp import BottleneckCSP
+import torch_npu
+
+
+class YOLOTiny(nn.Module):
+    def __init__(self, 
+                 cfg=None,
+                 device=None, 
+                 img_size=640, 
+                 num_classes=80, 
+                 trainable=False, 
+                 conf_thresh=0.001, 
+                 nms_thresh=0.60,
+                 center_sample=False):
+        super(YOLOTiny, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.center_sample = center_sample
+
+        # backbone
+        self.backbone, feature_channels, strides = build_backbone(model_name=cfg['backbone'], pretrained=trainable)
+        self.stride = strides
+        anchor_size = cfg["anchor_size"]
+        self.anchor_size = torch.tensor(anchor_size).reshape(len(self.stride), len(anchor_size) // 3, 2).float()
+        self.num_anchors = self.anchor_size.size(1)
+        c3, c4, c5 = feature_channels
+
+        # build grid cell
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+        # head
+        self.head_conv_0 = build_neck(model=cfg["neck"], in_ch=c5, out_ch=c5//2)  # 10
+        self.head_upsample_0 = UpSample(scale_factor=2)
+        self.head_csp_0 = BottleneckCSP(c4 + c5//2, c4, n=1, shortcut=False)
+
+        # P3/8-small
+        self.head_conv_1 = Conv(c4, c4//2, k=1)  # 14
+        self.head_upsample_1 = UpSample(scale_factor=2)
+        self.head_csp_1 = BottleneckCSP(c3 + c4//2, c3, n=1, shortcut=False)
+
+        # P4/16-medium
+        self.head_conv_2 = Conv(c3, c3, k=3, p=1, s=2)
+        self.head_csp_2 = BottleneckCSP(c3 + c4//2, c4, n=1, shortcut=False)
+
+        # P8/32-large
+        self.head_conv_3 = Conv(c4, c4, k=3, p=1, s=2)
+        self.head_csp_3 = BottleneckCSP(c4 + c5//2, c5, n=1, shortcut=False)
+
+        # det conv
+        self.head_det_1 = nn.Conv2d(c3, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_2 = nn.Conv2d(c4, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_3 = nn.Conv2d(c5, self.num_anchors * (1 + self.num_classes + 4), 1)
+
+        if self.trainable:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.head_det_1.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_2.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_3.bias[..., :self.num_anchors], bias_value)
+
+
+    def create_grid(self, img_size):
+        total_grid_xy = []
+        total_anchor_wh = []
+        w, h = img_size, img_size
+        for ind, s in enumerate(self.stride):
+            # generate grid cells
+            fmp_w, fmp_h = w // s, h // s
+            grid_y, grid_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+            # [H, W, 2] -> [HW, 2]
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2)
+            # [HW, 2] -> [1, HW, 1, 2]   
+            grid_xy = grid_xy[None, :, None, :].to(self.device)
+            # [1, HW, 1, 2]
+            anchor_wh = self.anchor_size[ind].repeat(fmp_h*fmp_w, 1, 1).unsqueeze(0).to(self.device)
+
+            total_grid_xy.append(grid_xy)
+            total_anchor_wh.append(anchor_wh)
+
+        return total_grid_xy, total_anchor_wh
+
+
+    def set_grid(self, img_size):
+        self.img_size = img_size
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+
+    def nms(self, dets, scores):
+        """"Pure Python NMS YOLOv4."""
+        x1 = dets[:, 0]  #xmin
+        y1 = dets[:, 1]  #ymin
+        x2 = dets[:, 2]  #xmax
+        y2 = dets[:, 3]  #ymax
+
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            # compute iou
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(1e-28, xx2 - xx1)
+            h = np.maximum(1e-28, yy2 - yy1)
+            inter = w * h
+
+            ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+            #reserve all the boundingbox whose ovr less than thresh
+            inds = np.where(ovr <= self.nms_thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+    def postprocess(self, bboxes, scores):
+        """
+        bboxes: (HxW, 4), bsize = 1
+        scores: (HxW, num_classes), bsize = 1
+        """
+
+        cls_inds = np.argmax(scores, axis=1)
+        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
+        
+        # threshold
+        keep = np.where(scores >= self.conf_thresh)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        # NMS
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(cls_inds == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        return bboxes, scores, cls_inds
+
+
+    @torch.no_grad()
+    def inference_single_image(self, x):
+        KA = self.num_anchors
+        C = self.num_classes
+        # backbone
+        c3, c4, c5 = self.backbone(x)
+
+        # FPN + PAN
+        # head
+        c6 = self.head_conv_0(c5)
+        c7 = self.head_upsample_0(c6)   # s32->s16
+        c8 = torch.cat([c7, c4], dim=1)
+        c9 = self.head_csp_0(c8)
+        # P3/8
+        c10 = self.head_conv_1(c9)
+        c11 = self.head_upsample_1(c10)   # s16->s8
+        c12 = torch.cat([c11, c3], dim=1)
+        c13 = self.head_csp_1(c12)  # to det
+        # p4/16
+        c14 = self.head_conv_2(c13)
+        c15 = torch.cat([c14, c10], dim=1)
+        c16 = self.head_csp_2(c15)  # to det
+        # p5/32
+        c17 = self.head_conv_3(c16)
+        c18 = torch.cat([c17, c6], dim=1)
+        c19 = self.head_csp_3(c18)  # to det
+
+        # det
+        pred_s = self.head_det_1(c13)[0]
+        pred_m = self.head_det_2(c16)[0]
+        pred_l = self.head_det_3(c19)[0]
+
+        preds = [pred_s, pred_m, pred_l]
+        obj_pred_list = []
+        cls_pred_list = []
+        box_pred_list = []
+
+        for i, pred in enumerate(preds):
+            # [KA*(1 + C + 4), H, W] -> [KA*1, H, W] -> [H, W, KA*1] -> [HW*KA, 1]
+            obj_pred_i = pred[:KA, :, :].permute(1, 2, 0).contiguous().view(-1, 1)
+            # [KA*(1 + C + 4), H, W] -> [KA*C, H, W] -> [H, W, KA*C] -> [HW*KA, C]
+            cls_pred_i = pred[KA:KA*(1+C), :, :].permute(1, 2, 0).contiguous().view(-1, C)
+            # [KA*(1 + C + 4), H, W] -> [KA*4, H, W] -> [H, W, KA*4] -> [HW, KA, 4]
+            reg_pred_i = pred[KA*(1+C):, :, :].permute(1, 2, 0).contiguous().view(-1, KA, 4)
+            # txty -> xy
+            if self.center_sample:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+            else:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+            # twth -> wh
+            wh_pred_i = reg_pred_i[None, ..., 2:].exp() * self.anchors_wh[i]
+            # xywh -> x1y1x2y2           
+            x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+            x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+            box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1)[0].view(-1, 4)
+
+            obj_pred_list.append(obj_pred_i)
+            cls_pred_list.append(cls_pred_i)
+            box_pred_list.append(box_pred_i)
+        
+        obj_pred = torch.cat(obj_pred_list, dim=0)
+        cls_pred = torch.cat(cls_pred_list, dim=0)
+        box_pred = torch.cat(box_pred_list, dim=0)
+        
+        # normalize bbox
+        bboxes = torch.clamp(box_pred / self.img_size, 0., 1.)
+
+        # scores
+        scores = torch.sigmoid(obj_pred) * torch.softmax(cls_pred, dim=-1)
+
+        # to cpu
+        scores = scores.to('cpu').numpy()
+        bboxes = bboxes.to('cpu').numpy()
+
+        # post-process
+        bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
+
+        return bboxes, scores, cls_inds
+
+
+    def forward(self, x, targets=None):
+        if not self.trainable:
+            return self.inference_single_image(x)
+        else:
+            B = x.size(0)
+            KA = self.num_anchors
+            C = self.num_classes
+            # backbone
+            c3, c4, c5 = self.backbone(x)
+
+            # FPN + PAN
+            # head
+            c6 = self.head_conv_0(c5)
+            c7 = self.head_upsample_0(c6)   # s32->s16
+            c8 = torch.cat([c7, c4], dim=1)
+            c9 = self.head_csp_0(c8)
+            # P3/8
+            c10 = self.head_conv_1(c9)
+            c11 = self.head_upsample_1(c10)   # s16->s8
+            c12 = torch.cat([c11, c3], dim=1)
+            c13 = self.head_csp_1(c12)  # to det
+            # p4/16
+            c14 = self.head_conv_2(c13)
+            c15 = torch.cat([c14, c10], dim=1)
+            c16 = self.head_csp_2(c15)  # to det
+            # p5/32
+            c17 = self.head_conv_3(c16)
+            c18 = torch.cat([c17, c6], dim=1)
+            c19 = self.head_csp_3(c18)  # to det
+
+            # det
+            pred_s = self.head_det_1(c13)
+            pred_m = self.head_det_2(c16)
+            pred_l = self.head_det_3(c19)
+
+            preds = [pred_s, pred_m, pred_l]
+            obj_pred_list = []
+            cls_pred_list = []
+            box_pred_list = []
+
+            for i, pred in enumerate(preds):
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA, H, W] -> [B, H, W, KA] ->  [B, HW*KA, 1]
+                obj_pred_i = pred[:, :KA, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+                cls_pred_i = pred[:, KA:KA*(1+C), :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, C)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+                reg_pred_i = pred[:, KA*(1+C):, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, KA, 4)
+                # txty -> xy
+                if self.center_sample:
+                    xy_pred_i = (reg_pred_i[..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+                else:
+                    xy_pred_i = (reg_pred_i[..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+                # twth -> wh
+                wh_pred_i = reg_pred_i[..., 2:].exp() * self.anchors_wh[i]
+                # xywh -> x1y1x2y2
+                x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+                x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+                box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1).view(B, -1, 4)
+
+                obj_pred_list.append(obj_pred_i)
+                cls_pred_list.append(cls_pred_i)
+                box_pred_list.append(box_pred_i)
+            
+            obj_pred = torch.cat(obj_pred_list, dim=1)
+            cls_pred = torch.cat(cls_pred_list, dim=1)
+            box_pred = torch.cat(box_pred_list, dim=1)
+            
+            # normalize bbox
+            box_pred = box_pred / self.img_size
+
+            # compute giou between prediction bbox and target bbox
+            x1y1x2y2_pred = box_pred.view(-1, 4)
+            x1y1x2y2_gt = targets[..., 2:6].view(-1, 4)
+
+            # giou: [B, HW,]
+            giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+
+            # we set giou as the target of the objectness
+            targets = torch.cat([0.5 * (giou_pred[..., None].clone().detach() + 1.0), targets], dim=-1)
+
+            return obj_pred, cls_pred, giou_pred, targets
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov1.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov1.py
new file mode 100644
index 0000000000..bf3160505f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov1.py
@@ -0,0 +1,260 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from utils import box_ops
+
+from ..basic.conv import Conv 
+from ..neck import build_neck
+from ..backbone import build_backbone
+import torch_npu
+
+
+class YOLOv1(nn.Module):
+    def __init__(self, 
+                 cfg=None,
+                 device=None, 
+                 img_size=None, 
+                 num_classes=20, 
+                 trainable=False, 
+                 conf_thresh=0.001, 
+                 nms_thresh=0.6,
+                 center_sample=False):
+        super(YOLOv1, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.center_sample = center_sample
+
+        # backbone
+        self.backbone, feature_channels, strides = build_backbone(model_name=cfg['backbone'], 
+                                                                  pretrained=trainable)
+        self.stride = [strides[-1]]
+        feature_dim = feature_channels[-1]
+        head_dim = 512
+
+        # build grid cell
+        self.grid_xy = self.create_grid(img_size)
+        
+        # neck
+        self.neck = build_neck(model=cfg['neck'], in_ch=feature_dim, out_ch=head_dim)
+
+        # head
+        self.cls_feat = nn.Sequential(
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1)
+        )
+        self.reg_feat = nn.Sequential(
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1)
+        )
+
+        # head
+        self.obj_pred = nn.Conv2d(head_dim, 1, kernel_size=1)
+        self.cls_pred = nn.Conv2d(head_dim, self.num_classes, kernel_size=1)
+        self.reg_pred = nn.Conv2d(head_dim, 4, kernel_size=1)
+
+        if self.trainable:
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.obj_pred.bias, bias_value)
+
+
+    def create_grid(self, img_size):
+        """img_size: [H, W]"""
+        img_h = img_w = img_size
+        # generate grid cells
+        fmp_h, fmp_w = img_h // self.stride[0], img_w // self.stride[0]
+        grid_y, grid_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+        # [H, W, 2] -> [HW, 2]
+        grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2)
+        # [HW, 2] -> [1, HW, 2]
+        grid_xy = grid_xy.unsqueeze(0).to(self.device)
+
+        return grid_xy
+
+
+    def set_grid(self, img_size):
+        self.grid_xy = self.create_grid(img_size)
+        self.img_size = img_size
+
+
+    def decode_bbox(self, reg_pred):
+        """reg_pred: [B, N, 4]"""
+        # txty -> xy
+        if self.center_sample:
+            xy_pred = reg_pred[..., :2].sigmoid() * 2.0 - 1.0 + self.grid_xy
+        else:
+            xy_pred = reg_pred[..., :2].sigmoid() + self.grid_xy
+        # twth -> wh
+        wh_pred = reg_pred[..., 2:].exp()
+        xywh_pred = torch.cat([xy_pred, wh_pred], dim=-1)
+        # xywh -> x1y1x2y2
+        x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] / 2
+        x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] / 2
+        box_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
+        # rescale bbox
+        box_pred = box_pred * self.stride[0]
+
+        return box_pred
+
+
+    def nms(self, dets, scores):
+        """"Pure Python NMS YOLOv4."""
+        x1 = dets[:, 0]  #xmin
+        y1 = dets[:, 1]  #ymin
+        x2 = dets[:, 2]  #xmax
+        y2 = dets[:, 3]  #ymax
+
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            # compute iou
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(1e-28, xx2 - xx1)
+            h = np.maximum(1e-28, yy2 - yy1)
+            inter = w * h
+
+            ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+            #reserve all the boundingbox whose ovr less than thresh
+            inds = np.where(ovr <= self.nms_thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+    def postprocess(self, bboxes, scores):
+        """
+        bboxes: (N, 4), bsize = 1
+        scores: (N, C), bsize = 1
+        """
+
+        cls_inds = np.argmax(scores, axis=1)
+        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
+        
+        # threshold
+        keep = np.where(scores >= self.conf_thresh)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        # NMS
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(cls_inds == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        return bboxes, scores, cls_inds
+
+
+    @torch.no_grad()
+    def inference_single_image(self, x):
+        # backbone
+        x = self.backbone(x)[-1]
+
+        # neck
+        x = self.neck(x)
+
+        # head
+        cls_feat = self.cls_feat(x)
+        reg_feat = self.reg_feat(x)
+
+        # pred
+        obj_pred = self.obj_pred(reg_feat)[0]
+        cls_pred = self.cls_pred(cls_feat)[0]
+        reg_pred = self.reg_pred(reg_feat)[0]
+
+        # [1, H, W] -> [1, HW] -> [HW, 1] -> [HW, 1]
+        obj_pred =obj_pred.flatten(1).permute(1, 0).contiguous()
+        # [C, H, W] -> [C, HW] -> [HW, C] -> [HW, C]
+        cls_pred =cls_pred.flatten(1).permute(1, 0).contiguous()
+        # [4, H, W] -> [4, HW] -> [HW, 4]
+        reg_pred = reg_pred.flatten(1).permute(1, 0).contiguous()
+        box_pred = self.decode_bbox(reg_pred[None])[0] # [B, HW, 4] -> [HW, 4]
+        # normalize bbox
+        bboxes = torch.clamp(box_pred / self.img_size, 0., 1.)
+
+        # scores
+        scores = torch.sigmoid(obj_pred) * torch.softmax(cls_pred, dim=-1)
+
+        # to cpu
+        scores = scores.to('cpu').numpy()
+        bboxes = bboxes.to('cpu').numpy()
+
+        # post-process
+        bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
+
+        return bboxes, scores, cls_inds
+
+
+    def forward(self, x, targets=None):
+        if not self.trainable:
+            return self.inference_single_image(x)
+        else:
+            B = x.size(0)
+            C = self.num_classes
+            # backbone
+            x = self.backbone(x)[-1]
+
+            # neck
+            x = self.neck(x)
+
+            # head
+            cls_feat = self.cls_feat(x)
+            reg_feat = self.reg_feat(x)
+
+            # pred
+            obj_pred = self.obj_pred(reg_feat)
+            cls_pred = self.cls_pred(cls_feat)
+            reg_pred = self.reg_pred(reg_feat)
+
+            # [B, 1, H, W] -> [B, 1, HW] -> [B, HW, 1]
+            obj_pred =obj_pred.flatten(2).permute(0, 2, 1).contiguous()
+            # [B, C, H, W] -> [B, C, HW] -> [B, HW, C]
+            cls_pred =cls_pred.flatten(2).permute(0, 2, 1).contiguous()
+            # [B, 4, H, W] -> [B, 4, HW] -> [B, HW, 4]
+            reg_pred = reg_pred.flatten(2).permute(0, 2, 1).contiguous()
+            box_pred = self.decode_bbox(reg_pred)
+            # normalize bbox
+            box_pred = box_pred / self.img_size
+
+            # compute giou between prediction bbox and target bbox
+            x1y1x2y2_pred = box_pred.view(-1, 4)
+            x1y1x2y2_gt = targets[..., 2:6].view(-1, 4)
+
+            # giou: [B, HW,]
+            giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+
+            # we set giou as the target of the objectness
+            targets = torch.cat([0.5 * (giou_pred[..., None].clone().detach() + 1.0), targets], dim=-1)
+
+            return obj_pred, cls_pred, giou_pred, targets
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov2.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov2.py
new file mode 100644
index 0000000000..47d386c09a
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov2.py
@@ -0,0 +1,271 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils import box_ops
+from utils import criterion
+
+from ..basic.conv import Conv 
+from ..neck import build_neck
+from ..backbone import build_backbone
+import torch_npu
+
+
+
+class YOLOv2(nn.Module):
+    def __init__(self, 
+                 cfg=None,
+                 device=None, 
+                 img_size=None, 
+                 num_classes=20, 
+                 trainable=False, 
+                 conf_thresh=0.001, 
+                 nms_thresh=0.6,
+                 center_sample=False):
+        super(YOLOv2, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.center_sample = center_sample
+        self.anchor_size = torch.tensor(cfg["anchor_size"])  # [KA, 2]
+        self.num_anchors = len(cfg["anchor_size"])
+
+        # backbone
+        self.backbone, feature_channels, strides = build_backbone(model_name=cfg['backbone'], 
+                                                                  pretrained=trainable)
+        self.stride = [strides[-1]]
+        feature_dim = feature_channels[-1]
+        head_dim = 512
+
+        # build grid cell
+        self.grid_xy, self.anchor_wh = self.create_grid(img_size)
+
+        # neck
+        self.neck = build_neck(model=cfg['neck'], in_ch=feature_dim, out_ch=head_dim)
+
+        # head
+        self.cls_feat = nn.Sequential(
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1)
+        )
+        self.reg_feat = nn.Sequential(
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1),
+            Conv(head_dim, head_dim, k=3, p=1, s=1)
+        )
+
+        # head
+        self.obj_pred = nn.Conv2d(head_dim, self.num_anchors * 1, kernel_size=1)
+        self.cls_pred = nn.Conv2d(head_dim, self.num_anchors * self.num_classes, kernel_size=1)
+        self.reg_pred = nn.Conv2d(head_dim, self.num_anchors * 4, kernel_size=1)
+
+        if self.trainable:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.obj_pred.bias, bias_value)
+
+
+    def create_grid(self, img_size):
+        """img_size: [H, W]"""
+        img_h = img_w = img_size
+        # generate grid cells
+        fmp_h, fmp_w = img_h // self.stride[0], img_w // self.stride[0]
+        grid_y, grid_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+        # [H, W, 2] -> [HW, 2]
+        grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2)
+        # [HW, 2] -> [1, HW, 1, 2]   
+        grid_xy = grid_xy[None, :, None, :].to(self.device)
+        # [1, HW, 1, 2]
+        anchor_wh = self.anchor_size.repeat(fmp_h*fmp_w, 1, 1).unsqueeze(0).to(self.device)
+
+        return grid_xy, anchor_wh
+
+
+    def set_grid(self, img_size):
+        self.grid_xy, self.anchor_wh = self.create_grid(img_size)
+        self.img_size = img_size
+
+
+    def nms(self, dets, scores):
+        """"Pure Python NMS YOLOv4."""
+        x1 = dets[:, 0]  #xmin
+        y1 = dets[:, 1]  #ymin
+        x2 = dets[:, 2]  #xmax
+        y2 = dets[:, 3]  #ymax
+
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            # compute iou
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(1e-28, xx2 - xx1)
+            h = np.maximum(1e-28, yy2 - yy1)
+            inter = w * h
+
+            ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+            #reserve all the boundingbox whose ovr less than thresh
+            inds = np.where(ovr <= self.nms_thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+    def postprocess(self, bboxes, scores):
+        """
+        bboxes: (N, 4), bsize = 1
+        scores: (N, C), bsize = 1
+        """
+
+        cls_inds = np.argmax(scores, axis=1)
+        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
+        
+        # threshold
+        keep = np.where(scores >= self.conf_thresh)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        # NMS
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(cls_inds == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        return bboxes, scores, cls_inds
+
+
+    def decode_bbox(self, reg_pred):
+        """reg_pred: [B, N, KA, 4]"""
+        B = reg_pred.size(0)
+        # txty -> cxcy
+        if self.center_sample:
+            xy_pred = (reg_pred[..., :2].sigmoid() * 2.0 - 1.0 + self.grid_xy) * self.stride[0]
+        else:
+            xy_pred = (reg_pred[..., :2].sigmoid() + self.grid_xy) * self.stride[0]
+        # twth -> wh
+        wh_pred = reg_pred[..., 2:].exp() * self.anchor_wh
+        xywh_pred = torch.cat([xy_pred, wh_pred], dim=-1).view(B, -1, 4)
+        # xywh -> x1y1x2y2
+        x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] / 2
+        x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] / 2
+        box_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
+
+        return box_pred
+
+
+    @torch.no_grad()
+    def inference_single_image(self, x):
+        KA = self.num_anchors
+        C = self.num_classes
+        # backbone
+        x = self.backbone(x)[-1]
+
+        # neck
+        x = self.neck(x)
+
+        # head
+        cls_feat = self.cls_feat(x)
+        reg_feat = self.reg_feat(x)
+
+        # pred
+        obj_pred = self.obj_pred(reg_feat)[0]
+        cls_pred = self.cls_pred(cls_feat)[0]
+        reg_pred = self.reg_pred(reg_feat)[0]
+
+        # [KA*1, H, W] -> [H, W, KA*1] -> [HW*KA, 1]
+        obj_pred = obj_pred.permute(1, 2, 0).contiguous().view(-1, 1)
+        # [KA*C, H, W] -> [H, W, KA*C] -> [HW*KA, C]
+        cls_pred = cls_pred.permute(1, 2, 0).contiguous().view(-1, C)
+        # [KA*4, H, W] -> [H, W, KA*4] -> [HW, KA, 4]
+        reg_pred = reg_pred.permute(1, 2, 0).contiguous().view(-1, KA, 4)
+        # [HW, KA, 4] -> [HW*KA, 4]
+        box_pred = self.decode_bbox(reg_pred[None])[0]
+        # normalize bbox
+        bboxes = torch.clamp(box_pred / self.img_size, 0., 1.)
+
+        # scores
+        scores = torch.sigmoid(obj_pred) * torch.softmax(cls_pred, dim=-1)
+
+        # to cpu
+        scores = scores.to('cpu').numpy()
+        bboxes = bboxes.to('cpu').numpy()
+
+        # post-process
+        bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
+
+        return bboxes, scores, cls_inds
+
+
+    def forward(self, x, targets=None):
+        if not self.trainable:
+            return self.inference_single_image(x)
+        else:
+            B = x.size(0)
+            KA = self.num_anchors
+            C = self.num_classes
+            # backbone
+            x = self.backbone(x)[-1]
+
+            # neck
+            x = self.neck(x)
+
+            # head
+            cls_feat = self.cls_feat(x)
+            reg_feat = self.reg_feat(x)
+
+            # pred
+            obj_pred = self.obj_pred(reg_feat)
+            cls_pred = self.cls_pred(cls_feat)
+            reg_pred = self.reg_pred(reg_feat)
+
+            # [B, KA*1, H, W] -> [B, H, W, KA*1] -> [B, H*W*KA, 1]
+            obj_pred = obj_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+            # [B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
+            # [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, KA, 4)
+            # [B, HW, KA, 4] -> [B, HW*KA, 4]
+            box_pred = self.decode_bbox(reg_pred)
+            # normalize bbox
+            box_pred = box_pred / self.img_size
+
+            # compute giou between prediction bbox and target bbox
+            x1y1x2y2_pred = box_pred.view(-1, 4)
+            x1y1x2y2_gt = targets[..., 2:6].view(-1, 4)
+
+            # giou: [B, HW,]
+            giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+
+            # we set giou as the target of the objectness
+            targets = torch.cat([0.5 * (giou_pred[..., None].clone().detach() + 1.0), targets], dim=-1)
+
+            return obj_pred, cls_pred, giou_pred, targets
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov3.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov3.py
new file mode 100644
index 0000000000..0b89db12b1
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov3.py
@@ -0,0 +1,327 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from utils import box_ops
+
+from ..backbone import build_backbone
+from ..neck import build_neck
+from ..basic.conv import Conv, ConvBlocks
+from ..basic.upsample import UpSample
+import torch_npu
+
+
+class YOLOv3(nn.Module):
+    def __init__(self, 
+                 cfg=None,
+                 device=None, 
+                 img_size=640, 
+                 num_classes=80, 
+                 trainable=False, 
+                 conf_thresh=0.001, 
+                 nms_thresh=0.60,
+                 center_sample=False):
+
+        super(YOLOv3, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.center_sample = center_sample
+
+        # backbone
+        self.backbone, feature_channels, strides = build_backbone(model_name=cfg["backbone"], 
+                                                                  pretrained=trainable)
+        self.stride = strides
+        anchor_size = cfg["anchor_size"]
+        # [S, KA, 2], S is equal to number of stride
+        self.anchor_size = torch.tensor(anchor_size).reshape(len(self.stride), len(anchor_size) // 3, 2).float()
+        self.num_anchors = self.anchor_size.size(1)
+        c3, c4, c5 = feature_channels
+
+        # build grid cell
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+        
+        # head
+        # P3/8-small
+        self.head_convblock_0 = build_neck(model=cfg["neck"], in_ch=c5, out_ch=c5//2)
+        self.head_conv_0 = Conv(c5//2, c4//2, k=1)
+        self.head_upsample_0 = UpSample(scale_factor=2)
+        self.head_conv_1 = Conv(c5//2, c5, k=3, p=1)
+
+        # P4/16-medium
+        self.head_convblock_1 = ConvBlocks(c4 + c4//2, c4//2)
+        self.head_conv_2 = Conv(c4//2, c3//2, k=1)
+        self.head_upsample_1 = UpSample(scale_factor=2)
+        self.head_conv_3 = Conv(c4//2, c4, k=3, p=1)
+
+        # P8/32-large
+        self.head_convblock_2 = ConvBlocks(c3 + c3//2, c3//2)
+        self.head_conv_4 = Conv(c3//2, c3, k=3, p=1)
+
+        # det conv
+        self.head_det_1 = nn.Conv2d(c3, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_2 = nn.Conv2d(c4, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_3 = nn.Conv2d(c5, self.num_anchors * (1 + self.num_classes + 4), 1)
+
+
+        if self.trainable:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.head_det_1.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_2.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_3.bias[..., :self.num_anchors], bias_value)
+
+
+    def create_grid(self, img_size):
+        total_grid_xy = []
+        total_anchor_wh = []
+        w, h = img_size, img_size
+        for ind, s in enumerate(self.stride):
+            # generate grid cells
+            fmp_w, fmp_h = w // s, h // s
+            grid_y, grid_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+            # [H, W, 2] -> [HW, 2]
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2)
+            # [HW, 2] -> [1, HW, 1, 2]   
+            grid_xy = grid_xy[None, :, None, :].to(self.device)
+            # [1, HW, 1, 2]
+            anchor_wh = self.anchor_size[ind].repeat(fmp_h*fmp_w, 1, 1).unsqueeze(0).to(self.device)
+
+            total_grid_xy.append(grid_xy)
+            total_anchor_wh.append(anchor_wh)
+
+        return total_grid_xy, total_anchor_wh
+
+
+    def set_grid(self, img_size):
+        self.img_size = img_size
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+
+    def nms(self, dets, scores):
+        """"Pure Python NMS YOLOv4."""
+        x1 = dets[:, 0]  #xmin
+        y1 = dets[:, 1]  #ymin
+        x2 = dets[:, 2]  #xmax
+        y2 = dets[:, 3]  #ymax
+
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            # compute iou
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(1e-28, xx2 - xx1)
+            h = np.maximum(1e-28, yy2 - yy1)
+            inter = w * h
+
+            ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+            #reserve all the boundingbox whose ovr less than thresh
+            inds = np.where(ovr <= self.nms_thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+    def postprocess(self, bboxes, scores):
+        """
+        bboxes: (N, 4), bsize = 1
+        scores: (N, C), bsize = 1
+        """
+
+        cls_inds = np.argmax(scores, axis=1)
+        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
+        
+        # threshold
+        keep = np.where(scores >= self.conf_thresh)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        # NMS
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(cls_inds == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        return bboxes, scores, cls_inds
+
+
+    @torch.no_grad()
+    def inference_single_image(self, x):
+        KA = self.num_anchors
+        C = self.num_classes
+        # backbone
+        c3, c4, c5 = self.backbone(x)
+
+        # head
+        # p5/32
+        p5 = self.head_convblock_0(c5)
+        p5_up = self.head_upsample_0(self.head_conv_0(p5))
+        p5 = self.head_conv_1(p5)
+
+        # p4/16
+        p4 = self.head_convblock_1(torch.cat([c4, p5_up], dim=1))
+        p4_up = self.head_upsample_1(self.head_conv_2(p4))
+        p4 = self.head_conv_3(p4)
+
+        # P3/8
+        p3 = self.head_convblock_2(torch.cat([c3, p4_up], dim=1))
+        p3 = self.head_conv_4(p3)
+
+        # det
+        pred_s = self.head_det_1(p3)[0]
+        pred_m = self.head_det_2(p4)[0]
+        pred_l = self.head_det_3(p5)[0]
+
+        preds = [pred_s, pred_m, pred_l]
+        obj_pred_list = []
+        cls_pred_list = []
+        box_pred_list = []
+
+        for i, pred in enumerate(preds):
+            # [KA*(1 + C + 4), H, W] -> [KA*1, H, W] -> [H, W, KA*1] -> [HW*KA, 1]
+            obj_pred_i = pred[:KA, :, :].permute(1, 2, 0).contiguous().view(-1, 1)
+            # [KA*(1 + C + 4), H, W] -> [KA*C, H, W] -> [H, W, KA*C] -> [HW*KA, C]
+            cls_pred_i = pred[KA:KA*(1+C), :, :].permute(1, 2, 0).contiguous().view(-1, C)
+            # [KA*(1 + C + 4), H, W] -> [KA*4, H, W] -> [H, W, KA*4] -> [HW, KA, 4]
+            reg_pred_i = pred[KA*(1+C):, :, :].permute(1, 2, 0).contiguous().view(-1, KA, 4)
+            # txty -> xy
+            if self.center_sample:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+            else:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+            # twth -> wh
+            wh_pred_i = reg_pred_i[None, ..., 2:].exp() * self.anchors_wh[i]
+            # xywh -> x1y1x2y2           
+            x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+            x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+            box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1)[0].view(-1, 4)
+
+            obj_pred_list.append(obj_pred_i)
+            cls_pred_list.append(cls_pred_i)
+            box_pred_list.append(box_pred_i)
+        
+        obj_pred = torch.cat(obj_pred_list, dim=0)
+        cls_pred = torch.cat(cls_pred_list, dim=0)
+        box_pred = torch.cat(box_pred_list, dim=0)
+        
+        # normalize bbox
+        bboxes = torch.clamp(box_pred / self.img_size, 0., 1.)
+
+        # scores
+        scores = torch.sigmoid(obj_pred) * torch.softmax(cls_pred, dim=-1)
+
+        # to cpu
+        scores = scores.to('cpu').numpy()
+        bboxes = bboxes.to('cpu').numpy()
+
+        # post-process
+        bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
+
+        return bboxes, scores, cls_inds
+
+
+    def forward(self, x, targets=None):
+        if not self.trainable:
+            return self.inference_single_image(x)
+        else:
+            B = x.size(0)
+            KA = self.num_anchors
+            C = self.num_classes
+            # backbone
+            c3, c4, c5 = self.backbone(x)
+
+            # head
+            # p5/32
+            p5 = self.head_convblock_0(c5)
+            p5_up = self.head_upsample_0(self.head_conv_0(p5))
+            p5 = self.head_conv_1(p5)
+
+            # p4/16
+            p4 = self.head_convblock_1(torch.cat([c4, p5_up], dim=1))
+            p4_up = self.head_upsample_1(self.head_conv_2(p4))
+            p4 = self.head_conv_3(p4)
+
+            # P3/8
+            p3 = self.head_convblock_2(torch.cat([c3, p4_up], dim=1))
+            p3 = self.head_conv_4(p3)
+
+            # det
+            pred_s = self.head_det_1(p3)
+            pred_m = self.head_det_2(p4)
+            pred_l = self.head_det_3(p5)
+
+            preds = [pred_s, pred_m, pred_l]
+            obj_pred_list = []
+            cls_pred_list = []
+            box_pred_list = []
+
+            for i, pred in enumerate(preds):
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA, H, W] -> [B, H, W, KA] ->  [B, HW*KA, 1]
+                obj_pred_i = pred[:, :KA, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+                cls_pred_i = pred[:, KA:KA*(1+C), :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, C)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+                reg_pred_i = pred[:, KA*(1+C):, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, KA, 4)
+                # txty -> xy
+                if self.center_sample:
+                    xy_pred_i = (reg_pred_i[..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+                else:
+                    xy_pred_i = (reg_pred_i[..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+                # twth -> wh
+                wh_pred_i = reg_pred_i[..., 2:].exp() * self.anchors_wh[i]
+                # xywh -> x1y1x2y2
+                x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+                x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+                box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1).view(B, -1, 4)
+
+                obj_pred_list.append(obj_pred_i)
+                cls_pred_list.append(cls_pred_i)
+                box_pred_list.append(box_pred_i)
+            
+            obj_pred = torch.cat(obj_pred_list, dim=1)
+            cls_pred = torch.cat(cls_pred_list, dim=1)
+            box_pred = torch.cat(box_pred_list, dim=1)
+            
+            # normalize bbox
+            box_pred = box_pred / self.img_size
+
+            # compute giou between prediction bbox and target bbox
+            x1y1x2y2_pred = box_pred.view(-1, 4)
+            x1y1x2y2_gt = targets[..., 2:6].view(-1, 4)
+
+            # giou: [B, HW,]
+            giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+
+            # we set giou as the target of the objectness
+            targets = torch.cat([0.5 * (giou_pred[..., None].clone().detach() + 1.0), targets], dim=-1)
+
+            return obj_pred, cls_pred, giou_pred, targets
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov4.py b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov4.py
new file mode 100644
index 0000000000..67d5aa1e6f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/models/yolo/yolov4.py
@@ -0,0 +1,345 @@
+import numpy as np
+import torch
+import torch.nn as nn
+
+from utils import box_ops
+
+from ..backbone import build_backbone
+from ..neck import build_neck
+from ..basic.conv import Conv 
+from ..basic.upsample import UpSample
+from ..basic.bottleneck_csp import BottleneckCSP
+import torch_npu
+
+
+class YOLOv4(nn.Module):
+    def __init__(self, 
+                 cfg=None,
+                 device=None, 
+                 img_size=640, 
+                 num_classes=80, 
+                 trainable=False, 
+                 conf_thresh=0.001, 
+                 nms_thresh=0.60, 
+                 center_sample=False):
+
+        super(YOLOv4, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.center_sample = center_sample
+
+        # backbone
+        self.backbone, feature_channels, strides = build_backbone(model_name=cfg["backbone"], 
+                                                                  pretrained=trainable)
+        self.stride = strides
+        anchor_size = cfg["anchor_size"]
+        self.anchor_size = torch.tensor(anchor_size).reshape(len(self.stride), len(anchor_size) // 3, 2).float()
+        self.num_anchors = self.anchor_size.size(1)
+        c3, c4, c5 = feature_channels
+
+        # build grid cell
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+        # head
+        self.head_conv_0 = build_neck(model=cfg["neck"], in_ch=c5, out_ch=c5//2)  # 10
+        self.head_upsample_0 = UpSample(scale_factor=2)
+        self.head_csp_0 = BottleneckCSP(c4 + c5//2, c4, n=3, shortcut=False)
+
+        # P3/8-small
+        self.head_conv_1 = Conv(c4, c4//2, k=1)  # 14
+        self.head_upsample_1 = UpSample(scale_factor=2)
+        self.head_csp_1 = BottleneckCSP(c3 + c4//2, c3, n=3, shortcut=False)
+
+        # P4/16-medium
+        self.head_conv_2 = Conv(c3, c3, k=3, p=1, s=2)
+        self.head_csp_2 = BottleneckCSP(c3 + c4//2, c4, n=3, shortcut=False)
+
+        # P8/32-large
+        self.head_conv_3 = Conv(c4, c4, k=3, p=1, s=2)
+        self.head_csp_3 = BottleneckCSP(c4 + c5//2, c5, n=3, shortcut=False)
+
+        # det conv
+        self.head_det_1 = nn.Conv2d(c3, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_2 = nn.Conv2d(c4, self.num_anchors * (1 + self.num_classes + 4), 1)
+        self.head_det_3 = nn.Conv2d(c5, self.num_anchors * (1 + self.num_classes + 4), 1)
+
+        if self.trainable:
+            # init bias
+            self.init_bias()
+
+
+    def init_bias(self):               
+        # init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        nn.init.constant_(self.head_det_1.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_2.bias[..., :self.num_anchors], bias_value)
+        nn.init.constant_(self.head_det_3.bias[..., :self.num_anchors], bias_value)
+
+
+    def create_grid(self, img_size):
+        total_grid_xy = []
+        total_anchor_wh = []
+        w, h = img_size, img_size
+        for ind, s in enumerate(self.stride):
+            # generate grid cells
+            fmp_w, fmp_h = w // s, h // s
+            grid_y, grid_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+            # [H, W, 2] -> [HW, 2]
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2)
+            # [HW, 2] -> [1, HW, 1, 2]   
+            grid_xy = grid_xy[None, :, None, :].to(self.device)
+            # [1, HW, 1, 2]
+            anchor_wh = self.anchor_size[ind].repeat(fmp_h*fmp_w, 1, 1).unsqueeze(0).to(self.device)
+
+            total_grid_xy.append(grid_xy)
+            total_anchor_wh.append(anchor_wh)
+
+        return total_grid_xy, total_anchor_wh
+
+
+    def set_grid(self, img_size):
+        self.img_size = img_size
+        self.grid_cell, self.anchors_wh = self.create_grid(img_size)
+
+
+    def nms(self, dets, scores):
+        """"Pure Python NMS."""
+        x1 = dets[:, 0]  #xmin
+        y1 = dets[:, 1]  #ymin
+        x2 = dets[:, 2]  #xmax
+        y2 = dets[:, 3]  #ymax
+
+        areas = (x2 - x1) * (y2 - y1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            # compute iou
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(1e-28, xx2 - xx1)
+            h = np.maximum(1e-28, yy2 - yy1)
+            inter = w * h
+
+            ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+            #reserve all the boundingbox whose ovr less than thresh
+            inds = np.where(ovr <= self.nms_thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+    def postprocess(self, bboxes, scores):
+        """
+        bboxes: (HxW, 4), bsize = 1
+        scores: (HxW, num_classes), bsize = 1
+        """
+
+        cls_inds = np.argmax(scores, axis=1)
+        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
+        
+        # threshold
+        keep = np.where(scores >= self.conf_thresh)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        # NMS
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(cls_inds == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        cls_inds = cls_inds[keep]
+
+        return bboxes, scores, cls_inds
+
+
+    @torch.no_grad()
+    def inference_single_image(self, x):
+        KA = self.num_anchors
+        C = self.num_classes
+        # backbone
+        c3, c4, c5 = self.backbone(x)
+
+        # FPN + PAN
+        # head
+        c6 = self.head_conv_0(c5)
+        c7 = self.head_upsample_0(c6)   # s32->s16
+        c8 = torch.cat([c7, c4], dim=1)
+        c9 = self.head_csp_0(c8)
+        # P3/8
+        c10 = self.head_conv_1(c9)
+        c11 = self.head_upsample_1(c10)   # s16->s8
+        c12 = torch.cat([c11, c3], dim=1)
+        c13 = self.head_csp_1(c12)  # to det
+        # p4/16
+        c14 = self.head_conv_2(c13)
+        c15 = torch.cat([c14, c10], dim=1)
+        c16 = self.head_csp_2(c15)  # to det
+        # p5/32
+        c17 = self.head_conv_3(c16)
+        c18 = torch.cat([c17, c6], dim=1)
+        c19 = self.head_csp_3(c18)  # to det
+
+        # det
+        pred_s = self.head_det_1(c13)[0]
+        pred_m = self.head_det_2(c16)[0]
+        pred_l = self.head_det_3(c19)[0]
+
+        preds = [pred_s, pred_m, pred_l]
+        obj_pred_list = []
+        cls_pred_list = []
+        box_pred_list = []
+
+        for i, pred in enumerate(preds):
+            # [KA*(1 + C + 4), H, W] -> [KA*1, H, W] -> [H, W, KA*1] -> [HW*KA, 1]
+            obj_pred_i = pred[:KA, :, :].permute(1, 2, 0).contiguous().view(-1, 1)
+            # [KA*(1 + C + 4), H, W] -> [KA*C, H, W] -> [H, W, KA*C] -> [HW*KA, C]
+            cls_pred_i = pred[KA:KA*(1+C), :, :].permute(1, 2, 0).contiguous().view(-1, C)
+            # [KA*(1 + C + 4), H, W] -> [KA*4, H, W] -> [H, W, KA*4] -> [HW, KA, 4]
+            reg_pred_i = pred[KA*(1+C):, :, :].permute(1, 2, 0).contiguous().view(-1, KA, 4)
+            # txty -> xy
+            if self.center_sample:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() * 2.0 - 1.0 + self.grid_cell[i]) * self.stride[i]
+            else:
+                xy_pred_i = (reg_pred_i[None, ..., :2].sigmoid() + self.grid_cell[i]) * self.stride[i]
+            # twth -> wh
+            wh_pred_i = reg_pred_i[None, ..., 2:].exp() * self.anchors_wh[i]
+            # xywh -> x1y1x2y2           
+            x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+            x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+            box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1)[0].view(-1, 4)
+
+            obj_pred_list.append(obj_pred_i)
+            cls_pred_list.append(cls_pred_i)
+            box_pred_list.append(box_pred_i)
+        
+        obj_pred = torch.cat(obj_pred_list, dim=0)
+        cls_pred = torch.cat(cls_pred_list, dim=0)
+        box_pred = torch.cat(box_pred_list, dim=0)
+        
+        # normalize bbox
+        bboxes = torch.clamp(box_pred / self.img_size, 0., 1.)
+
+        # scores
+        scores = torch.sigmoid(obj_pred) * torch.softmax(cls_pred, dim=-1)
+
+        # to cpu
+        scores = scores.to('cpu').numpy()
+        bboxes = bboxes.to('cpu').numpy()
+
+        # post-process
+        bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
+
+        return bboxes, scores, cls_inds
+
+
+    def forward(self, x, targets=None):
+        if not self.trainable:
+            return self.inference_single_image(x)
+        else:
+            B = x.size(0)
+            KA = self.num_anchors
+            C = self.num_classes
+            # backbone
+            c3, c4, c5 = self.backbone(x)
+
+            # FPN + PAN
+            # head
+            c6 = self.head_conv_0(c5)
+            c7 = self.head_upsample_0(c6)   # s32->s16
+            c8 = torch.cat([c7, c4], dim=1)
+            c9 = self.head_csp_0(c8)
+            # P3/8
+            c10 = self.head_conv_1(c9)
+            c11 = self.head_upsample_1(c10)   # s16->s8
+            c12 = torch.cat([c11, c3], dim=1)
+            c13 = self.head_csp_1(c12)  # to det
+            # p4/16
+            c14 = self.head_conv_2(c13)
+            c15 = torch.cat([c14, c10], dim=1)
+            c16 = self.head_csp_2(c15)  # to det
+            # p5/32
+            c17 = self.head_conv_3(c16)
+            c18 = torch.cat([c17, c6], dim=1)
+            c19 = self.head_csp_3(c18)  # to det
+
+            # det
+            pred_s = self.head_det_1(c13)
+            pred_m = self.head_det_2(c16)
+            pred_l = self.head_det_3(c19)
+
+            preds = [pred_s, pred_m, pred_l]
+            obj_pred_list = []
+            cls_pred_list = []
+            box_pred_list = []
+
+            for i, pred in enumerate(preds):
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA, H, W] -> [B, H, W, KA] ->  [B, HW*KA, 1]
+                obj_pred_i = pred[:, :KA, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*C, H, W] -> [B, H, W, KA*C] -> [B, H*W*KA, C]
+                cls_pred_i = pred[:, KA:KA*(1+C), :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, C)
+                # [B, KA*(1 + C + 4), H, W] -> [B, KA*4, H, W] -> [B, H, W, KA*4] -> [B, HW, KA, 4]
+                reg_pred_i = pred[:, KA*(1+C):, :, :].permute(0, 2, 3, 1).contiguous().view(B, -1, KA, 4)
+                # txty -> xy
+                if self.center_sample:     
+                    xy_pred_i = (self.grid_cell[i] + reg_pred_i[..., :2].sigmoid() * 2.0 - 1.0) * self.stride[i]
+                else:
+                    xy_pred_i = (self.grid_cell[i] + reg_pred_i[..., :2].sigmoid()) * self.stride[i]
+                # twth -> wh
+                wh_pred_i = reg_pred_i[..., 2:].exp() * self.anchors_wh[i]
+                # xywh -> x1y1x2y2
+                x1y1_pred_i = xy_pred_i - wh_pred_i * 0.5
+                x2y2_pred_i = xy_pred_i + wh_pred_i * 0.5
+                box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1).view(B, -1, 4)
+
+                obj_pred_list.append(obj_pred_i)
+                cls_pred_list.append(cls_pred_i)
+                box_pred_list.append(box_pred_i)
+            
+            obj_pred = torch.cat(obj_pred_list, dim=1)
+            cls_pred = torch.cat(cls_pred_list, dim=1)
+            box_pred = torch.cat(box_pred_list, dim=1)
+            
+            # normalize bbox
+            box_pred = box_pred / self.img_size
+
+            # compute giou between prediction bbox and target bbox
+            x1y1x2y2_pred = box_pred.view(-1, 4)
+            x1y1x2y2_gt = targets[..., 2:6].view(-1, 4)
+
+            # iou: [B, HW,]
+            if self.cfg['loss_box'] == 'iou':
+                iou_pred = box_ops.iou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+                obj_tgt = iou_pred[..., None].clone().detach().clamp(0.) # [0, 1]
+            elif self.cfg['loss_box'] == 'giou':
+                iou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+                obj_tgt = 0.5 * (iou_pred[..., None].clone().detach() + 1.0) # [-1, 1] -> [0, 1]
+            elif self.cfg['loss_box'] == 'ciou':
+                iou_pred = box_ops.ciou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B)
+                obj_tgt = iou_pred[..., None].clone().detach().clamp(0.) # [0, 1]
+
+            # we set iou as the target of the objectness
+            targets = torch.cat([obj_tgt, targets], dim=-1)
+
+            return obj_pred, cls_pred, iou_pred, targets
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/requirements.txt b/PyTorch/contrib/cv/detection/YoloV2-640/requirements.txt
new file mode 100644
index 0000000000..4c64801e7f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/requirements.txt
@@ -0,0 +1,19 @@
+torch==1.8.1
+
+torch_npu==1.8.1
+
+torchvision==0.9.1
+
+opencv-python
+
+thop
+
+scipy
+
+matplotlib
+
+numpy
+
+pycocotools
+
+timm
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/test.py b/PyTorch/contrib/cv/detection/YoloV2-640/test.py
new file mode 100644
index 0000000000..9c58c234f5
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/test.py
@@ -0,0 +1,233 @@
+import argparse
+import cv2
+import os
+import time
+import numpy as np
+import torch
+
+from config.yolo_config import yolo_config
+from data.voc import VOC_CLASSES, VOCDetection
+from data.coco import coco_class_index, coco_class_labels, COCODataset
+from data.transforms import ValTransforms
+from utils.misc import TestTimeAugmentation
+
+from models.yolo import build_model
+import torch_npu
+
+
+parser = argparse.ArgumentParser(description='YOLO Detection')
+# basic
+parser.add_argument('-size', '--img_size', default=640, type=int,
+                    help='img_size')
+parser.add_argument('--show', action='store_true', default=False,
+                    help='show the visulization results.')
+parser.add_argument('-vs', '--visual_threshold', default=0.35, type=float,
+                    help='Final confidence threshold')
+parser.add_argument('--cuda', action='store_true', default=False, 
+                    help='use cuda.')
+parser.add_argument('--save_folder', default='det_results/', type=str,
+                    help='Dir to save results')
+# model
+parser.add_argument('-m', '--model', default='yolov1',
+                    help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                            'yolov4, yolo_tiny, yolo_nano')
+parser.add_argument('--weight', default='weight/',
+                    type=str, help='Trained state_dict file path to open')
+parser.add_argument('--conf_thresh', default=0.1, type=float,
+                    help='NMS threshold')
+parser.add_argument('--nms_thresh', default=0.45, type=float,
+                    help='NMS threshold')
+parser.add_argument('--center_sample', action='store_true', default=False,
+                    help='center sample trick.')
+# dataset
+parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                    help='data root')
+parser.add_argument('-d', '--dataset', default='coco',
+                    help='coco.')
+# TTA
+parser.add_argument('-tta', '--test_aug', action='store_true', default=False,
+                    help='use test augmentation.')
+
+args = parser.parse_args()
+
+
+
+def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
+    x1, y1, x2, y2 = bbox
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
+    # plot bbox
+    cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
+    
+    if label is not None:
+        # plot title bbox
+        cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1)
+        # put the test on the title bbox
+        cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
+
+    return img
+
+
+def visualize(img, 
+              bboxes, 
+              scores, 
+              cls_inds, 
+              vis_thresh, 
+              class_colors, 
+              class_names, 
+              class_indexs=None, 
+              dataset_name='voc'):
+    ts = 0.4
+    for i, bbox in enumerate(bboxes):
+        if scores[i] > vis_thresh:
+            cls_id = int(cls_inds[i])
+            if dataset_name == 'coco':
+                cls_color = class_colors[cls_id]
+                cls_id = class_indexs[cls_id]
+            else:
+                cls_color = class_colors[cls_id]
+                
+            if len(class_names) > 1:
+                mess = '%s: %.2f' % (class_names[cls_id], scores[i])
+            else:
+                cls_color = [255, 0, 0]
+                mess = None
+            img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts)
+
+    return img
+        
+
+def test(args,
+         net, 
+         device, 
+         dataset,
+         transforms=None,
+         vis_thresh=0.4, 
+         class_colors=None, 
+         class_names=None, 
+         class_indexs=None, 
+         show=False,
+         test_aug=None, 
+         dataset_name='coco'):
+    num_images = len(dataset)
+    save_path = os.path.join('det_results/', args.dataset, args.model)
+    os.makedirs(save_path, exist_ok=True)
+
+    for index in range(num_images):
+        print('Testing image {:d}/{:d}....'.format(index+1, num_images))
+        image, _ = dataset.pull_image(index)
+
+        h, w, _ = image.shape
+        size = np.array([[w, h, w, h]])
+
+        # prepare
+        x, _, _, scale, offset = transforms(image)
+        x = x.unsqueeze(0).to(device)
+
+        t0 = time.time()
+        # forward
+        # test augmentation:
+        if test_aug is not None:
+            bboxes, scores, cls_inds = test_aug(x, net)
+        else:
+            # inference
+            bboxes, scores, cls_inds = net(x)
+        print("detection time used ", time.time() - t0, "s")
+        
+        # rescale
+        bboxes -= offset
+        bboxes /= scale
+        bboxes *= size
+
+        # vis detection
+        img_processed = visualize(
+                            img=image,
+                            bboxes=bboxes,
+                            scores=scores,
+                            cls_inds=cls_inds,
+                            vis_thresh=vis_thresh,
+                            class_colors=class_colors,
+                            class_names=class_names,
+                            class_indexs=class_indexs,
+                            dataset_name=dataset_name
+                            )
+        if show:
+            cv2.imshow('detection', img_processed)
+            cv2.waitKey(0)
+        # save result
+        cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    # cuda
+    if args.cuda:
+        print('use cuda')
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    model_name = args.model
+    print('Model: ', model_name)
+
+    # dataset and evaluator
+    if args.dataset == 'voc':
+        data_dir = os.path.join(args.root, 'VOCdevkit')
+        class_names = VOC_CLASSES
+        class_indexs = None
+        num_classes = 20
+        dataset = VOCDetection(
+                        data_dir=data_dir,
+                        img_size=args.img_size,
+                        image_sets=[('2007', 'test')])
+
+    elif args.dataset == 'coco':
+        data_dir = os.path.join(args.root, 'COCO')
+        class_names = coco_class_labels
+        class_indexs = coco_class_index
+        num_classes = 80
+        dataset = COCODataset(
+                    data_dir=data_dir,
+                    img_size=args.img_size,
+                    image_set='val2017')
+    
+    else:
+        print('unknow dataset !! Only support voc and coco !!')
+        exit(0)
+
+    np.random.seed(0)
+    class_colors = [(np.random.randint(255),
+                     np.random.randint(255),
+                     np.random.randint(255)) for _ in range(num_classes)]
+
+    # YOLO Config
+    cfg = yolo_config[args.model]
+    # build model
+    model = build_model(args=args, 
+                        cfg=cfg, 
+                        device=device, 
+                        num_classes=num_classes, 
+                        trainable=False)
+
+    # load weight
+    model.load_state_dict(torch.load(args.weight, map_location='cpu'), strict=False)
+    model = model.to(device).eval()
+    print('Finished loading model!')
+
+    # TTA
+    test_aug = TestTimeAugmentation(num_classes=num_classes) if args.test_aug else None
+
+
+    # run
+    test(args=args,
+        net=model, 
+        device=device, 
+        dataset=dataset,
+        transforms=ValTransforms(args.img_size),
+        vis_thresh=args.visual_threshold,
+        class_colors=class_colors,
+        class_names=class_names,
+        class_indexs=class_indexs,
+        show=args.show,
+        test_aug=test_aug,
+        dataset_name=args.dataset)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
new file mode 100644
index 0000000000..b0d155ece7
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
@@ -0,0 +1,13 @@
+python3 train3.py \
+        --npu \
+        -d coco \
+        -m yolov2 \
+        --root /home/normal58/zhang/zzb_msft \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 640 \
+        --max_epoch 200 \
+        --lr_epoch 100 150 \
+        --multi_scale \
+        --multi_scale_range 10 20 \
+        --multi_anchor \
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
new file mode 100644
index 0000000000..8c1bcdb666
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+RANK_ID_START=0
+export WORLD_SIZE=8
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+#训练batch_size,,需要模型审视修改
+batch_size=32
+#设置环境变量，不需要修改
+RANK_ID=0
+echo "Decive ID: $RANK_ID"
+export RANK_ID=$RANK_ID
+export ASCEND_DEVICE_ID=$RANK_ID
+ASCEND_DEVICE_ID=$RANK_ID
+#创建DeviceID输出目录，不需要修改
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt
+fi
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+export RANK_SIZE=8
+
+KERNEL_NUM=$(($(nproc)/8))
+for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
+do
+    export RANK=$RANK_ID
+
+    if [ $(uname -m) = "aarch64" ]
+    then
+        PID_START=$((KERNEL_NUM * RANK_ID))
+        PID_END=$((PID_START + KERNEL_NUM - 1))
+        taskset -c $PID_START-$PID_END python3.7 -m torch.distributed.launch --nproc_per_node=8 train8p.py \
+                                                        --npu \
+                                                        -d coco \
+                                                        -m yolov2 \
+                                                        --root /forDocker/dataset \
+                                                        --batch_size 32 \
+                                                        --lr 0.002 \
+                                                        --img_size 640 \
+                                                        --max_epoch 200 \
+                                                        --lr_epoch 100 150 \
+                                                        --multi_scale \
+                                                        --multi_scale_range 10 20 \
+                                                        --multi_anchor \
+                                                        -dist \
+                                                        --sybn \
+                                                        --num_gpu 8 \
+                                                        --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    else
+        python3.7 -m torch.distributed.launch --nproc_per_node=8 train8p.py \
+                                                        --npu \
+                                                        -d coco \
+                                                        -m yolov2 \
+                                                        --root /forDocker/dataset \
+                                                        --batch_size 32 \
+                                                        --lr 0.002 \
+                                                        --img_size 640 \
+                                                        --max_epoch 200 \
+                                                        --lr_epoch 100 150 \
+                                                        --multi_scale \
+                                                        --multi_scale_range 10 20 \
+                                                        --multi_anchor \
+                                                        -dist \
+                                                        --sybn \
+                                                        --num_gpu 8 \
+                                                        --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    fi
+done
+
+#8p情况下仅0卡(主节点)有完整日志,因此后续日志提取仅涉及0卡
+ASCEND_DEVICE_ID=0
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+time=`grep -a 'Epoch '  $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time: " '{print $2}'|awk -F "," '{print $1}'|awk 'END {print}'|sed 's/.$//'`
+FPS=`awk 'BEGIN{printf "%.2f\n", '${RANK_SIZE}'*'${batch_size}'/'${time}'}'`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py b/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
new file mode 100644
index 0000000000..4a50a26de8
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
@@ -0,0 +1,545 @@
+from __future__ import division
+
+import os
+import argparse
+import time
+import math
+import random
+from copy import deepcopy
+import apex
+from apex import amp
+import torch
+import torch_npu
+import torch.optim as optim
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import sys
+from config.yolo_config import yolo_config
+from data.voc import VOCDetection
+from data.coco import COCODataset
+from data.transforms import TrainTransforms, ColorTransforms, ValTransforms
+
+from utils import distributed_utils
+from utils import create_labels
+from utils.vis import vis_data, vis_targets
+from utils.com_flops_params import FLOPs_and_Params
+from utils.criterion import build_criterion
+from utils.misc import detection_collate
+from utils.misc import ModelEMA
+from utils.criterion import build_criterion
+
+from models.yolo import build_model
+
+from evaluator.cocoapi_evaluator import COCOAPIEvaluator
+from evaluator.vocapi_evaluator import VOCAPIEvaluator
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO Detection')
+    # basic
+    parser.add_argument('--npu', action='store_true', default=False,
+                        help='use npu.')
+    parser.add_argument('--batch_size', default=16, type=int,
+                        help='Batch size for training')
+    parser.add_argument('--lr', default=1e-3, type=float,
+                        help='initial learning rate')
+    parser.add_argument('--img_size', type=int, default=640,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--multi_scale_range', nargs='+', default=[10, 20], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--max_epoch', type=int, default=200,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--lr_epoch', nargs='+', default=[100, 150], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--wp_epoch', type=int, default=2,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--start_epoch', type=int, default=0,
+                        help='start epoch to train')
+    parser.add_argument('-r', '--resume', default=None, type=str,
+                        help='keep training')
+    parser.add_argument('--num_workers', default=8, type=int,
+                        help='Number of workers used in dataloading')
+    parser.add_argument('--num_gpu', default=1, type=int,
+                        help='Number of GPUs to train')
+    parser.add_argument('--eval_epoch', type=int,
+                        default=10, help='interval between evaluations')
+    parser.add_argument('--tfboard', action='store_true', default=False,
+                        help='use tensorboard')
+    parser.add_argument('--save_folder', default='weights/', type=str,
+                        help='path to save weight')
+    parser.add_argument('--vis_data', action='store_true', default=False,
+                        help='visualize images and labels.')
+    parser.add_argument('--vis_targets', action='store_true', default=False,
+                        help='visualize assignment.')
+
+    # Optimizer & Schedule
+    parser.add_argument('--optimizer', default='NpuFusedSGD', type=str,
+                        help='sgd, adamw')
+    parser.add_argument('--lr_schedule', default='step', type=str,
+                        help='step, cos')
+    parser.add_argument('--grad_clip', default=None, type=float,
+                        help='clip gradient')
+
+    # model
+    parser.add_argument('-m', '--model', default='yolov1',
+                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                             'yolov4, yolo_tiny, yolo_nano')
+    parser.add_argument('--conf_thresh', default=0.001, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--nms_thresh', default=0.5, type=float,
+                        help='NMS threshold')
+
+    # dataset
+    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, widerface, crowdhuman')
+
+    # Loss
+    parser.add_argument('--loss_obj_weight', default=1.0, type=float,
+                        help='weight of obj loss')
+    parser.add_argument('--loss_cls_weight', default=1.0, type=float,
+                        help='weight of cls loss')
+    parser.add_argument('--loss_reg_weight', default=1.0, type=float,
+                        help='weight of reg loss')
+    parser.add_argument('--scale_loss', default='batch', type=str,
+                        help='scale loss: batch or positive samples')
+
+    # train trick
+    parser.add_argument('--no_warmup', action='store_true', default=False,
+                        help='do not use warmup')
+    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
+                        help='use multi-scale trick')
+    parser.add_argument('--ema', action='store_true', default=False,
+                        help='use ema training trick')
+    parser.add_argument('--mosaic', action='store_true', default=False,
+                        help='use Mosaic Augmentation trick')
+    parser.add_argument('--mixup', action='store_true', default=False,
+                        help='use MixUp Augmentation trick')
+    parser.add_argument('--multi_anchor', action='store_true', default=False,
+                        help='use multiple anchor boxes as the positive samples')
+    parser.add_argument('--center_sample', action='store_true', default=False,
+                        help='use center sample for labels')
+    parser.add_argument('--accumulate', type=int, default=1,
+                        help='accumulate gradient')
+    # DDP train
+    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
+                        help='distributed training')
+    parser.add_argument('--local_rank', type=int, default=0,
+                        help='local_rank')
+    parser.add_argument('--sybn', action='store_true', default=False,
+                        help='use sybn.')
+    parser.add_argument('--opt-level', default='O2', type=str,
+                        help='loss scale using in amp, default O1')
+
+    return parser.parse_args()
+
+
+def train():
+    args = parse_args()
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12345'
+
+    # torch.npu.set_compile_mode(jit_compile=False)
+    option = {}
+    option["ACL_OP_COMPILER_CACHE_MODE"]="enable"
+    option["ACL_OP_COMPILER_CACHE_DIR"]="./kernel_meta"
+    option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Maximum,Conv2D,BNInfer,BNTrainingReduceGrad,Cast"
+    print("option:",option)
+    # torch.npu.set_option(option)
+    print("Setting Arguments.. : ", args)
+    print("----------------------------------------------------------")
+
+    # path to save model
+    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
+    os.makedirs(path_to_save, exist_ok=True)
+
+    # set distributed
+    local_rank = 0
+    if args.distributed:
+        dist.init_process_group(backend="hccl", #init_method="env://"
+                                )
+        local_rank = torch.distributed.get_rank()
+        print(local_rank)
+        torch_npu.npu.set_device(local_rank)
+
+    # cuda
+    if args.npu:
+        print('use npu')
+        cudnn.benchmark = True
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # YOLO config
+    cfg = yolo_config[args.model]
+    train_size = val_size = args.img_size
+
+    # dataset and evaluator
+    dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device)
+    # dataloader
+    dataloader = build_dataloader(args, dataset, detection_collate)
+    # criterioin
+    criterion = build_criterion(args, cfg, num_classes)
+
+    print('Training model on:', args.dataset)
+    print('The dataset size:', len(dataset))
+    print("----------------------------------------------------------")
+
+    # build model
+    net = build_model(args=args,
+                      cfg=cfg,
+                      device=device,
+                      num_classes=num_classes,
+                      trainable=True)
+    model = net
+
+    # SyncBatchNorm
+    # if args.sybn and args.npu and args.num_gpu > 1:
+    #     print('use SyncBatchNorm ...')
+    #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model = model.to(device).train()
+    # compute FLOPs and Params
+    # if local_rank == 0:
+    #     model_copy = deepcopy(model)
+    #     model_copy.trainable = False
+    #     model_copy.eval()
+    #     FLOPs_and_Params(model=model_copy, size=train_size)
+    #     model_copy.trainable = True
+    #     model_copy.train()
+    # keep training
+    if args.resume is not None:
+        print('keep training model: %s' % (args.resume))
+        model.load_state_dict(torch.load(args.resume, map_location=device))
+
+    # EMA
+    ema = ModelEMA(model) if args.ema else None
+    # use tfboard
+    tblogger = None
+    if args.tfboard:
+        print('use tensorboard')
+        from torch.utils.tensorboard import SummaryWriter
+        c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
+        log_path = os.path.join('log/', args.dataset, c_time)
+        os.makedirs(log_path, exist_ok=True)
+
+        tblogger = SummaryWriter(log_path)
+    # optimizer setup
+    base_lr = args.lr
+    tmp_lr = args.lr
+    if args.optimizer == 'NpuFusedSGD':
+        print('use SGD with momentum ...')
+        optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), lr=args.lr, momentum=0.9)
+        # optimizer = optim.SGD(model.parameters(),
+        #                         lr=tmp_lr,
+        #                         momentum=0.9,
+        #                         weight_decay=5e-4)
+    elif args.optimizer == 'adamw':
+        print('use AdamW ...')
+        optimizer = optim.AdamW(model.parameters(),
+                                lr=tmp_lr,
+                                weight_decay=5e-4)
+
+    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0,combine_grad=True)
+
+    # DDP
+    if args.distributed and args.num_gpu > 1:
+        print('using DDP ...')
+        model = DDP(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False)
+
+
+
+
+    batch_size = args.batch_size
+    epoch_size = len(dataset) // (batch_size * args.num_gpu)
+    best_map = -100.
+    warmup = not args.no_warmup
+
+    t0 = time.time()
+    # start training loop
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if args.distributed:
+            dataloader.sampler.set_epoch(epoch)
+
+        # use step lr decay
+        if args.lr_schedule == 'step':
+            if epoch in args.lr_epoch:
+                tmp_lr = tmp_lr * 0.1
+                set_lr(optimizer, tmp_lr)
+        # use cos lr decay
+        elif args.lr_schedule == 'cos' and not warmup:
+            T_max = args.max_epoch - 15
+            lr_min = base_lr * 0.1 * 0.1
+            if epoch > T_max:
+                # Cos decay is done
+                print('Cosine annealing is over !!')
+                args.lr_schedule == None
+                tmp_lr = lr_min
+                set_lr(optimizer, tmp_lr)
+            else:
+                tmp_lr = lr_min + 0.5*(base_lr - lr_min)*(1 + math.cos(math.pi*epoch / T_max))
+                set_lr(optimizer, tmp_lr)
+        fps_sum=0
+        # train one epoch
+        # pre_flag = False
+        # start_time = time.time()
+        for iter_i, (images, targets) in enumerate(dataloader):
+            # if iter_i == 5:
+            #     start_time = time.time()
+            # with torch.autograd.profiler.profile(use_npu=True) as prof:
+            ni = iter_i + epoch * epoch_size
+            # warmup
+            if epoch < args.wp_epoch and warmup:
+                nw = args.wp_epoch * epoch_size
+                tmp_lr = base_lr * pow(ni / nw, 4)
+                set_lr(optimizer, tmp_lr)
+
+            elif epoch == args.wp_epoch and iter_i == 0 and warmup:
+                # warmup is over
+                print('Warmup is over !!')
+                warmup = False
+                tmp_lr = base_lr
+                set_lr(optimizer, tmp_lr)
+
+            # multi-scale trick
+            if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
+                # randomly choose a new size
+                r = args.multi_scale_range
+                train_size = random.randint(r[0], r[1]) * 32
+                model.set_grid(train_size)
+            if args.multi_scale:
+                # interpolate
+                images = torch.nn.functional.interpolate(
+                                    input=images,
+                                    size=train_size,
+                                    mode='bilinear',
+                                    align_corners=False)
+
+            targets = [label.tolist() for label in targets]
+            # visualize target
+            if args.vis_data:
+                vis_data(images, targets)
+                continue
+            # make labels
+            targets = create_labels.gt_creator(
+                                    img_size=train_size,
+                                    strides=net.stride,
+                                    label_lists=targets,
+                                    anchor_size=cfg["anchor_size"],
+                                    multi_anchor=args.multi_anchor,
+                                    center_sample=args.center_sample)
+            # visualize assignment
+            if args.vis_targets:
+                vis_targets(images, targets, cfg["anchor_size"], net.stride)
+                continue
+
+            # to device
+            images = images.to(device)
+            targets = targets.to(device)
+
+            # inference
+            pred_obj, pred_cls, pred_iou, targets = model(images, targets=targets)
+
+            # compute loss
+            loss_obj, loss_cls, loss_reg, total_loss = criterion(pred_obj, pred_cls, pred_iou, targets)
+
+            # check loss
+            if torch.isnan(total_loss):
+                continue
+
+            loss_dict = dict(
+                loss_obj=loss_obj,
+                loss_cls=loss_cls,
+                loss_reg=loss_reg,
+                total_loss=total_loss
+            )
+            loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict)
+
+            total_loss = total_loss / args.accumulate
+            # Backward and Optimize
+            with amp.scale_loss(total_loss , optimizer) as scaled_loss:
+                scaled_loss.backward()
+            if ni % args.accumulate == 0:
+                if args.grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                if args.ema:
+                    ema.update(model)
+
+                # display
+            # if iter_i % 10 == 0:
+                if args.tfboard:
+                    # viz loss
+                    tblogger.add_scalar('loss obj',  loss_dict_reduced['loss_obj'].item(),  ni)
+                    tblogger.add_scalar('loss cls',  loss_dict_reduced['loss_cls'].item(),  ni)
+                    tblogger.add_scalar('loss reg',  loss_dict_reduced['loss_reg'].item(),  ni)
+
+            t1 = time.time()
+            print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: obj %.2f || cls %.2f || reg %.2f || size %d || time: %.2f]'
+                        % (epoch+1,
+                           args.max_epoch,
+                           iter_i,
+                           epoch_size,
+                           tmp_lr,
+                           loss_dict['loss_obj'].item(),
+                           loss_dict['loss_cls'].item(),
+                           loss_dict['loss_reg'].item(),
+                           train_size,
+                           t1-t0),
+                        flush=True)
+            fps_sum = fps_sum + (batch_size*8 / (t1 - t0))
+            t0 = time.time()
+        # if local_rank in [-1, 0]:
+        #     epoch_time = time.time() - start_time
+        #     if iter_i >= 5:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1 - 5) / (epoch_time)))
+        #     else:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1) / (epoch_time)))
+            if iter_i > 0 and iter_i == 461:
+                fps_avg = fps_sum / 461
+                print("fps:",fps_avg)
+                fps_sum = 0
+
+        # evaluation
+        if (epoch + 1) % args.eval_epoch == 0 or (epoch + 1) == args.max_epoch:
+            if evaluator is None:
+                print('No evaluator ...')
+                print('Saving state, epoch:', epoch + 1)
+                torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                            args.model + '_' + repr(epoch + 1) + '.pth'))
+                print('Keep training ...')
+            else:
+                print('eval ...')
+                # check ema
+                if args.ema:
+                    model_eval = ema.ema
+                else:
+                    model_eval = model.module if args.distributed else model
+
+                # set eval mode
+                model_eval.trainable = False
+                model_eval.set_grid(val_size)
+                model_eval.eval()
+
+                if local_rank == 0:
+                    # evaluate
+                    evaluator.evaluate(model_eval)
+
+                    cur_map = evaluator.map
+                    if cur_map > best_map:
+                        # update best-map
+                        best_map = cur_map
+                        # save model
+                        print('Saving state, epoch:', epoch + 1)
+                        torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                                    args.model + '_' + repr(epoch + 1) + '_' + str(round(best_map*100, 2)) + '.pth'))
+                    if args.tfboard:
+                        if args.dataset == 'voc':
+                            tblogger.add_scalar('07test/mAP', evaluator.map, epoch)
+                        elif args.dataset == 'coco':
+                            tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch)
+                            tblogger.add_scalar('val/AP50', evaluator.ap50, epoch)
+
+                if args.distributed:
+                    # wait for all processes to synchronize
+                    dist.barrier()
+
+                # set train mode.
+                model_eval.trainable = True
+                model_eval.set_grid(train_size)
+                model_eval.train()
+
+        # close mosaic augmentation
+        if args.mosaic and args.max_epoch - epoch == 15:
+            print('close Mosaic Augmentation ...')
+            dataloader.dataset.mosaic = False
+        # close mixup augmentation
+        if args.mixup and args.max_epoch - epoch == 15:
+            print('close Mixup Augmentation ...')
+            dataloader.dataset.mixup = False
+
+    if args.tfboard:
+        tblogger.close()
+
+
+def build_dataset(args, train_size, val_size, device):
+    if args.dataset == 'voc':
+        data_dir = os.path.join(args.root, 'VOCdevkit')
+        num_classes = 20
+        dataset = VOCDetection(
+                        data_dir=data_dir,
+                        img_size=train_size,
+                        transform=TrainTransforms(train_size),
+                        color_augment=ColorTransforms(train_size),
+                        mosaic=args.mosaic,
+                        mixup=args.mixup)
+
+        evaluator = VOCAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size))
+
+    elif args.dataset == 'coco':
+        data_dir = os.path.join(args.root, 'COCO')
+        num_classes = 80
+        dataset = COCODataset(
+                    data_dir=data_dir,
+                    img_size=train_size,
+                    image_set='train2017',
+                    transform=TrainTransforms(train_size),
+                    color_augment=ColorTransforms(train_size),
+                    mosaic=args.mosaic,
+                    mixup=args.mixup)
+
+        evaluator = COCOAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size)
+                        )
+
+    else:
+        print('unknow dataset !! Only support voc and coco !!')
+        exit(0)
+
+    return dataset, evaluator, num_classes
+
+
+def build_dataloader(args, dataset, collate_fn=None):
+    # distributed
+    if args.distributed and args.num_gpu > 1:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True,
+                        sampler=torch.utils.data.distributed.DistributedSampler(dataset)
+                        )
+
+    else:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        shuffle=True,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True
+                        )
+    return dataloader
+
+
+def set_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+if __name__ == '__main__':
+    train()
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py b/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
new file mode 100644
index 0000000000..b34ec2f26f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
@@ -0,0 +1,545 @@
+from __future__ import division
+
+import os
+import argparse
+import time
+import math
+import random
+from copy import deepcopy
+import apex
+from apex import amp
+import torch
+import torch_npu
+import torch.optim as optim
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import sys
+from config.yolo_config import yolo_config
+from data.voc import VOCDetection
+from data.coco import COCODataset
+from data.transforms import TrainTransforms, ColorTransforms, ValTransforms
+
+from utils import distributed_utils
+from utils import create_labels
+from utils.vis import vis_data, vis_targets
+from utils.com_flops_params import FLOPs_and_Params
+from utils.criterion import build_criterion
+from utils.misc import detection_collate
+from utils.misc import ModelEMA
+from utils.criterion import build_criterion
+
+from models.yolo import build_model
+
+from evaluator.cocoapi_evaluator import COCOAPIEvaluator
+from evaluator.vocapi_evaluator import VOCAPIEvaluator
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO Detection')
+    # basic
+    parser.add_argument('--npu', action='store_true', default=False,
+                        help='use npu.')
+    parser.add_argument('--batch_size', default=16, type=int,
+                        help='Batch size for training')
+    parser.add_argument('--lr', default=1e-3, type=float,
+                        help='initial learning rate')
+    parser.add_argument('--img_size', type=int, default=640,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--multi_scale_range', nargs='+', default=[10, 20], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--max_epoch', type=int, default=200,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--lr_epoch', nargs='+', default=[100, 150], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--wp_epoch', type=int, default=2,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--start_epoch', type=int, default=0,
+                        help='start epoch to train')
+    parser.add_argument('-r', '--resume', default=None, type=str,
+                        help='keep training')
+    parser.add_argument('--num_workers', default=8, type=int,
+                        help='Number of workers used in dataloading')
+    parser.add_argument('--num_gpu', default=1, type=int,
+                        help='Number of GPUs to train')
+    parser.add_argument('--eval_epoch', type=int,
+                        default=10, help='interval between evaluations')
+    parser.add_argument('--tfboard', action='store_true', default=False,
+                        help='use tensorboard')
+    parser.add_argument('--save_folder', default='weights/', type=str,
+                        help='path to save weight')
+    parser.add_argument('--vis_data', action='store_true', default=False,
+                        help='visualize images and labels.')
+    parser.add_argument('--vis_targets', action='store_true', default=False,
+                        help='visualize assignment.')
+
+    # Optimizer & Schedule
+    parser.add_argument('--optimizer', default='NpuFusedSGD', type=str,
+                        help='sgd, adamw')
+    parser.add_argument('--lr_schedule', default='step', type=str,
+                        help='step, cos')
+    parser.add_argument('--grad_clip', default=None, type=float,
+                        help='clip gradient')
+
+    # model
+    parser.add_argument('-m', '--model', default='yolov1',
+                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                             'yolov4, yolo_tiny, yolo_nano')
+    parser.add_argument('--conf_thresh', default=0.001, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--nms_thresh', default=0.5, type=float,
+                        help='NMS threshold')
+
+    # dataset
+    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, widerface, crowdhuman')
+
+    # Loss
+    parser.add_argument('--loss_obj_weight', default=1.0, type=float,
+                        help='weight of obj loss')
+    parser.add_argument('--loss_cls_weight', default=1.0, type=float,
+                        help='weight of cls loss')
+    parser.add_argument('--loss_reg_weight', default=1.0, type=float,
+                        help='weight of reg loss')
+    parser.add_argument('--scale_loss', default='batch', type=str,
+                        help='scale loss: batch or positive samples')
+
+    # train trick
+    parser.add_argument('--no_warmup', action='store_true', default=False,
+                        help='do not use warmup')
+    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
+                        help='use multi-scale trick')
+    parser.add_argument('--ema', action='store_true', default=False,
+                        help='use ema training trick')
+    parser.add_argument('--mosaic', action='store_true', default=False,
+                        help='use Mosaic Augmentation trick')
+    parser.add_argument('--mixup', action='store_true', default=False,
+                        help='use MixUp Augmentation trick')
+    parser.add_argument('--multi_anchor', action='store_true', default=False,
+                        help='use multiple anchor boxes as the positive samples')
+    parser.add_argument('--center_sample', action='store_true', default=False,
+                        help='use center sample for labels')
+    parser.add_argument('--accumulate', type=int, default=1,
+                        help='accumulate gradient')
+    # DDP train
+    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
+                        help='distributed training')
+    parser.add_argument('--local_rank', type=int, default=0,
+                        help='local_rank')
+    parser.add_argument('--sybn', action='store_true', default=False,
+                        help='use sybn.')
+    parser.add_argument('--opt-level', default='O2', type=str,
+                        help='loss scale using in amp, default O1')
+
+    return parser.parse_args()
+
+
+def train():
+    args = parse_args()
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12345'
+
+    # torch.npu.set_compile_mode(jit_compile=False)
+    option = {}
+    option["ACL_OP_COMPILER_CACHE_MODE"]="enable"
+    option["ACL_OP_COMPILER_CACHE_DIR"]="./kernel_meta"
+    option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Maximum,Conv2D,BNInfer,BNTrainingReduceGrad,Cast"
+    print("option:",option)
+    # torch.npu.set_option(option)
+    print("Setting Arguments.. : ", args)
+    print("----------------------------------------------------------")
+
+    # path to save model
+    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
+    os.makedirs(path_to_save, exist_ok=True)
+
+    # set distributed
+    local_rank = 0
+    if args.distributed:
+        dist.init_process_group(backend="hccl", #init_method="env://"
+                                )
+        local_rank = torch.distributed.get_rank()
+        print(local_rank)
+        torch_npu.npu.set_device(local_rank)
+
+    # cuda
+    if args.npu:
+        print('use npu')
+        cudnn.benchmark = True
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # YOLO config
+    cfg = yolo_config[args.model]
+    train_size = val_size = args.img_size
+
+    # dataset and evaluator
+    dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device)
+    # dataloader
+    dataloader = build_dataloader(args, dataset, detection_collate)
+    # criterioin
+    criterion = build_criterion(args, cfg, num_classes)
+
+    print('Training model on:', args.dataset)
+    print('The dataset size:', len(dataset))
+    print("----------------------------------------------------------")
+
+    # build model
+    net = build_model(args=args,
+                      cfg=cfg,
+                      device=device,
+                      num_classes=num_classes,
+                      trainable=True)
+    model = net
+
+    # SyncBatchNorm
+    # if args.sybn and args.npu and args.num_gpu > 1:
+    #     print('use SyncBatchNorm ...')
+    #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model = model.to(device).train()
+    # compute FLOPs and Params
+    # if local_rank == 0:
+    #     model_copy = deepcopy(model)
+    #     model_copy.trainable = False
+    #     model_copy.eval()
+    #     FLOPs_and_Params(model=model_copy, size=train_size)
+    #     model_copy.trainable = True
+    #     model_copy.train()
+    # keep training
+    if args.resume is not None:
+        print('keep training model: %s' % (args.resume))
+        model.load_state_dict(torch.load(args.resume, map_location=device))
+
+    # EMA
+    ema = ModelEMA(model) if args.ema else None
+    # use tfboard
+    tblogger = None
+    if args.tfboard:
+        print('use tensorboard')
+        from torch.utils.tensorboard import SummaryWriter
+        c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
+        log_path = os.path.join('log/', args.dataset, c_time)
+        os.makedirs(log_path, exist_ok=True)
+
+        tblogger = SummaryWriter(log_path)
+    # optimizer setup
+    base_lr = args.lr
+    tmp_lr = args.lr
+    if args.optimizer == 'NpuFusedSGD':
+        print('use SGD with momentum ...')
+        optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), lr=args.lr, momentum=0.9)
+        # optimizer = optim.SGD(model.parameters(),
+        #                         lr=tmp_lr,
+        #                         momentum=0.9,
+        #                         weight_decay=5e-4)
+    elif args.optimizer == 'adamw':
+        print('use AdamW ...')
+        optimizer = optim.AdamW(model.parameters(),
+                                lr=tmp_lr,
+                                weight_decay=5e-4)
+
+    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0,combine_grad=True)
+
+    # DDP
+    if args.distributed and args.num_gpu > 1:
+        print('using DDP ...')
+        model = DDP(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False)
+
+
+
+
+    batch_size = args.batch_size
+    epoch_size = len(dataset) // (batch_size * args.num_gpu)
+    best_map = -100.
+    warmup = not args.no_warmup
+
+    t0 = time.time()
+    # start training loop
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if args.distributed:
+            dataloader.sampler.set_epoch(epoch)
+
+        # use step lr decay
+        if args.lr_schedule == 'step':
+            if epoch in args.lr_epoch:
+                tmp_lr = tmp_lr * 0.1
+                set_lr(optimizer, tmp_lr)
+        # use cos lr decay
+        elif args.lr_schedule == 'cos' and not warmup:
+            T_max = args.max_epoch - 15
+            lr_min = base_lr * 0.1 * 0.1
+            if epoch > T_max:
+                # Cos decay is done
+                print('Cosine annealing is over !!')
+                args.lr_schedule == None
+                tmp_lr = lr_min
+                set_lr(optimizer, tmp_lr)
+            else:
+                tmp_lr = lr_min + 0.5*(base_lr - lr_min)*(1 + math.cos(math.pi*epoch / T_max))
+                set_lr(optimizer, tmp_lr)
+        fps_sum=0
+        # train one epoch
+        # pre_flag = False
+        # start_time = time.time()
+        for iter_i, (images, targets) in enumerate(dataloader):
+            # if iter_i == 5:
+            #     start_time = time.time()
+            # with torch.autograd.profiler.profile(use_npu=True) as prof:
+            ni = iter_i + epoch * epoch_size
+            # warmup
+            if epoch < args.wp_epoch and warmup:
+                nw = args.wp_epoch * epoch_size
+                tmp_lr = base_lr * pow(ni / nw, 4)
+                set_lr(optimizer, tmp_lr)
+
+            elif epoch == args.wp_epoch and iter_i == 0 and warmup:
+                # warmup is over
+                print('Warmup is over !!')
+                warmup = False
+                tmp_lr = base_lr
+                set_lr(optimizer, tmp_lr)
+
+            # multi-scale trick
+            if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
+                # randomly choose a new size
+                r = args.multi_scale_range
+                train_size = random.randint(r[0], r[1]) * 32
+                model.module.set_grid(train_size)
+            if args.multi_scale:
+                # interpolate
+                images = torch.nn.functional.interpolate(
+                                    input=images,
+                                    size=train_size,
+                                    mode='bilinear',
+                                    align_corners=False)
+
+            targets = [label.tolist() for label in targets]
+            # visualize target
+            if args.vis_data:
+                vis_data(images, targets)
+                continue
+            # make labels
+            targets = create_labels.gt_creator(
+                                    img_size=train_size,
+                                    strides=net.stride,
+                                    label_lists=targets,
+                                    anchor_size=cfg["anchor_size"],
+                                    multi_anchor=args.multi_anchor,
+                                    center_sample=args.center_sample)
+            # visualize assignment
+            if args.vis_targets:
+                vis_targets(images, targets, cfg["anchor_size"], net.stride)
+                continue
+
+            # to device
+            images = images.to(device)
+            targets = targets.to(device)
+
+            # inference
+            pred_obj, pred_cls, pred_iou, targets = model(images, targets=targets)
+
+            # compute loss
+            loss_obj, loss_cls, loss_reg, total_loss = criterion(pred_obj, pred_cls, pred_iou, targets)
+
+            # check loss
+            if torch.isnan(total_loss):
+                continue
+
+            loss_dict = dict(
+                loss_obj=loss_obj,
+                loss_cls=loss_cls,
+                loss_reg=loss_reg,
+                total_loss=total_loss
+            )
+            loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict)
+
+            total_loss = total_loss / args.accumulate
+            # Backward and Optimize
+            with amp.scale_loss(total_loss , optimizer) as scaled_loss:
+                scaled_loss.backward()
+            if ni % args.accumulate == 0:
+                if args.grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                if args.ema:
+                    ema.update(model)
+
+                # display
+            # if iter_i % 10 == 0:
+                if args.tfboard:
+                    # viz loss
+                    tblogger.add_scalar('loss obj',  loss_dict_reduced['loss_obj'].item(),  ni)
+                    tblogger.add_scalar('loss cls',  loss_dict_reduced['loss_cls'].item(),  ni)
+                    tblogger.add_scalar('loss reg',  loss_dict_reduced['loss_reg'].item(),  ni)
+
+            t1 = time.time()
+            print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: obj %.2f || cls %.2f || reg %.2f || size %d || time: %.2f]'
+                        % (epoch+1,
+                           args.max_epoch,
+                           iter_i,
+                           epoch_size,
+                           tmp_lr,
+                           loss_dict['loss_obj'].item(),
+                           loss_dict['loss_cls'].item(),
+                           loss_dict['loss_reg'].item(),
+                           train_size,
+                           t1-t0),
+                        flush=True)
+            fps_sum = fps_sum + (batch_size*8 / (t1 - t0))
+            t0 = time.time()
+        # if local_rank in [-1, 0]:
+        #     epoch_time = time.time() - start_time
+        #     if iter_i >= 5:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1 - 5) / (epoch_time)))
+        #     else:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1) / (epoch_time)))
+            if iter_i > 0 and iter_i == 461:
+                fps_avg = fps_sum / 461
+                print("fps:",fps_avg)
+                fps_sum = 0
+
+        # evaluation
+        if (epoch + 1) % args.eval_epoch == 0 or (epoch + 1) == args.max_epoch:
+            if evaluator is None:
+                print('No evaluator ...')
+                print('Saving state, epoch:', epoch + 1)
+                torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                            args.model + '_' + repr(epoch + 1) + '.pth'))
+                print('Keep training ...')
+            else:
+                print('eval ...')
+                # check ema
+                if args.ema:
+                    model_eval = ema.ema
+                else:
+                    model_eval = model.module if args.distributed else model
+
+                # set eval mode
+                model_eval.trainable = False
+                model_eval.set_grid(val_size)
+                model_eval.eval()
+
+                if local_rank == 0:
+                    # evaluate
+                    evaluator.evaluate(model_eval)
+
+                    cur_map = evaluator.map
+                    if cur_map > best_map:
+                        # update best-map
+                        best_map = cur_map
+                        # save model
+                        print('Saving state, epoch:', epoch + 1)
+                        torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                                    args.model + '_' + repr(epoch + 1) + '_' + str(round(best_map*100, 2)) + '.pth'))
+                    if args.tfboard:
+                        if args.dataset == 'voc':
+                            tblogger.add_scalar('07test/mAP', evaluator.map, epoch)
+                        elif args.dataset == 'coco':
+                            tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch)
+                            tblogger.add_scalar('val/AP50', evaluator.ap50, epoch)
+
+                if args.distributed:
+                    # wait for all processes to synchronize
+                    dist.barrier()
+
+                # set train mode.
+                model_eval.trainable = True
+                model_eval.set_grid(train_size)
+                model_eval.train()
+
+        # close mosaic augmentation
+        if args.mosaic and args.max_epoch - epoch == 15:
+            print('close Mosaic Augmentation ...')
+            dataloader.dataset.mosaic = False
+        # close mixup augmentation
+        if args.mixup and args.max_epoch - epoch == 15:
+            print('close Mixup Augmentation ...')
+            dataloader.dataset.mixup = False
+
+    if args.tfboard:
+        tblogger.close()
+
+
+def build_dataset(args, train_size, val_size, device):
+    if args.dataset == 'voc':
+        data_dir = os.path.join(args.root, 'VOCdevkit')
+        num_classes = 20
+        dataset = VOCDetection(
+                        data_dir=data_dir,
+                        img_size=train_size,
+                        transform=TrainTransforms(train_size),
+                        color_augment=ColorTransforms(train_size),
+                        mosaic=args.mosaic,
+                        mixup=args.mixup)
+
+        evaluator = VOCAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size))
+
+    elif args.dataset == 'coco':
+        data_dir = os.path.join(args.root, 'COCO')
+        num_classes = 80
+        dataset = COCODataset(
+                    data_dir=data_dir,
+                    img_size=train_size,
+                    image_set='train2017',
+                    transform=TrainTransforms(train_size),
+                    color_augment=ColorTransforms(train_size),
+                    mosaic=args.mosaic,
+                    mixup=args.mixup)
+
+        evaluator = COCOAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size)
+                        )
+
+    else:
+        print('unknow dataset !! Only support voc and coco !!')
+        exit(0)
+
+    return dataset, evaluator, num_classes
+
+
+def build_dataloader(args, dataset, collate_fn=None):
+    # distributed
+    if args.distributed and args.num_gpu > 1:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True,
+                        sampler=torch.utils.data.distributed.DistributedSampler(dataset)
+                        )
+
+    else:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        shuffle=True,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True
+                        )
+    return dataloader
+
+
+def set_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+if __name__ == '__main__':
+    train()
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train_yolonano.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolonano.sh
new file mode 100644
index 0000000000..60e0967766
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolonano.sh
@@ -0,0 +1,15 @@
+python train.py \
+        --cuda \
+        -d coco \
+        -m yolo_nano \
+        --root /home/zzb/PyTorch_YOLO-Family-master \
+        --batch_size 64 \
+        --lr 0.001 \
+        --img_size 512 \
+        --max_epoch 160 \
+        --lr_epoch 100 130 \
+        --multi_scale \
+        --multi_scale_range 10 16 \
+        --multi_anchor \
+        --ema
+                        
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov1.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov1.sh
new file mode 100644
index 0000000000..f02d280e1c
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov1.sh
@@ -0,0 +1,16 @@
+python train.py \
+        --cuda \
+        -d coco \
+        -m yolov1 \
+        --root /home/zzb/PyTorch_YOLO-Family-master \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 640 \
+        --max_epoch 200 \
+        --lr_epoch 100 150 \
+        --multi_scale \
+        --multi_scale_range 10 20 \
+        --ema
+FPS=`grep FPS  $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "FPS:" '{print $2}'|tail -n +2|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g`
+#FPS=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${perf}'}'`
+echo "Final Performance images/sec : $FPS"
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3.sh
new file mode 100644
index 0000000000..f6fc52db37
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3.sh
@@ -0,0 +1,15 @@
+python train.py \
+        --npu \
+        -d coco \
+        -m yolov3 \
+        --root /home/zzb/PyTorch_YOLO-Family-master \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 640 \
+        --max_epoch 200 \
+        --lr_epoch 100 150 \
+        --multi_scale \
+        --multi_scale_range 10 20 \
+        --multi_anchor \
+        --ema
+        
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_de.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_de.sh
new file mode 100644
index 0000000000..0c4bc871b5
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_de.sh
@@ -0,0 +1,15 @@
+python train.py \
+        --cuda \
+        -d coco \
+        -m yolov3_de \
+        --root /home/zzb/PyTorch_YOLO-Family-master \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 640 \
+        --max_epoch 200 \
+        --lr_epoch 100 150 \
+        --multi_scale \
+        --multi_scale_range 10 20 \
+        --multi_anchor \
+        --ema
+        
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_spp.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_spp.sh
new file mode 100644
index 0000000000..0494d09827
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov3_spp.sh
@@ -0,0 +1,15 @@
+python train.py \
+        --cuda \
+        -d coco \
+        -m yolov3_spp \
+        --root /home/zzb/PyTorch_YOLO-Family-master \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 640 \
+        --max_epoch 200 \
+        --lr_epoch 100 150 \
+        --multi_scale \
+        --multi_scale_range 10 20 \
+        --multi_anchor \
+        --ema
+        
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov4.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov4.sh
new file mode 100644
index 0000000000..2258bb927b
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train_yolov4.sh
@@ -0,0 +1,19 @@
+python train.py \
+        --cuda \
+        -d coco \
+        -m yolov4 \
+        --root /home/zzb/PyTorch_YOLO-Family-master \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 608 \
+        --max_epoch 250 \
+        --lr_epoch 130 180 \
+        --multi_scale \
+        --multi_scale_range 10 19 \
+        --scale_loss batch \
+        --accumulate 1 \
+        --mosaic \
+        --mixup \
+        --multi_anchor \
+        --ema
+                
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/__init__.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/box_ops.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/box_ops.py
new file mode 100644
index 0000000000..6176d23478
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/box_ops.py
@@ -0,0 +1,101 @@
+import math
+import torch_npu
+import torch
+
+
+def iou_score(bboxes_a, bboxes_b, batch_size):
+    """
+        Input:\n
+        bboxes_a : [B*N, 4] = [x1, y1, x2, y2] \n
+        bboxes_b : [B*N, 4] = [x1, y1, x2, y2] \n
+
+        Output:\n
+        iou : [B, N] = [iou, ...] \n
+    """
+    tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2])
+    br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:])
+    area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+    area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+
+    en = (tl < br).type(tl.type()).prod(dim=1)
+    area_i = torch.prod(br - tl, 1) * en  # * ((tl < br).all())
+    iou = area_i / (area_a + area_b - area_i + 1e-14)
+
+    return iou.view(batch_size, -1)
+
+
+def giou_score(bboxes_a, bboxes_b, batch_size):
+    """
+        bbox_1 : [B*N, 4] = [x1, y1, x2, y2]
+        bbox_2 : [B*N, 4] = [x1, y1, x2, y2]
+    """
+    # iou
+    tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2])
+    br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:])
+    area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+    area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+
+    en = (tl < br).float().prod(dim=1)
+    #en = (tl < br).type(tl.type()).prod(dim=1)
+    area_i = torch.prod(br - tl, 1) * en  # * ((tl < br).all())
+    area_u = area_a + area_b - area_i
+    iou = (area_i / (area_u + 1e-14)).clamp(0)
+    
+    # giou
+    tl = torch.min(bboxes_a[:, :2], bboxes_b[:, :2])
+    br = torch.max(bboxes_a[:, 2:], bboxes_b[:, 2:])
+    en = (tl < br).float().prod(dim=1)
+    #en = (tl < br).type(tl.type()).prod(dim=1)
+    area_c = torch.prod(br - tl, 1) * en  # * ((tl < br).all())
+
+    giou = (iou - (area_c - area_u) / (area_c + 1e-14))
+
+    return giou.view(batch_size, -1)
+
+
+def ciou_score(bboxes_a, bboxes_b, batch_size):
+    """
+        Input:\n
+        bboxes_a : [B*N, 4] = [x1, y1, x2, y2] \n
+        bboxes_b : [B*N, 4] = [x1, y1, x2, y2] \n
+
+        Output:\n
+        iou : [B, N] = [ciou, ...] \n
+    """
+    tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2])
+    br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:])
+    area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+    area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+
+    en = (tl < br).type(tl.type()).prod(dim=1)
+    area_i = torch.prod(br - tl, 1) * en  # * ((tl < br).all())
+    iou = area_i / (area_a + area_b - area_i + 1e-7)
+
+    cw = torch.max(bboxes_a[..., 2], bboxes_b[..., 2]) - torch.min(bboxes_a[..., 0], bboxes_b[..., 0])
+    ch = torch.max(bboxes_a[..., 3], bboxes_b[..., 3]) - torch.min(bboxes_a[..., 1], bboxes_b[..., 1])
+
+    c2 = cw ** 2 + ch ** 2 + 1e-7
+    rho2 = ((bboxes_b[..., 0] + bboxes_b[..., 2] - bboxes_a[..., 0] - bboxes_a[..., 2]) ** 2 +
+            (bboxes_b[..., 1] + bboxes_b[..., 3] - bboxes_a[..., 1] - bboxes_a[..., 3]) ** 2) / 4
+    w1 = bboxes_a[..., 2] - bboxes_a[..., 0]
+    h1 = bboxes_a[..., 3] - bboxes_a[..., 1]
+    w2 = bboxes_b[..., 2] - bboxes_b[..., 0]
+    h2 = bboxes_b[..., 3] - bboxes_b[..., 1]
+    v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+    with torch.no_grad():
+        alpha = v / (v - iou + (1. + 1e-7))
+
+    ciou = iou - (rho2 / c2 + v * alpha)
+
+    return ciou.view(batch_size, -1)
+
+
+if __name__ == '__main__':
+    bboxes_a = torch.tensor([[10, 10, 20, 20]])
+    bboxes_b = torch.tensor([[13, 15, 27, 25]])
+    iou = iou_score(bboxes_a, bboxes_b, 1)
+    print(iou)
+    giou = giou_score(bboxes_a, bboxes_b, 1)
+    print(giou)
+    ciou = ciou_score(bboxes_a, bboxes_b, 1)
+    print(ciou)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/com_flops_params.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/com_flops_params.py
new file mode 100644
index 0000000000..be7efad152
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/com_flops_params.py
@@ -0,0 +1,17 @@
+import torch
+from thop import profile
+import torch_npu
+import torch.nn.functional as F
+
+
+def FLOPs_and_Params(model, size):
+    device = model.device
+    x = torch.randn(1, 3, size, size).to(device)
+
+    flops, params = profile(model, inputs=(x, ))
+    print('FLOPs : ', flops / 1e9, ' B')
+    print('Params : ', params / 1e6, ' M')
+
+
+if __name__ == "__main__":
+    pass
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/create_labels.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/create_labels.py
new file mode 100644
index 0000000000..3b32df3134
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/create_labels.py
@@ -0,0 +1,240 @@
+import numpy as np
+import torch
+import torch_npu
+
+
+def compute_iou(anchor_boxes, gt_box):
+    """
+    Input:
+        anchor_boxes : ndarray -> [[xc_s, yc_s, anchor_w, anchor_h], ..., [xc_s, yc_s, anchor_w, anchor_h]].
+        gt_box : ndarray -> [xc_s, yc_s, anchor_w, anchor_h].
+    Output:
+        iou : ndarray -> [iou_1, iou_2, ..., iou_m], and m is equal to the number of anchor boxes.
+    """
+    # compute the iou between anchor box and gt box
+    # First, change [xc_s, yc_s, anchor_w, anchor_h] ->  [x1, y1, x2, y2]
+    # anchor box :
+    ab_x1y1_x2y2 = np.zeros([len(anchor_boxes), 4])
+    ab_x1y1_x2y2[:, 0] = anchor_boxes[:, 0] - anchor_boxes[:, 2] / 2  # x1
+    ab_x1y1_x2y2[:, 1] = anchor_boxes[:, 1] - anchor_boxes[:, 3] / 2  # y1
+    ab_x1y1_x2y2[:, 2] = anchor_boxes[:, 0] + anchor_boxes[:, 2] / 2  # x2
+    ab_x1y1_x2y2[:, 3] = anchor_boxes[:, 1] + anchor_boxes[:, 3] / 2  # y2
+    w_ab, h_ab = anchor_boxes[:, 2], anchor_boxes[:, 3]
+    
+    # gt_box : 
+    # We need to expand gt_box(ndarray) to the shape of anchor_boxes(ndarray), in order to compute IoU easily. 
+    gt_box_expand = np.repeat(gt_box, len(anchor_boxes), axis=0)
+
+    gb_x1y1_x2y2 = np.zeros([len(anchor_boxes), 4])
+    gb_x1y1_x2y2[:, 0] = gt_box_expand[:, 0] - gt_box_expand[:, 2] / 2 # x1
+    gb_x1y1_x2y2[:, 1] = gt_box_expand[:, 1] - gt_box_expand[:, 3] / 2 # y1
+    gb_x1y1_x2y2[:, 2] = gt_box_expand[:, 0] + gt_box_expand[:, 2] / 2 # x2
+    gb_x1y1_x2y2[:, 3] = gt_box_expand[:, 1] + gt_box_expand[:, 3] / 2 # y1
+    w_gt, h_gt = gt_box_expand[:, 2], gt_box_expand[:, 3]
+
+    # Then we compute IoU between anchor_box and gt_box
+    S_gt = w_gt * h_gt
+    S_ab = w_ab * h_ab
+    I_w = np.minimum(gb_x1y1_x2y2[:, 2], ab_x1y1_x2y2[:, 2]) - np.maximum(gb_x1y1_x2y2[:, 0], ab_x1y1_x2y2[:, 0])
+    I_h = np.minimum(gb_x1y1_x2y2[:, 3], ab_x1y1_x2y2[:, 3]) - np.maximum(gb_x1y1_x2y2[:, 1], ab_x1y1_x2y2[:, 1])
+    S_I = I_h * I_w
+    U = S_gt + S_ab - S_I + 1e-20
+    IoU = S_I / U
+    
+    return IoU
+
+
+def set_anchors(anchor_size):
+    """
+    Input:
+        anchor_size : list -> [[h_1, w_1], [h_2, w_2], ..., [h_n, w_n]].
+    Output:
+        anchor_boxes : ndarray -> [[0, 0, anchor_w, anchor_h],
+                                   [0, 0, anchor_w, anchor_h],
+                                   ...
+                                   [0, 0, anchor_w, anchor_h]].
+    """
+    num_anchors = len(anchor_size)
+    anchor_boxes = np.zeros([num_anchors, 4])
+    for index, size in enumerate(anchor_size): 
+        anchor_w, anchor_h = size
+        anchor_boxes[index] = np.array([0, 0, anchor_w, anchor_h])
+    
+    return anchor_boxes
+
+
+def label_assignment_with_anchorbox(anchor_size, target_boxes, num_anchors, strides, multi_anchor=False):
+    # prepare
+    anchor_boxes = set_anchors(anchor_size)
+    gt_box = np.array([[0, 0, target_boxes[2], target_boxes[3]]])
+
+    # compute IoU
+    iou = compute_iou(anchor_boxes, gt_box)
+
+    label_assignment_results = []
+    if multi_anchor:
+        # We consider those anchor boxes whose IoU is more than 0.5,
+        iou_mask = (iou > 0.5)
+        if iou_mask.sum() == 0:
+            # We assign the anchor box with highest IoU score.
+            iou_ind = np.argmax(iou)
+
+            # scale_ind, anchor_ind = index // num_scale, index % num_scale
+            scale_ind = iou_ind // num_anchors
+            anchor_ind = iou_ind - scale_ind * num_anchors
+
+            # get the corresponding stride
+            stride = strides[scale_ind]
+
+            # compute the grid cell
+            xc_s = target_boxes[0] / stride
+            yc_s = target_boxes[1] / stride
+            grid_x = int(xc_s)
+            grid_y = int(yc_s)
+
+            label_assignment_results.append([grid_x, grid_y, scale_ind, anchor_ind])
+        else:            
+            for iou_ind, iou_m in enumerate(iou_mask):
+                if iou_m:
+                    # scale_ind, anchor_ind = index // num_scale, index % num_scale
+                    scale_ind = iou_ind // num_anchors
+                    anchor_ind = iou_ind - scale_ind * num_anchors
+
+                    # get the corresponding stride
+                    stride = strides[scale_ind]
+
+                    # compute the gride cell
+                    xc_s = target_boxes[0] / stride
+                    yc_s = target_boxes[1] / stride
+                    grid_x = int(xc_s)
+                    grid_y = int(yc_s)
+
+                    label_assignment_results.append([grid_x, grid_y, scale_ind, anchor_ind])
+
+    else:
+        # We assign the anchor box with highest IoU score.
+        iou_ind = np.argmax(iou)
+
+        # scale_ind, anchor_ind = index // num_scale, index % num_scale
+        scale_ind = iou_ind // num_anchors
+        anchor_ind = iou_ind - scale_ind * num_anchors
+
+        # get the corresponding stride
+        stride = strides[scale_ind]
+
+        # compute the grid cell
+        xc_s = target_boxes[0] / stride
+        yc_s = target_boxes[1] / stride
+        grid_x = int(xc_s)
+        grid_y = int(yc_s)
+
+        label_assignment_results.append([grid_x, grid_y, scale_ind, anchor_ind])
+
+    return label_assignment_results
+
+
+def label_assignment_without_anchorbox(target_boxes, strides):
+    # no anchor box
+    scale_ind = 0
+    anchor_ind = 0
+
+    label_assignment_results = []
+    # get the corresponding stride
+    stride = strides[scale_ind]
+
+    # compute the grid cell
+    xc_s = target_boxes[0] / stride
+    yc_s = target_boxes[1] / stride
+    grid_x = int(xc_s)
+    grid_y = int(yc_s)
+    
+    label_assignment_results.append([grid_x, grid_y, scale_ind, anchor_ind])
+            
+    return label_assignment_results
+
+
+def gt_creator(img_size, strides, label_lists, anchor_size=None, multi_anchor=False, center_sample=False):
+    """creator gt"""
+    # prepare
+    batch_size = len(label_lists)
+    img_h = img_w = img_size
+    num_scale = len(strides)
+    gt_tensor = []
+    KA = len(anchor_size) // num_scale if anchor_size is not None else 1
+
+    for s in strides:
+        fmp_h, fmp_w = img_h // s, img_w // s
+        # [B, H, W, KA, obj+cls+box+scale]
+        gt_tensor.append(np.zeros([batch_size, fmp_h, fmp_w, KA, 1+1+4+1]))
+        
+    # generate gt datas  
+    for bi in range(batch_size):
+        label = label_lists[bi]
+        for box_cls in label:
+            # get a bbox coords
+            cls_id = int(box_cls[-1])
+            x1, y1, x2, y2 = box_cls[:-1]
+            # [x1, y1, x2, y2] -> [xc, yc, bw, bh]
+            xc = (x2 + x1) / 2 * img_w
+            yc = (y2 + y1) / 2 * img_h
+            bw = (x2 - x1) * img_w
+            bh = (y2 - y1) * img_h
+            target_boxes = [xc, yc, bw, bh]
+            box_scale = 2.0 - (bw / img_w) * (bh / img_h)
+
+            # check label
+            if bw < 1. or bh < 1.:
+                # print('A dirty data !!!')
+                continue
+
+            # label assignment
+            if anchor_size is not None:
+                # use anchor box
+                label_assignment_results = label_assignment_with_anchorbox(
+                                                anchor_size=anchor_size,
+                                                target_boxes=target_boxes,
+                                                num_anchors=KA,
+                                                strides=strides,
+                                                multi_anchor=multi_anchor)
+            else:
+                # no anchor box
+                label_assignment_results = label_assignment_without_anchorbox(
+                                                target_boxes=target_boxes,
+                                                strides=strides)
+
+            # make labels
+            for result in label_assignment_results:
+                grid_x, grid_y, scale_ind, anchor_ind = result
+                
+                if center_sample:
+                    # We consider four grid points near the center point
+                    for j in range(grid_y, grid_y+2):
+                        for i in range(grid_x, grid_x+2):
+                            if (j >= 0 and j < gt_tensor[scale_ind].shape[1]) and (i >= 0 and i < gt_tensor[scale_ind].shape[2]):
+                                gt_tensor[scale_ind][bi, j, i, anchor_ind, 0] = 1.0
+                                gt_tensor[scale_ind][bi, j, i, anchor_ind, 1] = cls_id
+                                gt_tensor[scale_ind][bi, j, i, anchor_ind, 2:6] = np.array([x1, y1, x2, y2])
+                                gt_tensor[scale_ind][bi, j, i, anchor_ind, 6] = box_scale
+                else:
+                    # We ongly consider top-left grid point near the center point
+                    if (grid_y >= 0 and grid_y < gt_tensor[scale_ind].shape[1]) and (grid_x >= 0 and grid_x < gt_tensor[scale_ind].shape[2]):
+                        gt_tensor[scale_ind][bi, grid_y, grid_x, anchor_ind, 0] = 1.0
+                        gt_tensor[scale_ind][bi, grid_y, grid_x, anchor_ind, 1] = cls_id
+                        gt_tensor[scale_ind][bi, grid_y, grid_x, anchor_ind, 2:6] = np.array([x1, y1, x2, y2])
+                        gt_tensor[scale_ind][bi, grid_y, grid_x, anchor_ind, 6] = box_scale
+
+    gt_tensor = [gt.reshape(batch_size, -1, 1+1+4+1) for gt in gt_tensor]
+    gt_tensor = np.concatenate(gt_tensor, axis=1)
+    
+    return torch.from_numpy(gt_tensor).float()
+
+
+if __name__ == "__main__":
+    gt_box = np.array([[0.0, 0.0, 10, 10]])
+    anchor_boxes = np.array([[0.0, 0.0, 10, 10], 
+                             [0.0, 0.0, 4, 4], 
+                             [0.0, 0.0, 8, 8], 
+                             [0.0, 0.0, 16, 16]
+                             ])
+    iou = compute_iou(anchor_boxes, gt_box)
+    print(iou)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/criterion.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/criterion.py
new file mode 100644
index 0000000000..fd07debdb9
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/criterion.py
@@ -0,0 +1,192 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_npu
+
+
+class MSEWithLogitsLoss(nn.Module):
+    def __init__(self, reduction='mean'):
+        super().__init__()
+        self.reduction = reduction
+
+    def forward(self, logits, targets, target_pos):
+        inputs = logits.sigmoid()
+        # mse loss
+        loss = F.mse_loss(input=inputs, 
+                          target=targets,
+                          reduction="none")
+        pos_loss = loss * target_pos * 5.0
+        neg_loss = loss * (1.0 - target_pos) * 1.0
+        loss = pos_loss + neg_loss
+
+        if self.reduction == 'mean':
+            loss = loss.mean()
+
+        elif self.reduction == 'sum':
+            loss = loss.sum()
+
+        return loss
+
+
+class BCEWithLogitsLoss(nn.Module):
+    def __init__(self, pos_weight=1.0, neg_weight=0.25, reduction='mean'):
+        super().__init__()
+        self.pos_weight = pos_weight
+        self.neg_weight = neg_weight
+        self.reduction = reduction
+
+    def forward(self, logits, targets, target_pos):
+        # bce loss
+        loss = F.binary_cross_entropy_with_logits(input=logits, target=targets, reduction="none")
+        pos_loss = loss * target_pos * self.pos_weight
+        neg_loss = loss * (1.0 - target_pos) * self.neg_weight
+        loss = pos_loss + neg_loss
+
+        if self.reduction == 'mean':
+            loss = loss.mean()
+
+        elif self.reduction == 'sum':
+            loss = loss.sum()
+
+        return loss
+
+
+class Criterion(nn.Module):
+    def __init__(self,
+                 args,
+                 cfg,
+                 loss_obj_weight=1.0, 
+                 loss_cls_weight=1.0, 
+                 loss_reg_weight=1.0, 
+                 num_classes=80):
+        super().__init__()
+        self.args = args
+        self.num_classes = num_classes
+        self.loss_obj_weight = loss_obj_weight
+        self.loss_cls_weight = loss_cls_weight
+        self.loss_reg_weight = loss_reg_weight
+
+        # objectness loss
+        try:
+            if cfg['loss_obj'] == 'mse':
+                self.obj_loss_f = MSEWithLogitsLoss(reduction='none')
+            elif cfg['loss_obj'] == 'bce':
+                self.obj_loss_f = BCEWithLogitsLoss(reduction='none')
+        except:
+            self.obj_loss_f = MSEWithLogitsLoss(reduction='none')
+        # class loss
+        self.cls_loss_f = nn.CrossEntropyLoss(reduction='none')
+
+
+    def loss_objectness(self, pred_obj, target_obj, target_pos):
+        """
+            pred_obj: (FloatTensor) [B, HW, 1]
+            target_obj: (FloatTensor) [B, HW,]
+            target_pos: (FloatTensor) [B, HW,]
+        """
+        # obj loss: [B, HW,]
+        loss_obj = self.obj_loss_f(pred_obj[..., 0], target_obj, target_pos)
+
+        if self.args.scale_loss == 'batch':
+            # scale loss by batch size
+            batch_size = pred_obj.size(0)
+            loss_obj = loss_obj.sum() / batch_size
+        elif self.args.scale_loss == 'positive':
+            # scale loss by number of positive samples
+            num_pos = target_pos.sum().clamp(1.0)
+            loss_obj = loss_obj.sum() / num_pos
+
+        return loss_obj
+
+
+    def loss_class(self, pred_cls, target_cls, target_pos):
+        """
+            pred_cls: (FloatTensor) [B, HW, C]
+            target_cls: (LongTensor) [B, HW,]
+            target_pos: (FloatTensor) [B, HW,]
+        """
+        # [B, HW, C] -> [B, C, HW]
+        pred_cls = pred_cls.permute(0, 2, 1)
+        # reg loss: [B, HW, ]
+        loss_cls = self.cls_loss_f(pred_cls, target_cls)
+        # valid loss. Here we only compute the loss of positive samples
+        loss_cls = loss_cls * target_pos
+
+        if self.args.scale_loss == 'batch':
+            # scale loss by batch size
+            batch_size = pred_cls.size(0)
+            loss_cls = loss_cls.sum() / batch_size
+        elif self.args.scale_loss == 'positive':
+            # scale loss by number of positive samples
+            num_pos = target_pos.sum().clamp(1.0)
+            loss_cls = loss_cls.sum() / num_pos
+
+        return loss_cls
+
+
+    def loss_bbox(self, pred_iou, target_pos, target_scale):
+        """
+            pred_iou: (FloatTensor) [B, HW, ]
+            target_pos: (FloatTensor) [B, HW,]
+            target_scale: (FloatTensor) [B, HW,]
+        """
+
+        # bbox loss: [B, HW,]
+        loss_reg = 1. - pred_iou
+        loss_reg = loss_reg * target_scale
+        # valid loss. Here we only compute the loss of positive samples
+        loss_reg = loss_reg * target_pos
+
+        if self.args.scale_loss == 'batch':
+            # scale loss by batch size
+            batch_size = pred_iou.size(0)
+            loss_reg = loss_reg.sum() / batch_size
+        elif self.args.scale_loss == 'positive':
+            # scale loss by number of positive samples
+            num_pos = target_pos.sum().clamp(1.0)
+            loss_reg = loss_reg.sum() / num_pos
+
+        return loss_reg
+
+
+    def forward(self, pred_obj, pred_cls, pred_iou, targets):
+        """
+            pred_obj: (Tensor) [B, HW, 1]
+            pred_cls: (Tensor) [B, HW, C]
+            pred_iou: (Tensor) [B, HW,]
+            targets: (Tensor) [B, HW, 1+1+1+4]
+        """
+        # groundtruth
+        target_obj = targets[..., 0].float()     # [B, HW,]
+        target_pos = targets[..., 1].float()     # [B, HW,]
+        target_cls = targets[..., 2].long()      # [B, HW,]
+        target_scale = targets[..., -1].float()  # [B, HW,]
+
+        # objectness loss
+        loss_obj = self.loss_objectness(pred_obj, target_obj, target_pos)
+
+        # class loss
+        loss_cls = self.loss_class(pred_cls, target_cls, target_pos)
+
+        # regression loss
+        loss_reg = self.loss_bbox(pred_iou, target_pos, target_scale)
+
+        # total loss
+        losses = self.loss_obj_weight * loss_obj + \
+                 self.loss_cls_weight * loss_cls + \
+                 self.loss_reg_weight * loss_reg
+
+        return loss_obj, loss_cls, loss_reg, losses
+
+
+def build_criterion(args, cfg, num_classes=80):
+    criterion = Criterion(args=args,
+                          cfg=cfg,
+                          loss_obj_weight=args.loss_obj_weight,
+                          loss_cls_weight=args.loss_cls_weight,
+                          loss_reg_weight=args.loss_reg_weight,
+                          num_classes=num_classes)
+    return criterion
+
+
+if __name__ == "__main__":
+    pass
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/distributed_utils.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/distributed_utils.py
new file mode 100644
index 0000000000..33333f4d2c
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/distributed_utils.py
@@ -0,0 +1,77 @@
+# from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py
+
+import torch
+import time
+import torch_npu
+
+def get_world_size():
+    if not torch.distributed.is_initialized():
+        return 1
+    return torch.distributed.get_world_size()
+
+
+def get_rank():
+    if not torch.distributed.is_initialized():
+        return 0
+    return torch.distributed.get_rank()
+
+
+def is_main_process():
+    if not torch.distributed.is_initialized():
+        return True
+    return torch.distributed.get_rank() == 0
+
+
+def synchronize():
+    """
+    Helper function to synchronize between multiple processes when
+    using distributed training
+    """
+    if not torch.distributed.is_initialized():
+        return
+    world_size = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+    if world_size == 1:
+        return
+
+    def _send_and_wait(r):
+        if rank == r:
+            tensor = torch.tensor(0, device="npu")
+        else:
+            tensor = torch.tensor(1, device="npu")
+        torch.distributed.broadcast(tensor, r)
+        while tensor.item() == 1:
+            time.sleep(1)
+
+    _send_and_wait(0)
+    # now sync on the main process
+    _send_and_wait(1)
+
+
+def reduce_loss_dict(loss_dict):
+    """
+    Reduce the loss dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    loss_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return loss_dict
+    with torch.no_grad():
+        loss_names = []
+        all_losses = []
+        for k in sorted(loss_dict.keys()):
+            loss_names.append(k)
+            if len(loss_dict[k].size()) == 0:
+                all_losses.append(loss_dict[k].unsqueeze(0))
+            else:
+                all_losses.append(loss_dict[k])
+        
+        all_losses = torch.stack(all_losses, dim=0)
+        torch.distributed.reduce(all_losses, dst=0)
+        if torch.distributed.get_rank() == 0:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            all_losses /= world_size
+        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
+    return reduced_losses
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/fuse_conv_bn.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/fuse_conv_bn.py
new file mode 100644
index 0000000000..97794e3b0e
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/fuse_conv_bn.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+def _fuse_conv_bn(conv, bn):
+    """Fuse conv and bn into one module.
+    Args:
+        conv (nn.Module): Conv to be fused.
+        bn (nn.Module): BN to be fused.
+    Returns:
+        nn.Module: Fused module.
+    """
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_conv_bn(module):
+    """Recursively fuse conv and bn in a module.
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+    Args:
+        module (nn.Module): Module to be fused.
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
\ No newline at end of file
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/kmeans_anchor.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/kmeans_anchor.py
new file mode 100644
index 0000000000..2c8a0c10eb
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/kmeans_anchor.py
@@ -0,0 +1,230 @@
+import numpy as np
+import random
+import argparse
+import os
+import sys
+sys.path.append('..')
+
+from data.voc import VOCDetection
+from data.coco import COCODataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='kmeans for anchor box')
+    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, widerface, crowdhuman')
+    parser.add_argument('-na', '--num_anchorbox', default=9, type=int,
+                        help='number of anchor box.')
+    parser.add_argument('-size', '--img_size', default=512, type=int,
+                        help='input size.')
+    return parser.parse_args()
+                    
+args = parse_args()
+                    
+
+class Box():
+    def __init__(self, x, y, w, h):
+        self.x = x
+        self.y = y
+        self.w = w
+        self.h = h
+
+
+def iou(box1, box2):
+    x1, y1, w1, h1 = box1.x, box1.y, box1.w, box1.h
+    x2, y2, w2, h2 = box2.x, box2.y, box2.w, box2.h
+
+    S_1 = w1 * h1
+    S_2 = w2 * h2
+
+    xmin_1, ymin_1 = x1 - w1 / 2, y1 - h1 / 2
+    xmax_1, ymax_1 = x1 + w1 / 2, y1 + h1 / 2
+    xmin_2, ymin_2 = x2 - w2 / 2, y2 - h2 / 2
+    xmax_2, ymax_2 = x2 + w2 / 2, y2 + h2 / 2
+
+    I_w = min(xmax_1, xmax_2) - max(xmin_1, xmin_2)
+    I_h = min(ymax_1, ymax_2) - max(ymin_1, ymin_2)
+    if I_w < 0 or I_h < 0:
+        return 0
+    I = I_w * I_h
+
+    IoU = I / (S_1 + S_2 - I)
+
+    return IoU
+
+
+def init_centroids(boxes, n_anchors):
+    """
+        We use kmeans++ to initialize centroids.
+    """
+    centroids = []
+    boxes_num = len(boxes)
+
+    centroid_index = int(np.random.choice(boxes_num, 1)[0])
+    centroids.append(boxes[centroid_index])
+    print(centroids[0].w,centroids[0].h)
+
+    for centroid_index in range(0, n_anchors-1):
+        sum_distance = 0
+        distance_thresh = 0
+        distance_list = []
+        cur_sum = 0
+
+        for box in boxes:
+            min_distance = 1
+            for centroid_i, centroid in enumerate(centroids):
+                distance = (1 - iou(box, centroid))
+                if distance < min_distance:
+                    min_distance = distance
+            sum_distance += min_distance
+            distance_list.append(min_distance)
+
+        distance_thresh = sum_distance * np.random.random()
+
+        for i in range(0, boxes_num):
+            cur_sum += distance_list[i]
+            if cur_sum > distance_thresh:
+                centroids.append(boxes[i])
+                print(boxes[i].w, boxes[i].h)
+                break
+    return centroids
+
+
+def do_kmeans(n_anchors, boxes, centroids):
+    loss = 0
+    groups = []
+    new_centroids = []
+    # for box in centroids:
+    #     print('box: ', box.x, box.y, box.w, box.h)
+    # exit()
+    for i in range(n_anchors):
+        groups.append([])
+        new_centroids.append(Box(0, 0, 0, 0))
+    
+    for box in boxes:
+        min_distance = 1
+        group_index = 0
+        for centroid_index, centroid in enumerate(centroids):
+            distance = (1 - iou(box, centroid))
+            if distance < min_distance:
+                min_distance = distance
+                group_index = centroid_index
+        groups[group_index].append(box)
+        loss += min_distance
+        new_centroids[group_index].w += box.w
+        new_centroids[group_index].h += box.h
+
+    for i in range(n_anchors):
+        new_centroids[i].w /= max(len(groups[i]), 1)
+        new_centroids[i].h /= max(len(groups[i]), 1)
+
+    return new_centroids, groups, loss# / len(boxes)
+
+
+def anchor_box_kmeans(total_gt_boxes, n_anchors, loss_convergence, iters, plus=True):
+    """
+        This function will use k-means to get appropriate anchor boxes for train dataset.
+        Input:
+            total_gt_boxes: 
+            n_anchor : int -> the number of anchor boxes.
+            loss_convergence : float -> threshold of iterating convergence.
+            iters: int -> the number of iterations for training kmeans.
+        Output: anchor_boxes : list -> [[w1, h1], [w2, h2], ..., [wn, hn]].
+    """
+    boxes = total_gt_boxes
+    centroids = []
+    if plus:
+        centroids = init_centroids(boxes, n_anchors)
+    else:
+        total_indexs = range(len(boxes))
+        sample_indexs = random.sample(total_indexs, n_anchors)
+        for i in sample_indexs:
+            centroids.append(boxes[i])
+
+    # iterate k-means
+    centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids)
+    iterations = 1
+    while(True):
+        centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids)
+        iterations += 1
+        print("Loss = %f" % loss)
+        if abs(old_loss - loss) < loss_convergence or iterations > iters:
+            break
+        old_loss = loss
+
+        for centroid in centroids:
+            print(centroid.w, centroid.h)
+    
+    print("k-means result : ") 
+    for centroid in centroids:
+        print("w, h: ", round(centroid.w, 2), round(centroid.h, 2), 
+            "area: ", round(centroid.w, 2) * round(centroid.h, 2))
+    
+    return centroids
+
+
+if __name__ == "__main__":
+
+    n_anchors = args.num_anchorbox
+    img_size = args.img_size
+    dataset = args.dataset
+    
+    loss_convergence = 1e-6
+    iters_n = 1000
+    
+    dataset_voc = VOCDetection(data_dir=os.path.join(args.root, 'VOCdevkit'), 
+                                img_size=img_size)
+
+    dataset_coco = COCODataset(data_dir=os.path.join(args.root, 'COCO'),
+                                img_size=img_size)
+
+    boxes = []
+    print("The dataset size: ", len(dataset))
+    print("Loading the dataset ...")
+    # VOC
+    for i in range(len(dataset_voc)):
+        if i % 5000 == 0:
+            print('Loading voc data [%d / %d]' % (i+1, len(dataset_voc)))
+
+        # For VOC
+        img, _ = dataset_voc.pull_image(i)
+        w, h = img.shape[1], img.shape[0]
+        _, annotation = dataset_voc.pull_anno(i)
+
+        # prepare bbox datas
+        for box_and_label in annotation:
+            box = box_and_label[:-1]
+            xmin, ymin, xmax, ymax = box
+            bw = (xmax - xmin) / max(w, h) * img_size
+            bh = (ymax - ymin) / max(w, h) * img_size
+            # check bbox
+            if bw < 1.0 or bh < 1.0:
+                continue
+            boxes.append(Box(0, 0, bw, bh))
+
+    # COCO
+    for i in range(len(dataset_coco)):
+        if i % 5000 == 0:
+            print('Loading coco datat [%d / %d]' % (i+1, len(dataset_coco)))
+
+        # For COCO
+        img, _ = dataset_coco.pull_image(i)
+        w, h = img.shape[1], img.shape[0]
+        annotation = dataset_coco.pull_anno(i)
+
+        # prepare bbox datas
+        for box_and_label in annotation:
+            box = box_and_label[:-1]
+            xmin, ymin, xmax, ymax = box
+            bw = (xmax - xmin) / max(w, h) * img_size
+            bh = (ymax - ymin) / max(w, h) * img_size
+            # check bbox
+            if bw < 1.0 or bh < 1.0:
+                continue
+            boxes.append(Box(0, 0, bw, bh))
+
+    print("Number of all bboxes: ", len(boxes))
+    print("Start k-means !")
+    centroids = anchor_box_kmeans(boxes, n_anchors, loss_convergence, iters_n, plus=True)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/misc.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/misc.py
new file mode 100644
index 0000000000..583cefd71f
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/misc.py
@@ -0,0 +1,149 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from copy import deepcopy
+import torch_npu
+
+
+def nms(dets, scores, nms_thresh=0.4):
+    """"Pure Python NMS baseline."""
+    x1 = dets[:, 0]  #xmin
+    y1 = dets[:, 1]  #ymin
+    x2 = dets[:, 2]  #xmax
+    y2 = dets[:, 3]  #ymax
+
+    areas = (x2 - x1) * (y2 - y1)                 # the size of bbox
+    order = scores.argsort()[::-1]                        # sort bounding boxes by decreasing order
+
+    keep = []                                             # store the final bounding boxes
+    while order.size > 0:
+        i = order[0]                                      #the index of the bbox with highest confidence
+        keep.append(i)                                    #save it to keep
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(1e-28, xx2 - xx1)
+        h = np.maximum(1e-28, yy2 - yy1)
+        inter = w * h
+
+        # Cross Area / (bbox + particular area - Cross Area)
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+        #reserve all the boundingbox whose ovr less than thresh
+        inds = np.where(ovr <= nms_thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def is_parallel(model):
+    # Returns True if model is of type DP or DDP
+    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+
+
+def detection_collate(batch):
+    """Custom collate fn for dealing with batches of images that have a different
+    number of associated object annotations (bounding boxes).
+
+    Arguments:
+        batch: (tuple) A tuple of tensor images and lists of annotations
+
+    Return:
+        A tuple containing:
+            1) (tensor) batch of images stacked on their 0 dim
+            2) (list of tensors) annotations for a given image are stacked on
+                                 0 dim
+    """
+    targets = []
+    imgs = []
+    for sample in batch:
+        imgs.append(sample[0])
+        targets.append(torch.FloatTensor(sample[1]))
+    return torch.stack(imgs, 0), targets
+
+
+# Model EMA
+class ModelEMA(object):
+    def __init__(self, model, decay=0.9999, updates=0):
+        # create EMA
+        self.ema = deepcopy(model.module if is_parallel(model) else model).eval()  # FP32 EMA
+        self.updates = updates
+        self.decay = lambda x: decay * (1 - math.exp(-x / 2000.))
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = model.module.state_dict() if is_parallel(model) else model.state_dict()  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1. - d) * msd[k].detach()
+
+
+# test time augmentation(TTA)
+class TestTimeAugmentation(object):
+    def __init__(self, num_classes=80, nms_thresh=0.4, scale_range=[320, 640, 32]):
+        self.nms = nms
+        self.num_classes = num_classes
+        self.nms_thresh = nms_thresh
+        self.scales = np.arange(scale_range[0], scale_range[1]+1, scale_range[2])
+        
+    def __call__(self, x, model):
+        # x: Tensor -> [B, C, H, W]
+        bboxes_list = []
+        scores_list = []
+        labels_list = []
+
+        # multi scale
+        for s in self.scales:
+            if x.size(-1) == s and x.size(-2) == s:
+                x_scale = x
+            else:
+                x_scale =torch.nn.functional.interpolate(
+                                        input=x, 
+                                        size=(s, s), 
+                                        mode='bilinear', 
+                                        align_corners=False)
+            model.set_grid(s)
+            bboxes, scores, labels = model(x_scale)
+            bboxes_list.append(bboxes)
+            scores_list.append(scores)
+            labels_list.append(labels)
+
+            # Flip
+            x_flip = torch.flip(x_scale, [-1])
+            bboxes, scores, labels = model(x_flip)
+            bboxes = bboxes.copy()
+            bboxes[:, 0::2] = 1.0 - bboxes[:, 2::-2]
+            bboxes_list.append(bboxes)
+            scores_list.append(scores)
+            labels_list.append(labels)
+
+        bboxes = np.concatenate(bboxes_list)
+        scores = np.concatenate(scores_list)
+        labels = np.concatenate(labels_list)
+
+        # nms
+        keep = np.zeros(len(bboxes), dtype=np.int)
+        for i in range(self.num_classes):
+            inds = np.where(labels == i)[0]
+            if len(inds) == 0:
+                continue
+            c_bboxes = bboxes[inds]
+            c_scores = scores[inds]
+            c_keep = self.nms(c_bboxes, c_scores, self.nms_thresh)
+            keep[inds[c_keep]] = 1
+
+        keep = np.where(keep > 0)
+        bboxes = bboxes[keep]
+        scores = scores[keep]
+        labels = labels[keep]
+
+        return bboxes, scores, labels
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/utils/vis.py b/PyTorch/contrib/cv/detection/YoloV2-640/utils/vis.py
new file mode 100644
index 0000000000..19bc181d65
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/utils/vis.py
@@ -0,0 +1,106 @@
+import numpy as np
+import cv2
+
+
+def vis_data(images, targets):
+    """
+        images: (tensor) [B, 3, H, W]
+        targets: (list) a list of targets
+    """
+    batch_size = images.size(0)
+    # vis data
+    rgb_mean=np.array((0.406, 0.456, 0.485), dtype=np.float32)
+    rgb_std=np.array((0.225, 0.224, 0.229), dtype=np.float32)
+
+    for bi in range(batch_size):
+        # to numpy
+        image = images[bi].permute(1, 2, 0).cpu().numpy()
+        # to BGR
+        image = image[..., (2, 1, 0)]
+        # denormalize
+        image = ((image * rgb_std + rgb_mean)*255).astype(np.uint8)
+        image = image.copy()
+        img_h, img_w = image.shape[:2]
+
+        targets_i = targets[bi]
+        for target in targets_i:
+            x1, y1, x2, y2 = target[:-1]
+            x1 = int(x1 * img_w)
+            y1 = int(y1 * img_h)
+            x2 = int(x2 * img_w)
+            y2 = int(y2 * img_h)
+            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
+
+        cv2.imshow('groundtruth', image)
+        cv2.waitKey(0)
+
+
+def vis_targets(images, targets, anchor_sizes=None, strides=[8, 16, 32]):
+    """
+        images: (tensor) [B, 3, H, W]
+        targets: (tensor) [B, HW*KA, 1+1+4+1]
+        anchor_sizes: (List) 
+        strides: (List[Int]) output stride of network
+    """
+    batch_size = images.size(0)
+    KA = len(anchor_sizes) // len(strides) if anchor_sizes is not None else 1
+    # vis data
+    rgb_mean=np.array((0.485, 0.456, 0.406), dtype=np.float32)
+    rgb_std=np.array((0.229, 0.224, 0.225), dtype=np.float32)
+
+    for bi in range(batch_size):
+        # to numpy
+        image = images[bi].permute(1, 2, 0).cpu().numpy()
+        # denormalize
+        image = ((image * rgb_std + rgb_mean)*255).astype(np.uint8)
+        # to BGR
+        image = image[..., (2, 1, 0)]
+        image = image.copy()
+        img_h, img_w = image.shape[:2]
+
+        target_i = targets[bi] # [HW*KA, 1+1+4+1]
+        N = 0
+        for si, s in enumerate(strides):
+            fmp_h, fmp_w = img_h // s, img_w // s
+            HWKA = fmp_h * fmp_w * KA
+            targets_i_s = target_i[N:N+HWKA]
+            N += HWKA
+            # [HW*KA, 1+1+4+1] -> [H, W, KA, 1+1+4+1]
+            targets_i_s = targets_i_s.reshape(fmp_h, fmp_w, KA, -1)
+            for j in range(fmp_h):
+                for i in range(fmp_w):
+                    for k in range(KA):
+                        target = targets_i_s[j, i, k] # [1+1+4+1,]
+                        if target[0] > 0.:
+                            # gt box
+                            box = target[2:6]
+                            x1, y1, x2, y2 = box
+                            # denormalize bbox
+                            x1 = int(x1 * img_w)
+                            y1 = int(y1 * img_h)
+                            x2 = int(x2 * img_w)
+                            y2 = int(y2 * img_h)
+                            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
+
+                            if anchor_sizes is not None:
+                                # anchor box
+                                anchor_size = anchor_sizes[si*KA + k]
+                                x_anchor = (i) * s
+                                y_anchor = (j) * s
+                                w_anchor, h_anchor = anchor_size
+                                anchor_box = [x_anchor, y_anchor, w_anchor, h_anchor]
+                                print('stride: {} - anchor box: ({}, {}, {}, {})'.format(s, *anchor_box))
+                                x1_a = int(x_anchor - w_anchor * 0.5)
+                                y1_a = int(y_anchor - h_anchor * 0.5)
+                                x2_a = int(x_anchor + w_anchor * 0.5)
+                                y2_a = int(y_anchor + h_anchor * 0.5)
+                                cv2.rectangle(image, (x1_a, y1_a), (x2_a, y2_a), (255, 0, 0), 2)
+                            else:
+                                x_anchor = (i) * s
+                                y_anchor = (j) * s
+                                anchor_point = (x_anchor, y_anchor)
+                                print('stride: {} - anchor point: ({}, {})'.format(s, *anchor_point))
+                                cv2.circle(image, anchor_point, 10, (255, 0, 0), -1)
+
+        cv2.imshow('assignment', image)
+        cv2.waitKey(0)
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/weights/README.md b/PyTorch/contrib/cv/detection/YoloV2-640/weights/README.md
new file mode 100644
index 0000000000..6550070efb
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/weights/README.md
@@ -0,0 +1,15 @@
+# yolo-v2-v3 and tiny model
+Hi, guys ! 
+
+For researchers in China, you can download them from BaiduYunDisk. 
+There are 5 models including yolo-v2, yolo-v3, yolo_v3_spp, slim-yolo-v2 and tiny-yolo-v3.
+
+The link is as following: 
+
+link: https://pan.baidu.com/s/1rnmM8HGFzE2NTv6AkljJdg
+
+password: 5c8h 
+
+<!-- link: https://drive.google.com/open?id=1Yrxz2IW3nzMZiX6EvcTzayDlH8ju4ZFc -->
+
+I will upload all models to googledrive.
\ No newline at end of file
-- 
Gitee


From ef2fab5ff3866f5c26da2608e5ad83c20265e4c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:41:49 +0000
Subject: [PATCH 2/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTo?=
 =?UTF-8?q?rch/contrib/cv/detection/YoloV2-640/README.md?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../contrib/cv/detection/YoloV2-640/README.md | 331 ------------------
 1 file changed, 331 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/README.md

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/README.md b/PyTorch/contrib/cv/detection/YoloV2-640/README.md
deleted file mode 100644
index 5cf9a9f4ad..0000000000
--- a/PyTorch/contrib/cv/detection/YoloV2-640/README.md
+++ /dev/null
@@ -1,331 +0,0 @@
-# Update: 2022-05-31
-Recently, I have released an anchor-free YOLO:
-
-https://github.com/yjh0410/FreeYOLO
-
-# A new and strong YOLO family
-Recently, I rebuild my YOLO-Family project !!
-
-# Requirements
-- We recommend you to use Anaconda to create a conda environment:
-```Shell
-conda create -n yolo python=3.6
-```
-
-- Then, activate the environment:
-```Shell
-conda activate yolo
-```
-
-- Requirements:
-```Shell
-pip install -r requirements.txt 
-```
-PyTorch >= 1.1.0 and Torchvision >= 0.3.0
-
-# Visualize positive samples
-You can run following command to visualize positiva sample:
-```Shell
-python train.py \
-        -d voc \
-        --root path/to/your/dataset \
-        -m yolov2 \
-        --batch_size 2 \
-        --vis_targets
-```
-
-# Come soon
-My better YOLO family
-
-
-# This project
-In this project, you can enjoy: 
-- a new and stronger YOLOv1
-- a new and stronger YOLOv2
-- a stronger YOLOv3
-- a stronger YOLOv3 with SPP
-- a stronger YOLOv3 with DilatedEncoder
-- YOLOv4 (I'm trying to make it better)
-- YOLO-Tiny
-- YOLO-Nano
-
-
-# Future work
-- Try to make my YOLO-v4 better.
-- Train my YOLOv1/YOLOv2 with ViT-Base (pretrained by MaskAutoencoder)
-
-# Weights
-You can download all weights including my DarkNet-53, CSPDarkNet-53, MAE-ViT and YOLO weights from the following links.
-
-## Backbone
-My Backbone:
-- DarkNet53: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/darknet53.pth
-- CSPDarkNet-53: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/cspdarknet53.pth
-- CSPDarkNet-Tiny: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/cspdarknet_tiny.pth
-
-YOLOX-Backbone:
-- CSPDarkNet-S: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_s.pth
-- CSPDarkNet-M: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_m.pth
-- CSPDarkNet-L: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_l.pth
-- CSPDarkNet-X: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_x.pth
-- CSPDarkNet-Tiny: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_tiny.pth
-- CSPDarkNet-Nano: https://github.com/yjh0410/YOLOX-Backbone/releases/download/YOLOX-Backbone/yolox_cspdarknet_nano.pth
-
-## YOLO
-- YOLOv1: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov1_35.22_54.7.pth
-- YOLOv2: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov2_36.4_56.6.pth
-- YOLOv3: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov3_36.9_59.0.pth
-- YOLOv3-SPP: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov3_spp_38.2_60.1.pth
-- YOLOv3-DE: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov3_de_38.7_60.2.pth
-- YOLOv4: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolov4_exp_43.0_63.4.pth
-- YOLO-Tiny: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolo_tiny_28.8_48.6.pth
-- YOLO-Nano: https://github.com/yjh0410/PyTorch_YOLO-Family/releases/download/yolo-weight/yolo_nano_22.4_40.7.pth
-
-
-# Experiments
-## Tricks
-Tricks in this project:
-- [x] Augmentations: Flip + Color jitter + RandomCrop
-- [x] Model EMA
-- [x] Mosaic Augmentation
-- [x] Multi Scale training
-- [ ] Gradient accumulation
-- [ ] MixUp Augmentation
-- [ ] Cosine annealing learning schedule
-- [ ] AdamW
-- [ ] Scale loss by number of positive samples
-
-
-# Experiments
-All experiment results are evaluated on COCO val. All FPS results except YOLO-Nano's are measured on a 2080ti GPU. 
-We will measure the speed of YOLO-Nano on a CPU.
-
-## YOLOv1
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-320</th><td bgcolor=white> 151 </td><td bgcolor=white> 25.4 </td><td bgcolor=white> 41.5 </td><td bgcolor=white> 26.0 </td><td bgcolor=white> 4.2   </td><td bgcolor=white> 25.0 </td><td bgcolor=white> 49.8 </td><td bgcolor=white> 10.49 </td><td bgcolor=white> 44.54M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-416</th><td bgcolor=white> 128 </td><td bgcolor=white> 30.1 </td><td bgcolor=white> 47.8 </td><td bgcolor=white> 30.9 </td><td bgcolor=white> 7.8   </td><td bgcolor=white> 31.9 </td><td bgcolor=white> 53.3 </td><td bgcolor=white> 17.73 </td><td bgcolor=white> 44.54M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-512</th><td bgcolor=white> 114 </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 52.2 </td><td bgcolor=white> 34.0 </td><td bgcolor=white> 10.8  </td><td bgcolor=white> 35.9 </td><td bgcolor=white> 54.9 </td><td bgcolor=white> 26.85 </td><td bgcolor=white> 44.54M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-640</th><td bgcolor=white> 75 </td><td bgcolor=white> 35.2 </td><td bgcolor=white> 54.7 </td><td bgcolor=white> 37.1 </td><td bgcolor=white>  14.3 </td><td bgcolor=white>  39.5 </td><td bgcolor=white>  53.4 </td><td bgcolor=white> 41.96 </td><td bgcolor=white> 44.54M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv1-800 </th><td bgcolor=white>     </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white> 65.56 </td><td bgcolor=white> 44.54M </td></tr>
-
-</table></tbody>
-
-## YOLOv2
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-320 </th><td bgcolor=white> 147 </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 44.1 </td><td bgcolor=white> 27.1 </td><td bgcolor=white> 4.7  </td><td bgcolor=white> 27.6 </td><td bgcolor=white> 50.8 </td><td bgcolor=white> 10.53 </td><td bgcolor=white> 44.89M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-416 </th><td bgcolor=white> 123 </td><td bgcolor=white> 31.6 </td><td bgcolor=white> 50.3 </td><td bgcolor=white> 32.4 </td><td bgcolor=white> 9.1  </td><td bgcolor=white> 33.8 </td><td bgcolor=white> 54.0 </td><td bgcolor=white> 17.79 </td><td bgcolor=white> 44.89M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-512 </th><td bgcolor=white> 108 </td><td bgcolor=white> 34.3 </td><td bgcolor=white> 54.0 </td><td bgcolor=white> 35.4 </td><td bgcolor=white> 12.3 </td><td bgcolor=white> 37.8 </td><td bgcolor=white> 55.2 </td><td bgcolor=white> 26.94 </td><td bgcolor=white> 44.89M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-640 </th><td bgcolor=white> 73 </td><td bgcolor=white> 36.3 </td><td bgcolor=white> 56.6 </td><td bgcolor=white> 37.7 </td><td bgcolor=white> 15.1 </td><td bgcolor=white>  41.1 </td><td bgcolor=white>  54.0 </td><td bgcolor=white> 42.10 </td><td bgcolor=white> 44.89M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv2-800 </th><td bgcolor=white>     </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>  </td><td bgcolor=white>     </td><td bgcolor=white>     </td><td bgcolor=white> 65.78 </td><td bgcolor=white> 44.89M </td></tr>
-
-</table></tbody>
-
-## YOLOv3
-
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-320</th><td bgcolor=white> 111 </td><td bgcolor=white> 30.8 </td><td bgcolor=white> 50.3 </td><td bgcolor=white> 31.8 </td><td bgcolor=white> 10.0 </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 50.0 </td><td bgcolor=white> 19.57 </td><td bgcolor=white> 61.97M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-416</th><td bgcolor=white> 89 </td><td bgcolor=white> 34.8 </td><td bgcolor=white> 55.8 </td><td bgcolor=white> 36.1 </td><td bgcolor=white> 14.6 </td><td bgcolor=white> 37.5 </td><td bgcolor=white> 52.9 </td><td bgcolor=white> 33.08 </td><td bgcolor=white> 61.97M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-512</th><td bgcolor=white> 77 </td><td bgcolor=white> 36.9 </td><td bgcolor=white> 58.1 </td><td bgcolor=white> 39.3 </td><td bgcolor=white> 18.0 </td><td bgcolor=white> 40.3 </td><td bgcolor=white> 52.2 </td><td bgcolor=white> 50.11 </td><td bgcolor=white> 61.97M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-608</th><td bgcolor=white> 51 </td><td bgcolor=white> 37.0 </td><td bgcolor=white> 58.9 </td><td bgcolor=white> 39.3 </td><td bgcolor=white> 20.5 </td><td bgcolor=white> 41.2 </td><td bgcolor=white> 49.0 </td><td bgcolor=white> 70.66 </td><td bgcolor=white> 61.97M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-640</th><td bgcolor=white> 49 </td><td bgcolor=white> 36.9 </td><td bgcolor=white> 59.0 </td><td bgcolor=white> 39.7 </td><td bgcolor=white> 21.6 </td><td bgcolor=white> 41.6 </td><td bgcolor=white> 47.7 </td><td bgcolor=white> 78.30 </td><td bgcolor=white> 61.97M </td></tr>
-</table></tbody>
-
-## YOLOv3 with SPP
-
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-320</th><td bgcolor=white> 110 </td><td bgcolor=white> 31.0 </td><td bgcolor=white> 50.8 </td><td bgcolor=white> 32.0 </td><td bgcolor=white> 10.5 </td><td bgcolor=white> 33.0 </td><td bgcolor=white> 50.4 </td><td bgcolor=white> 19.68 </td><td bgcolor=white> 63.02M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-416</th><td bgcolor=white> 88 </td><td bgcolor=white> 35.0 </td><td bgcolor=white> 56.1 </td><td bgcolor=white> 36.4 </td><td bgcolor=white> 14.9 </td><td bgcolor=white> 37.7 </td><td bgcolor=white> 52.8 </td><td bgcolor=white> 33.26 </td><td bgcolor=white> 63.02M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-512</th><td bgcolor=white> 75 </td><td bgcolor=white> 37.2 </td><td bgcolor=white> 58.7 </td><td bgcolor=white> 39.1 </td><td bgcolor=white> 19.1 </td><td bgcolor=white> 40.0 </td><td bgcolor=white> 53.0 </td><td bgcolor=white> 50.38 </td><td bgcolor=white> 63.02M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-608</th><td bgcolor=white> 50 </td><td bgcolor=white> 38.3 </td><td bgcolor=white> 60.1 </td><td bgcolor=white> 40.7 </td><td bgcolor=white> 20.9 </td><td bgcolor=white> 41.1 </td><td bgcolor=white> 51.2 </td><td bgcolor=white> 71.04 </td><td bgcolor=white>  63.02M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-SPP-640</th><td bgcolor=white> 48 </td><td bgcolor=white> 38.2 </td><td bgcolor=white> 60.1 </td><td bgcolor=white> 40.4 </td><td bgcolor=white> 21.6 </td><td bgcolor=white> 41.1 </td><td bgcolor=white> 50.5 </td><td bgcolor=white> 78.72 </td><td bgcolor=white> 63.02M </td></tr>
-</table></tbody>
-
-## YOLOv3 with Dilated Encoder
-The DilatedEncoder is proposed by YOLOF.
-
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-320</th><td bgcolor=white> 109 </td><td bgcolor=white> 31.1 </td><td bgcolor=white> 51.1 </td><td bgcolor=white> 31.7 </td><td bgcolor=white> 10.2 </td><td bgcolor=white> 32.6 </td><td bgcolor=white> 51.2 </td><td bgcolor=white> 19.10 </td><td bgcolor=white> 57.25M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-416</th><td bgcolor=white> 88 </td><td bgcolor=white> 35.0 </td><td bgcolor=white> 56.1 </td><td bgcolor=white> 36.3 </td><td bgcolor=white> 14.6 </td><td bgcolor=white> 37.4 </td><td bgcolor=white> 53.7 </td><td bgcolor=white> 32.28 </td><td bgcolor=white> 57.25M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-512</th><td bgcolor=white> 74 </td><td bgcolor=white> 37.7 </td><td bgcolor=white> 59.3 </td><td bgcolor=white> 39.6 </td><td bgcolor=white> 17.9 </td><td bgcolor=white> 40.4 </td><td bgcolor=white> 54.4 </td><td bgcolor=white> 48.90 </td><td bgcolor=white> 57.25M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-608</th><td bgcolor=white> 50 </td><td bgcolor=white> 38.7 </td><td bgcolor=white> 60.5 </td><td bgcolor=white> 40.8 </td><td bgcolor=white> 20.6 </td><td bgcolor=white> 41.7 </td><td bgcolor=white> 53.1 </td><td bgcolor=white> 68.96 </td><td bgcolor=white> 57.25M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv3-DE-640</th><td bgcolor=white> 48 </td><td bgcolor=white> 38.7 </td><td bgcolor=white> 60.2 </td><td bgcolor=white> 40.7 </td><td bgcolor=white>  21.3 </td><td bgcolor=white> 41.7 </td><td bgcolor=white> 51.7  </td><td bgcolor=white> 76.41 </td><td bgcolor=white> 57.25M </td></tr>
-</table></tbody>
-
-## YOLOv4
-I'm still trying to make it better.
-
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-320</th><td bgcolor=white> 89 </td><td bgcolor=white> 39.2 </td><td bgcolor=white> 58.6 </td><td bgcolor=white> 40.9 </td><td bgcolor=white> 16.9 </td><td bgcolor=white> 44.1 </td><td bgcolor=white> 59.2 </td><td bgcolor=white> 16.38 </td><td bgcolor=white> 58.14M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-416</th><td bgcolor=white> 84 </td><td bgcolor=white> 41.7 </td><td bgcolor=white> 61.6 </td><td bgcolor=white> 44.2 </td><td bgcolor=white> 22.0 </td><td bgcolor=white> 46.6 </td><td bgcolor=white> 57.7 </td><td bgcolor=white> 27.69 </td><td bgcolor=white> 58.14M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-512</th><td bgcolor=white> 70 </td><td bgcolor=white> 42.9 </td><td bgcolor=white> 63.1 </td><td bgcolor=white> 46.1 </td><td bgcolor=white> 24.5 </td><td bgcolor=white> 48.3 </td><td bgcolor=white> 56.5 </td><td bgcolor=white> 41.94 </td><td bgcolor=white> 58.14M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLOv4-608</th><td bgcolor=white> 51 </td><td bgcolor=white> 43.0 </td><td bgcolor=white> 63.4 </td><td bgcolor=white> 46.1 </td><td bgcolor=white> 26.7 </td><td bgcolor=white> 48.6 </td><td bgcolor=white> 53.9 </td><td bgcolor=white> 59.14 </td><td bgcolor=white> 58.14M </td></tr>
-
-</table></tbody>
-
-## YOLO-Tiny
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLO-Tiny-320</th><td bgcolor=white> 143 </td><td bgcolor=white> 26.4 </td><td bgcolor=white> 44.5 </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 8.8 </td><td bgcolor=white> 28.2 </td><td bgcolor=white> 42.4 </td><td bgcolor=white> 2.17 </td><td bgcolor=white> 7.66M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLO-Tiny-416</th><td bgcolor=white> 130 </td><td bgcolor=white> 28.2 </td><td bgcolor=white> 47.6 </td><td bgcolor=white> 28.8 </td><td bgcolor=white> 11.6 </td><td bgcolor=white> 31.5 </td><td bgcolor=white> 41.4 </td><td bgcolor=white> 3.67 </td><td bgcolor=white> 7.82M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLO-Tiny-512</th><td bgcolor=white> 118 </td><td bgcolor=white> 28.8 </td><td bgcolor=white> 48.6 </td><td bgcolor=white> 29.4 </td><td bgcolor=white> 13.3 </td><td bgcolor=white> 33.4 </td><td bgcolor=white> 38.3 </td><td bgcolor=white> 5.57 </td><td bgcolor=white> 7.82M </td></tr>
-
-</table></tbody>
-
-## YOLO-Nano
-The FPS is measured on i5-1135G& CPU. Any accelerated deployments that would help speed up detection are not done.
-
-<table><tbody>
-<tr><th align="left" bgcolor=#f8f8f8>           </th><td bgcolor=white> FPS </td><td bgcolor=white> AP   </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white>  APs  </td><td bgcolor=white>  APm  </td><td bgcolor=white>  APl  </td><td bgcolor=white>  GFLOPs  </td><td bgcolor=white>  Params  </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLO-Nano-320</th><td bgcolor=white> 25 </td><td bgcolor=white> 18.4 </td><td bgcolor=white> 33.7 </td><td bgcolor=white> 17.8 </td><td bgcolor=white> 3.9 </td><td bgcolor=white> 17.5 </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 0.64 </td><td bgcolor=white> 1.86M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLO-Nano-416</th><td bgcolor=white> 15 </td><td bgcolor=white> 21.4 </td><td bgcolor=white> 38.5 </td><td bgcolor=white> 20.9 </td><td bgcolor=white> 6.5 </td><td bgcolor=white> 21.4 </td><td bgcolor=white> 34.8 </td><td bgcolor=white> 0.99 </td><td bgcolor=white> 1.86M </td></tr>
-
-<tr><th align="left" bgcolor=#f8f8f8> YOLO-Nano-512</th><td bgcolor=white> 10 </td><td bgcolor=white> 22.4 </td><td bgcolor=white> 40.7 </td><td bgcolor=white> 22.1 </td><td bgcolor=white> 8.0 </td><td bgcolor=white> 24.0 </td><td bgcolor=white> 33.2 </td><td bgcolor=white> 1.65 </td><td bgcolor=white> 1.86M </td></tr>
-
-</table></tbody>
-
-
-# Dataset
-
-## VOC Dataset
-### My BaiduYunDisk
-- BaiduYunDisk: https://pan.baidu.com/s/1tYPGCYGyC0wjpC97H-zzMQ Password：4la9
-
-### Download VOC2007 trainval & test
-
-```Shell
-# specify a directory for dataset to be downloaded into, else default is ~/data/
-sh data/scripts/VOC2007.sh # <directory>
-```
-
-### Download VOC2012 trainval
-```Shell
-# specify a directory for dataset to be downloaded into, else default is ~/data/
-sh data/scripts/VOC2012.sh # <directory>
-```
-### My BaiduYunDisk
-- BaiduYunDisk: https://pan.baidu.com/s/1xAPk8fnaWMMov1VEjr8-zA Password：6vhp
-
-On Ubuntu system, you might use the command `jar xvf xxx.zip` to unzip the `train2017.zip` and `test2017.zip` files
-since they are larger than 2G (As far as I know, `unzip` operation can't process the zip file which is larger than 2G.).
-
-## MSCOCO Dataset
-
-### Download MSCOCO 2017 dataset
-Just run ```sh data/scripts/COCO2017.sh```. You will get COCO train2017, val2017, test2017.
-
-
-# Train
-For example:
-
-```Shell
-python train.py --cuda \
-                -d coco \
-                -m yolov2 \
-                -ms \
-                --ema \
-                --batch_size 16 \
-                --root path/to/dataset/
-```
-
-You can run ```python train.py -h``` to check all optional argument. Or you can just run the shell file, for example:
-```Shell
-sh train_yolov1.sh
-```
-
-If you have multi gpus like 8, and you put 4 images on each gpu:
-```Shell
-python -m torch.distributed.launch --nproc_per_node=8 train.py -d coco \
-                                                               --cuda \
-                                                               -m yolov1 \
-                                                               -ms \
-                                                               --ema \
-                                                               -dist \
-                                                               --sybn \
-                                                               --num_gpu 8 \
-                                                               --batch_size 4 \
-                                                               --root path/to/dataset/
-```
-Attention, `--batch_size` is the number of batchsize on per GPU, not all GPUs.
-
-I have upload all training log files. For example, `1-v1.txt` contains all the output information during the training YOLOv1.
-
-It is strongly recommended that you open the training shell file to check how I train each YOLO detector.
-
-# Test
-For example:
-
-```Shell
-python test.py -d coco \
-               --cuda \
-               -m yolov2 \
-               --weight path/to/weight \
-               --img_size 640 \
-               --root path/to/dataset/ \
-               --show
-```
-
-# Evaluation
-For example
-
-```Shell
-python eval.py -d coco-val \
-               --cuda \
-               -m yolov1 \
-               --weight path/to/weight \
-               --img_size 640 \
-               --root path/to/dataset/
-```
-
-# Evaluation on COCO-test-dev
-To run on COCO_test-dev(You must be sure that you have downloaded test2017):
-```Shell
-python eval.py -d coco-test \
-               --cuda \
-               -m yolov1 \
-               --weight path/to/weight \
-               --img_size 640 \
-                --root path/to/dataset/
-```
-You will get a `coco_test-dev.json` file. 
-Then you should follow the official requirements to compress it into zip format 
-and upload it the official evaluation server.
-- 
Gitee


From e3094e57375a2cddf82a11a4e8dfa00677120971 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:42:23 +0000
Subject: [PATCH 3/8] my first commit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张智斌123 <qq1336356953@163.com>
---
 .../contrib/cv/detection/YoloV2-640/README.md | 194 ++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/README.md

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/README.md b/PyTorch/contrib/cv/detection/YoloV2-640/README.md
new file mode 100644
index 0000000000..677c207e4d
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/README.md
@@ -0,0 +1,194 @@
+# YoloV2 for PyTorch
+
+-   [概述](概述.md)
+-   [准备训练环境](准备训练环境.md)
+-   [开始训练](开始训练.md)
+-   [训练结果展示](训练结果展示.md)
+-   [版本说明](版本说明.md)
+
+
+
+# 概述
+
+## 简述
+
+为提高物体定位精准性和召回率，Yolo作者提出了YoloV2。相比V1提高了训练图像的分辨率；引入了Faster RCNN中anchor box的思想，对网络结构的设计进行了改进，输出层使用卷积层替代Yolo的全连接层，使用coco物体检测标注数据训练物体检测模型。相比YoloV1，YoloV2在识别种类、精度、速度、和定位准确性等方面都有大大提升。
+- 参考实现：
+
+  ```
+  url=https://github.com/yjh0410/PyTorch_YOLO-Family
+  commit_id=234fa7c53b1f0d2a8bec3a8cdb656f63b916c6ef
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/contrib/cv/detection
+  ```
+  
+- 通过Git获取代码方法如下：
+
+  ```
+  git clone {url}       # 克隆仓库的代码
+  cd {code_path}        # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
+  ```
+  
+- 通过单击“立即下载”，下载源码包。
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的 PyTorch 版本和已知三方库依赖如下表所示。
+
+  **表 1**  版本支持表
+
+  | Torch_Version| 三方库依赖版本                          |
+  |----------------------------------| ----------------------------------|
+  | PyTorch 1.5 | torchvision==0.6.0；pillow==8.4.0 |
+  | PyTorch 1.8 | torchvision==0.9.1；pillow==9.1.0 |
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+  
+- 安装依赖。
+  
+  在模型源码包根目录下执行命令，安装模型对应PyTorch版本需要的依赖
+  ```
+  pip install -r 1.5_requirements.txt  # PyTorch1.5版本
+
+  pip install -r 1.8_requirements.txt  # PyTorch1.8版本
+  ```
+
+
+## 准备数据集
+
+1. 获取数据集。
+
+   用户只需运行sh data/scripts/COCO2017.sh，用户即可获得COCO train2017，val2017，test2017。
+
+   COCO2017数据集目录结构参考如下所示。
+
+   ```
+   ├── COCO: 数据集根目录
+        ├──train2017: 所有训练图像文件夹(118287张)
+                │──000000000009.jpg
+                │──000000000025.jpg
+                │──000000000030.jpg
+                │   ...       
+        ├──test2017
+                │──000000000001.jpg
+                │──000000000016.jpg
+                │──000000000019.jpg
+                │   ...    
+        ├──val2017: 所有验证图像文件夹(5000张)
+                ├──000000000139.jpg
+                ├──000000000285.jpg
+                ├──000000000632.jpg
+                │   ...
+        ├──annotations: 对应标注文件夹
+                ├── instances_train2017.json: 对应目标检测、分割任务的训练集标注文件
+                ├── instances_val2017.json: 对应目标检测、分割任务的验证集标注文件
+                ├── captions_train2017.json: 对应图像描述的训练集标注文件
+                ├── captions_val2017.json: 对应图像描述的验证集标注文件
+                ├── person_keypoints_train2017.json: 对应人体关键点检测的训练集标注文件
+                └── person_keypoints_val2017.json: 对应人体关键点检测的验证集标注文件夹
+
+   ```
+
+   > **说明：** 
+   >该数据集的训练过程脚本只作为一种参考示例。
+
+2. 数据预处理（按需处理所需要的数据集）。
+
+# 开始训练
+
+## 训练模型
+
+1. 进入解压后的源码包根目录。
+
+   ```
+   cd /${模型文件夹名称} 
+   ```
+
+2. 运行训练脚本。
+
+   该模型支持单机单卡训练和单机8卡训练。
+
+   - 单机单卡训练
+
+     启动单卡训练。
+
+     ```
+     bash train-1p.sh
+     ```
+
+   - 单机8卡训练
+
+     启动8卡训练。
+
+     ```
+     bash train-8p.sh
+     ```
+
+    模型训练脚本参数说明如下。
+
+   ```
+   公共参数：
+   --npu                               //使用npu
+   -d                                  //所用数据集，coco或者voc
+   -m                                  //使用模型，yolov2
+   --root                              //数据集路径
+   --batch_size                        //训练批次大小
+   --lr                                //初始学习率
+   --img_size                          //指定图像尺寸
+   --max_epoch                         //最大训练次数
+   --lr_epoch                          //学习率调整
+   --multi_scale                       //多尺度训练
+   --multi_scale_range                 //多尺度训练分辨率范围
+   --multi_anchor                      //使用multi anchor正样本策略
+   多卡训练参数：
+   --nproc_per_node                    //每个节点上有多少个进程
+   --multiprocessing-distributed       //使用多卡训练
+   -dist                               //分布式训练
+   --num_gpu                           //npu数量
+   ```
+   
+   训练完成后，会在weights/yolov2目录下保存模型权重文件，并输出模型训练精度和性能信息。
+
+# 训练结果展示
+
+**表 2**  训练结果展示表
+
+| NAME   | AP50  | FPS | Epochs | AMP_Type |
+|--------|-------|----:|--------|---------:|
+| 1p-竞品V | -     |  95 | 10     |       O2 |
+| 1p-NPU | -     |  95 | 20     |       O2 |
+| 8p-竞品V | 53.40 | 259 | 200    |       O2 |
+| 8p-NPU | 52.50 | 216 | 200    |       O2 |
+
+
+# 版本说明
+
+## 变更
+
+2023.4.27:首次发布
+
+## 已知问题
+
+**_当前发行版本中存在的问题描述。_**
+
+无
+
+
+
+
+
+
+
+
+
+
+
-- 
Gitee


From 56bbb3b5749cb4dc9214e31d15890dea242a4d9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:47:50 +0000
Subject: [PATCH 4/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTo?=
 =?UTF-8?q?rch/contrib/cv/detection/YoloV2-640/train-8p.sh?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cv/detection/YoloV2-640/train-8p.sh       | 96 -------------------
 1 file changed, 96 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
deleted file mode 100644
index 8c1bcdb666..0000000000
--- a/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/bin/bash
-cur_path=`pwd`
-cur_path_last_dirname=${cur_path##*/}
-if [ x"${cur_path_last_dirname}" == x"test" ];then
-    test_path_dir=${cur_path}
-    cd ..
-    cur_path=`pwd`
-else
-    test_path_dir=${cur_path}/test
-fi
-#集合通信参数,不需要修改
-export RANK_SIZE=8
-RANK_ID_START=0
-export WORLD_SIZE=8
-#训练开始时间，不需要修改
-start_time=$(date +%s)
-#训练batch_size,,需要模型审视修改
-batch_size=32
-#设置环境变量，不需要修改
-RANK_ID=0
-echo "Decive ID: $RANK_ID"
-export RANK_ID=$RANK_ID
-export ASCEND_DEVICE_ID=$RANK_ID
-ASCEND_DEVICE_ID=$RANK_ID
-#创建DeviceID输出目录，不需要修改
-if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
-    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
-    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt
-else
-    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt
-fi
-#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
-export RANK_SIZE=8
-
-KERNEL_NUM=$(($(nproc)/8))
-for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
-do
-    export RANK=$RANK_ID
-
-    if [ $(uname -m) = "aarch64" ]
-    then
-        PID_START=$((KERNEL_NUM * RANK_ID))
-        PID_END=$((PID_START + KERNEL_NUM - 1))
-        taskset -c $PID_START-$PID_END python3.7 -m torch.distributed.launch --nproc_per_node=8 train8p.py \
-                                                        --npu \
-                                                        -d coco \
-                                                        -m yolov2 \
-                                                        --root /forDocker/dataset \
-                                                        --batch_size 32 \
-                                                        --lr 0.002 \
-                                                        --img_size 640 \
-                                                        --max_epoch 200 \
-                                                        --lr_epoch 100 150 \
-                                                        --multi_scale \
-                                                        --multi_scale_range 10 20 \
-                                                        --multi_anchor \
-                                                        -dist \
-                                                        --sybn \
-                                                        --num_gpu 8 \
-                                                        --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-    else
-        python3.7 -m torch.distributed.launch --nproc_per_node=8 train8p.py \
-                                                        --npu \
-                                                        -d coco \
-                                                        -m yolov2 \
-                                                        --root /forDocker/dataset \
-                                                        --batch_size 32 \
-                                                        --lr 0.002 \
-                                                        --img_size 640 \
-                                                        --max_epoch 200 \
-                                                        --lr_epoch 100 150 \
-                                                        --multi_scale \
-                                                        --multi_scale_range 10 20 \
-                                                        --multi_anchor \
-                                                        -dist \
-                                                        --sybn \
-                                                        --num_gpu 8 \
-                                                        --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
-    fi
-done
-
-#8p情况下仅0卡(主节点)有完整日志,因此后续日志提取仅涉及0卡
-ASCEND_DEVICE_ID=0
-
-#训练结束时间，不需要修改
-end_time=$(date +%s)
-e2e_time=$(( $end_time - $start_time ))
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-time=`grep -a 'Epoch '  $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time: " '{print $2}'|awk -F "," '{print $1}'|awk 'END {print}'|sed 's/.$//'`
-FPS=`awk 'BEGIN{printf "%.2f\n", '${RANK_SIZE}'*'${batch_size}'/'${time}'}'`
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-- 
Gitee


From 0f4c9df09374a126057566e0941d78765037fba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:48:03 +0000
Subject: [PATCH 5/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTo?=
 =?UTF-8?q?rch/contrib/cv/detection/YoloV2-640/train1p.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cv/detection/YoloV2-640/train1p.py        | 545 ------------------
 1 file changed, 545 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train1p.py

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py b/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
deleted file mode 100644
index 4a50a26de8..0000000000
--- a/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
+++ /dev/null
@@ -1,545 +0,0 @@
-from __future__ import division
-
-import os
-import argparse
-import time
-import math
-import random
-from copy import deepcopy
-import apex
-from apex import amp
-import torch
-import torch_npu
-import torch.optim as optim
-import torch.backends.cudnn as cudnn
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-import sys
-from config.yolo_config import yolo_config
-from data.voc import VOCDetection
-from data.coco import COCODataset
-from data.transforms import TrainTransforms, ColorTransforms, ValTransforms
-
-from utils import distributed_utils
-from utils import create_labels
-from utils.vis import vis_data, vis_targets
-from utils.com_flops_params import FLOPs_and_Params
-from utils.criterion import build_criterion
-from utils.misc import detection_collate
-from utils.misc import ModelEMA
-from utils.criterion import build_criterion
-
-from models.yolo import build_model
-
-from evaluator.cocoapi_evaluator import COCOAPIEvaluator
-from evaluator.vocapi_evaluator import VOCAPIEvaluator
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='YOLO Detection')
-    # basic
-    parser.add_argument('--npu', action='store_true', default=False,
-                        help='use npu.')
-    parser.add_argument('--batch_size', default=16, type=int,
-                        help='Batch size for training')
-    parser.add_argument('--lr', default=1e-3, type=float,
-                        help='initial learning rate')
-    parser.add_argument('--img_size', type=int, default=640,
-                        help='The upper bound of warm-up')
-    parser.add_argument('--multi_scale_range', nargs='+', default=[10, 20], type=int,
-                        help='lr epoch to decay')
-    parser.add_argument('--max_epoch', type=int, default=200,
-                        help='The upper bound of warm-up')
-    parser.add_argument('--lr_epoch', nargs='+', default=[100, 150], type=int,
-                        help='lr epoch to decay')
-    parser.add_argument('--wp_epoch', type=int, default=2,
-                        help='The upper bound of warm-up')
-    parser.add_argument('--start_epoch', type=int, default=0,
-                        help='start epoch to train')
-    parser.add_argument('-r', '--resume', default=None, type=str,
-                        help='keep training')
-    parser.add_argument('--num_workers', default=8, type=int,
-                        help='Number of workers used in dataloading')
-    parser.add_argument('--num_gpu', default=1, type=int,
-                        help='Number of GPUs to train')
-    parser.add_argument('--eval_epoch', type=int,
-                        default=10, help='interval between evaluations')
-    parser.add_argument('--tfboard', action='store_true', default=False,
-                        help='use tensorboard')
-    parser.add_argument('--save_folder', default='weights/', type=str,
-                        help='path to save weight')
-    parser.add_argument('--vis_data', action='store_true', default=False,
-                        help='visualize images and labels.')
-    parser.add_argument('--vis_targets', action='store_true', default=False,
-                        help='visualize assignment.')
-
-    # Optimizer & Schedule
-    parser.add_argument('--optimizer', default='NpuFusedSGD', type=str,
-                        help='sgd, adamw')
-    parser.add_argument('--lr_schedule', default='step', type=str,
-                        help='step, cos')
-    parser.add_argument('--grad_clip', default=None, type=float,
-                        help='clip gradient')
-
-    # model
-    parser.add_argument('-m', '--model', default='yolov1',
-                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
-                             'yolov4, yolo_tiny, yolo_nano')
-    parser.add_argument('--conf_thresh', default=0.001, type=float,
-                        help='NMS threshold')
-    parser.add_argument('--nms_thresh', default=0.5, type=float,
-                        help='NMS threshold')
-
-    # dataset
-    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
-                        help='data root')
-    parser.add_argument('-d', '--dataset', default='coco',
-                        help='coco, widerface, crowdhuman')
-
-    # Loss
-    parser.add_argument('--loss_obj_weight', default=1.0, type=float,
-                        help='weight of obj loss')
-    parser.add_argument('--loss_cls_weight', default=1.0, type=float,
-                        help='weight of cls loss')
-    parser.add_argument('--loss_reg_weight', default=1.0, type=float,
-                        help='weight of reg loss')
-    parser.add_argument('--scale_loss', default='batch', type=str,
-                        help='scale loss: batch or positive samples')
-
-    # train trick
-    parser.add_argument('--no_warmup', action='store_true', default=False,
-                        help='do not use warmup')
-    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
-                        help='use multi-scale trick')
-    parser.add_argument('--ema', action='store_true', default=False,
-                        help='use ema training trick')
-    parser.add_argument('--mosaic', action='store_true', default=False,
-                        help='use Mosaic Augmentation trick')
-    parser.add_argument('--mixup', action='store_true', default=False,
-                        help='use MixUp Augmentation trick')
-    parser.add_argument('--multi_anchor', action='store_true', default=False,
-                        help='use multiple anchor boxes as the positive samples')
-    parser.add_argument('--center_sample', action='store_true', default=False,
-                        help='use center sample for labels')
-    parser.add_argument('--accumulate', type=int, default=1,
-                        help='accumulate gradient')
-    # DDP train
-    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
-                        help='distributed training')
-    parser.add_argument('--local_rank', type=int, default=0,
-                        help='local_rank')
-    parser.add_argument('--sybn', action='store_true', default=False,
-                        help='use sybn.')
-    parser.add_argument('--opt-level', default='O2', type=str,
-                        help='loss scale using in amp, default O1')
-
-    return parser.parse_args()
-
-
-def train():
-    args = parse_args()
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12345'
-
-    # torch.npu.set_compile_mode(jit_compile=False)
-    option = {}
-    option["ACL_OP_COMPILER_CACHE_MODE"]="enable"
-    option["ACL_OP_COMPILER_CACHE_DIR"]="./kernel_meta"
-    option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Maximum,Conv2D,BNInfer,BNTrainingReduceGrad,Cast"
-    print("option:",option)
-    # torch.npu.set_option(option)
-    print("Setting Arguments.. : ", args)
-    print("----------------------------------------------------------")
-
-    # path to save model
-    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
-    os.makedirs(path_to_save, exist_ok=True)
-
-    # set distributed
-    local_rank = 0
-    if args.distributed:
-        dist.init_process_group(backend="hccl", #init_method="env://"
-                                )
-        local_rank = torch.distributed.get_rank()
-        print(local_rank)
-        torch_npu.npu.set_device(local_rank)
-
-    # cuda
-    if args.npu:
-        print('use npu')
-        cudnn.benchmark = True
-        device = torch.device("npu")
-    else:
-        device = torch.device("cpu")
-
-    # YOLO config
-    cfg = yolo_config[args.model]
-    train_size = val_size = args.img_size
-
-    # dataset and evaluator
-    dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device)
-    # dataloader
-    dataloader = build_dataloader(args, dataset, detection_collate)
-    # criterioin
-    criterion = build_criterion(args, cfg, num_classes)
-
-    print('Training model on:', args.dataset)
-    print('The dataset size:', len(dataset))
-    print("----------------------------------------------------------")
-
-    # build model
-    net = build_model(args=args,
-                      cfg=cfg,
-                      device=device,
-                      num_classes=num_classes,
-                      trainable=True)
-    model = net
-
-    # SyncBatchNorm
-    # if args.sybn and args.npu and args.num_gpu > 1:
-    #     print('use SyncBatchNorm ...')
-    #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-
-    model = model.to(device).train()
-    # compute FLOPs and Params
-    # if local_rank == 0:
-    #     model_copy = deepcopy(model)
-    #     model_copy.trainable = False
-    #     model_copy.eval()
-    #     FLOPs_and_Params(model=model_copy, size=train_size)
-    #     model_copy.trainable = True
-    #     model_copy.train()
-    # keep training
-    if args.resume is not None:
-        print('keep training model: %s' % (args.resume))
-        model.load_state_dict(torch.load(args.resume, map_location=device))
-
-    # EMA
-    ema = ModelEMA(model) if args.ema else None
-    # use tfboard
-    tblogger = None
-    if args.tfboard:
-        print('use tensorboard')
-        from torch.utils.tensorboard import SummaryWriter
-        c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
-        log_path = os.path.join('log/', args.dataset, c_time)
-        os.makedirs(log_path, exist_ok=True)
-
-        tblogger = SummaryWriter(log_path)
-    # optimizer setup
-    base_lr = args.lr
-    tmp_lr = args.lr
-    if args.optimizer == 'NpuFusedSGD':
-        print('use SGD with momentum ...')
-        optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), lr=args.lr, momentum=0.9)
-        # optimizer = optim.SGD(model.parameters(),
-        #                         lr=tmp_lr,
-        #                         momentum=0.9,
-        #                         weight_decay=5e-4)
-    elif args.optimizer == 'adamw':
-        print('use AdamW ...')
-        optimizer = optim.AdamW(model.parameters(),
-                                lr=tmp_lr,
-                                weight_decay=5e-4)
-
-    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0,combine_grad=True)
-
-    # DDP
-    if args.distributed and args.num_gpu > 1:
-        print('using DDP ...')
-        model = DDP(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False)
-
-
-
-
-    batch_size = args.batch_size
-    epoch_size = len(dataset) // (batch_size * args.num_gpu)
-    best_map = -100.
-    warmup = not args.no_warmup
-
-    t0 = time.time()
-    # start training loop
-    for epoch in range(args.start_epoch, args.max_epoch):
-        if args.distributed:
-            dataloader.sampler.set_epoch(epoch)
-
-        # use step lr decay
-        if args.lr_schedule == 'step':
-            if epoch in args.lr_epoch:
-                tmp_lr = tmp_lr * 0.1
-                set_lr(optimizer, tmp_lr)
-        # use cos lr decay
-        elif args.lr_schedule == 'cos' and not warmup:
-            T_max = args.max_epoch - 15
-            lr_min = base_lr * 0.1 * 0.1
-            if epoch > T_max:
-                # Cos decay is done
-                print('Cosine annealing is over !!')
-                args.lr_schedule == None
-                tmp_lr = lr_min
-                set_lr(optimizer, tmp_lr)
-            else:
-                tmp_lr = lr_min + 0.5*(base_lr - lr_min)*(1 + math.cos(math.pi*epoch / T_max))
-                set_lr(optimizer, tmp_lr)
-        fps_sum=0
-        # train one epoch
-        # pre_flag = False
-        # start_time = time.time()
-        for iter_i, (images, targets) in enumerate(dataloader):
-            # if iter_i == 5:
-            #     start_time = time.time()
-            # with torch.autograd.profiler.profile(use_npu=True) as prof:
-            ni = iter_i + epoch * epoch_size
-            # warmup
-            if epoch < args.wp_epoch and warmup:
-                nw = args.wp_epoch * epoch_size
-                tmp_lr = base_lr * pow(ni / nw, 4)
-                set_lr(optimizer, tmp_lr)
-
-            elif epoch == args.wp_epoch and iter_i == 0 and warmup:
-                # warmup is over
-                print('Warmup is over !!')
-                warmup = False
-                tmp_lr = base_lr
-                set_lr(optimizer, tmp_lr)
-
-            # multi-scale trick
-            if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
-                # randomly choose a new size
-                r = args.multi_scale_range
-                train_size = random.randint(r[0], r[1]) * 32
-                model.set_grid(train_size)
-            if args.multi_scale:
-                # interpolate
-                images = torch.nn.functional.interpolate(
-                                    input=images,
-                                    size=train_size,
-                                    mode='bilinear',
-                                    align_corners=False)
-
-            targets = [label.tolist() for label in targets]
-            # visualize target
-            if args.vis_data:
-                vis_data(images, targets)
-                continue
-            # make labels
-            targets = create_labels.gt_creator(
-                                    img_size=train_size,
-                                    strides=net.stride,
-                                    label_lists=targets,
-                                    anchor_size=cfg["anchor_size"],
-                                    multi_anchor=args.multi_anchor,
-                                    center_sample=args.center_sample)
-            # visualize assignment
-            if args.vis_targets:
-                vis_targets(images, targets, cfg["anchor_size"], net.stride)
-                continue
-
-            # to device
-            images = images.to(device)
-            targets = targets.to(device)
-
-            # inference
-            pred_obj, pred_cls, pred_iou, targets = model(images, targets=targets)
-
-            # compute loss
-            loss_obj, loss_cls, loss_reg, total_loss = criterion(pred_obj, pred_cls, pred_iou, targets)
-
-            # check loss
-            if torch.isnan(total_loss):
-                continue
-
-            loss_dict = dict(
-                loss_obj=loss_obj,
-                loss_cls=loss_cls,
-                loss_reg=loss_reg,
-                total_loss=total_loss
-            )
-            loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict)
-
-            total_loss = total_loss / args.accumulate
-            # Backward and Optimize
-            with amp.scale_loss(total_loss , optimizer) as scaled_loss:
-                scaled_loss.backward()
-            if ni % args.accumulate == 0:
-                if args.grad_clip is not None:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
-                optimizer.step()
-                optimizer.zero_grad()
-
-                if args.ema:
-                    ema.update(model)
-
-                # display
-            # if iter_i % 10 == 0:
-                if args.tfboard:
-                    # viz loss
-                    tblogger.add_scalar('loss obj',  loss_dict_reduced['loss_obj'].item(),  ni)
-                    tblogger.add_scalar('loss cls',  loss_dict_reduced['loss_cls'].item(),  ni)
-                    tblogger.add_scalar('loss reg',  loss_dict_reduced['loss_reg'].item(),  ni)
-
-            t1 = time.time()
-            print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: obj %.2f || cls %.2f || reg %.2f || size %d || time: %.2f]'
-                        % (epoch+1,
-                           args.max_epoch,
-                           iter_i,
-                           epoch_size,
-                           tmp_lr,
-                           loss_dict['loss_obj'].item(),
-                           loss_dict['loss_cls'].item(),
-                           loss_dict['loss_reg'].item(),
-                           train_size,
-                           t1-t0),
-                        flush=True)
-            fps_sum = fps_sum + (batch_size*8 / (t1 - t0))
-            t0 = time.time()
-        # if local_rank in [-1, 0]:
-        #     epoch_time = time.time() - start_time
-        #     if iter_i >= 5:
-        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1 - 5) / (epoch_time)))
-        #     else:
-        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1) / (epoch_time)))
-            if iter_i > 0 and iter_i == 461:
-                fps_avg = fps_sum / 461
-                print("fps:",fps_avg)
-                fps_sum = 0
-
-        # evaluation
-        if (epoch + 1) % args.eval_epoch == 0 or (epoch + 1) == args.max_epoch:
-            if evaluator is None:
-                print('No evaluator ...')
-                print('Saving state, epoch:', epoch + 1)
-                torch.save(model_eval.state_dict(), os.path.join(path_to_save,
-                            args.model + '_' + repr(epoch + 1) + '.pth'))
-                print('Keep training ...')
-            else:
-                print('eval ...')
-                # check ema
-                if args.ema:
-                    model_eval = ema.ema
-                else:
-                    model_eval = model.module if args.distributed else model
-
-                # set eval mode
-                model_eval.trainable = False
-                model_eval.set_grid(val_size)
-                model_eval.eval()
-
-                if local_rank == 0:
-                    # evaluate
-                    evaluator.evaluate(model_eval)
-
-                    cur_map = evaluator.map
-                    if cur_map > best_map:
-                        # update best-map
-                        best_map = cur_map
-                        # save model
-                        print('Saving state, epoch:', epoch + 1)
-                        torch.save(model_eval.state_dict(), os.path.join(path_to_save,
-                                    args.model + '_' + repr(epoch + 1) + '_' + str(round(best_map*100, 2)) + '.pth'))
-                    if args.tfboard:
-                        if args.dataset == 'voc':
-                            tblogger.add_scalar('07test/mAP', evaluator.map, epoch)
-                        elif args.dataset == 'coco':
-                            tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch)
-                            tblogger.add_scalar('val/AP50', evaluator.ap50, epoch)
-
-                if args.distributed:
-                    # wait for all processes to synchronize
-                    dist.barrier()
-
-                # set train mode.
-                model_eval.trainable = True
-                model_eval.set_grid(train_size)
-                model_eval.train()
-
-        # close mosaic augmentation
-        if args.mosaic and args.max_epoch - epoch == 15:
-            print('close Mosaic Augmentation ...')
-            dataloader.dataset.mosaic = False
-        # close mixup augmentation
-        if args.mixup and args.max_epoch - epoch == 15:
-            print('close Mixup Augmentation ...')
-            dataloader.dataset.mixup = False
-
-    if args.tfboard:
-        tblogger.close()
-
-
-def build_dataset(args, train_size, val_size, device):
-    if args.dataset == 'voc':
-        data_dir = os.path.join(args.root, 'VOCdevkit')
-        num_classes = 20
-        dataset = VOCDetection(
-                        data_dir=data_dir,
-                        img_size=train_size,
-                        transform=TrainTransforms(train_size),
-                        color_augment=ColorTransforms(train_size),
-                        mosaic=args.mosaic,
-                        mixup=args.mixup)
-
-        evaluator = VOCAPIEvaluator(
-                        data_dir=data_dir,
-                        img_size=val_size,
-                        device=device,
-                        transform=ValTransforms(val_size))
-
-    elif args.dataset == 'coco':
-        data_dir = os.path.join(args.root, 'COCO')
-        num_classes = 80
-        dataset = COCODataset(
-                    data_dir=data_dir,
-                    img_size=train_size,
-                    image_set='train2017',
-                    transform=TrainTransforms(train_size),
-                    color_augment=ColorTransforms(train_size),
-                    mosaic=args.mosaic,
-                    mixup=args.mixup)
-
-        evaluator = COCOAPIEvaluator(
-                        data_dir=data_dir,
-                        img_size=val_size,
-                        device=device,
-                        transform=ValTransforms(val_size)
-                        )
-
-    else:
-        print('unknow dataset !! Only support voc and coco !!')
-        exit(0)
-
-    return dataset, evaluator, num_classes
-
-
-def build_dataloader(args, dataset, collate_fn=None):
-    # distributed
-    if args.distributed and args.num_gpu > 1:
-        # dataloader
-        dataloader = torch.utils.data.DataLoader(
-                        dataset=dataset,
-                        batch_size=args.batch_size,
-                        collate_fn=collate_fn,
-                        num_workers=args.num_workers,
-                        pin_memory=True,
-                        sampler=torch.utils.data.distributed.DistributedSampler(dataset)
-                        )
-
-    else:
-        # dataloader
-        dataloader = torch.utils.data.DataLoader(
-                        dataset=dataset,
-                        shuffle=True,
-                        batch_size=args.batch_size,
-                        collate_fn=collate_fn,
-                        num_workers=args.num_workers,
-                        pin_memory=True
-                        )
-    return dataloader
-
-
-def set_lr(optimizer, lr):
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-
-
-if __name__ == '__main__':
-    train()
-
-- 
Gitee


From c60141e8229575609cad7ccd124c83b0323ac428 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:48:06 +0000
Subject: [PATCH 6/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTo?=
 =?UTF-8?q?rch/contrib/cv/detection/YoloV2-640/train8p.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cv/detection/YoloV2-640/train8p.py        | 545 ------------------
 1 file changed, 545 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train8p.py

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py b/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
deleted file mode 100644
index b34ec2f26f..0000000000
--- a/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
+++ /dev/null
@@ -1,545 +0,0 @@
-from __future__ import division
-
-import os
-import argparse
-import time
-import math
-import random
-from copy import deepcopy
-import apex
-from apex import amp
-import torch
-import torch_npu
-import torch.optim as optim
-import torch.backends.cudnn as cudnn
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-import sys
-from config.yolo_config import yolo_config
-from data.voc import VOCDetection
-from data.coco import COCODataset
-from data.transforms import TrainTransforms, ColorTransforms, ValTransforms
-
-from utils import distributed_utils
-from utils import create_labels
-from utils.vis import vis_data, vis_targets
-from utils.com_flops_params import FLOPs_and_Params
-from utils.criterion import build_criterion
-from utils.misc import detection_collate
-from utils.misc import ModelEMA
-from utils.criterion import build_criterion
-
-from models.yolo import build_model
-
-from evaluator.cocoapi_evaluator import COCOAPIEvaluator
-from evaluator.vocapi_evaluator import VOCAPIEvaluator
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='YOLO Detection')
-    # basic
-    parser.add_argument('--npu', action='store_true', default=False,
-                        help='use npu.')
-    parser.add_argument('--batch_size', default=16, type=int,
-                        help='Batch size for training')
-    parser.add_argument('--lr', default=1e-3, type=float,
-                        help='initial learning rate')
-    parser.add_argument('--img_size', type=int, default=640,
-                        help='The upper bound of warm-up')
-    parser.add_argument('--multi_scale_range', nargs='+', default=[10, 20], type=int,
-                        help='lr epoch to decay')
-    parser.add_argument('--max_epoch', type=int, default=200,
-                        help='The upper bound of warm-up')
-    parser.add_argument('--lr_epoch', nargs='+', default=[100, 150], type=int,
-                        help='lr epoch to decay')
-    parser.add_argument('--wp_epoch', type=int, default=2,
-                        help='The upper bound of warm-up')
-    parser.add_argument('--start_epoch', type=int, default=0,
-                        help='start epoch to train')
-    parser.add_argument('-r', '--resume', default=None, type=str,
-                        help='keep training')
-    parser.add_argument('--num_workers', default=8, type=int,
-                        help='Number of workers used in dataloading')
-    parser.add_argument('--num_gpu', default=1, type=int,
-                        help='Number of GPUs to train')
-    parser.add_argument('--eval_epoch', type=int,
-                        default=10, help='interval between evaluations')
-    parser.add_argument('--tfboard', action='store_true', default=False,
-                        help='use tensorboard')
-    parser.add_argument('--save_folder', default='weights/', type=str,
-                        help='path to save weight')
-    parser.add_argument('--vis_data', action='store_true', default=False,
-                        help='visualize images and labels.')
-    parser.add_argument('--vis_targets', action='store_true', default=False,
-                        help='visualize assignment.')
-
-    # Optimizer & Schedule
-    parser.add_argument('--optimizer', default='NpuFusedSGD', type=str,
-                        help='sgd, adamw')
-    parser.add_argument('--lr_schedule', default='step', type=str,
-                        help='step, cos')
-    parser.add_argument('--grad_clip', default=None, type=float,
-                        help='clip gradient')
-
-    # model
-    parser.add_argument('-m', '--model', default='yolov1',
-                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
-                             'yolov4, yolo_tiny, yolo_nano')
-    parser.add_argument('--conf_thresh', default=0.001, type=float,
-                        help='NMS threshold')
-    parser.add_argument('--nms_thresh', default=0.5, type=float,
-                        help='NMS threshold')
-
-    # dataset
-    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
-                        help='data root')
-    parser.add_argument('-d', '--dataset', default='coco',
-                        help='coco, widerface, crowdhuman')
-
-    # Loss
-    parser.add_argument('--loss_obj_weight', default=1.0, type=float,
-                        help='weight of obj loss')
-    parser.add_argument('--loss_cls_weight', default=1.0, type=float,
-                        help='weight of cls loss')
-    parser.add_argument('--loss_reg_weight', default=1.0, type=float,
-                        help='weight of reg loss')
-    parser.add_argument('--scale_loss', default='batch', type=str,
-                        help='scale loss: batch or positive samples')
-
-    # train trick
-    parser.add_argument('--no_warmup', action='store_true', default=False,
-                        help='do not use warmup')
-    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
-                        help='use multi-scale trick')
-    parser.add_argument('--ema', action='store_true', default=False,
-                        help='use ema training trick')
-    parser.add_argument('--mosaic', action='store_true', default=False,
-                        help='use Mosaic Augmentation trick')
-    parser.add_argument('--mixup', action='store_true', default=False,
-                        help='use MixUp Augmentation trick')
-    parser.add_argument('--multi_anchor', action='store_true', default=False,
-                        help='use multiple anchor boxes as the positive samples')
-    parser.add_argument('--center_sample', action='store_true', default=False,
-                        help='use center sample for labels')
-    parser.add_argument('--accumulate', type=int, default=1,
-                        help='accumulate gradient')
-    # DDP train
-    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
-                        help='distributed training')
-    parser.add_argument('--local_rank', type=int, default=0,
-                        help='local_rank')
-    parser.add_argument('--sybn', action='store_true', default=False,
-                        help='use sybn.')
-    parser.add_argument('--opt-level', default='O2', type=str,
-                        help='loss scale using in amp, default O1')
-
-    return parser.parse_args()
-
-
-def train():
-    args = parse_args()
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12345'
-
-    # torch.npu.set_compile_mode(jit_compile=False)
-    option = {}
-    option["ACL_OP_COMPILER_CACHE_MODE"]="enable"
-    option["ACL_OP_COMPILER_CACHE_DIR"]="./kernel_meta"
-    option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Maximum,Conv2D,BNInfer,BNTrainingReduceGrad,Cast"
-    print("option:",option)
-    # torch.npu.set_option(option)
-    print("Setting Arguments.. : ", args)
-    print("----------------------------------------------------------")
-
-    # path to save model
-    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
-    os.makedirs(path_to_save, exist_ok=True)
-
-    # set distributed
-    local_rank = 0
-    if args.distributed:
-        dist.init_process_group(backend="hccl", #init_method="env://"
-                                )
-        local_rank = torch.distributed.get_rank()
-        print(local_rank)
-        torch_npu.npu.set_device(local_rank)
-
-    # cuda
-    if args.npu:
-        print('use npu')
-        cudnn.benchmark = True
-        device = torch.device("npu")
-    else:
-        device = torch.device("cpu")
-
-    # YOLO config
-    cfg = yolo_config[args.model]
-    train_size = val_size = args.img_size
-
-    # dataset and evaluator
-    dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device)
-    # dataloader
-    dataloader = build_dataloader(args, dataset, detection_collate)
-    # criterioin
-    criterion = build_criterion(args, cfg, num_classes)
-
-    print('Training model on:', args.dataset)
-    print('The dataset size:', len(dataset))
-    print("----------------------------------------------------------")
-
-    # build model
-    net = build_model(args=args,
-                      cfg=cfg,
-                      device=device,
-                      num_classes=num_classes,
-                      trainable=True)
-    model = net
-
-    # SyncBatchNorm
-    # if args.sybn and args.npu and args.num_gpu > 1:
-    #     print('use SyncBatchNorm ...')
-    #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-
-    model = model.to(device).train()
-    # compute FLOPs and Params
-    # if local_rank == 0:
-    #     model_copy = deepcopy(model)
-    #     model_copy.trainable = False
-    #     model_copy.eval()
-    #     FLOPs_and_Params(model=model_copy, size=train_size)
-    #     model_copy.trainable = True
-    #     model_copy.train()
-    # keep training
-    if args.resume is not None:
-        print('keep training model: %s' % (args.resume))
-        model.load_state_dict(torch.load(args.resume, map_location=device))
-
-    # EMA
-    ema = ModelEMA(model) if args.ema else None
-    # use tfboard
-    tblogger = None
-    if args.tfboard:
-        print('use tensorboard')
-        from torch.utils.tensorboard import SummaryWriter
-        c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
-        log_path = os.path.join('log/', args.dataset, c_time)
-        os.makedirs(log_path, exist_ok=True)
-
-        tblogger = SummaryWriter(log_path)
-    # optimizer setup
-    base_lr = args.lr
-    tmp_lr = args.lr
-    if args.optimizer == 'NpuFusedSGD':
-        print('use SGD with momentum ...')
-        optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), lr=args.lr, momentum=0.9)
-        # optimizer = optim.SGD(model.parameters(),
-        #                         lr=tmp_lr,
-        #                         momentum=0.9,
-        #                         weight_decay=5e-4)
-    elif args.optimizer == 'adamw':
-        print('use AdamW ...')
-        optimizer = optim.AdamW(model.parameters(),
-                                lr=tmp_lr,
-                                weight_decay=5e-4)
-
-    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0,combine_grad=True)
-
-    # DDP
-    if args.distributed and args.num_gpu > 1:
-        print('using DDP ...')
-        model = DDP(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False)
-
-
-
-
-    batch_size = args.batch_size
-    epoch_size = len(dataset) // (batch_size * args.num_gpu)
-    best_map = -100.
-    warmup = not args.no_warmup
-
-    t0 = time.time()
-    # start training loop
-    for epoch in range(args.start_epoch, args.max_epoch):
-        if args.distributed:
-            dataloader.sampler.set_epoch(epoch)
-
-        # use step lr decay
-        if args.lr_schedule == 'step':
-            if epoch in args.lr_epoch:
-                tmp_lr = tmp_lr * 0.1
-                set_lr(optimizer, tmp_lr)
-        # use cos lr decay
-        elif args.lr_schedule == 'cos' and not warmup:
-            T_max = args.max_epoch - 15
-            lr_min = base_lr * 0.1 * 0.1
-            if epoch > T_max:
-                # Cos decay is done
-                print('Cosine annealing is over !!')
-                args.lr_schedule == None
-                tmp_lr = lr_min
-                set_lr(optimizer, tmp_lr)
-            else:
-                tmp_lr = lr_min + 0.5*(base_lr - lr_min)*(1 + math.cos(math.pi*epoch / T_max))
-                set_lr(optimizer, tmp_lr)
-        fps_sum=0
-        # train one epoch
-        # pre_flag = False
-        # start_time = time.time()
-        for iter_i, (images, targets) in enumerate(dataloader):
-            # if iter_i == 5:
-            #     start_time = time.time()
-            # with torch.autograd.profiler.profile(use_npu=True) as prof:
-            ni = iter_i + epoch * epoch_size
-            # warmup
-            if epoch < args.wp_epoch and warmup:
-                nw = args.wp_epoch * epoch_size
-                tmp_lr = base_lr * pow(ni / nw, 4)
-                set_lr(optimizer, tmp_lr)
-
-            elif epoch == args.wp_epoch and iter_i == 0 and warmup:
-                # warmup is over
-                print('Warmup is over !!')
-                warmup = False
-                tmp_lr = base_lr
-                set_lr(optimizer, tmp_lr)
-
-            # multi-scale trick
-            if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
-                # randomly choose a new size
-                r = args.multi_scale_range
-                train_size = random.randint(r[0], r[1]) * 32
-                model.module.set_grid(train_size)
-            if args.multi_scale:
-                # interpolate
-                images = torch.nn.functional.interpolate(
-                                    input=images,
-                                    size=train_size,
-                                    mode='bilinear',
-                                    align_corners=False)
-
-            targets = [label.tolist() for label in targets]
-            # visualize target
-            if args.vis_data:
-                vis_data(images, targets)
-                continue
-            # make labels
-            targets = create_labels.gt_creator(
-                                    img_size=train_size,
-                                    strides=net.stride,
-                                    label_lists=targets,
-                                    anchor_size=cfg["anchor_size"],
-                                    multi_anchor=args.multi_anchor,
-                                    center_sample=args.center_sample)
-            # visualize assignment
-            if args.vis_targets:
-                vis_targets(images, targets, cfg["anchor_size"], net.stride)
-                continue
-
-            # to device
-            images = images.to(device)
-            targets = targets.to(device)
-
-            # inference
-            pred_obj, pred_cls, pred_iou, targets = model(images, targets=targets)
-
-            # compute loss
-            loss_obj, loss_cls, loss_reg, total_loss = criterion(pred_obj, pred_cls, pred_iou, targets)
-
-            # check loss
-            if torch.isnan(total_loss):
-                continue
-
-            loss_dict = dict(
-                loss_obj=loss_obj,
-                loss_cls=loss_cls,
-                loss_reg=loss_reg,
-                total_loss=total_loss
-            )
-            loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict)
-
-            total_loss = total_loss / args.accumulate
-            # Backward and Optimize
-            with amp.scale_loss(total_loss , optimizer) as scaled_loss:
-                scaled_loss.backward()
-            if ni % args.accumulate == 0:
-                if args.grad_clip is not None:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
-                optimizer.step()
-                optimizer.zero_grad()
-
-                if args.ema:
-                    ema.update(model)
-
-                # display
-            # if iter_i % 10 == 0:
-                if args.tfboard:
-                    # viz loss
-                    tblogger.add_scalar('loss obj',  loss_dict_reduced['loss_obj'].item(),  ni)
-                    tblogger.add_scalar('loss cls',  loss_dict_reduced['loss_cls'].item(),  ni)
-                    tblogger.add_scalar('loss reg',  loss_dict_reduced['loss_reg'].item(),  ni)
-
-            t1 = time.time()
-            print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: obj %.2f || cls %.2f || reg %.2f || size %d || time: %.2f]'
-                        % (epoch+1,
-                           args.max_epoch,
-                           iter_i,
-                           epoch_size,
-                           tmp_lr,
-                           loss_dict['loss_obj'].item(),
-                           loss_dict['loss_cls'].item(),
-                           loss_dict['loss_reg'].item(),
-                           train_size,
-                           t1-t0),
-                        flush=True)
-            fps_sum = fps_sum + (batch_size*8 / (t1 - t0))
-            t0 = time.time()
-        # if local_rank in [-1, 0]:
-        #     epoch_time = time.time() - start_time
-        #     if iter_i >= 5:
-        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1 - 5) / (epoch_time)))
-        #     else:
-        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1) / (epoch_time)))
-            if iter_i > 0 and iter_i == 461:
-                fps_avg = fps_sum / 461
-                print("fps:",fps_avg)
-                fps_sum = 0
-
-        # evaluation
-        if (epoch + 1) % args.eval_epoch == 0 or (epoch + 1) == args.max_epoch:
-            if evaluator is None:
-                print('No evaluator ...')
-                print('Saving state, epoch:', epoch + 1)
-                torch.save(model_eval.state_dict(), os.path.join(path_to_save,
-                            args.model + '_' + repr(epoch + 1) + '.pth'))
-                print('Keep training ...')
-            else:
-                print('eval ...')
-                # check ema
-                if args.ema:
-                    model_eval = ema.ema
-                else:
-                    model_eval = model.module if args.distributed else model
-
-                # set eval mode
-                model_eval.trainable = False
-                model_eval.set_grid(val_size)
-                model_eval.eval()
-
-                if local_rank == 0:
-                    # evaluate
-                    evaluator.evaluate(model_eval)
-
-                    cur_map = evaluator.map
-                    if cur_map > best_map:
-                        # update best-map
-                        best_map = cur_map
-                        # save model
-                        print('Saving state, epoch:', epoch + 1)
-                        torch.save(model_eval.state_dict(), os.path.join(path_to_save,
-                                    args.model + '_' + repr(epoch + 1) + '_' + str(round(best_map*100, 2)) + '.pth'))
-                    if args.tfboard:
-                        if args.dataset == 'voc':
-                            tblogger.add_scalar('07test/mAP', evaluator.map, epoch)
-                        elif args.dataset == 'coco':
-                            tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch)
-                            tblogger.add_scalar('val/AP50', evaluator.ap50, epoch)
-
-                if args.distributed:
-                    # wait for all processes to synchronize
-                    dist.barrier()
-
-                # set train mode.
-                model_eval.trainable = True
-                model_eval.set_grid(train_size)
-                model_eval.train()
-
-        # close mosaic augmentation
-        if args.mosaic and args.max_epoch - epoch == 15:
-            print('close Mosaic Augmentation ...')
-            dataloader.dataset.mosaic = False
-        # close mixup augmentation
-        if args.mixup and args.max_epoch - epoch == 15:
-            print('close Mixup Augmentation ...')
-            dataloader.dataset.mixup = False
-
-    if args.tfboard:
-        tblogger.close()
-
-
-def build_dataset(args, train_size, val_size, device):
-    if args.dataset == 'voc':
-        data_dir = os.path.join(args.root, 'VOCdevkit')
-        num_classes = 20
-        dataset = VOCDetection(
-                        data_dir=data_dir,
-                        img_size=train_size,
-                        transform=TrainTransforms(train_size),
-                        color_augment=ColorTransforms(train_size),
-                        mosaic=args.mosaic,
-                        mixup=args.mixup)
-
-        evaluator = VOCAPIEvaluator(
-                        data_dir=data_dir,
-                        img_size=val_size,
-                        device=device,
-                        transform=ValTransforms(val_size))
-
-    elif args.dataset == 'coco':
-        data_dir = os.path.join(args.root, 'COCO')
-        num_classes = 80
-        dataset = COCODataset(
-                    data_dir=data_dir,
-                    img_size=train_size,
-                    image_set='train2017',
-                    transform=TrainTransforms(train_size),
-                    color_augment=ColorTransforms(train_size),
-                    mosaic=args.mosaic,
-                    mixup=args.mixup)
-
-        evaluator = COCOAPIEvaluator(
-                        data_dir=data_dir,
-                        img_size=val_size,
-                        device=device,
-                        transform=ValTransforms(val_size)
-                        )
-
-    else:
-        print('unknow dataset !! Only support voc and coco !!')
-        exit(0)
-
-    return dataset, evaluator, num_classes
-
-
-def build_dataloader(args, dataset, collate_fn=None):
-    # distributed
-    if args.distributed and args.num_gpu > 1:
-        # dataloader
-        dataloader = torch.utils.data.DataLoader(
-                        dataset=dataset,
-                        batch_size=args.batch_size,
-                        collate_fn=collate_fn,
-                        num_workers=args.num_workers,
-                        pin_memory=True,
-                        sampler=torch.utils.data.distributed.DistributedSampler(dataset)
-                        )
-
-    else:
-        # dataloader
-        dataloader = torch.utils.data.DataLoader(
-                        dataset=dataset,
-                        shuffle=True,
-                        batch_size=args.batch_size,
-                        collate_fn=collate_fn,
-                        num_workers=args.num_workers,
-                        pin_memory=True
-                        )
-    return dataloader
-
-
-def set_lr(optimizer, lr):
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-
-
-if __name__ == '__main__':
-    train()
-
-- 
Gitee


From f9207a4be3168f6d309510e1a077d9f6c78dc98a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:48:32 +0000
Subject: [PATCH 7/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20PyTo?=
 =?UTF-8?q?rch/contrib/cv/detection/YoloV2-640/train-1p.sh?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
deleted file mode 100644
index b0d155ece7..0000000000
--- a/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-python3 train3.py \
-        --npu \
-        -d coco \
-        -m yolov2 \
-        --root /home/normal58/zhang/zzb_msft \
-        --batch_size 16 \
-        --lr 0.001 \
-        --img_size 640 \
-        --max_epoch 200 \
-        --lr_epoch 100 150 \
-        --multi_scale \
-        --multi_scale_range 10 20 \
-        --multi_anchor \
-- 
Gitee


From a745b8e64c4190d108f3bce4377e9050616f70dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=99=BA=E6=96=8C123?= <qq1336356953@163.com>
Date: Thu, 27 Apr 2023 06:49:01 +0000
Subject: [PATCH 8/8] my first commit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张智斌123 <qq1336356953@163.com>
---
 .../cv/detection/YoloV2-640/train-1p.sh       |  13 +
 .../cv/detection/YoloV2-640/train-8p.sh       |  82 +++
 .../cv/detection/YoloV2-640/train1p.py        | 559 ++++++++++++++++++
 .../cv/detection/YoloV2-640/train8p.py        | 559 ++++++++++++++++++
 4 files changed, 1213 insertions(+)
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
 create mode 100644 PyTorch/contrib/cv/detection/YoloV2-640/train8p.py

diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
new file mode 100644
index 0000000000..d844daf25b
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train-1p.sh
@@ -0,0 +1,13 @@
+python3 train1p.py \
+        --npu \
+        -d coco \
+        -m yolov2 \
+        --root /forDocker/dataset \
+        --batch_size 16 \
+        --lr 0.001 \
+        --img_size 640 \
+        --max_epoch 200 \
+        --lr_epoch 100 150 \
+        --multi_scale \
+        --multi_scale_range 10 20 \
+        --multi_anchor \
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh b/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
new file mode 100644
index 0000000000..707a4dc136
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train-8p.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+RANK_ID_START=0
+export WORLD_SIZE=8
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+#训练batch_size,,需要模型审视修改
+batch_size=32
+#设置环境变量，不需要修改
+RANK_ID=0
+echo "Decive ID: $RANK_ID"
+export RANK_ID=$RANK_ID
+export ASCEND_DEVICE_ID=$RANK_ID
+ASCEND_DEVICE_ID=$RANK_ID
+#创建DeviceID输出目录，不需要修改
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt
+fi
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+export RANK_SIZE=8
+
+KERNEL_NUM=$(($(nproc)/8))
+for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
+do
+    export RANK=$RANK_ID
+
+    if [ $(uname -m) = "aarch64" ]
+    then
+        PID_START=$((KERNEL_NUM * RANK_ID))
+        PID_END=$((PID_START + KERNEL_NUM - 1))
+        taskset -c $PID_START-$PID_END python3.7 -m torch.distributed.launch --nproc_per_node=8 train8p.py \
+                                                        --npu \
+                                                        -d coco \
+                                                        -m yolov2 \
+                                                        --root /forDocker/dataset \
+                                                        --batch_size 32 \
+                                                        --lr 0.002 \
+                                                        --img_size 640 \
+                                                        --max_epoch 200 \
+                                                        --lr_epoch 100 150 \
+                                                        --multi_scale \
+                                                        --multi_scale_range 10 20 \
+                                                        --multi_anchor \
+                                                        -dist \
+                                                        --sybn \
+                                                        --num_gpu 8 \
+                                                        --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    else
+        python3.7 -m torch.distributed.launch --nproc_per_node=8 train8p.py \
+                                                        --npu \
+                                                        -d coco \
+                                                        -m yolov2 \
+                                                        --root /forDocker/dataset \
+                                                        --batch_size 32 \
+                                                        --lr 0.002 \
+                                                        --img_size 640 \
+                                                        --max_epoch 200 \
+                                                        --lr_epoch 100 150 \
+                                                        --multi_scale \
+                                                        --multi_scale_range 10 20 \
+                                                        --multi_anchor \
+                                                        -dist \
+                                                        --sybn \
+                                                        --num_gpu 8 \
+                                                        --local_rank 0 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+    fi
+done
+
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py b/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
new file mode 100644
index 0000000000..54429fa1ec
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train1p.py
@@ -0,0 +1,559 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from __future__ import division
+
+import os
+import argparse
+import time
+import math
+import random
+from copy import deepcopy
+import apex
+from apex import amp
+import torch
+import torch_npu
+import torch.optim as optim
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import sys
+from config.yolo_config import yolo_config
+from data.voc import VOCDetection
+from data.coco import COCODataset
+from data.transforms import TrainTransforms, ColorTransforms, ValTransforms
+
+from utils import distributed_utils
+from utils import create_labels
+from utils.vis import vis_data, vis_targets
+from utils.com_flops_params import FLOPs_and_Params
+from utils.criterion import build_criterion
+from utils.misc import detection_collate
+from utils.misc import ModelEMA
+from utils.criterion import build_criterion
+
+from models.yolo import build_model
+
+from evaluator.cocoapi_evaluator import COCOAPIEvaluator
+from evaluator.vocapi_evaluator import VOCAPIEvaluator
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO Detection')
+    # basic
+    parser.add_argument('--npu', action='store_true', default=False,
+                        help='use npu.')
+    parser.add_argument('--batch_size', default=16, type=int,
+                        help='Batch size for training')
+    parser.add_argument('--lr', default=1e-3, type=float,
+                        help='initial learning rate')
+    parser.add_argument('--img_size', type=int, default=640,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--multi_scale_range', nargs='+', default=[10, 20], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--max_epoch', type=int, default=200,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--lr_epoch', nargs='+', default=[100, 150], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--wp_epoch', type=int, default=2,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--start_epoch', type=int, default=0,
+                        help='start epoch to train')
+    parser.add_argument('-r', '--resume', default=None, type=str,
+                        help='keep training')
+    parser.add_argument('--num_workers', default=8, type=int,
+                        help='Number of workers used in dataloading')
+    parser.add_argument('--num_gpu', default=1, type=int,
+                        help='Number of GPUs to train')
+    parser.add_argument('--eval_epoch', type=int,
+                        default=10, help='interval between evaluations')
+    parser.add_argument('--tfboard', action='store_true', default=False,
+                        help='use tensorboard')
+    parser.add_argument('--save_folder', default='weights/', type=str,
+                        help='path to save weight')
+    parser.add_argument('--vis_data', action='store_true', default=False,
+                        help='visualize images and labels.')
+    parser.add_argument('--vis_targets', action='store_true', default=False,
+                        help='visualize assignment.')
+
+    # Optimizer & Schedule
+    parser.add_argument('--optimizer', default='NpuFusedSGD', type=str,
+                        help='sgd, adamw')
+    parser.add_argument('--lr_schedule', default='step', type=str,
+                        help='step, cos')
+    parser.add_argument('--grad_clip', default=None, type=float,
+                        help='clip gradient')
+
+    # model
+    parser.add_argument('-m', '--model', default='yolov1',
+                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                             'yolov4, yolo_tiny, yolo_nano')
+    parser.add_argument('--conf_thresh', default=0.001, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--nms_thresh', default=0.5, type=float,
+                        help='NMS threshold')
+
+    # dataset
+    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, widerface, crowdhuman')
+
+    # Loss
+    parser.add_argument('--loss_obj_weight', default=1.0, type=float,
+                        help='weight of obj loss')
+    parser.add_argument('--loss_cls_weight', default=1.0, type=float,
+                        help='weight of cls loss')
+    parser.add_argument('--loss_reg_weight', default=1.0, type=float,
+                        help='weight of reg loss')
+    parser.add_argument('--scale_loss', default='batch', type=str,
+                        help='scale loss: batch or positive samples')
+
+    # train trick
+    parser.add_argument('--no_warmup', action='store_true', default=False,
+                        help='do not use warmup')
+    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
+                        help='use multi-scale trick')
+    parser.add_argument('--ema', action='store_true', default=False,
+                        help='use ema training trick')
+    parser.add_argument('--mosaic', action='store_true', default=False,
+                        help='use Mosaic Augmentation trick')
+    parser.add_argument('--mixup', action='store_true', default=False,
+                        help='use MixUp Augmentation trick')
+    parser.add_argument('--multi_anchor', action='store_true', default=False,
+                        help='use multiple anchor boxes as the positive samples')
+    parser.add_argument('--center_sample', action='store_true', default=False,
+                        help='use center sample for labels')
+    parser.add_argument('--accumulate', type=int, default=1,
+                        help='accumulate gradient')
+    # DDP train
+    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
+                        help='distributed training')
+    parser.add_argument('--local_rank', type=int, default=0,
+                        help='local_rank')
+    parser.add_argument('--sybn', action='store_true', default=False,
+                        help='use sybn.')
+    parser.add_argument('--opt-level', default='O2', type=str,
+                        help='loss scale using in amp, default O1')
+
+    return parser.parse_args()
+
+
+def train():
+    args = parse_args()
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12345'
+
+    # torch.npu.set_compile_mode(jit_compile=False)
+    option = {}
+    option["ACL_OP_COMPILER_CACHE_MODE"]="enable"
+    option["ACL_OP_COMPILER_CACHE_DIR"]="./kernel_meta"
+    option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Maximum,Conv2D,BNInfer,BNTrainingReduceGrad,Cast"
+    print("option:",option)
+    # torch.npu.set_option(option)
+    print("Setting Arguments.. : ", args)
+    print("----------------------------------------------------------")
+
+    # path to save model
+    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
+    os.makedirs(path_to_save, exist_ok=True)
+
+    # set distributed
+    local_rank = 0
+    if args.distributed:
+        dist.init_process_group(backend="hccl", #init_method="env://"
+                                )
+        local_rank = torch.distributed.get_rank()
+        print(local_rank)
+        torch_npu.npu.set_device(local_rank)
+
+    # cuda
+    if args.npu:
+        print('use npu')
+        cudnn.benchmark = True
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # YOLO config
+    cfg = yolo_config[args.model]
+    train_size = val_size = args.img_size
+
+    # dataset and evaluator
+    dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device)
+    # dataloader
+    dataloader = build_dataloader(args, dataset, detection_collate)
+    # criterioin
+    criterion = build_criterion(args, cfg, num_classes)
+
+    print('Training model on:', args.dataset)
+    print('The dataset size:', len(dataset))
+    print("----------------------------------------------------------")
+
+    # build model
+    net = build_model(args=args,
+                      cfg=cfg,
+                      device=device,
+                      num_classes=num_classes,
+                      trainable=True)
+    model = net
+
+    # SyncBatchNorm
+    # if args.sybn and args.npu and args.num_gpu > 1:
+    #     print('use SyncBatchNorm ...')
+    #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model = model.to(device).train()
+    # compute FLOPs and Params
+    # if local_rank == 0:
+    #     model_copy = deepcopy(model)
+    #     model_copy.trainable = False
+    #     model_copy.eval()
+    #     FLOPs_and_Params(model=model_copy, size=train_size)
+    #     model_copy.trainable = True
+    #     model_copy.train()
+    # keep training
+    if args.resume is not None:
+        print('keep training model: %s' % (args.resume))
+        model.load_state_dict(torch.load(args.resume, map_location=device))
+
+    # EMA
+    ema = ModelEMA(model) if args.ema else None
+    # use tfboard
+    tblogger = None
+    if args.tfboard:
+        print('use tensorboard')
+        from torch.utils.tensorboard import SummaryWriter
+        c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
+        log_path = os.path.join('log/', args.dataset, c_time)
+        os.makedirs(log_path, exist_ok=True)
+
+        tblogger = SummaryWriter(log_path)
+    # optimizer setup
+    base_lr = args.lr
+    tmp_lr = args.lr
+    if args.optimizer == 'NpuFusedSGD':
+        print('use SGD with momentum ...')
+        optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), lr=args.lr, momentum=0.9)
+        # optimizer = optim.SGD(model.parameters(),
+        #                         lr=tmp_lr,
+        #                         momentum=0.9,
+        #                         weight_decay=5e-4)
+    elif args.optimizer == 'adamw':
+        print('use AdamW ...')
+        optimizer = optim.AdamW(model.parameters(),
+                                lr=tmp_lr,
+                                weight_decay=5e-4)
+
+    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0,combine_grad=True)
+
+    # DDP
+    if args.distributed and args.num_gpu > 1:
+        print('using DDP ...')
+        model = DDP(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False)
+
+
+
+
+    batch_size = args.batch_size
+    epoch_size = len(dataset) // (batch_size * args.num_gpu)
+    best_map = -100.
+    warmup = not args.no_warmup
+
+    t0 = time.time()
+    # start training loop
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if args.distributed:
+            dataloader.sampler.set_epoch(epoch)
+
+        # use step lr decay
+        if args.lr_schedule == 'step':
+            if epoch in args.lr_epoch:
+                tmp_lr = tmp_lr * 0.1
+                set_lr(optimizer, tmp_lr)
+        # use cos lr decay
+        elif args.lr_schedule == 'cos' and not warmup:
+            T_max = args.max_epoch - 15
+            lr_min = base_lr * 0.1 * 0.1
+            if epoch > T_max:
+                # Cos decay is done
+                print('Cosine annealing is over !!')
+                args.lr_schedule == None
+                tmp_lr = lr_min
+                set_lr(optimizer, tmp_lr)
+            else:
+                tmp_lr = lr_min + 0.5*(base_lr - lr_min)*(1 + math.cos(math.pi*epoch / T_max))
+                set_lr(optimizer, tmp_lr)
+        fps_sum=0
+        # train one epoch
+        # pre_flag = False
+        # start_time = time.time()
+        for iter_i, (images, targets) in enumerate(dataloader):
+            # if iter_i == 5:
+            #     start_time = time.time()
+            # with torch.autograd.profiler.profile(use_npu=True) as prof:
+            ni = iter_i + epoch * epoch_size
+            # warmup
+            if epoch < args.wp_epoch and warmup:
+                nw = args.wp_epoch * epoch_size
+                tmp_lr = base_lr * pow(ni / nw, 4)
+                set_lr(optimizer, tmp_lr)
+
+            elif epoch == args.wp_epoch and iter_i == 0 and warmup:
+                # warmup is over
+                print('Warmup is over !!')
+                warmup = False
+                tmp_lr = base_lr
+                set_lr(optimizer, tmp_lr)
+
+            # multi-scale trick
+            if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
+                # randomly choose a new size
+                r = args.multi_scale_range
+                train_size = random.randint(r[0], r[1]) * 32
+                model.set_grid(train_size)
+            if args.multi_scale:
+                # interpolate
+                images = torch.nn.functional.interpolate(
+                                    input=images,
+                                    size=train_size,
+                                    mode='bilinear',
+                                    align_corners=False)
+
+            targets = [label.tolist() for label in targets]
+            # visualize target
+            if args.vis_data:
+                vis_data(images, targets)
+                continue
+            # make labels
+            targets = create_labels.gt_creator(
+                                    img_size=train_size,
+                                    strides=net.stride,
+                                    label_lists=targets,
+                                    anchor_size=cfg["anchor_size"],
+                                    multi_anchor=args.multi_anchor,
+                                    center_sample=args.center_sample)
+            # visualize assignment
+            if args.vis_targets:
+                vis_targets(images, targets, cfg["anchor_size"], net.stride)
+                continue
+
+            # to device
+            images = images.to(device)
+            targets = targets.to(device)
+
+            # inference
+            pred_obj, pred_cls, pred_iou, targets = model(images, targets=targets)
+
+            # compute loss
+            loss_obj, loss_cls, loss_reg, total_loss = criterion(pred_obj, pred_cls, pred_iou, targets)
+
+            # check loss
+            if torch.isnan(total_loss):
+                continue
+
+            loss_dict = dict(
+                loss_obj=loss_obj,
+                loss_cls=loss_cls,
+                loss_reg=loss_reg,
+                total_loss=total_loss
+            )
+            loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict)
+
+            total_loss = total_loss / args.accumulate
+            # Backward and Optimize
+            with amp.scale_loss(total_loss , optimizer) as scaled_loss:
+                scaled_loss.backward()
+            if ni % args.accumulate == 0:
+                if args.grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                if args.ema:
+                    ema.update(model)
+
+                # display
+            # if iter_i % 10 == 0:
+                if args.tfboard:
+                    # viz loss
+                    tblogger.add_scalar('loss obj',  loss_dict_reduced['loss_obj'].item(),  ni)
+                    tblogger.add_scalar('loss cls',  loss_dict_reduced['loss_cls'].item(),  ni)
+                    tblogger.add_scalar('loss reg',  loss_dict_reduced['loss_reg'].item(),  ni)
+
+            t1 = time.time()
+            print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: obj %.2f || cls %.2f || reg %.2f || size %d || time: %.2f]'
+                        % (epoch+1,
+                           args.max_epoch,
+                           iter_i,
+                           epoch_size,
+                           tmp_lr,
+                           loss_dict['loss_obj'].item(),
+                           loss_dict['loss_cls'].item(),
+                           loss_dict['loss_reg'].item(),
+                           train_size,
+                           t1-t0),
+                        flush=True)
+            fps_sum = fps_sum + (batch_size*8 / (t1 - t0))
+            t0 = time.time()
+        # if local_rank in [-1, 0]:
+        #     epoch_time = time.time() - start_time
+        #     if iter_i >= 5:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1 - 5) / (epoch_time)))
+        #     else:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1) / (epoch_time)))
+            if iter_i > 0 and iter_i == 461:
+                fps_avg = fps_sum / 461
+                print("fps:",fps_avg)
+                fps_sum = 0
+
+        # evaluation
+        if (epoch + 1) % args.eval_epoch == 0 or (epoch + 1) == args.max_epoch:
+            if evaluator is None:
+                print('No evaluator ...')
+                print('Saving state, epoch:', epoch + 1)
+                torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                            args.model + '_' + repr(epoch + 1) + '.pth'))
+                print('Keep training ...')
+            else:
+                print('eval ...')
+                # check ema
+                if args.ema:
+                    model_eval = ema.ema
+                else:
+                    model_eval = model.module if args.distributed else model
+
+                # set eval mode
+                model_eval.trainable = False
+                model_eval.set_grid(val_size)
+                model_eval.eval()
+
+                if local_rank == 0:
+                    # evaluate
+                    evaluator.evaluate(model_eval)
+
+                    cur_map = evaluator.map
+                    if cur_map > best_map:
+                        # update best-map
+                        best_map = cur_map
+                        # save model
+                        print('Saving state, epoch:', epoch + 1)
+                        torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                                    args.model + '_' + repr(epoch + 1) + '_' + str(round(best_map*100, 2)) + '.pth'))
+                    if args.tfboard:
+                        if args.dataset == 'voc':
+                            tblogger.add_scalar('07test/mAP', evaluator.map, epoch)
+                        elif args.dataset == 'coco':
+                            tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch)
+                            tblogger.add_scalar('val/AP50', evaluator.ap50, epoch)
+
+                if args.distributed:
+                    # wait for all processes to synchronize
+                    dist.barrier()
+
+                # set train mode.
+                model_eval.trainable = True
+                model_eval.set_grid(train_size)
+                model_eval.train()
+
+        # close mosaic augmentation
+        if args.mosaic and args.max_epoch - epoch == 15:
+            print('close Mosaic Augmentation ...')
+            dataloader.dataset.mosaic = False
+        # close mixup augmentation
+        if args.mixup and args.max_epoch - epoch == 15:
+            print('close Mixup Augmentation ...')
+            dataloader.dataset.mixup = False
+
+    if args.tfboard:
+        tblogger.close()
+
+
+def build_dataset(args, train_size, val_size, device):
+    if args.dataset == 'voc':
+        data_dir = os.path.join(args.root, 'VOCdevkit')
+        num_classes = 20
+        dataset = VOCDetection(
+                        data_dir=data_dir,
+                        img_size=train_size,
+                        transform=TrainTransforms(train_size),
+                        color_augment=ColorTransforms(train_size),
+                        mosaic=args.mosaic,
+                        mixup=args.mixup)
+
+        evaluator = VOCAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size))
+
+    elif args.dataset == 'coco':
+        data_dir = os.path.join(args.root, 'COCO')
+        num_classes = 80
+        dataset = COCODataset(
+                    data_dir=data_dir,
+                    img_size=train_size,
+                    image_set='train2017',
+                    transform=TrainTransforms(train_size),
+                    color_augment=ColorTransforms(train_size),
+                    mosaic=args.mosaic,
+                    mixup=args.mixup)
+
+        evaluator = COCOAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size)
+                        )
+
+    else:
+        print('unknow dataset !! Only support voc and coco !!')
+        exit(0)
+
+    return dataset, evaluator, num_classes
+
+
+def build_dataloader(args, dataset, collate_fn=None):
+    # distributed
+    if args.distributed and args.num_gpu > 1:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True,
+                        sampler=torch.utils.data.distributed.DistributedSampler(dataset)
+                        )
+
+    else:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        shuffle=True,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True
+                        )
+    return dataloader
+
+
+def set_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+if __name__ == '__main__':
+    train()
+
diff --git a/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py b/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
new file mode 100644
index 0000000000..572b4aced1
--- /dev/null
+++ b/PyTorch/contrib/cv/detection/YoloV2-640/train8p.py
@@ -0,0 +1,559 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from __future__ import division
+
+import os
+import argparse
+import time
+import math
+import random
+from copy import deepcopy
+import apex
+from apex import amp
+import torch
+import torch_npu
+import torch.optim as optim
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import sys
+from config.yolo_config import yolo_config
+from data.voc import VOCDetection
+from data.coco import COCODataset
+from data.transforms import TrainTransforms, ColorTransforms, ValTransforms
+
+from utils import distributed_utils
+from utils import create_labels
+from utils.vis import vis_data, vis_targets
+from utils.com_flops_params import FLOPs_and_Params
+from utils.criterion import build_criterion
+from utils.misc import detection_collate
+from utils.misc import ModelEMA
+from utils.criterion import build_criterion
+
+from models.yolo import build_model
+
+from evaluator.cocoapi_evaluator import COCOAPIEvaluator
+from evaluator.vocapi_evaluator import VOCAPIEvaluator
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO Detection')
+    # basic
+    parser.add_argument('--npu', action='store_true', default=False,
+                        help='use npu.')
+    parser.add_argument('--batch_size', default=16, type=int,
+                        help='Batch size for training')
+    parser.add_argument('--lr', default=1e-3, type=float,
+                        help='initial learning rate')
+    parser.add_argument('--img_size', type=int, default=640,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--multi_scale_range', nargs='+', default=[10, 20], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--max_epoch', type=int, default=200,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--lr_epoch', nargs='+', default=[100, 150], type=int,
+                        help='lr epoch to decay')
+    parser.add_argument('--wp_epoch', type=int, default=2,
+                        help='The upper bound of warm-up')
+    parser.add_argument('--start_epoch', type=int, default=0,
+                        help='start epoch to train')
+    parser.add_argument('-r', '--resume', default=None, type=str,
+                        help='keep training')
+    parser.add_argument('--num_workers', default=8, type=int,
+                        help='Number of workers used in dataloading')
+    parser.add_argument('--num_gpu', default=1, type=int,
+                        help='Number of GPUs to train')
+    parser.add_argument('--eval_epoch', type=int,
+                        default=10, help='interval between evaluations')
+    parser.add_argument('--tfboard', action='store_true', default=False,
+                        help='use tensorboard')
+    parser.add_argument('--save_folder', default='weights/', type=str,
+                        help='path to save weight')
+    parser.add_argument('--vis_data', action='store_true', default=False,
+                        help='visualize images and labels.')
+    parser.add_argument('--vis_targets', action='store_true', default=False,
+                        help='visualize assignment.')
+
+    # Optimizer & Schedule
+    parser.add_argument('--optimizer', default='NpuFusedSGD', type=str,
+                        help='sgd, adamw')
+    parser.add_argument('--lr_schedule', default='step', type=str,
+                        help='step, cos')
+    parser.add_argument('--grad_clip', default=None, type=float,
+                        help='clip gradient')
+
+    # model
+    parser.add_argument('-m', '--model', default='yolov1',
+                        help='yolov1, yolov2, yolov3, yolov3_spp, yolov3_de, '
+                             'yolov4, yolo_tiny, yolo_nano')
+    parser.add_argument('--conf_thresh', default=0.001, type=float,
+                        help='NMS threshold')
+    parser.add_argument('--nms_thresh', default=0.5, type=float,
+                        help='NMS threshold')
+
+    # dataset
+    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, widerface, crowdhuman')
+
+    # Loss
+    parser.add_argument('--loss_obj_weight', default=1.0, type=float,
+                        help='weight of obj loss')
+    parser.add_argument('--loss_cls_weight', default=1.0, type=float,
+                        help='weight of cls loss')
+    parser.add_argument('--loss_reg_weight', default=1.0, type=float,
+                        help='weight of reg loss')
+    parser.add_argument('--scale_loss', default='batch', type=str,
+                        help='scale loss: batch or positive samples')
+
+    # train trick
+    parser.add_argument('--no_warmup', action='store_true', default=False,
+                        help='do not use warmup')
+    parser.add_argument('-ms', '--multi_scale', action='store_true', default=False,
+                        help='use multi-scale trick')
+    parser.add_argument('--ema', action='store_true', default=False,
+                        help='use ema training trick')
+    parser.add_argument('--mosaic', action='store_true', default=False,
+                        help='use Mosaic Augmentation trick')
+    parser.add_argument('--mixup', action='store_true', default=False,
+                        help='use MixUp Augmentation trick')
+    parser.add_argument('--multi_anchor', action='store_true', default=False,
+                        help='use multiple anchor boxes as the positive samples')
+    parser.add_argument('--center_sample', action='store_true', default=False,
+                        help='use center sample for labels')
+    parser.add_argument('--accumulate', type=int, default=1,
+                        help='accumulate gradient')
+    # DDP train
+    parser.add_argument('-dist', '--distributed', action='store_true', default=False,
+                        help='distributed training')
+    parser.add_argument('--local_rank', type=int, default=0,
+                        help='local_rank')
+    parser.add_argument('--sybn', action='store_true', default=False,
+                        help='use sybn.')
+    parser.add_argument('--opt-level', default='O2', type=str,
+                        help='loss scale using in amp, default O1')
+
+    return parser.parse_args()
+
+
+def train():
+    args = parse_args()
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12345'
+
+    # torch.npu.set_compile_mode(jit_compile=False)
+    option = {}
+    option["ACL_OP_COMPILER_CACHE_MODE"]="enable"
+    option["ACL_OP_COMPILER_CACHE_DIR"]="./kernel_meta"
+    option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Maximum,Conv2D,BNInfer,BNTrainingReduceGrad,Cast"
+    print("option:",option)
+    # torch.npu.set_option(option)
+    print("Setting Arguments.. : ", args)
+    print("----------------------------------------------------------")
+
+    # path to save model
+    path_to_save = os.path.join(args.save_folder, args.dataset, args.model)
+    os.makedirs(path_to_save, exist_ok=True)
+
+    # set distributed
+    local_rank = 0
+    if args.distributed:
+        dist.init_process_group(backend="hccl", #init_method="env://"
+                                )
+        local_rank = torch.distributed.get_rank()
+        print(local_rank)
+        torch_npu.npu.set_device(local_rank)
+
+    # cuda
+    if args.npu:
+        print('use npu')
+        cudnn.benchmark = True
+        device = torch.device("npu")
+    else:
+        device = torch.device("cpu")
+
+    # YOLO config
+    cfg = yolo_config[args.model]
+    train_size = val_size = args.img_size
+
+    # dataset and evaluator
+    dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device)
+    # dataloader
+    dataloader = build_dataloader(args, dataset, detection_collate)
+    # criterioin
+    criterion = build_criterion(args, cfg, num_classes)
+
+    print('Training model on:', args.dataset)
+    print('The dataset size:', len(dataset))
+    print("----------------------------------------------------------")
+
+    # build model
+    net = build_model(args=args,
+                      cfg=cfg,
+                      device=device,
+                      num_classes=num_classes,
+                      trainable=True)
+    model = net
+
+    # SyncBatchNorm
+    # if args.sybn and args.npu and args.num_gpu > 1:
+    #     print('use SyncBatchNorm ...')
+    #     model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    model = model.to(device).train()
+    # compute FLOPs and Params
+    # if local_rank == 0:
+    #     model_copy = deepcopy(model)
+    #     model_copy.trainable = False
+    #     model_copy.eval()
+    #     FLOPs_and_Params(model=model_copy, size=train_size)
+    #     model_copy.trainable = True
+    #     model_copy.train()
+    # keep training
+    if args.resume is not None:
+        print('keep training model: %s' % (args.resume))
+        model.load_state_dict(torch.load(args.resume, map_location=device))
+
+    # EMA
+    ema = ModelEMA(model) if args.ema else None
+    # use tfboard
+    tblogger = None
+    if args.tfboard:
+        print('use tensorboard')
+        from torch.utils.tensorboard import SummaryWriter
+        c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
+        log_path = os.path.join('log/', args.dataset, c_time)
+        os.makedirs(log_path, exist_ok=True)
+
+        tblogger = SummaryWriter(log_path)
+    # optimizer setup
+    base_lr = args.lr
+    tmp_lr = args.lr
+    if args.optimizer == 'NpuFusedSGD':
+        print('use SGD with momentum ...')
+        optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), lr=args.lr, momentum=0.9)
+        # optimizer = optim.SGD(model.parameters(),
+        #                         lr=tmp_lr,
+        #                         momentum=0.9,
+        #                         weight_decay=5e-4)
+    elif args.optimizer == 'adamw':
+        print('use AdamW ...')
+        optimizer = optim.AdamW(model.parameters(),
+                                lr=tmp_lr,
+                                weight_decay=5e-4)
+
+    model, optimizer = amp.initialize(model, optimizer, opt_level='O1', loss_scale=128.0,combine_grad=True)
+
+    # DDP
+    if args.distributed and args.num_gpu > 1:
+        print('using DDP ...')
+        model = DDP(model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False)
+
+
+
+
+    batch_size = args.batch_size
+    epoch_size = len(dataset) // (batch_size * args.num_gpu)
+    best_map = -100.
+    warmup = not args.no_warmup
+
+    t0 = time.time()
+    # start training loop
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if args.distributed:
+            dataloader.sampler.set_epoch(epoch)
+
+        # use step lr decay
+        if args.lr_schedule == 'step':
+            if epoch in args.lr_epoch:
+                tmp_lr = tmp_lr * 0.1
+                set_lr(optimizer, tmp_lr)
+        # use cos lr decay
+        elif args.lr_schedule == 'cos' and not warmup:
+            T_max = args.max_epoch - 15
+            lr_min = base_lr * 0.1 * 0.1
+            if epoch > T_max:
+                # Cos decay is done
+                print('Cosine annealing is over !!')
+                args.lr_schedule == None
+                tmp_lr = lr_min
+                set_lr(optimizer, tmp_lr)
+            else:
+                tmp_lr = lr_min + 0.5*(base_lr - lr_min)*(1 + math.cos(math.pi*epoch / T_max))
+                set_lr(optimizer, tmp_lr)
+        fps_sum=0
+        # train one epoch
+        # pre_flag = False
+        # start_time = time.time()
+        for iter_i, (images, targets) in enumerate(dataloader):
+            # if iter_i == 5:
+            #     start_time = time.time()
+            # with torch.autograd.profiler.profile(use_npu=True) as prof:
+            ni = iter_i + epoch * epoch_size
+            # warmup
+            if epoch < args.wp_epoch and warmup:
+                nw = args.wp_epoch * epoch_size
+                tmp_lr = base_lr * pow(ni / nw, 4)
+                set_lr(optimizer, tmp_lr)
+
+            elif epoch == args.wp_epoch and iter_i == 0 and warmup:
+                # warmup is over
+                print('Warmup is over !!')
+                warmup = False
+                tmp_lr = base_lr
+                set_lr(optimizer, tmp_lr)
+
+            # multi-scale trick
+            if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
+                # randomly choose a new size
+                r = args.multi_scale_range
+                train_size = random.randint(r[0], r[1]) * 32
+                model.module.set_grid(train_size)
+            if args.multi_scale:
+                # interpolate
+                images = torch.nn.functional.interpolate(
+                                    input=images,
+                                    size=train_size,
+                                    mode='bilinear',
+                                    align_corners=False)
+
+            targets = [label.tolist() for label in targets]
+            # visualize target
+            if args.vis_data:
+                vis_data(images, targets)
+                continue
+            # make labels
+            targets = create_labels.gt_creator(
+                                    img_size=train_size,
+                                    strides=net.stride,
+                                    label_lists=targets,
+                                    anchor_size=cfg["anchor_size"],
+                                    multi_anchor=args.multi_anchor,
+                                    center_sample=args.center_sample)
+            # visualize assignment
+            if args.vis_targets:
+                vis_targets(images, targets, cfg["anchor_size"], net.stride)
+                continue
+
+            # to device
+            images = images.to(device)
+            targets = targets.to(device)
+
+            # inference
+            pred_obj, pred_cls, pred_iou, targets = model(images, targets=targets)
+
+            # compute loss
+            loss_obj, loss_cls, loss_reg, total_loss = criterion(pred_obj, pred_cls, pred_iou, targets)
+
+            # check loss
+            if torch.isnan(total_loss):
+                continue
+
+            loss_dict = dict(
+                loss_obj=loss_obj,
+                loss_cls=loss_cls,
+                loss_reg=loss_reg,
+                total_loss=total_loss
+            )
+            loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict)
+
+            total_loss = total_loss / args.accumulate
+            # Backward and Optimize
+            with amp.scale_loss(total_loss , optimizer) as scaled_loss:
+                scaled_loss.backward()
+            if ni % args.accumulate == 0:
+                if args.grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                if args.ema:
+                    ema.update(model)
+
+                # display
+            # if iter_i % 10 == 0:
+                if args.tfboard:
+                    # viz loss
+                    tblogger.add_scalar('loss obj',  loss_dict_reduced['loss_obj'].item(),  ni)
+                    tblogger.add_scalar('loss cls',  loss_dict_reduced['loss_cls'].item(),  ni)
+                    tblogger.add_scalar('loss reg',  loss_dict_reduced['loss_reg'].item(),  ni)
+
+            t1 = time.time()
+            print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: obj %.2f || cls %.2f || reg %.2f || size %d || time: %.2f]'
+                        % (epoch+1,
+                           args.max_epoch,
+                           iter_i,
+                           epoch_size,
+                           tmp_lr,
+                           loss_dict['loss_obj'].item(),
+                           loss_dict['loss_cls'].item(),
+                           loss_dict['loss_reg'].item(),
+                           train_size,
+                           t1-t0),
+                        flush=True)
+            fps_sum = fps_sum + (batch_size*8 / (t1 - t0))
+            t0 = time.time()
+        # if local_rank in [-1, 0]:
+        #     epoch_time = time.time() - start_time
+        #     if iter_i >= 5:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1 - 5) / (epoch_time)))
+        #     else:
+        #         print('Training speed is {} FPS'.format(batch_size * 8 * (iter_i + 1) / (epoch_time)))
+            if iter_i > 0 and iter_i == 461:
+                fps_avg = fps_sum / 461
+                print("fps:",fps_avg)
+                fps_sum = 0
+
+        # evaluation
+        if (epoch + 1) % args.eval_epoch == 0 or (epoch + 1) == args.max_epoch:
+            if evaluator is None:
+                print('No evaluator ...')
+                print('Saving state, epoch:', epoch + 1)
+                torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                            args.model + '_' + repr(epoch + 1) + '.pth'))
+                print('Keep training ...')
+            else:
+                print('eval ...')
+                # check ema
+                if args.ema:
+                    model_eval = ema.ema
+                else:
+                    model_eval = model.module if args.distributed else model
+
+                # set eval mode
+                model_eval.trainable = False
+                model_eval.set_grid(val_size)
+                model_eval.eval()
+
+                if local_rank == 0:
+                    # evaluate
+                    evaluator.evaluate(model_eval)
+
+                    cur_map = evaluator.map
+                    if cur_map > best_map:
+                        # update best-map
+                        best_map = cur_map
+                        # save model
+                        print('Saving state, epoch:', epoch + 1)
+                        torch.save(model_eval.state_dict(), os.path.join(path_to_save,
+                                    args.model + '_' + repr(epoch + 1) + '_' + str(round(best_map*100, 2)) + '.pth'))
+                    if args.tfboard:
+                        if args.dataset == 'voc':
+                            tblogger.add_scalar('07test/mAP', evaluator.map, epoch)
+                        elif args.dataset == 'coco':
+                            tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch)
+                            tblogger.add_scalar('val/AP50', evaluator.ap50, epoch)
+
+                if args.distributed:
+                    # wait for all processes to synchronize
+                    dist.barrier()
+
+                # set train mode.
+                model_eval.trainable = True
+                model_eval.set_grid(train_size)
+                model_eval.train()
+
+        # close mosaic augmentation
+        if args.mosaic and args.max_epoch - epoch == 15:
+            print('close Mosaic Augmentation ...')
+            dataloader.dataset.mosaic = False
+        # close mixup augmentation
+        if args.mixup and args.max_epoch - epoch == 15:
+            print('close Mixup Augmentation ...')
+            dataloader.dataset.mixup = False
+
+    if args.tfboard:
+        tblogger.close()
+
+
+def build_dataset(args, train_size, val_size, device):
+    if args.dataset == 'voc':
+        data_dir = os.path.join(args.root, 'VOCdevkit')
+        num_classes = 20
+        dataset = VOCDetection(
+                        data_dir=data_dir,
+                        img_size=train_size,
+                        transform=TrainTransforms(train_size),
+                        color_augment=ColorTransforms(train_size),
+                        mosaic=args.mosaic,
+                        mixup=args.mixup)
+
+        evaluator = VOCAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size))
+
+    elif args.dataset == 'coco':
+        data_dir = os.path.join(args.root, 'COCO')
+        num_classes = 80
+        dataset = COCODataset(
+                    data_dir=data_dir,
+                    img_size=train_size,
+                    image_set='train2017',
+                    transform=TrainTransforms(train_size),
+                    color_augment=ColorTransforms(train_size),
+                    mosaic=args.mosaic,
+                    mixup=args.mixup)
+
+        evaluator = COCOAPIEvaluator(
+                        data_dir=data_dir,
+                        img_size=val_size,
+                        device=device,
+                        transform=ValTransforms(val_size)
+                        )
+
+    else:
+        print('unknow dataset !! Only support voc and coco !!')
+        exit(0)
+
+    return dataset, evaluator, num_classes
+
+
+def build_dataloader(args, dataset, collate_fn=None):
+    # distributed
+    if args.distributed and args.num_gpu > 1:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True,
+                        sampler=torch.utils.data.distributed.DistributedSampler(dataset)
+                        )
+
+    else:
+        # dataloader
+        dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        shuffle=True,
+                        batch_size=args.batch_size,
+                        collate_fn=collate_fn,
+                        num_workers=args.num_workers,
+                        pin_memory=True
+                        )
+    return dataloader
+
+
+def set_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+if __name__ == '__main__':
+    train()
+
-- 
Gitee