From 74670bcd04cbc063a0835e3cae61eac1c2d0d148 Mon Sep 17 00:00:00 2001 From: koervcor <1015296415@qq.com> Date: Fri, 17 Feb 2023 14:57:59 +0800 Subject: [PATCH 1/2] Update ConvNeXt --- .../ConvNeXt/1.5_requirements.txt | 8 + .../ConvNeXt/1.8_requirements.txt | 8 + .../cv/classification/ConvNeXt/README.md | 249 +++++++++++++----- .../cv/classification/ConvNeXt/engine.py | 17 +- .../cv/classification/ConvNeXt/main.py | 3 +- 5 files changed, 201 insertions(+), 84 deletions(-) create mode 100644 PyTorch/contrib/cv/classification/ConvNeXt/1.5_requirements.txt create mode 100644 PyTorch/contrib/cv/classification/ConvNeXt/1.8_requirements.txt diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/1.5_requirements.txt b/PyTorch/contrib/cv/classification/ConvNeXt/1.5_requirements.txt new file mode 100644 index 0000000000..da9be54422 --- /dev/null +++ b/PyTorch/contrib/cv/classification/ConvNeXt/1.5_requirements.txt @@ -0,0 +1,8 @@ +six==1.15.0 +timm==0.3.2 +torchvision==0.2.2.post3 +pillow==8.4.0 +tensorboardX==2.5 +decorator +protobuf==3.20.1 +sympy diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/1.8_requirements.txt b/PyTorch/contrib/cv/classification/ConvNeXt/1.8_requirements.txt new file mode 100644 index 0000000000..f30b2205ef --- /dev/null +++ b/PyTorch/contrib/cv/classification/ConvNeXt/1.8_requirements.txt @@ -0,0 +1,8 @@ +six==1.15.0 +timm==0.3.2 +torchvision==0.9.1 +pillow==9.1.0 +tensorboardX==2.5 +decorator +protobuf==3.20.1 +sympy diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/README.md b/PyTorch/contrib/cv/classification/ConvNeXt/README.md index 9764a62ba4..adf519c6a7 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/README.md +++ b/PyTorch/contrib/cv/classification/ConvNeXt/README.md @@ -1,69 +1,180 @@ -# ConvNext_for_PyTorch - -This implements training ConvNext of on the ImageNet dataset, mainly modified from https://github.com/facebookresearch/ConvNeXt.git - -## ConvNext_for_PyTorch Detail - -As of the current date, Ascend-Pytorch is still inefficient for contiguous operations. - - - -## Requirements -- pip install -r requirements.txt -- pip install torch==1.8.1+ascend.rc2.20220505;torchvision==0.9.1;torch-npu 1.8.1rc2.post20220505; -- Download the ImageNet dataset from http://www.image-net.org/ - - Then, and move validation images to labeled subfolders, using [the following shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh) -## timm -将timm_need目录下的文件替换到timm的安装目录下 -```bash - -cd ../ConvNeXt -/bin/cp -f timm_need/mixup.py ../timm/data/mixup.py -/bin/cp -f timm_need/model_ema.py ../timm/utils/model_ema.py - -``` -## 软件包 -- 910版本 -- CANN toolkit_5.1.RC1 -- torch 1.8.1+ascend.rc2.20220505 -- 固件驱动 22.0.0 - -## Training - -To train a model, run `main.py` with the desired model architecture and the path to the ImageNet dataset: - -```bash -# training 1p accuracy -bash ./test/train_full_1p.sh --data_path=real_data_path - -# training 1p performance -bash ./test/train_performance_1p.sh --data_path=real_data_path - -# training 8p accuracy -bash ./test/train_full_8p.sh --data_path=real_data_path - -# training 8p performance -bash ./test/train_performance_8p.sh --data_path=real_data_path - -# eval -bash test/train_eval_8p.sh --data_path=real_data_path - -# finetuning -bash test/train_finetune_1p.sh --data_path=real_data_path - - -``` - -## ConvNext_for_PyTorch training result - -| Acc@1 | FPS | Npu_nums | Epochs | AMP_Type | -| :-----: | :------: | :------: | :----: | :------: | -| - | 115.10 | 1 | 300 | O1 | -| 82.049 | 259.85 | 8 | 300 | O1 | - - - - - - - +# ConvNeXt for PyTorch + +- [概述](概述.md) +- [准备训练环境](准备训练环境.md) +- [开始训练](开始训练.md) +- [训练结果展示](训练结果展示.md) +- [版本说明](版本说明.md) + + + +# 概述 + +## 简述 + +ConvNeXt是基于ResNet-50网络结构,并参考Transformer网络的设计思想和技术引入到CNN网络模块中以结合这两种网络的优势,从而提高CNN网络的性能表现。其进行的优化设计主要有以下几点: 1.Macro design 2.ResNeXt 3.Inverted bottleneck 4. Large kernel size 5.Various layer-wise Micro designs + +- 参考实现: + + ``` + url=https://github.com/facebookresearch/ConvNeXt.git + commit_id=b9dd6b5a0885c381e7e7cd99ba64d1b61041bdfc + ``` + +- 适配昇腾 AI 处理器的实现: + + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/contrib/cv/classification + ``` + + +# 准备训练环境 + +## 准备环境 + +- 当前模型支持的 PyTorch 版本和已知三方库依赖如下表所示。 + + **表 1** 版本支持表 + + | Torch_Version | 三方库依赖版本 | + | :--------: | :----------------------------------------------------------: | + | PyTorch 1.5 | torchvision==0.2.2.post3;pillow==8.4.0 | + | PyTorch 1.8 | torchvision==0.9.1;pillow==9.1.0 | + +- 环境准备指导。 + + 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + +- 安装依赖。 + + 在模型源码包根目录下执行命令,安装模型对应PyTorch版本需要的依赖。 + ``` + pip install -r 1.5_requirements.txt # PyTorch1.5版本 + + pip install -r 1.8_requirements.txt # PyTorch1.8版本 + ``` + > **说明:** + >只需执行一条对应的PyTorch版本依赖安装命令。 + +- timm 配置。 + + 将 `timm_need` 目录下的文件替换到 `timm` 的安装目录下。 + ```bash + cd ../ConvNeXt + /bin/cp -f timm_need/mixup.py ../timm/data/mixup.py + ``` + +## 准备数据集 + +1. 获取数据集。 + + 用户自行获取 `ImageNet` 数据集,将数据集上传到服务器任意路径下并解压。 + + 数据集目录结构参考如下所示。 + + ``` + ├── ImageNet + ├──train + ├──类别1 + │──图片1 + │──图片2 + │ ... + ├──类别2 + │──图片1 + │──图片2 + │ ... + ├──... + ├──val + ├──类别1 + │──图片1 + │──图片2 + │ ... + ├──类别2 + │──图片1 + │──图片2 + │ ... + ``` + > **说明:** + >该数据集的训练过程脚本只作为一种参考示例。 + +# 开始训练 + +## 训练模型 + +1. 进入解压后的源码包根目录。 + + ``` + cd /${模型文件夹名称} + ``` + +2. 运行训练脚本。 + + 该模型支持单机单卡训练和单机8卡训练。 + + - 单机单卡训练 + + 启动单卡训练。 + + ``` + bash ./test/train_full_1p.sh --data_path=/data/xxx/ # 单卡精度 + bash ./test/train_performance_1p.sh --data_path=/data/xxx/ # 单卡性能 + ``` + + - 单机8卡训练 + + 启动8卡训练。 + + ``` + bash ./test/train_full_8p.sh --data_path=/data/xxx/ # 8卡精度 + bash ./test/train_performance_8p.sh --data_path=/data/xxx/ # 8卡性能 + ``` + + - 单机8卡评测 + + 启动8卡评测。 + + ``` + bash ./test/train_eval_8p.sh --data_path=/data/xxx/ # 8卡评测 + ``` + + --data_path参数填写数据集路径,需写到数据集的一级目录。 + + 模型训练脚本参数说明如下。 + + ``` + 公共参数: + --data //数据集路径 + --model //模型训练名称,默认:convnext_tiny + --drop_path //防过拟合的比例,默认:0.1 + --batch_size //训练批次大小,默认:128 + --lr //学习率,默认:0.004 + --update_freq //梯度累积更新,默认:4 + --use_amp //是否使用自动混合精度,默认:True + --model_ema //是否开启权重滑动更新,默认:True + --model_ema_eval //是否在训期间使用ema模型进行评估,默认:True + ``` + + 训练完成后,权重文件保存在当前路径下,并输出模型训练精度和性能信息。 + +# 训练结果展示 + +**表 2** 训练结果展示表 + +| NAME | Acc@1 | FPS | Epochs | AMP_Type | Torch_Version | +| :-----: | :---: | :--: | :----: | :------: | :-----------: | +| 1p-竞品V| - | - | 1 | - | 1.5 | +| 8p-竞品V| - | - | 300 | - | 1.5 | +| 1p-NPU | - | 119 | 1 | O2 | 1.8 | +| 8p-NPU | 82.05 | 261 | 300 | O2 | 1.8 | + + +# 版本说明 + +## 变更 + +2023.02.07:首次发布。 + +## FAQ + +无。 \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/engine.py b/PyTorch/contrib/cv/classification/ConvNeXt/engine.py index eb815e594a..3377fb7fb0 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/engine.py +++ b/PyTorch/contrib/cv/classification/ConvNeXt/engine.py @@ -80,14 +80,9 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, if (data_iter_step + 1) % update_freq == 0: optimizer.step() optimizer.zero_grad() - # if model_ema is not None: - # model_ema.update(model) if model_ema is not None: - if device.type == 'npu': - params_fp32_fused = optimizer.get_model_combined_params() - model_ema.update(model, params_fp32_fused[0],device) - else: - model_ema.update(model) + model_ema.update(model) + else: # full precision loss /= update_freq @@ -95,14 +90,8 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, if (data_iter_step + 1) % update_freq == 0: optimizer.step() optimizer.zero_grad() - # if model_ema is not None: - # model_ema.update(model) if model_ema is not None: - if device.type == 'npu': - params_fp32_fused = optimizer.get_model_combined_params() - model_ema.update(model, params_fp32_fused[0]) - else: - model_ema.update(model) + model_ema.update(model) torch.npu.synchronize() diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/main.py b/PyTorch/contrib/cv/classification/ConvNeXt/main.py index 919d2d666e..1f3cd9d32e 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/main.py +++ b/PyTorch/contrib/cv/classification/ConvNeXt/main.py @@ -11,7 +11,8 @@ import datetime import numpy as np import time import torch -import torch_npu +if torch.__version__ >= '1.8': + import torch_npu import torch.nn as nn import torch.backends.cudnn as cudnn import json -- Gitee From 166704b2fedfce2dab9422bb4a981ed544a78556 Mon Sep 17 00:00:00 2001 From: koervcor <1015296415@qq.com> Date: Sun, 23 Apr 2023 15:47:15 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9batchsize=E8=AE=A1?= =?UTF-8?q?=E7=AE=97=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/contrib/cv/classification/ConvNeXt/README.md | 9 +++++---- PyTorch/contrib/cv/classification/ConvNeXt/main.py | 4 ++-- .../cv/classification/ConvNeXt/test/train_finetune_1p.sh | 2 +- .../cv/classification/ConvNeXt/test/train_full_1p.sh | 2 +- .../cv/classification/ConvNeXt/test/train_full_8p.sh | 2 +- .../classification/ConvNeXt/test/train_performance_1p.sh | 2 +- .../classification/ConvNeXt/test/train_performance_8p.sh | 2 +- 7 files changed, 12 insertions(+), 11 deletions(-) diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/README.md b/PyTorch/contrib/cv/classification/ConvNeXt/README.md index adf519c6a7..5bf0bb3689 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/README.md +++ b/PyTorch/contrib/cv/classification/ConvNeXt/README.md @@ -165,16 +165,17 @@ ConvNeXt是基于ResNet-50网络结构,并参考Transformer网络的设计思 | :-----: | :---: | :--: | :----: | :------: | :-----------: | | 1p-竞品V| - | - | 1 | - | 1.5 | | 8p-竞品V| - | - | 300 | - | 1.5 | -| 1p-NPU | - | 119 | 1 | O2 | 1.8 | -| 8p-NPU | 82.05 | 261 | 300 | O2 | 1.8 | +| 1p-NPU | - | 370.60 | 1 | O2 | 1.8 | +| 8p-NPU | 82.05 | 2968.12 | 300 | O2 | 1.8 | # 版本说明 ## 变更 -2023.02.07:首次发布。 +2023.04.23: 更新内容重新发布。 +2022.08.09: 首次发布。 ## FAQ -无。 \ No newline at end of file +无。 diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/main.py b/PyTorch/contrib/cv/classification/ConvNeXt/main.py index 1f3cd9d32e..fdcf153c9b 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/main.py +++ b/PyTorch/contrib/cv/classification/ConvNeXt/main.py @@ -8,6 +8,8 @@ import argparse import datetime +import json +import os import numpy as np import time import torch @@ -15,8 +17,6 @@ if torch.__version__ >= '1.8': import torch_npu import torch.nn as nn import torch.backends.cudnn as cudnn -import json -import os from pathlib import Path diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_finetune_1p.sh b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_finetune_1p.sh index 61dc55f6cc..28396e76f7 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_finetune_1p.sh +++ b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_finetune_1p.sh @@ -14,7 +14,7 @@ data_path="" Network="ConvNext" #训练batch_size,,需要模型审视修改 -batch_size=16 +batch_size=128 diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_1p.sh index d638c75e5b..a64769abf6 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_1p.sh @@ -14,7 +14,7 @@ data_path="" Network="ConvNext" #训练batch_size,,需要模型审视修改 -batch_size=16 +batch_size=128 diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_8p.sh index 76a658166c..0d1e79974e 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_full_8p.sh @@ -94,7 +94,7 @@ e2e_time=$(( $end_time - $start_time )) #最后一个迭代FPS值 step_time=`grep -a 'time:' ${cur_path}/test/output/$ASCEND_DEVICE_ID/train_full_8p.log|awk -F "time: " '{print $NF}'|awk 'END {print}'| awk -F " data:" '{print $1}'` Acc=`grep -a 'Acc@1' ${cur_path}/test/output/$ASCEND_DEVICE_ID/train_full_8p.log | awk 'END {print}'| awk -F " " '{print $3}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${step_time}'}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * 8/'${step_time}'}'` #最后一个迭代loss值 loss=`grep -a 'loss:' ${cur_path}/test/output/$ASCEND_DEVICE_ID/train_full_8p.log | awk -F "loss:" '{print $NF}'| awk 'END {print}' | awk -F "(" '{print $1}'` diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_1p.sh index 27bdf5e293..3dcd1507da 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_1p.sh @@ -14,7 +14,7 @@ data_path="" Network="ConvNext" #训练batch_size,,需要模型审视修改 -batch_size=16 +batch_size=128 diff --git a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_8p.sh index 2a38738b3a..6c8c9c13f8 100644 --- a/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/classification/ConvNeXt/test/train_performance_8p.sh @@ -98,7 +98,7 @@ e2e_time=$(( $end_time - $start_time )) #最后一个迭代FPS值 step_time=`grep -a 'time:' ${cur_path}/test/output/$ASCEND_DEVICE_ID/train_perf_8p.log|awk -F "time: " '{print $NF}'|awk 'END {print}'| awk -F " data:" '{print $1}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${step_time}'}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}' * 8/'${step_time}'}'` #最后一个迭代loss值 loss=`grep -a 'loss:' ${cur_path}/test/output/$ASCEND_DEVICE_ID/train_perf_8p.log | awk -F "loss:" '{print $NF}'| awk 'END {print}' | awk -F "(" '{print $1}'` -- Gitee