From 2d02f946509f4a5825d5ae780382b4a4f37f3f08 Mon Sep 17 00:00:00 2001 From: xuhan-cn Date: Thu, 25 Aug 2022 18:39:56 +0800 Subject: [PATCH 1/2] =?UTF-8?q?LetNet=E5=88=9D=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../contrib/cv/classification/LeNet/README.md | 157 ++++ .../contrib/cv/classification/LeNet/main.py | 680 ++++++++++++++++++ .../classification/LeNet/models/__init__.py | 0 .../cv/classification/LeNet/models/lenet_5.py | 36 + .../classification/LeNet/modelzoo_level.txt | 3 + .../cv/classification/LeNet/requirements.txt | 4 + .../cv/classification/LeNet/test/env_npu.sh | 76 ++ .../LeNet/test/train_eval_8p.sh | 132 ++++ .../LeNet/test/train_finetune_1p.sh | 155 ++++ .../LeNet/test/train_full_1p.sh | 143 ++++ .../LeNet/test/train_full_8p.sh | 134 ++++ .../LeNet/test/train_performance_1p.sh | 143 ++++ .../LeNet/test/train_performance_8p.sh | 136 ++++ 13 files changed, 1799 insertions(+) create mode 100644 PyTorch/contrib/cv/classification/LeNet/README.md create mode 100644 PyTorch/contrib/cv/classification/LeNet/main.py create mode 100644 PyTorch/contrib/cv/classification/LeNet/models/__init__.py create mode 100644 PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py create mode 100644 PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt create mode 100644 PyTorch/contrib/cv/classification/LeNet/requirements.txt create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh create mode 100644 PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh diff --git a/PyTorch/contrib/cv/classification/LeNet/README.md b/PyTorch/contrib/cv/classification/LeNet/README.md new file mode 100644 index 0000000000..fa2ee7871e --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/README.md @@ -0,0 +1,157 @@ +# LeNet for PyTorch + +- [概述](概述.md) +- [准备训练环境](准备训练环境.md) +- [开始训练](开始训练.md) +- [训练结果展示](训练结果展示.md) +- [版本说明](版本说明.md) + + + +# 概述 + +## 简述 + +Lenet是一个7层的神经网络,包含3个卷积层,2个池化层,1个全连接层。其中所有卷积层的所有卷积核都为5x5,步长strid=1,池化方法都为全局池化,激活函数为Sigmoid。 + + +- 参考实现: + + ``` + url=https://github.com/allegrofb/LeNet.git + ``` + +- 适配昇腾 AI 处理器的实现: + + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/contrib/cv/classification + ``` + +- 通过Git获取代码方法如下: + + ``` + git clone https://gitee.com/xuhan-cn/ModelZoo-PyTorch.git # 克隆仓库的代码 + cd ./ModelZoo-PyTorch/PyTorch/contrib/cv/classification # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 + ``` + +- 通过单击“立即下载”,下载源码包。 + +# 准备训练环境 + +## 准备环境 + +- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。 + + **表 1** 版本配套表 + + | 配套 | 版本 | + | ---------- | ------------------------------------------------------------ | + | 固件与驱动 | [1.0.15](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | CANN | [5.1.RC1](https://www.hiascend.com/software/cann/commercial?version=5.1.RC1) | + | PyTorch | [1.5.0](https://gitee.com/ascend/pytorch/tree/v1.5.0/) + +- 环境准备指导。 + + 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + +- 安装依赖(根据模型需求,按需添加所需依赖)。 + + ``` + pip install -r requirements.txt + ``` + + +## 数据集 + +LeNet_5模型选用的数据集为MNIST手写数据集,数据集将在模型训练过程中自动下载。 + + + +## 获取预训练模型 + +预训练模型存放在'./checkpoint'文件夹下,训练过程中每5个epoch保存一次模型。 + +# 开始训练 + +## 训练模型 + +1. 进入解压后的源码包根目录。 + + ``` + cd ./LeNet + ``` + +2. 运行训练脚本。 + + 该模型支持单机单卡训练和单机8卡训练。 + + - 单机单卡训练 + + 启动单卡训练。 + + ``` + # training 1p accuracy + bash ./test/train_full_1p.sh --data_path + + # training 1p performance + bash ./test/train_performance_1p.sh --data_path + + # finetuning 1p + bash test/train_finetune_1p.sh --data_path --pth_path=real_pre_train_model_path + + ``` + + - 单机8卡训练 + + 启动8卡训练。 + + ``` + # training 8p accuracy + bash ./test/train_full_8p.sh --data_path + + # training 8p performance + bash ./test/train_performance_8p.sh --data_path + + #test 8p accuracy + bash test/train_eval_8p.sh --data_path --pth_path=real_pre_train_model_path + ``` + + --data\_path参数后无需填写数据集路径,因为MNIST数据集是自动下载的 + --pth\_path参数填写预训练模型路径。 + + 模型训练脚本参数说明如下。 + + ``` + 公共参数: + --data_path //数据集路径 + --addr //主机地址 + --Epoch //重复训练次数 + --batchSize //训练批次大小 + --lr //初始学习率,默认:0.012 + --momentum //动量,默认:0.9 + --weight_decay //权重衰减,默认:0.0001 + --resume //中断重新开始模型参数路径 + --start-epoch //开始训练epoch 默认1 + --pretrained //预训练模型路径 + --amp //是否使用混合精度 + --loss-scale //混合精度lossscale大小 + --opt-level //混合精度类型 + 多卡训练参数: + --multiprocessing-distributed //是否使用多卡训练 + --device-list '0,1,2,3,4,5,6,7' //多卡训练指定训练用卡 + ``` + + 训练完成后,权重文件保存在'./checkpoint'路径下,并输出模型训练精度和性能信息。 + +# 训练结果展示 + +**表 2** 训练结果展示表 + + +| NAME | Acc@1 | FPS | Epochs | AMP_Type | +| ------- | ----- | ----: | ------ | -------: | +| 1p-竞品 | 99.36 |38000 | 80 | - | +| 1p-NPU | 99.44 |21000 | 80 | O2 | +| 8p-竞品 | 99.07 |82000 | 80 | - | +| 8p-NPU | 99.10 |41000 | 80 | O2 | diff --git a/PyTorch/contrib/cv/classification/LeNet/main.py b/PyTorch/contrib/cv/classification/LeNet/main.py new file mode 100644 index 0000000000..22ef71ec76 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/main.py @@ -0,0 +1,680 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import random +import shutil +import time +import warnings +import torch +import numpy as np +import apex +from apex import amp +import torch.nn as nn +import torch.nn.parallel +import torch.npu +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.optim +import torch.multiprocessing as mp +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torch.nn.functional as F +from models.lenet_5 import LeNet5 + + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('data', metavar='DIR', + help='path to dataset') +parser.add_argument('--workers', default=8, type=int, metavar='N', + help='number of data loading workers (default: 4)') +parser.add_argument('--epochs', default=80, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=2048, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.012, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=5, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--pth_path', default='', type=str, metavar='PATH', + help='path to pretrained checkpoint (default: none)') +parser.add_argument('--world-size', default=1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=0, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://127.0.0.1:30000', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='hccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +parser.add_argument('--num_classes', default=10, type=int, + help='The number of classes.') +## for ascend 910 +parser.add_argument('--device', default='npu', type=str, help='npu or gpu') +parser.add_argument('--addr', default='127.0.0.1', + type=str, help='master addr') +parser.add_argument('--device_list', default='0,1,2,3,4,5,6,7', + type=str, help='device id list') +parser.add_argument('--warm_up_epochs', default=0, type=int, + help='warm up') +parser.add_argument('--amp', default=False, action='store_true', + help='use amp to train the model') +parser.add_argument('--loss-scale', default=128., type=float, + help='loss scale using in amp, default -1 means dynamic') +parser.add_argument('--opt-level', default='O2', type=str, + help='loss scale using in amp, default -1 means dynamic') +parser.add_argument('--prof', default=False, action='store_true', + help='use profiling to evaluate the performance of model') +args = parser.parse_args() +best_acc1 = 0 + +def device_id_to_process_device_map(device_list): + devices = device_list.split(",") + devices = [int(x) for x in devices] + devices.sort() + + process_device_map = dict() + for process_id, device_id in enumerate(devices): + process_device_map[process_id] = device_id + + return process_device_map + + +def main(): + print(args.device_list) + + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29688' + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + args.process_device_map = device_id_to_process_device_map(args.device_list) + + if args.device == 'npu': + ngpus_per_node = len(args.process_device_map) + else: + if args.distributed: + ngpus_per_node = torch.cuda.device_count() + else: + ngpus_per_node = 1 + print('ngpus_per_node:', ngpus_per_node) + + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, + args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + +transforms = transforms.Compose([transforms.Resize((32, 32)), + transforms.ToTensor()]) +# Data loading code +if args.data: + train_dataset = datasets.MNIST(root=args.data, + train=True, + transform=transforms, + download=False) + val_dataset = datasets.MNIST(root=args.data, + train=False, + transform=transforms) +else: + train_dataset = datasets.MNIST(root='mnist_data', + train=True, + transform=transforms, + download=True) + val_dataset = datasets.MNIST(root='mnist_data', + train=False, + transform=transforms) + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = args.process_device_map[gpu] + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, # init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model LenNet_5") + model = LeNet5(args.num_classes) + print("loading model of yours...") + if args.pth_path: + print("load pth you give") + pretrained_dict = torch.load(args.pth_path, map_location="cpu")["state_dict"] + else: + pretrained_dict = torch.load("./model_best.pth.tar", map_location="cpu")["state_dict"] + + if "fc.weight" in pretrained_dict: + print("pop fc layer weight") + pretrained_dict.pop('fc.weight') + pretrained_dict.pop('fc.bias') + model.load_state_dict(pretrained_dict, strict=False) + else: + print("=> creating model LeNet_5") + model = LeNet5(args.num_classes) + + if args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + model = model.to(loc) + else: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + args.batch_size = int(args.batch_size / args.world_size) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + else: + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + model = model.to(loc) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + print("[gpu id:", args.gpu, "]", + "============================test args.gpu is not None else==========================") + elif args.gpu is not None: + print("[gpu id:", args.gpu, "]", + "============================test elif args.gpu is not None:==========================") + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(args.gpu) + model = model.to(loc) + else: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + + else: + # DataParallel will divide and allocate batch_size to all available GPUs + print("[gpu id:", args.gpu, "]", "============================test 1==========================") + print("[gpu id:", args.gpu, "]", "============================test 3==========================") + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + else: + print("before : model = torch.nn.DataParallel(model).cuda()") + + # define loss function (criterion) and optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) + #optimizer = apex.optimizers.NpuFusedSGD(model.parameters(), args.lr, + # momentum=args.momentum, + # weight_decay=args.weight_decay) + + if args.amp: + model, optimizer = amp.initialize( + model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale, combine_grad=True) + + if args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs we have + if args.pretrained: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False, + find_unused_parameters=True) + else: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + else: + print("[gpu id:", args.gpu, "]", + "============================test args.gpu is not None else==========================") + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None: + print("[gpu id:", args.gpu, "]", + "============================test elif args.gpu is not None:==========================") + else: + # DataParallel will divide and allocate batch_size to all available GPUs + print("[gpu id:", args.gpu, "]", "============================test 1==========================") + print("[gpu id:", args.gpu, "]", "============================test 3==========================") + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + model = torch.nn.DataParallel(model).to(loc) + else: + model = torch.nn.DataParallel(model).cuda() + + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + criterion = nn.CrossEntropyLoss().to(loc) + else: + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + else: + # Map model to be loaded to specified single gpu. + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + else: + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader(dataset=train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + num_workers=args.workers, + pin_memory=False, + sampler=train_sampler, + drop_last=True) + + + val_loader = torch.utils.data.DataLoader(dataset=val_dataset, + batch_size=args.batch_size, + shuffle=False) + + if args.evaluate: + validate(val_loader, model, criterion, args, ngpus_per_node) + return + + if args.prof: + profiling(train_loader, model, criterion, optimizer, args) + return + + start_time = time.time() + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + + adjust_learning_rate(optimizer, epoch, args) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion, args, ngpus_per_node) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + if args.device == 'npu' and args.gpu == 0 and epoch == 19: + print("Complete 90 epoch training, take time:{}h".format(round((time.time() - start_time) / 3600.0, 2))) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + + ############## npu modify begin ############# + if args.amp: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': 'LeNet5', + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + 'amp': amp.state_dict(), + }, is_best) + else: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': 'LeNet', + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer': optimizer.state_dict(), + }, is_best) + ############## npu modify end ############# + + +def profiling(data_loader, model, criterion, optimizer, args): + # switch to train mode + model.train() + + def update(model, images, target, optimizer): + output, _ = model(images) + loss = criterion(output, target) + if args.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.zero_grad() + optimizer.step() + + for step, (images, target) in enumerate(data_loader): + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + images = images.to(loc, non_blocking=True).to(torch.float) + target = target.to(torch.int32).to(loc, non_blocking=True) + else: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + if step < 5: + update(model, images, target, optimizer) + else: + if args.device == 'npu': + with torch.autograd.profiler.profile(use_npu=True) as prof: + update(model, images, target, optimizer) + else: + with torch.autograd.profiler.profile(use_cuda=True) as prof: + update(model, images, target, optimizer) + break + + prof.export_chrome_trace("output.prof") + + +def train(train_loader, model, criterion, optimizer, epoch, args, ngpus_per_node): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + images = images.to(loc, non_blocking=True).to(torch.float) + target = target.to(torch.int32).to(loc, non_blocking=True) + else: + images = images.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output, _ = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + if args.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + optimizer.step() + if args.device == 'npu': + torch.npu.synchronize() + + # measure elapsed time + cost_time = time.time() - end + batch_time.update(cost_time) + end = time.time() + + if i % args.print_freq == 0: + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + progress.display(i) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + if batch_time.avg: + print("[npu id:", args.gpu, "]", "batch_size:", args.world_size * args.batch_size, + 'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format( + args.batch_size * args.world_size / batch_time.avg)) + + +def validate(val_loader, model, criterion, args, ngpus_per_node): + batch_time = AverageMeter('Time', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + i=0 + for X, y_true in val_loader: + if args.gpu is not None: + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + X = X.to(loc).to(torch.float) + else: + X = X.cuda(args.gpu, non_blocking=True) + if args.device == 'npu': + loc = 'npu:{}'.format(args.gpu) + y_true = y_true.to(torch.int32).to(loc, non_blocking=True) + else: + y_true = y_true.cuda(args.gpu, non_blocking=True) + + # compute output + output, _ = model(X) + loss = criterion(output, y_true) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, y_true, topk=(1, 5)) + losses.update(loss.item(), X.size(0)) + top1.update(acc1[0], X.size(0)) + top5.update(acc5[0], X.size(0)) + + # measure elapsed time + cost_time = time.time() - end + batch_time.update(cost_time) + end = time.time() + + if i % args.print_freq == 0: + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + progress.display(i) + i=i+1 + + if i % args.print_freq == 0: + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + print("[gpu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint/checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'checkpoint/model_best.pth.tar') + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self, name, fmt=':f', start_count_index=2): + self.name = name + self.fmt = fmt + self.reset() + self.start_count_index = start_count_index + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + if self.count == 0: + self.N = n + + self.val = val + self.count += n + if self.count > (self.start_count_index * self.N): + self.sum += val * n + self.avg = self.sum / (self.count - self.start_count_index * self.N) + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def adjust_learning_rate(optimizer, epoch, args): + """Sets the learning rate to the initial LR decayed by cosine method""" + + if args.warm_up_epochs > 0 and epoch < args.warm_up_epochs: + lr = args.lr * ((epoch + 1) / (args.warm_up_epochs + 1)) + else: + alpha = 0 + cosine_decay = 0.5 * ( + 1 + np.cos(np.pi * (epoch - args.warm_up_epochs) / (args.epochs - args.warm_up_epochs))) + decayed = (1 - alpha) * cosine_decay + alpha + lr = args.lr * decayed + #if (epoch != 0) and epoch % args.en == 0: + # lr = args.lr * args.dr + # args.lr = lr + #else: + # lr = args.lr + + print("=> Epoch[%d] Setting lr: %.4f" % (epoch, lr)) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() diff --git a/PyTorch/contrib/cv/classification/LeNet/models/__init__.py b/PyTorch/contrib/cv/classification/LeNet/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py b/PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py new file mode 100644 index 0000000000..df92fdc6ea --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/models/lenet_5.py @@ -0,0 +1,36 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class LeNet5(nn.Module): + + def __init__(self, n_classes): + super(LeNet5, self).__init__() + + self.feature_extractor = nn.Sequential( + nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1), + # nn.Tanh(), + nn.ReLU(), + nn.AvgPool2d(kernel_size=2), + nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1), + # nn.Tanh(), + nn.ReLU(), + nn.AvgPool2d(kernel_size=2), + nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1), + # nn.Tanh() + nn.ReLU(), + ) + + self.classifier = nn.Sequential( + nn.Linear(in_features=120, out_features=84), + # nn.Tanh(), + nn.ReLU(), + nn.Linear(in_features=84, out_features=n_classes), + ) + + def forward(self, x): + x = self.feature_extractor(x) + x = torch.flatten(x, 1) + logits = self.classifier(x) + probs = F.softmax(logits, dim=1) + return logits, probs diff --git a/PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt b/PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt new file mode 100644 index 0000000000..31529da2e6 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/modelzoo_level.txt @@ -0,0 +1,3 @@ +FuncStatus:OK +PerfStatus:OK +PrecisionStatus:OK \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LeNet/requirements.txt b/PyTorch/contrib/cv/classification/LeNet/requirements.txt new file mode 100644 index 0000000000..00352a8956 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/requirements.txt @@ -0,0 +1,4 @@ +torch==1.5.0 +apex +torchvision==0.5.0 +numpy \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh b/PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh new file mode 100644 index 0000000000..4740fafdcc --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/env_npu.sh @@ -0,0 +1,76 @@ +#!/bin/bash +export install_path=/usr/local/Ascend + +if [ -d ${install_path}/toolkit ]; then + export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64/:/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} + export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH + export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH + export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=${install_path}/opp +else + if [ -d ${install_path}/nnae/latest ];then + export LD_LIBRARY_PATH=${install_path}/nnae/latest/fwkacllib/lib64/:/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/nnae/latest + else + export LD_LIBRARY_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest + fi +fi + +${install_path}/driver/tools/msnpureport -g error -d 0 +${install_path}/driver/tools/msnpureport -g error -d 1 +${install_path}/driver/tools/msnpureport -g error -d 2 +${install_path}/driver/tools/msnpureport -g error -d 3 +${install_path}/driver/tools/msnpureport -g error -d 4 +${install_path}/driver/tools/msnpureport -g error -d 5 +${install_path}/driver/tools/msnpureport -g error -d 6 +${install_path}/driver/tools/msnpureport -g error -d 7 + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=0 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 +export HCCL_IF_IP=$(hostname -I |awk '{print $1}') + +ulimit -SHn 512000 + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh new file mode 100644 index 0000000000..78313a496f --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/train_eval_8p.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size resume RANK_SIZE +# 网络名称,同目录名称 +Network="LeNet_5_for_PyTorch" +# 训练batch_size +batch_size=2048 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" +# checkpoint文件路径,以实际路径为准 +pth_path="" +# 训练epoch +train_epochs=80 +# 学习率 +learning_rate=0.012 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --pth_path* ]];then + pth_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否传入 pth_path , 验证脚本需要传入此参数 +if [[ $pth_path == "" ]];then + echo "[Error] para \"pth_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +echo ${data_path} +nohup python3.7 ${cur_path}/main.py \ + "" \ + --evaluate \ + --resume ${pth_path} \ + --seed=49 \ + --learning-rate=${learning_rate} \ + --mom=0.9 \ + --weight-decay=0.0005 \ + --print-freq=1 \ + --dist-url='tcp://127.0.0.1:50023' \ + --dist-backend 'hccl' \ + --multiprocessing-distributed \ + --world-size=1 \ + --rank=0 \ + --device='npu' \ + --epochs=${train_epochs} \ + --loss-scale=32 \ + --amp \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" + +# 输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`grep Test ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}.log | awk '{print $8}' | awk 'END {print}'` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh new file mode 100644 index 0000000000..2668bfa127 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/train_finetune_1p.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LeNet_5_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" +# checkpoint文件路径,以实际路径为准 +pth_path="" +# 训练epoch +train_epochs=80 +# 指定训练所使用的npu device卡id +device_id=0 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --pth_path* ]];then + pth_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否传入 pth_path , 验证脚本需要传入此参数 +if [[ $pth_path == "" ]];then + echo "[Error] para \"pth_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +echo ${pth_path} +nohup python3.7 ${cur_path}/main.py \ + "" \ + --learning-rate=0.012 \ + --mom=0.9 \ + --world-size=1 \ + --weight-decay=0.0005 \ + --print-freq=1 \ + --device='npu' \ + --gpu=${ASCEND_DEVICE_ID} \ + --dist-backend 'hccl' \ + --epochs=${train_epochs} \ + --loss-scale=32 \ + --amp \ + --pretrained \ + --pth_path=${pth_path} \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh new file mode 100644 index 0000000000..1a36df7d69 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/train_full_1p.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LeNet_5_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=80 +# 指定训练所使用的npu device卡id +device_id=0 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${cur_path}/main.py \ + "" \ + --learning-rate=0.012 \ + --momentum=0.9 \ + --world-size=1 \ + --weight-decay=0.0005 \ + --print-freq=1 \ + --device='npu' \ + --gpu=${ASCEND_DEVICE_ID} \ + --dist-backend 'hccl' \ + --epochs=${train_epochs} \ + --loss-scale=32 \ + --amp \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh new file mode 100644 index 0000000000..413ac99fe4 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/train_full_8p.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LeNet_5_for_PyTorch" +# 训练batch_size +batch_size=2048 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=80 +# 学习率 +learning_rate=0.012 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${cur_path}/main.py \ + "" \ + --learning-rate=${learning_rate} \ + --mom=0.9 \ + --weight-decay=0.0005 \ + --print-freq=1 \ + --dist-backend 'hccl' \ + --multiprocessing-distributed \ + --world-size=1 \ + --device='npu' \ + --epochs=${train_epochs} \ + --loss-scale=32 \ + --amp \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh new file mode 100644 index 0000000000..17b80a5222 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_1p.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LeNet_5_for_PyTorch" +# 训练batch_size +batch_size=256 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=2 +# 指定训练所使用的npu device卡id +device_id=0 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python3.7 ${cur_path}/main.py \ + "" \ + --learning-rate=0.012 \ + --momentum=0.9 \ + --world-size=1 \ + --weight-decay=0.0005 \ + --print-freq=1 \ + --device='npu' \ + --gpu=${ASCEND_DEVICE_ID} \ + --dist-backend 'hccl' \ + --epochs=${train_epochs} \ + --loss-scale=32 \ + --amp \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh new file mode 100644 index 0000000000..34f1f91c26 --- /dev/null +++ b/PyTorch/contrib/cv/classification/LeNet/test/train_performance_8p.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="LeNet_5_for_PyTorch" +# 训练batch_size +batch_size=2048 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=2 +# 学习率 +learning_rate=0.012 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +python3.7 ./main.py \ + "" \ + --seed=49 \ + --learning-rate=${learning_rate} \ + --momentum=0.9 \ + --weight-decay=0.0005 \ + --print-freq=1 \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend 'hccl' \ + --multiprocessing-distributed \ + --world-size=1 \ + --rank=0 \ + --device='npu' \ + --epochs=${train_epochs} \ + --loss-scale=32 \ + --amp \ + --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a '* Acc@1' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From a258bebac27786ca714ed0b60ea010ec89414e92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9F=A9=E6=97=AD?= Date: Thu, 27 Oct 2022 08:23:52 +0000 Subject: [PATCH 2/2] update PyTorch/contrib/cv/classification/LeNet/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 韩旭 --- PyTorch/contrib/cv/classification/LeNet/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/PyTorch/contrib/cv/classification/LeNet/README.md b/PyTorch/contrib/cv/classification/LeNet/README.md index fa2ee7871e..e9c64b307a 100644 --- a/PyTorch/contrib/cv/classification/LeNet/README.md +++ b/PyTorch/contrib/cv/classification/LeNet/README.md @@ -85,6 +85,12 @@ LeNet_5模型选用的数据集为MNIST手写数据集,数据集将在模型 2. 运行训练脚本。 该模型支持单机单卡训练和单机8卡训练。 + - 建立储存模型的文件夹 + + ``` + mkdir checkpoint + + ``` - 单机单卡训练 -- Gitee