diff --git a/PyTorch/dev/cv/image_classification/SRCNN_ID1770_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/cv/image_classification/SRCNN_ID1770_for_PyTorch/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..0e9f1aadebac8f1c59168fe8c6820fd41cd7a0e9 --- /dev/null +++ b/PyTorch/dev/cv/image_classification/SRCNN_ID1770_for_PyTorch/test/train_performance_8p.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +cur_path=`pwd`/../ +path=`pwd` +#失败用例打屏 +export ASCEND_GLOBAL_LOG_LEVEL=3 + +export HDF5_DISABLE_VERSION_CHECK=1 + +export PATH=/usr/local/hdf5/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/hdf5/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/usr/local/hdf5/lib:$LIBRARY_PATH +export CPATH=/usr/local/hdf5/include:$CPATH + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=160 +#网络名称,同目录名称 +Network="SRCNN_ID1770_for_PyTorch" +#Device数量,单卡默认为1 +RankSize=8 +RANK_SIZE=8 +#训练epoch,可选 +train_epochs=10 +#训练step +train_steps= +#学习率 +learning_rate=1e-3 + +num_workers=192 + +#参数配置 +data_path="" +PREC="--apex --apex_opt_level O2" + +if [[ $1 == --help || $1 == --h ]];then + echo "usage:./train_performance_1p.sh " + exit 1 +fi + +for para in $* +do + if [[ $para == --precision_mode* ]];then + apex_opt_level=`echo ${para#*=}` + if [[ $apex_opt_level != "O1" ]] && [[ $apex_opt_level != "O2" ]] && [[ $apex_opt_level != "O3" ]]; then + echo "[ERROR] para \"precision_mode\" must be config O1 or O2 or O3" + exit 1 + fi + PREC="--apex --apex-opt-level "$apex_opt_level + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi +##############执行训练########## +cd $cur_path +if [ -d $cur_path/test/output ];then + rm -rf $cur_path/test/output/* + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID +fi +wait + +#sed -i "s|THUCNews|$data_path|g" ${cur_path}/run.py +#sed -i "s|self.num_epochs = 10|self.num_epochs = 1|g" ${cur_path}/models/TextRNN.p +export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID + +start=$(date +%s) +export MASTER_ADDR=localhost +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 + +NPUS=($(seq 0 7)) +export NPU_WORLD_SIZE=${#NPUS[@]} +rank=0 +for i in ${NPUS[@]} +do + mkdir -p $cur_path/output/${i}/ + export NPU_CALCULATE_DEVICE=${i} + export RANK=${rank} + export ASCEND_DEVICE_ID=${i} + echo run process ${rank} + python3 train_8p.py $PREC --ddp --train-file "$data_path/SRCNN/91-image_x2.h5" \ + --eval-file "$data_path/SRCNN/Set5_x2.h5" \ + --outputs-dir "outputs" \ + --scale 3 \ + --lr $learning_rate \ + --batch-size $batch_size \ + --num-epochs $train_epochs \ + --num-workers $num_workers \ + --seed 123 > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${i}.log 2>&1 & + let rank++ +done +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) +sed -i "s|\r|\n|g" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep "FPS" $path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F ", loss" '{print$1}' | awk '{print$NF}' |awk '{sum+=$1} END {print"",sum*8/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep "Acc" $path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F ',' '{print $1}'|tail -1|awk '{print $NF}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +ActualFPS=${FPS} +#ActualAccuracy=${train_accuracy} +#单迭代训练时长 +TrainingTime=`echo "${BatchSize} ${FPS}"|awk '{printf("%.4f\n", $1*1000/$2)}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +# grep -r -B 3 'eval psnr' $path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | grep epoch | grep psnr -B 1 | grep 21904/21904 | awk -F "loss=" '{print$2}' | sed 's/]//g' > $path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +# grep 21904/21904 $path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss=" 'END {print$2}' | sed 's/]//g' >> $path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep "FPS" $path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$NF}' > $path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RankSize}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/dev/cv/image_classification/SRCNN_ID1770_for_PyTorch/train_8p.py b/PyTorch/dev/cv/image_classification/SRCNN_ID1770_for_PyTorch/train_8p.py new file mode 100644 index 0000000000000000000000000000000000000000..30709a0c50bc5fcbabffea442038a0fddacc33ca --- /dev/null +++ b/PyTorch/dev/cv/image_classification/SRCNN_ID1770_for_PyTorch/train_8p.py @@ -0,0 +1,249 @@ +# +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ +# +import argparse +import os +import copy + +import torch +from torch import nn +import torch.optim as optim +import torch.backends.cudnn as cudnn +from torch.utils.data.dataloader import DataLoader +from tqdm import tqdm +import time +from models import SRCNN +from datasets import TrainDataset, EvalDataset +from utils import AverageMeter, calc_psnr +import torch.npu +import os + +# print("1111111111") +# import torch +# torch.npu.global_step_inc() +# print("2222222222") + +# 使能混合精度 +try: + from apex import amp +except ImportError: + amp = None +import apex +# 使能混合精度 + +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + + +if __name__ == '__main__': + + # 开启模糊编译 + # print("ttttttttttttttttttt") + # import torch + # torch.npu.global_step_inc() + # print("zzzzzzzzzzzzzzzz") + # 开启模糊编译 + + parser = argparse.ArgumentParser() + parser.add_argument('--train-file', type=str, required=True) + parser.add_argument('--eval-file', type=str, required=True) + parser.add_argument('--outputs-dir', type=str, required=True) + parser.add_argument('--scale', type=int, default=3) + parser.add_argument('--lr', type=float, default=1e-4) + parser.add_argument('--batch-size', type=int, default=16) + parser.add_argument('--num-epochs', type=int, default=400) + parser.add_argument('--num-workers', type=int, default=128) + parser.add_argument('--seed', type=int, default=123) + parser.add_argument('--ddp', action='store_true', help='default 1p') + + # 使能混合精度 + parser.add_argument('--apex', action='store_true', + help='Use apex for mixed precision training') + parser.add_argument('--apex_opt_level', default='O1', type=str, + help='For apex mixed precision training' + 'O0 for FP32 training, O1 for mixed precision training.' + 'For further detail, see https://github.com/NVIDIA/apex/tree/master/examples/imagenet') + parser.add_argument('--loss_scale_value', default=1024., type=float, + help='loss scale using in amp, default -1 means dynamic') + # 使能混合精度 + args = parser.parse_args() + + if args.ddp: + NPU_CALCULATE_DEVICE = 0 + if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) + if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') + NPU_WORLD_SIZE = int(os.getenv('NPU_WORLD_SIZE')) + RANK = int(os.getenv('RANK')) + torch.distributed.init_process_group('hccl', rank=RANK, world_size=NPU_WORLD_SIZE) + + args.outputs_dir = os.path.join(args.outputs_dir, 'x{}'.format(args.scale)) + + if not os.path.exists(args.outputs_dir): + os.makedirs(args.outputs_dir) + + cudnn.benchmark = True + device = torch.device(f'npu:{NPU_CALCULATE_DEVICE}') + + torch.manual_seed(args.seed) + + model = SRCNN().to(f'npu:{NPU_CALCULATE_DEVICE}') + criterion = nn.MSELoss() + #使用PT原生优化器接口 + # optimizer = optim.Adam([ + # {'params': model.conv1.parameters()}, + # {'params': model.conv2.parameters()}, + # {'params': model.conv3.parameters(), 'lr': args.lr * 0.1} + # ], lr=args.lr) + + #替换亲和性接口,但是不合并paras + # optimizer = apex.optimizers.NpuFusedAdam([ + # {'params': model.conv1.parameters()}, + # {'params': model.conv2.parameters()}, + # {'params': model.conv3.parameters(), 'lr': args.lr * 0.1} + # ], lr=args.lr) + + #替换亲和性接口,同时将合并第1和第2组paras + optimizer = apex.optimizers.NpuFusedAdam([ + {'params': list(model.conv1.parameters()) + list(model.conv2.parameters())}, + {'params': model.conv3.parameters(), 'lr': args.lr * 0.1} + ], lr=args.lr) + + + + + if args.apex: + # print("args.apex=============================", args.apex) + model, optimizer = amp.initialize(model, optimizer, + opt_level=args.apex_opt_level, + loss_scale=args.loss_scale_value, + combine_grad=True) + #使能混合精度 + if args.ddp: + model = model.to(f'npu:{NPU_CALCULATE_DEVICE}') + if not isinstance(model, torch.nn.parallel.DistributedDataParallel): + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[NPU_CALCULATE_DEVICE], + broadcast_buffers=False) + + train_dataset = TrainDataset(args.train_file) + eval_dataset = EvalDataset(args.eval_file) + if args.ddp: + train_dataloader_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + train_dataloader_batch_size = args.batch_size + train_dataloader = DataLoader(dataset=train_dataset, + batch_size=train_dataloader_batch_size, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, sampler=train_dataloader_sampler) + eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=1) + else: + train_dataloader = DataLoader(dataset=train_dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True) + eval_dataloader = DataLoader(dataset=eval_dataset, batch_size=1) + + best_weights = copy.deepcopy(model.state_dict()) + best_epoch = 0 + best_psnr = 0.0 + + for epoch in range(args.num_epochs): + if args.ddp: + train_dataloader.sampler.set_epoch(epoch) + model.train() + epoch_losses = AverageMeter() + + # with tqdm(total=(len(train_dataset) - len(train_dataset) % args.batch_size)) as t: + # t.set_description('epoch: {}/{}'.format(epoch, args.num_epochs - 1)) + + # iter = 0 + start_time = time.time() + # print("train_dataloader length============================", len(train_dataloader)) + + + for i, (inputs, labels) in enumerate(train_dataloader): + + inputs = inputs.to(f'npu:{NPU_CALCULATE_DEVICE}', non_blocking=True) + labels = labels.to(f'npu:{NPU_CALCULATE_DEVICE}', non_blocking=True) + + preds = model(inputs) + loss = criterion(preds, labels) + epoch_losses.update(loss.item(), len(inputs)) + optimizer.zero_grad() + + # 使能混合精度 + # loss.backward() + if args.apex: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + # 使能混合精度 + optimizer.step() + FPS = args.batch_size/(time.time()-start_time) + start_time = time.time() + print("Epoch: {}, iter: {}, FPS: {:.4f}, loss: {:.6f}".format(epoch, i, FPS, epoch_losses.avg)) + + + torch.save(model.state_dict(), os.path.join(args.outputs_dir, 'epoch_{}.pth'.format(epoch))) + + model.eval() + epoch_psnr = AverageMeter() + + for data in eval_dataloader: + inputs, labels = data + + inputs = inputs.to(f'npu:{NPU_CALCULATE_DEVICE}') + labels = labels.to(f'npu:{NPU_CALCULATE_DEVICE}') + + with torch.no_grad(): + preds = model(inputs).clamp(0.0, 1.0) + + epoch_psnr.update(calc_psnr(preds, labels), len(inputs)) + + print('eval psnr: {:.2f}'.format(epoch_psnr.avg)) + + if epoch_psnr.avg > best_psnr: + best_epoch = epoch + best_psnr = epoch_psnr.avg + best_weights = copy.deepcopy(model.state_dict()) + + print('best epoch: {}, psnr: {:.2f}'.format(best_epoch, best_psnr)) + torch.save(best_weights, os.path.join(args.outputs_dir, 'best.pth')) \ No newline at end of file