From eebc83fd5348f67c05bad9bca4367c5522411901 Mon Sep 17 00:00:00 2001 From: Ryan Date: Wed, 25 May 2022 17:48:03 +0800 Subject: [PATCH 1/2] =?UTF-8?q?GAN=5FPytorch=201.8=E8=BF=81=E7=A7=BB?= =?UTF-8?q?=E9=A6=96=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../contrib/cv/others/GAN_Pytorch/README.md | 19 +-- PyTorch/contrib/cv/others/GAN_Pytorch/main.py | 67 ++++----- .../cv/others/GAN_Pytorch/test/env_npu.sh | 25 ++-- .../others/GAN_Pytorch/test/train_eval_8p.sh | 84 +++++++++++- .../GAN_Pytorch/test/train_finetune_1p.sh | 86 +++++++++++- .../others/GAN_Pytorch/test/train_full_1p.sh | 128 +++++++++++++++++- .../others/GAN_Pytorch/test/train_full_8p.sh | 127 ++++++++++++++++- .../GAN_Pytorch/test/train_performance_1p.sh | 126 ++++++++++++++++- .../GAN_Pytorch/test/train_performance_8p.sh | 125 ++++++++++++++++- 9 files changed, 696 insertions(+), 91 deletions(-) diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/README.md b/PyTorch/contrib/cv/others/GAN_Pytorch/README.md index 96151c3e21..eb86e0de01 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/README.md +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/README.md @@ -11,27 +11,27 @@ url=https://github.com/eriklindernoren/PyTorch-GAN/blob/master/implementations/g - Install PyTorch ([pytorch.org](http://pytorch.org)) - `pip install -r requirements.txt` -- The MNIST Dataset can be downloaded from the links below.Move the datasets to directory ./data . +- The MNIST Dataset can be downloaded from the links below. - Train Set : [Download Mnist](https://wwr.lanzoui.com/iSBOeu43dkf) ## Training # -To train a model, change the working directory to `./test`,then run: +To train a model, run: ```bash # 1p train perf -bash train_performance_1p.sh +bash train_performance_1p.sh --data_path=data/mnist # 8p train perf -bash train_performance_8p.sh +bash train_performance_8p.sh --data_path=data/mnist # 8p train full -bash train_full_8p.sh +bash train_full_8p.sh --data_path=data/mnist # 8p eval -bash train_eval_8p.sh +bash train_eval_8p.sh --data_path=data/mnist # finetuning -bash train_finetune_1p.sh +bash train_finetune_1p.sh --data_path=data/mnist ``` After running,you can see the results in `./output` @@ -39,8 +39,9 @@ After running,you can see the results in `./output` | Acc@1 | FPS | Npu_nums | Epochs | AMP_Type | | :------: | :------: | :------: | :------: | :------: | -| - | 997 | 1 | 200 | O1 | -| - | 11795 | 8 | 200 | O1 | +| - | 515.439 | 1 | 200 | O1 | +| - | 15275.049 | 8 | 200 | O1 | + diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/main.py b/PyTorch/contrib/cv/others/GAN_Pytorch/main.py index 87b96cdccb..e39c6099d3 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/main.py +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/main.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ - +import torch_npu import argparse import os import sys @@ -62,7 +62,7 @@ def train_one_epoch(generator, discriminator, optimizer_G, optimizer_D, adversar fake = Variable(Tensor(imgs.size(0), 1).fill_(0.0), requires_grad=False) # Configure input - real_imgs = Variable(imgs.type(Tensor)).to(device) + real_imgs = Variable(imgs.type(torch.Tensor)).to(device) # ----------------- # Train Generator @@ -106,7 +106,7 @@ def train_one_epoch(generator, discriminator, optimizer_G, optimizer_D, adversar d_loss.backward() optimizer_D.step() batch_time.update(time.time() - start_time) - if args.n_epochs == 1: + if args.n_epochs == 1 and args.is_master_node: print( "[Epoch %d] [step %d] [D loss: %f] [G loss: %f]" % (epoch, i, D_loss.avg, G_loss.avg) @@ -117,7 +117,7 @@ def train_one_epoch(generator, discriminator, optimizer_G, optimizer_D, adversar if args.is_master_node: print( "[Epoch %d] [D loss: %f] [G loss: %f] FPS:%.3f" - % (epoch, D_loss.avg,G_loss.avg,args.batch_size*args.gpus/batch_time.avg) + % (epoch, D_loss.avg, G_loss.avg, args.batch_size * args.gpus / batch_time.avg) ) LOSS_G.append(G_loss.avg) LOSS_D.append(D_loss.avg) @@ -135,43 +135,30 @@ def main(args): if amp is None: raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") - # if args.output_dir: - # os.mkdir(args.output_dir) - + + device = torch.device(f'npu:{args.local_rank}') # npu + torch.npu.set_device(f'npu:{args.local_rank}') + print('device_id=', args.local_rank) if args.distributed: + torch.distributed.init_process_group(backend='hccl', world_size=args.gpus, rank=args.local_rank) - mp.spawn(main_worker, nprocs=args.gpus, - args=(args,)) - else: - main_worker(args.gpus, args) + args.is_master_node = not args.distributed or args.local_rank == 0 -def main_worker(nprocs, args): - local_rank = 0 - if args.distributed: - torch.distributed.init_process_group(backend="hccl", - init_method='env://', - world_size=args.nodes * args.gpus, - rank=nprocs) - local_rank = torch.distributed.get_rank() - args.is_master_node = not args.distributed or local_rank == 0 if args.is_master_node: print(args) - args.device_id = args.device_id + local_rank - print('device_id=', args.device_id) - device = torch.device(f'npu:{args.device_id}') # npu - torch.npu.set_device(device) # for npu - print("Downloading dataset...") + print("Preparing dataset...") + # Configure data loader - os.makedirs("../data/mnist", exist_ok=True) train_dataset = datasets.MNIST( - "../../data/mnist", + args.data_path, train=True, download=True, transform=transforms.Compose( [transforms.Resize(args.img_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5])] )) - - print("Creating dataloader") + + if args.is_master_node: + print("Creating dataloader") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( @@ -185,12 +172,11 @@ def main_worker(nprocs, args): if args.is_master_node: print("Creating model") - # create model + Tensor = torch.npu.FloatTensor LOSS_G=[] LOSS_D=[] - os.makedirs("../output", exist_ok=True) - os.chdir("../output") + generator = Generator() discriminator = Discriminator() if args.pretrained: @@ -233,10 +219,11 @@ def main_worker(nprocs, args): opt_level='O1', loss_scale=128,combine_grad=True) if args.distributed: - generator = DDP(generator, device_ids=[local_rank], broadcast_buffers=False) - discriminator = DDP(discriminator, device_ids=[local_rank], broadcast_buffers=False) + generator = DDP(generator, device_ids=[args.local_rank], broadcast_buffers=False) + discriminator = DDP(discriminator, device_ids=[args.local_rank], broadcast_buffers=False) if args.test_only : + os.makedirs("test_images",exist_ok=True) Tensor = torch.npu.FloatTensor generator = Generator().npu() checkpoint = torch.load(r'./checkpoint.pth.tar', map_location='cpu') @@ -268,7 +255,7 @@ def main_worker(nprocs, args): # Generate a batch of images gen_imgs = generator(z) - save_image(gen_imgs.data[:25], "image/%d.png" % i, nrow=5, normalize=True) + save_image(gen_imgs.data[:25], "test_images/image/%d.png" % i, nrow=5, normalize=True) print("Generate done!") return @@ -357,9 +344,9 @@ def parse_args(): parser.add_argument("--latent_dim", type=int, default=100, help="dimensionality of the latent space") parser.add_argument("--img_size", type=int, default=28, help="size of each image dimension") parser.add_argument("--channels", type=int, default=1, help="number of image channels") - parser.add_argument("--gpus", type=int, default=8, help="num of gpus of per node") + parser.add_argument("--gpus", type=int, default=1, help="num of gpus of per node") parser.add_argument("--nodes", type=int, default=1) - parser.add_argument('--device_id', default=0, type=int, help='device id') + parser.add_argument('--local_rank', default=0, type=int, help='device id') parser.add_argument("--test_only", type=int, default=None, help="only generate images") parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') @@ -368,6 +355,9 @@ def parse_args(): parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') + # 数据集path + parser.add_argument('--data_path', default='../data/mnist', + help='the path of the dataset') parser.add_argument('--distributed', action='store_true', help='Use multi-processing distributed training to launch ' 'N processes per node, which has N GPUs. This is the ' @@ -381,6 +371,9 @@ def parse_args(): parser.add_argument('--apex', default=False, action='store_true', help='use apex to train the model') args = parser.parse_args() + + args.gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 + return args if __name__ == '__main__': args = parse_args() diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh index 4740fafdcc..1e746bffeb 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh @@ -34,23 +34,24 @@ ${install_path}/driver/tools/msnpureport -g error -d 5 ${install_path}/driver/tools/msnpureport -g error -d 6 ${install_path}/driver/tools/msnpureport -g error -d 7 -#将Host日志输出到串口,0-关闭/1-开启 +#Host־,0-ر/1- export ASCEND_SLOG_PRINT_TO_STDOUT=0 -#设置默认日志级别,0-debug/1-info/2-warning/3-error -export ASCEND_GLOBAL_LOG_LEVEL=3 -#设置Event日志开启标志,0-关闭/1-开启 +#Ĭ־,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL==3 +#Event־־,0-ر/1- export ASCEND_GLOBAL_EVENT_ENABLE=0 -#设置是否开启taskque,0-关闭/1-开启 -export TASK_QUEUE_ENABLE=1 -#设置是否开启PTCopy,0-关闭/1-开启 +#Ƿtaskque,0-ر/1- +export TASK_QUEUE_ENABLE=0 +#ǷPTCopy,0-ر/1- export PTCOPY_ENABLE=1 -#设置是否开启combined标志,0-关闭/1-开启 -export COMBINED_ENABLE=0 -#设置特殊场景是否需要重新编译,不需要修改 +#Ƿ2combined־,0-ر/1- +export COMBINED_ENABLE=1 +#ⳡǷҪ±,Ҫ޸ export DYNAMIC_OP="ADD#MUL" -#HCCL白名单开关,1-关闭/0-开启 +# HCCL,1-ر/0- export HCCL_WHITELIST_DISABLE=1 -export HCCL_IF_IP=$(hostname -I |awk '{print $1}') +# HCCLĬϳʱʱ120s٣޸Ϊ1800sPyTorchĬ +export HCCL_CONNECT_TIMEOUT=1800 ulimit -SHn 512000 diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh index 744956a7d1..3c9e578f43 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh @@ -1,14 +1,88 @@ #!/bin/bash -source env_npu.sh currentDir=$(cd "$(dirname "$0")";pwd)/.. -nohup python3 ${currentDir}/main.py \ - --gpus 8\ +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +#export RANK_SIZE=8 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="GAN" + +#训练batch_size,,需要模型审视修改 +batch_size=64 + + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7 + echo "device id is ${ASCEND_DEVICE_ID}" +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +echo "start_time: ${start_time}" + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${cur_path}/test/env_npu.sh +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.py \ --distributed \ --lr 0.0008 \ --batch_size 128 \ --n_epochs 200 \ --workers 0 \ --apex \ - --device_id 0 \ - --test_only 1 & + --test_only 1 \ + --data_path ${data_path} > ${cur_path}/output/train_eval_8p.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +echo "end_time: ${end_time}" +e2e_time=$(( $end_time - $start_time )) + + diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh index 58b97a99a6..c2e690aa7d 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh @@ -1,14 +1,88 @@ #!/bin/bash - -source env_npu.sh currentDir=$(cd "$(dirname "$0")";pwd)/.. -nohup python3 ${currentDir}/main.py \ - --gpus 1\ +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +#export RANK_SIZE=8 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="GAN" + +#训练batch_size,,需要模型审视修改 +batch_size=64 + + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7 + echo "device id is ${ASCEND_DEVICE_ID}" +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +echo "start_time: ${start_time}" + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${cur_path}/test/env_npu.sh +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3 -u ${currentDir}/main.py \ --lr 0.0002 \ --batch_size 64 \ --n_epochs 100 \ --workers 0 \ --apex \ - --device_id 0 \ - --pretrained & + --local_rank 0 \ + --pretrained \ + --data_path ${data_path} > ${cur_path}/output/train_tune_1p.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +echo "end_time: ${end_time}" +e2e_time=$(( $end_time - $start_time )) + + diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh index eb288785d0..1fd471a4e0 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh @@ -1,12 +1,128 @@ #!/bin/bash -source env_npu.sh currentDir=$(cd "$(dirname "$0")";pwd)/.. -nohup python3 ${currentDir}/main.py \ - --gpus 1\ +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="GAN" + +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7 + echo "device id is ${ASCEND_DEVICE_ID}" +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +echo "start_time: ${start_time}" + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${cur_path}/test/env_npu.sh +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3 -u ${currentDir}/main.py \ --lr 0.0002 \ - --batch_size 64 \ + --batch_size ${batch_size} \ --n_epochs 200 \ - --workers 0 \ + --workers 16 \ --apex \ - --device_id 0 & + --local_rank 6 \ + --data_path ${data_path} > ${cur_path}/output/train_full_1p.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +echo "end_time: ${end_time}" +e2e_time=$(( $end_time - $start_time )) + +#最后一个迭代FPS值 +FPS=`grep -a 'FPS:' ${cur_path}/output/train_full_1p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'` + +#最后一个迭代loss值 +loss=`grep -a 'D loss:' ${cur_path}/output/train_full_1p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'` + +#打印,不需要修改 +echo "ActualFPS : $FPS" +echo "ActualLoss : ${loss}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'D loss:' ${cur_path}/output/train_full_1p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +grep -a 'FPS:' ${cur_path}/output/train_full_1p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + + diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh index 8c42ea3fa3..ad9102476e 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh @@ -1,13 +1,128 @@ #!/bin/bash -source env_npu.sh currentDir=$(cd "$(dirname "$0")";pwd)/.. -nohup python3 ${currentDir}/main.py \ - --gpus 8\ +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="GAN" + +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7 + echo "device id is ${ASCEND_DEVICE_ID}" +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +echo "start_time: ${start_time}" + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${cur_path}/test/env_npu.sh +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.py \ --distributed \ --lr 0.0008 \ - --batch_size 128 \ + --batch_size ${batch_size} \ --n_epochs 200 \ - --workers 0 \ + --workers 16 \ --apex \ - --device_id 0 & + --data_path ${data_path} > ${cur_path}/output/train_full_8p.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +echo "end_time: ${end_time}" +e2e_time=$(( $end_time - $start_time )) + +#最后一个迭代FPS值 +FPS=`grep -a 'FPS:' ${cur_path}/output/train_full_8p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'` + +#最后一个迭代loss值 +loss=`grep -a 'D loss:' ${cur_path}/output/train_full_8p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'` + +#打印,不需要修改 +echo "ActualFPS : $FPS" +echo "ActualLoss : ${loss}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'D loss:' ${cur_path}/output/train_full_8p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +grep -a 'FPS:' ${cur_path}/output/train_full_8p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + + diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh index 7794753d3d..da5a3e61a8 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh @@ -1,12 +1,128 @@ #!/bin/bash -source env_npu.sh currentDir=$(cd "$(dirname "$0")";pwd)/.. -nohup python3 ${currentDir}/main.py \ - --gpus 1\ +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="GAN" + +#训练batch_size,,需要模型审视修改 +batch_size=64 + + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7 + echo "device id is ${ASCEND_DEVICE_ID}" +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +echo "start_time: ${start_time}" + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${cur_path}/test/env_npu.sh +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3 -u ${currentDir}/main.py \ --lr 0.0002 \ - --batch_size 64 \ + --batch_size ${batch_size} \ --n_epochs 1 \ --workers 16 \ --apex \ - --device_id 0 & + --local_rank 0 \ + --data_path ${data_path} > ${cur_path}/output/train_perf_1p.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +echo "end_time: ${end_time}" +e2e_time=$(( $end_time - $start_time )) + +#最后一个迭代FPS值 +FPS=`grep -a 'FPS:' ${cur_path}/output/train_perf_1p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'` + +#最后一个迭代loss值 +loss=`grep -a 'D loss:' ${cur_path}/output/train_perf_1p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'` + +#打印,不需要修改 +echo "ActualFPS : $FPS" +echo "ActualLoss : ${loss}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'D loss:' ${cur_path}/output/train_perf_1p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +grep -a 'FPS:' ${cur_path}/output/train_perf_1p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + + diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh index 092fdb26e7..eaca822eda 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh @@ -1,13 +1,128 @@ #!/bin/bash -source env_npu.sh currentDir=$(cd "$(dirname "$0")";pwd)/.. -nohup python3 ${currentDir}/main.py \ - --gpus 8\ +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=8 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="GAN" + +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7 + echo "device id is ${ASCEND_DEVICE_ID}" +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +echo "start_time: ${start_time}" + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#创建DeviceID输出目录,不需要修改 +if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${cur_path}/test/env_npu.sh +fi + +#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 +python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.py \ --distributed \ --lr 0.0008 \ - --batch_size 128 \ + --batch_size ${batch_size} \ --n_epochs 1 \ --workers 16 \ --apex \ - --device_id 0 & + --data_path ${data_path} > ${cur_path}/output/train_perf_8p.log 2>&1 & + +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +echo "end_time: ${end_time}" +e2e_time=$(( $end_time - $start_time )) + +#最后一个迭代FPS值 +FPS=`grep -a 'FPS:' ${cur_path}/output/train_perf_8p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'` + +#最后一个迭代loss值 +loss=`grep -a 'D loss:' ${cur_path}/output/train_perf_8p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'` + +#打印,不需要修改 +echo "ActualFPS : $FPS" +echo "ActualLoss : ${loss}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'D loss:' ${cur_path}/output/train_perf_8p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +grep -a 'FPS:' ${cur_path}/output/train_perf_8p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log + + + + + + -- Gitee From a576ce37eac7c7d769d6ee8736c4dc44bf017a73 Mon Sep 17 00:00:00 2001 From: Ryan Date: Wed, 25 May 2022 18:43:11 +0800 Subject: [PATCH 2/2] update --- .../contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh | 2 +- .../contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh index da5a3e61a8..d78646b7f5 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh @@ -71,7 +71,7 @@ fi python3 -u ${currentDir}/main.py \ --lr 0.0002 \ --batch_size ${batch_size} \ - --n_epochs 1 \ + --n_epochs 3 \ --workers 16 \ --apex \ --local_rank 0 \ diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh index eaca822eda..058db683ad 100644 --- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh @@ -72,7 +72,7 @@ python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.p --distributed \ --lr 0.0008 \ --batch_size ${batch_size} \ - --n_epochs 1 \ + --n_epochs 3 \ --workers 16 \ --apex \ --data_path ${data_path} > ${cur_path}/output/train_perf_8p.log 2>&1 & -- Gitee