From eebc83fd5348f67c05bad9bca4367c5522411901 Mon Sep 17 00:00:00 2001
From: Ryan <wangzhiwei73@huawei.com>
Date: Wed, 25 May 2022 17:48:03 +0800
Subject: [PATCH 1/2] =?UTF-8?q?GAN=5FPytorch=201.8=E8=BF=81=E7=A7=BB?=
 =?UTF-8?q?=E9=A6=96=E6=AC=A1=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../contrib/cv/others/GAN_Pytorch/README.md   |  19 +--
 PyTorch/contrib/cv/others/GAN_Pytorch/main.py |  67 ++++-----
 .../cv/others/GAN_Pytorch/test/env_npu.sh     |  25 ++--
 .../others/GAN_Pytorch/test/train_eval_8p.sh  |  84 +++++++++++-
 .../GAN_Pytorch/test/train_finetune_1p.sh     |  86 +++++++++++-
 .../others/GAN_Pytorch/test/train_full_1p.sh  | 128 +++++++++++++++++-
 .../others/GAN_Pytorch/test/train_full_8p.sh  | 127 ++++++++++++++++-
 .../GAN_Pytorch/test/train_performance_1p.sh  | 126 ++++++++++++++++-
 .../GAN_Pytorch/test/train_performance_8p.sh  | 125 ++++++++++++++++-
 9 files changed, 696 insertions(+), 91 deletions(-)

diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/README.md b/PyTorch/contrib/cv/others/GAN_Pytorch/README.md
index 96151c3e21..eb86e0de01 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/README.md
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/README.md
@@ -11,27 +11,27 @@ url=https://github.com/eriklindernoren/PyTorch-GAN/blob/master/implementations/g
 
 - Install PyTorch ([pytorch.org](http://pytorch.org))
 - `pip install -r requirements.txt`
-- The MNIST Dataset can be downloaded from the links below.Move the datasets to directory ./data .
+- The MNIST Dataset can be downloaded from the links below.
     - Train Set : [Download Mnist](https://wwr.lanzoui.com/iSBOeu43dkf)
 
 ## Training # 
-To train a model, change the working directory to `./test`,then run: 
+To train a model, run: 
 
 ```bash
 # 1p train perf
-bash train_performance_1p.sh
+bash train_performance_1p.sh --data_path=data/mnist
 
 # 8p train perf
-bash train_performance_8p.sh
+bash train_performance_8p.sh --data_path=data/mnist
 
 # 8p train full
-bash train_full_8p.sh
+bash train_full_8p.sh --data_path=data/mnist
 
 # 8p eval
-bash train_eval_8p.sh
+bash train_eval_8p.sh --data_path=data/mnist
 
 # finetuning
-bash train_finetune_1p.sh
+bash train_finetune_1p.sh --data_path=data/mnist
 ```
 After running,you can see the results in `./output`
 
@@ -39,8 +39,9 @@ After running,you can see the results in `./output`
 
 | Acc@1    | FPS       | Npu_nums | Epochs   | AMP_Type |
 | :------: | :------:  | :------: | :------: | :------: |
-| -        | 997      | 1        | 200      | O1       |
-| -     | 11795     | 8        | 200      | O1       |
+| -        | 515.439      | 1        | 200      | O1       |
+| -     | 15275.049     | 8        | 200      | O1       |
+
 
 
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/main.py b/PyTorch/contrib/cv/others/GAN_Pytorch/main.py
index 87b96cdccb..e39c6099d3 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/main.py
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/main.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-
+import torch_npu
 import argparse
 import os
 import sys
@@ -62,7 +62,7 @@ def train_one_epoch(generator, discriminator, optimizer_G, optimizer_D, adversar
         fake = Variable(Tensor(imgs.size(0), 1).fill_(0.0), requires_grad=False)
 
         # Configure input
-        real_imgs = Variable(imgs.type(Tensor)).to(device)
+        real_imgs = Variable(imgs.type(torch.Tensor)).to(device)
 
         # -----------------
         #  Train Generator
@@ -106,7 +106,7 @@ def train_one_epoch(generator, discriminator, optimizer_G, optimizer_D, adversar
             d_loss.backward()
         optimizer_D.step()
         batch_time.update(time.time() - start_time)
-        if args.n_epochs == 1:
+        if args.n_epochs == 1 and args.is_master_node:
             print(
                 "[Epoch %d] [step %d] [D loss: %f] [G loss: %f]"
                 % (epoch, i, D_loss.avg, G_loss.avg)
@@ -117,7 +117,7 @@ def train_one_epoch(generator, discriminator, optimizer_G, optimizer_D, adversar
     if args.is_master_node:
         print(
             "[Epoch %d] [D loss: %f] [G loss: %f] FPS:%.3f"
-            % (epoch, D_loss.avg,G_loss.avg,args.batch_size*args.gpus/batch_time.avg)
+            % (epoch, D_loss.avg, G_loss.avg, args.batch_size * args.gpus / batch_time.avg)
         )
     LOSS_G.append(G_loss.avg)
     LOSS_D.append(D_loss.avg)
@@ -135,43 +135,30 @@ def main(args):
         if amp is None:
             raise RuntimeError("Failed to import apex. Please install apex from https://www.github.com/nvidia/apex "
                                "to enable mixed-precision training.")
-    # if args.output_dir:
-    #     os.mkdir(args.output_dir)
-
+    
+    device = torch.device(f'npu:{args.local_rank}')  # npu
+    torch.npu.set_device(f'npu:{args.local_rank}')
+    print('device_id=', args.local_rank)
     if args.distributed:
+        torch.distributed.init_process_group(backend='hccl', world_size=args.gpus, rank=args.local_rank)
 
-        mp.spawn(main_worker, nprocs=args.gpus,
-                 args=(args,))
-    else:
-        main_worker(args.gpus, args)
+    args.is_master_node = not args.distributed or args.local_rank == 0
 
-def main_worker(nprocs, args):
-    local_rank = 0
-    if args.distributed:
-        torch.distributed.init_process_group(backend="hccl",
-                                             init_method='env://',
-                                             world_size=args.nodes * args.gpus,
-                                             rank=nprocs)
-        local_rank = torch.distributed.get_rank()
-    args.is_master_node = not args.distributed or local_rank == 0
     if args.is_master_node:
         print(args)
-    args.device_id = args.device_id + local_rank
-    print('device_id=', args.device_id)
-    device = torch.device(f'npu:{args.device_id}')  # npu
-    torch.npu.set_device(device)  # for npu
-    print("Downloading dataset...")
+        print("Preparing dataset...")
+
     # Configure data loader
-    os.makedirs("../data/mnist", exist_ok=True)
     train_dataset = datasets.MNIST(
-        "../../data/mnist",
+        args.data_path,
         train=True,
         download=True,
         transform=transforms.Compose(
             [transforms.Resize(args.img_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
         ))
-
-    print("Creating dataloader")
+    
+    if args.is_master_node:    
+        print("Creating dataloader")
 
     if args.distributed:
         train_sampler = torch.utils.data.distributed.DistributedSampler(
@@ -185,12 +172,11 @@ def main_worker(nprocs, args):
 
     if args.is_master_node:
         print("Creating model")
-        # create model
+
     Tensor = torch.npu.FloatTensor
     LOSS_G=[]
     LOSS_D=[]
-    os.makedirs("../output", exist_ok=True)
-    os.chdir("../output")
+
     generator = Generator()
     discriminator = Discriminator()
     if args.pretrained:
@@ -233,10 +219,11 @@ def main_worker(nprocs, args):
                                                     opt_level='O1', loss_scale=128,combine_grad=True)
 
     if args.distributed:
-        generator = DDP(generator, device_ids=[local_rank], broadcast_buffers=False)
-        discriminator = DDP(discriminator, device_ids=[local_rank], broadcast_buffers=False)
+        generator = DDP(generator, device_ids=[args.local_rank], broadcast_buffers=False)
+        discriminator = DDP(discriminator, device_ids=[args.local_rank], broadcast_buffers=False)
 
     if args.test_only :
+        os.makedirs("test_images",exist_ok=True)
         Tensor = torch.npu.FloatTensor
         generator = Generator().npu()
         checkpoint = torch.load(r'./checkpoint.pth.tar', map_location='cpu')
@@ -268,7 +255,7 @@ def main_worker(nprocs, args):
             # Generate a batch of images
             gen_imgs = generator(z)
 
-            save_image(gen_imgs.data[:25], "image/%d.png" % i, nrow=5, normalize=True)
+            save_image(gen_imgs.data[:25], "test_images/image/%d.png" % i, nrow=5, normalize=True)
         print("Generate done!")
         return
 
@@ -357,9 +344,9 @@ def parse_args():
     parser.add_argument("--latent_dim", type=int, default=100, help="dimensionality of the latent space")
     parser.add_argument("--img_size", type=int, default=28, help="size of each image dimension")
     parser.add_argument("--channels", type=int, default=1, help="number of image channels")
-    parser.add_argument("--gpus", type=int, default=8, help="num of gpus of per node")
+    parser.add_argument("--gpus", type=int, default=1, help="num of gpus of per node")
     parser.add_argument("--nodes", type=int, default=1)
-    parser.add_argument('--device_id', default=0, type=int, help='device id')
+    parser.add_argument('--local_rank', default=0, type=int, help='device id')
     parser.add_argument("--test_only", type=int, default=None, help="only generate images")
     parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
                         help='manual epoch number (useful on restarts)')
@@ -368,6 +355,9 @@ def parse_args():
     parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                         help='use pre-trained model')
 
+    # 数据集path
+    parser.add_argument('--data_path', default='../data/mnist',
+                        help='the path of the dataset')
     parser.add_argument('--distributed', action='store_true',
                         help='Use multi-processing distributed training to launch '
                              'N processes per node, which has N GPUs. This is the '
@@ -381,6 +371,9 @@ def parse_args():
     parser.add_argument('--apex', default=False, action='store_true',
                         help='use apex to train the model')
     args = parser.parse_args()
+    
+    args.gpus = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    
     return args
 if __name__ == '__main__':
     args = parse_args()
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh
index 4740fafdcc..1e746bffeb 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/env_npu.sh
@@ -34,23 +34,24 @@ ${install_path}/driver/tools/msnpureport -g error -d 5
 ${install_path}/driver/tools/msnpureport -g error -d 6
 ${install_path}/driver/tools/msnpureport -g error -d 7
 
-#将Host日志输出到串口,0-关闭/1-开启
+#��Host��־���������,0-�ر�/1-����
 export ASCEND_SLOG_PRINT_TO_STDOUT=0
-#设置默认日志级别,0-debug/1-info/2-warning/3-error
-export ASCEND_GLOBAL_LOG_LEVEL=3
-#设置Event日志开启标志,0-关闭/1-开启
+#����Ĭ����־����,0-debug/1-info/2-warning/3-error
+export ASCEND_GLOBAL_LOG_LEVEL==3
+#����Event��־������־,0-�ر�/1-����
 export ASCEND_GLOBAL_EVENT_ENABLE=0
-#设置是否开启taskque,0-关闭/1-开启
-export TASK_QUEUE_ENABLE=1
-#设置是否开启PTCopy,0-关闭/1-开启
+#�����Ƿ���taskque,0-�ر�/1-����
+export TASK_QUEUE_ENABLE=0
+#�����Ƿ���PTCopy,0-�ر�/1-����
 export PTCOPY_ENABLE=1
-#设置是否开启combined标志,0-关闭/1-开启
-export COMBINED_ENABLE=0
-#设置特殊场景是否需要重新编译,不需要修改
+#�����Ƿ���2��������combined��־,0-�ر�/1-����
+export COMBINED_ENABLE=1
+#�������ⳡ���Ƿ���Ҫ���±���,����Ҫ�޸�
 export DYNAMIC_OP="ADD#MUL"
-#HCCL白名单开关,1-关闭/0-开启
+# HCCL����������,1-�ر�/0-����
 export HCCL_WHITELIST_DISABLE=1
-export HCCL_IF_IP=$(hostname -I |awk '{print $1}')
+# HCCLĬ�ϳ�ʱʱ��120s���٣��޸�Ϊ1800s����PyTorchĬ������
+export HCCL_CONNECT_TIMEOUT=1800
 
 ulimit -SHn 512000
 
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh
index 744956a7d1..3c9e578f43 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_eval_8p.sh
@@ -1,14 +1,88 @@
 #!/bin/bash
-source env_npu.sh
 currentDir=$(cd "$(dirname "$0")";pwd)/..
 
-nohup python3 ${currentDir}/main.py \
-        --gpus 8\
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+#export RANK_SIZE=8
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="GAN"
+
+#训练batch_size,,需要模型审视修改
+batch_size=64
+
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7
+    echo "device id is ${ASCEND_DEVICE_ID}"
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.py \
         --distributed \
         --lr 0.0008 \
         --batch_size 128 \
         --n_epochs 200 \
         --workers 0 \
         --apex \
-        --device_id 0 \
-        --test_only 1 &
+        --test_only 1 \
+        --data_path ${data_path} > ${cur_path}/output/train_eval_8p.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh
index 58b97a99a6..c2e690aa7d 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_finetune_1p.sh
@@ -1,14 +1,88 @@
 #!/bin/bash
-
-source env_npu.sh
 currentDir=$(cd "$(dirname "$0")";pwd)/..
 
-nohup python3 ${currentDir}/main.py \
-        --gpus 1\
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+#export RANK_SIZE=8
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="GAN"
+
+#训练batch_size,,需要模型审视修改
+batch_size=64
+
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7
+    echo "device id is ${ASCEND_DEVICE_ID}"
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3 -u ${currentDir}/main.py \
         --lr 0.0002 \
         --batch_size 64 \
         --n_epochs 100 \
         --workers 0 \
         --apex \
-        --device_id 0 \
-        --pretrained &
+        --local_rank 0 \
+        --pretrained \
+        --data_path ${data_path} > ${cur_path}/output/train_tune_1p.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh
index eb288785d0..1fd471a4e0 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_1p.sh
@@ -1,12 +1,128 @@
 #!/bin/bash
-source env_npu.sh
 currentDir=$(cd "$(dirname "$0")";pwd)/..
 
-nohup python3 ${currentDir}/main.py \
-        --gpus 1\
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=1
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="GAN"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7
+    echo "device id is ${ASCEND_DEVICE_ID}"
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3 -u ${currentDir}/main.py \
         --lr 0.0002 \
-        --batch_size 64 \
+        --batch_size ${batch_size} \
         --n_epochs 200 \
-        --workers 0 \
+        --workers 16 \
         --apex \
-        --device_id 0 &
+        --local_rank 6 \
+        --data_path ${data_path} > ${cur_path}/output/train_full_1p.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+#最后一个迭代FPS值
+FPS=`grep -a 'FPS:'  ${cur_path}/output/train_full_1p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'`
+
+#最后一个迭代loss值
+loss=`grep -a 'D loss:'  ${cur_path}/output/train_full_1p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'`
+
+#打印，不需要修改
+echo "ActualFPS : $FPS"
+echo "ActualLoss : ${loss}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep -a 'D loss:'  ${cur_path}/output/train_full_1p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+grep -a 'FPS:'  ${cur_path}/output/train_full_1p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+
+
+
+
+
+
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh
index 8c42ea3fa3..ad9102476e 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_full_8p.sh
@@ -1,13 +1,128 @@
 #!/bin/bash
-source env_npu.sh
 currentDir=$(cd "$(dirname "$0")";pwd)/..
 
-nohup python3 ${currentDir}/main.py \
-        --gpus 8\
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="GAN"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7
+    echo "device id is ${ASCEND_DEVICE_ID}"
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.py \
         --distributed \
         --lr 0.0008 \
-        --batch_size 128 \
+        --batch_size ${batch_size} \
         --n_epochs 200 \
-        --workers 0 \
+        --workers 16 \
         --apex \
-        --device_id 0 &
+        --data_path ${data_path} > ${cur_path}/output/train_full_8p.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+#最后一个迭代FPS值
+FPS=`grep -a 'FPS:'  ${cur_path}/output/train_full_8p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'`
+
+#最后一个迭代loss值
+loss=`grep -a 'D loss:'  ${cur_path}/output/train_full_8p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'`
+
+#打印，不需要修改
+echo "ActualFPS : $FPS"
+echo "ActualLoss : ${loss}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep -a 'D loss:'  ${cur_path}/output/train_full_8p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+grep -a 'FPS:'  ${cur_path}/output/train_full_8p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+
+
+
+
+
+
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh
index 7794753d3d..da5a3e61a8 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh
@@ -1,12 +1,128 @@
 #!/bin/bash
-source env_npu.sh
 currentDir=$(cd "$(dirname "$0")";pwd)/..
 
-nohup python3 ${currentDir}/main.py \
-        --gpus 1\
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=1
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="GAN"
+
+#训练batch_size,,需要模型审视修改
+batch_size=64
+
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7
+    echo "device id is ${ASCEND_DEVICE_ID}"
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3 -u ${currentDir}/main.py \
         --lr 0.0002 \
-        --batch_size 64 \
+        --batch_size ${batch_size} \
         --n_epochs 1 \
         --workers 16 \
         --apex \
-        --device_id 0 &
+        --local_rank 0 \
+        --data_path ${data_path} > ${cur_path}/output/train_perf_1p.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+#最后一个迭代FPS值
+FPS=`grep -a 'FPS:'  ${cur_path}/output/train_perf_1p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'`
+
+#最后一个迭代loss值
+loss=`grep -a 'D loss:'  ${cur_path}/output/train_perf_1p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'`
+
+#打印，不需要修改
+echo "ActualFPS : $FPS"
+echo "ActualLoss : ${loss}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep -a 'D loss:'  ${cur_path}/output/train_perf_1p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+grep -a 'FPS:'  ${cur_path}/output/train_perf_1p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+
+
+
+
+
+
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh
index 092fdb26e7..eaca822eda 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh
@@ -1,13 +1,128 @@
 #!/bin/bash
-source env_npu.sh
 currentDir=$(cd "$(dirname "$0")";pwd)/..
 
-nohup python3 ${currentDir}/main.py \
-        --gpus 8\
+#当前路径,不需要修改
+cur_path=`pwd`
+
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="GAN"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+
+
+#参数校验，不需要修改
+for para in $*
+do
+    if [[ $para == --device_id* ]];then
+        device_id=`echo ${para#*=}`
+    elif [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改
+if [ $ASCEND_DEVICE_ID ];then
+    echo "device id is ${ASCEND_DEVICE_ID}"
+elif [ ${device_id} ];then
+    export ASCEND_DEVICE_ID=${device_id}
+    echo "device id is ${ASCEND_DEVICE_ID}"
+else
+    export ASCEND_DEVICE_ID=0,1,2,3,4,5,6,7
+    echo "device id is ${ASCEND_DEVICE_ID}"
+fi
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+echo "start_time: ${start_time}"
+
+#进入训练脚本目录，需要模型审视修改
+cd $cur_path/
+
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${Network}/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/${Network}/$ASCEND_DEVICE_ID/ckpt
+fi
+
+#非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${cur_path}/test/env_npu.sh
+fi
+
+#执行训练脚本，以下传参不需要修改，其他需要模型审视修改
+python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.py \
         --distributed \
         --lr 0.0008 \
-        --batch_size 128 \
+        --batch_size ${batch_size} \
         --n_epochs 1 \
         --workers 16 \
         --apex \
-        --device_id 0 &
+        --data_path ${data_path} > ${cur_path}/output/train_perf_8p.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+echo "end_time: ${end_time}"
+e2e_time=$(( $end_time - $start_time ))
+
+#最后一个迭代FPS值
+FPS=`grep -a 'FPS:'  ${cur_path}/output/train_perf_8p.log|awk -F "FPS:" '{print $NF}'|awk 'END {print}'`
+
+#最后一个迭代loss值
+loss=`grep -a 'D loss:'  ${cur_path}/output/train_perf_8p.log | awk -F "D loss:" '{print $NF}'| awk 'END {print}' | awk -F "]" '{print $1}'`
+
+#打印，不需要修改
+echo "ActualFPS : $FPS"
+echo "ActualLoss : ${loss}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#稳定性精度看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+##获取性能数据，不需要修改
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep -a 'D loss:'  ${cur_path}/output/train_perf_8p.log | awk -F "D loss:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+grep -a 'FPS:'  ${cur_path}/output/train_perf_8p.log | awk -F "FPS:" '{print $NF}' | awk -F "]" '{print $1}' >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/train_${CaseName}_FPS.txt
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${FPS}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${loss}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/${Network}/$ASCEND_DEVICE_ID/${CaseName}.log
+
+
+
+
+
+
-- 
Gitee


From a576ce37eac7c7d769d6ee8736c4dc44bf017a73 Mon Sep 17 00:00:00 2001
From: Ryan <wangzhiwei73@huawei.com>
Date: Wed, 25 May 2022 18:43:11 +0800
Subject: [PATCH 2/2] update

---
 .../contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh  | 2 +-
 .../contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh
index da5a3e61a8..d78646b7f5 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_1p.sh
@@ -71,7 +71,7 @@ fi
 python3 -u ${currentDir}/main.py \
         --lr 0.0002 \
         --batch_size ${batch_size} \
-        --n_epochs 1 \
+        --n_epochs 3 \
         --workers 16 \
         --apex \
         --local_rank 0 \
diff --git a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh
index eaca822eda..058db683ad 100644
--- a/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh
+++ b/PyTorch/contrib/cv/others/GAN_Pytorch/test/train_performance_8p.sh
@@ -72,7 +72,7 @@ python3.7 -u -m torch.distributed.launch --nproc_per_node=8 ${currentDir}/main.p
         --distributed \
         --lr 0.0008 \
         --batch_size ${batch_size} \
-        --n_epochs 1 \
+        --n_epochs 3 \
         --workers 16 \
         --apex \
         --data_path ${data_path} > ${cur_path}/output/train_perf_8p.log 2>&1 &
-- 
Gitee