diff --git a/PyTorch/built-in/cv/detection/YOLOV4_ID0396_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/cv/detection/YOLOV4_ID0396_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..93f02bf0a1ffdb1ba0e6ac53dadf5888f279a770 --- /dev/null +++ b/PyTorch/built-in/cv/detection/YOLOV4_ID0396_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,220 @@ +#!/bin/bash +cur_path=`pwd` +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +ls /npu/traindata/coco_txl >1.txt +ls /npu/traindata/coco_txt/images >2.txt +ls /npu/traindata/coco_txl/images/train2017 >3.txt +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="YOLOV4_ID0396_for_PyTorch" +# 训练batch_size +batch_size=512 +# 训练使用的npu卡数 +export RANK_SIZE=16 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" +devicesnum="" + +# 训练epoch +train_epochs=1 +# 图片大小 +image_size=608 +# 指定训练所使用的npu device卡id +device_id=0 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --devicesnum* ]];then + devicesnum=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + fi +done + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +if [ -d $data_path/../coco_txl/COCO2017/images/train2017/000000000009.jpg ];then + echo "NO NEED UNTAR" +else + mkdir -p $data_path/../coco_txl + tar -zxvf $data_path/COCO2017.tar.gz -C $data_path/../coco_txl/ +rm -rf $data_path/../coco_txl/COCO2017/labels/*.cache +fi +wait + +sed -i "s|./coco/train2017.txt|$data_path/../coco_txl/COCO2017/train2017.txt|g" data/coco.yaml +sed -i "s|./coco/val2017.txt|$data_path/../coco_txl/COCO2017/val2017.txt|g" data/coco.yaml +sed -i "s|./coco/testdev2017.txt|$data_path/../coco_txl/COCO2017/testdev2017.txt|g" data/coco.yaml +sed -i "s|./coco/annotations/instances_val|$data_path/../coco_txl/COCO2017/annotations/instances_val|g" test.py +sed -i "s|opt.notest or final_epoch:|opt.notest:|g" train_8p.py + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip +export MASTER_PORT=29501 +export HCCL_WHITELIST_DISABLE=1 +device_num=${#devicesnum} +devices_num=`awk 'BEGIN{printf "%.0f\n",'${device_num}'-1}'` + +NPUS=($(seq 0 $devices_num)) +rank_server=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}'}'` + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +rank=0 +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + export NPU_CALCULATE_DEVICE=$i + rankid=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + if [ $(uname -m) = "aarch64" ] + then + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 train_16p.py --img $image_size $image_size \ + --data coco.yaml \ + --cfg cfg/yolov4_8p.cfg \ + --weights '' \ + --name yolov4 \ + --batch-size ${batch_size} \ + --epochs=${train_epochs} \ + --amp \ + --opt-level O1 \ + --loss_scale 128 \ + --multiprocessing_distributed \ + --device 'npu' \ + --global_rank $rankid \ + --device_list 0,1,2,3,4,5,6,7 \ + --world_size 2 \ + --addr $one_node_ip \ + --dist_url 'tcp://127.0.0.1:41111' \ + --dist_backend 'hccl' \ + --stop_step_num 100 \ + --notest > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + else + python3.7 train_16p.py --img $image_size $image_size \ + --data coco.yaml \ + --cfg cfg/yolov4_8p.cfg \ + --weights '' \ + --name yolov4 \ + --batch-size ${batch_size} \ + --epochs=${train_epochs} \ + --amp \ + --opt-level O1 \ + --loss_scale 128 \ + --multiprocessing_distributed \ + --device 'npu' \ + --global_rank $rankid \ + --device_list 0,1,2,3,4,5,6,7 \ + --world_size 2 \ + --addr $one_node_ip \ + --dist_url 'tcp://127.0.0.1:41111' \ + --dist_backend 'hccl' \ + --stop_step_num 100 \ + --notest > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + fi + let rank++ +done + + +wait + +# 训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#参数复原 +sed -i "s|$data_path/../coco_txl/COCO2017/train2017.txt|./coco/train2017.txt|g" data/coco.yaml +sed -i "s|$data_path/../coco_txl/COCO2017/val2017.txt|./coco/val2017.txt|g" data/coco.yaml +sed -i "s|$data_path/../coco_txl/COCO2017/testdev2017.txt|./coco/testdev2017.txt|g" data/coco.yaml +sed -i "s|$data_path/../coco_txl/COCO2017/annotations/instances_val|./coco/annotations/instances_val|g" test.py +sed -i "s|opt.notest:|opt.notest or final_epoch:|g" train_8p.py + +# 结果打印,不需要修改 +echo "------------------ Final result ------------------" +# 输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $4}'|awk 'END {print}'` +# 打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# 打印,不需要修改 +echo "E2E Training Duration sec : $e2e_time" + +# 性能看护结果汇总 +# 训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +# 获取性能数据,不需要修改 +# 吞吐量 +ActualFPS=${FPS} +# 单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +cat ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|tr '\r' '\n'|grep "${image_size}:"|awk -F " " '{print $6}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +# 最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +# 关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/built-in/cv/detection/YOLOV4_ID0396_for_PyTorch/train_16p.py b/PyTorch/built-in/cv/detection/YOLOV4_ID0396_for_PyTorch/train_16p.py new file mode 100644 index 0000000000000000000000000000000000000000..281dc41a12eb7b7c5f5857659237eae5e5f4dc40 --- /dev/null +++ b/PyTorch/built-in/cv/detection/YOLOV4_ID0396_for_PyTorch/train_16p.py @@ -0,0 +1,638 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import warnings +warnings.filterwarnings("ignore") +import argparse +import math +import os +import random +import time +from pathlib import Path + +import numpy as np +import torch.distributed as dist +import torch.nn.functional as F +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler +import torch.utils.data +import yaml + +import apex +from apex import amp +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + + +import test # import test.py to get mAP after each epoch +from models.models import * +from utils.datasets import create_dataloader +from utils.general import ( + check_img_size, torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, + labels_to_image_weights, compute_loss, plot_images, fitness, strip_optimizer, plot_results, + get_latest_run, check_git_status, check_file, increment_dir, print_mutation, plot_evolution) +from utils.google_utils import attempt_download +from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts + +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) + + +def train(hyp, opt, device, tb_writer=None): + print(f'Hyperparameters {hyp}') + log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory + wdir = str(log_dir / 'weights') + os.sep # weights directory + os.makedirs(wdir, exist_ok=True) + last = wdir + 'last.pt' + best = wdir + 'best.pt' + results_file = str(log_dir / 'results.txt') + epochs, batch_size, total_batch_size, weights, rank = \ + opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank + + # TODO: Use DDP logging. Only the first process is allowed to log. + # Save run settings + with open(log_dir / 'hyp.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + with open(log_dir / 'opt.yaml', 'w') as f: + yaml.dump(vars(opt), f, sort_keys=False) + + # Configure + npu = device.type != 'cpu' + init_seeds(2 + rank) + with open(opt.data) as f: + data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict + train_path = data_dict['train'] + test_path = data_dict['val'] + nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names + assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check + + # Model + pretrained = weights.endswith('.pt') + if pretrained: + with torch_distributed_zero_first(rank): + attempt_download(weights) # download if not found locally + ckpt = torch.load(weights, map_location=device) # load checkpoint + model = Darknet(opt.cfg).to(device) # create + state_dict = {k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel()} + model.load_state_dict(state_dict, strict=False) + print('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report + else: + model = Darknet(opt.cfg).to(device) # create + + # Optimizer + nbs = 64 # nominal batch size + accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing + hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay + + pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + for k, v in dict(model.named_parameters()).items(): + if '.bias' in k: + pg2.append(v) # biases + elif 'Conv2d.weight' in k: + pg1.append(v) # apply weight_decay + else: + pg0.append(v) # all else + + if opt.adam: + optimizer = apex.optimizers.NpuFusedAdam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum + else: + optimizer = apex.optimizers.NpuFusedSGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) + + optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay + optimizer.add_param_group({'params': pg2}) # add pg2 (biases) + print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) + del pg0, pg1, pg2 + + if opt.amp: + model, optimizer = amp.initialize(model, optimizer, opt_level=opt.opt_level, + loss_scale=opt.loss_scale, combine_grad=True) + # Scheduler https://arxiv.org/pdf/1812.01187.pdf + # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR + lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2 # cosine + scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) + # plot_lr_scheduler(optimizer, scheduler, epochs) + + # Resume + start_epoch, best_fitness = 0, 0.0 + if pretrained: + # Optimizer + if ckpt['optimizer'] is not None: + optimizer.load_state_dict(ckpt['optimizer']) + best_fitness = ckpt['best_fitness'] + + # Results + if ckpt.get('training_results') is not None: + with open(results_file, 'w') as file: + file.write(ckpt['training_results']) # write results.txt + + # Epochs + start_epoch = ckpt['epoch'] + 1 + if epochs < start_epoch: + print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % + (weights, ckpt['epoch'], epochs)) + epochs += ckpt['epoch'] # finetune additional epochs + + del ckpt, state_dict + + # Image sizes + gs = 32 # grid size (max stride) + imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples + + # SyncBatchNorm + if opt.sync_bn and npu and rank != -1: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) + print('Using SyncBatchNorm()') + + # Exponential moving average + ema = ModelEMA(model) if rank in [-1, 0] else None + + # DDP mode + if npu and rank != -1: + model = DDP(model, device_ids=[NPU_CALCULATE_DEVICE], broadcast_buffers=False) + + # Trainloader + dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, + cache=opt.cache_images, rect=opt.rect, local_rank=rank, + world_size=opt.world_size) + mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class + nb = len(dataloader) # number of batches + assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) + + # Testloader + if rank in [-1, 0]: + ema.updates = start_epoch * nb // accumulate # set EMA updates *** + # local_rank is set to -1. Because only the first process is expected to do evaluation. + testloader = create_dataloader(test_path, imgsz_test, batch_size, imgsz_test + 32, opt, hyp=hyp, augment=False, + cache=opt.cache_images, pad=0.0, rect=True, local_rank=-1, + world_size=opt.world_size)[0] + + # Model parameters + hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset + model.nc = nc # attach number of classes to model + model.hyp = hyp # attach hyperparameters to model + model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) + model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights + model.names = names + + # Class frequency + if rank in [-1, 0]: + labels = np.concatenate(dataset.labels, 0) + c = torch.tensor(labels[:, 0]) # classes + # cf = torch.bincount(c.long(), minlength=nc) + 1. + # model._initialize_biases(cf.to(device)) + plot_labels(labels, save_dir=log_dir) + if tb_writer: + tb_writer.add_histogram('classes', c, 0) + + # Check anchors + #if not opt.noautoanchor: + # check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) + + # Start training + t0 = time.time() + nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) + # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training + maps = np.zeros(nc) # mAP per class + results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' + scheduler.last_epoch = start_epoch - 1 # do not move + + if rank in [0, -1]: + print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) + print('Using %g dataloader workers' % dataloader.num_workers) + print('Starting training for %g epochs...' % epochs) + # torch.autograd.set_detect_anomaly(True) + for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ + model.train() + + # Update image weights (optional) + if dataset.image_weights: + # Generate indices + if rank in [-1, 0]: + w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights + image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) + dataset.indices = random.choices(range(dataset.n), weights=image_weights, + k=dataset.n) # rand weighted idx + # Broadcast if DDP + if rank != -1: + indices = torch.zeros([dataset.n], dtype=torch.int) + if rank == 0: + indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) + dist.broadcast(indices, 0) + if rank != 0: + dataset.indices = indices.cpu().numpy() + + # Update mosaic border + # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) + # dataset.mosaic_border = [b - imgsz, -b] # height, width borders + + mloss = torch.zeros(4, device=device) # mean losses + if rank != -1: + dataloader.sampler.set_epoch(epoch) + pbar = enumerate(dataloader) + if rank in [-1, 0]: + print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) + pbar = tqdm(pbar, total=nb) # progress bar + optimizer.zero_grad() + start_time = time.time() + for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- + if i == 5: + start_time = time.time() + ni = i + nb * epoch # number integrated batches (since train start) + imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 + + # Warmup + if ni <= nw: + xi = [0, nw] # x interp + # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) + accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) + for j, x in enumerate(optimizer.param_groups): + # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 + x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) + if 'momentum' in x: + x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) + + # Multi-scale + if opt.multi_scale: + sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size + sf = sz / max(imgs.shape[2:]) # scale factor + if sf != 1: + ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) + imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) + + nt =targets.shape[0] + batch_size = imgs.shape[0] + nt_max = 32 * batch_size + while nt > nt_max: + nt_max *= 2 + print('targets len larger than nt_max, schedule to more bigger =', nt_max) + pad_size = nt_max - nt + pad_target = torch.nn.functional.pad(targets, [0, 0, 0, pad_size]) + pred = model(imgs) + + # Loss + loss, loss_items = compute_loss(pred, pad_target.to(device), model) # scaled by batch_size + if rank != -1: + loss *= opt.world_size # gradient averaged between devices in DDP mode + # if not torch.isfinite(loss): + # print('WARNING: non-finite loss, ending training ', loss_items) + # return results + + # Backward + if opt.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + # Optimize + if ni % accumulate == 0: + optimizer.step() # optimizer.step + optimizer.zero_grad() + if ema is not None: + x = torch.tensor([1.]).to(device) + params_fp32_fused = optimizer.get_model_combined_params() + ema.update(model, 'npu', params_fp32_fused[0]) + + # Print + if rank in [-1, 0]: + mloss = (mloss * i + loss_items) / (i + 1) # update mean losses + mem = '%.3gG' % (torch.npu.memory_reserved() / 1E9 if torch.npu.is_available() else 0) # (GB) + s = ('%10s' * 2 + '%10.4g' * 6) % ( + '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) + pbar.set_description(s) + + # Plot + if ni < 3: + f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename + result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) + if tb_writer and result is not None: + tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) + # tb_writer.add_graph(model, imgs) # add model to tensorboard + if opt.stop_step_num is not None and i >= opt.stop_step_num: + break + + # end batch ------------------------------------------------------------------------------------------------ + if rank in [-1, 0]: + epoch_time = time.time() - start_time + if i >= 5: + print('Training speed is {} FPS'.format(total_batch_size * (i + 1 - 5) / (epoch_time))) + else: + print('Training speed is {} FPS'.format(total_batch_size * (i + 1) / (epoch_time))) + # Scheduler + scheduler.step() + + # DDP process 0 or single-GPU + if rank in [-1, 0]: + # mAP + if ema is not None: + ema.update_attr(model) + final_epoch = epoch + 1 == epochs + if not opt.notest or final_epoch: # Calculate mAP + results, maps, times = test.test(opt.data, + batch_size=batch_size, + imgsz=imgsz_test, + save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), + model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, + single_cls=opt.single_cls, + dataloader=testloader, + save_dir=log_dir) + + # Write + with open(results_file, 'a') as f: + f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) + if len(opt.name) and opt.bucket: + os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) + + # Tensorboard + if tb_writer: + tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', + 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', + 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] + for x, tag in zip(list(mloss[:-1]) + list(results), tags): + tb_writer.add_scalar(tag, x, epoch) + + # Update best mAP + fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] + if fi > best_fitness: + best_fitness = fi + + # Save model + save = (not opt.nosave) or (final_epoch and not opt.evolve) + if save: + with open(results_file, 'r') as f: # create checkpoint + ckpt = {'epoch': epoch, + 'best_fitness': best_fitness, + 'training_results': f.read(), + 'model': ema.ema.module.state_dict() if hasattr(ema, 'module') else ema.ema.state_dict(), + 'optimizer': None if final_epoch else optimizer.state_dict()} + + # Save last, best and delete + torch.save(ckpt, last) + if epoch >= (epochs-5): + torch.save(ckpt, last.replace('.pt','_{:03d}.pt'.format(epoch))) + if (best_fitness == fi) and not final_epoch: + torch.save(ckpt, best) + del ckpt + # end epoch ---------------------------------------------------------------------------------------------------- + # end training + + if rank in [-1, 0]: + # Strip optimizers + n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name + fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n + for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): + if os.path.exists(f1): + os.rename(f1, f2) # rename + ispt = f2.endswith('.pt') # is *.pt + strip_optimizer(f2) if ispt else None # strip optimizer + os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload + # Finish + if not opt.evolve: + plot_results(save_dir=log_dir) # save as results.png + print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) + + dist.destroy_process_group() if rank not in [-1, 0] else None + torch.npu.empty_cache() + return results + +def device_id_to_process_device_map(device_list): + devices = device_list.split(",") + devices = [int(x) for x in devices] + devices.sort() + + process_device_map = dict() + for process_id, device_id in enumerate(devices): + process_device_map[process_id] = device_id + + return process_device_map +def main_worker(opt): + device_id = int(os.getenv('NPU_CALCULATE_DEVICE')) + print("device_id: ",device_id) + loc = 'npu:{}'.format(device_id) + if opt.device == 'npu': + torch.npu.set_device(loc) + print("Use NPU: {} for training".format(device_id)) + # Resume + if opt.resume: + last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run + if last and not opt.weights: + print(f'Resuming training from {last}') + opt.weights = last if opt.resume and not opt.weights else opt.weights + print("training config:", opt) + device = torch.device(loc) if opt.device == 'npu' else torch.device('cpu') + + if opt.multiprocessing_distributed: + if opt.dist_url == "env://" and opt.global_rank == -1: + opt.global_rank = int(os.environ["RANK"]) + dist.init_process_group(backend=opt.dist_backend, # init_method=cfg.dist_url, + world_size=opt.world_size, rank=opt.global_rank) + + opt.hyp = opt.hyp or ('data/hyp.scratch.yaml') + opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files + assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' + + opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) + #device = select_device(opt.device, batch_size=opt.batch_size) + opt.total_batch_size = opt.batch_size + opt.batch_size = opt.total_batch_size // opt.world_size + + print(opt) + with open(opt.hyp) as f: + hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps + + # Train + if not opt.evolve: + tb_writer = None + # if opt.global_rank in [-1, 0]: + # print('Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir) + # tb_writer = SummaryWriter(log_dir=increment_dir(Path(opt.logdir) / 'exp', opt.name)) # runs/exp + train(hyp, opt, device, tb_writer) + + # Evolve hyperparameters (optional) + else: + # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) + meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) + 'momentum': (0.1, 0.6, 0.98), # SGD momentum/Adam beta1 + 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay + 'giou': (1, 0.02, 0.2), # GIoU loss gain + 'cls': (1, 0.2, 4.0), # cls loss gain + 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight + 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) + 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight + 'iou_t': (0, 0.1, 0.7), # IoU training threshold + 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold + 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) + 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) + 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) + 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) + 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) + 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) + 'scale': (1, 0.0, 0.9), # image scale (+/- gain) + 'shear': (1, 0.0, 10.0), # image shear (+/- deg) + 'perspective': (1, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 + 'flipud': (0, 0.0, 1.0), # image flip up-down (probability) + 'fliplr': (1, 0.0, 1.0), # image flip left-right (probability) + 'mixup': (1, 0.0, 1.0)} # image mixup (probability) + + assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' + opt.notest, opt.nosave = True, True # only test/save final epoch + # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices + yaml_file = Path('runs/evolve/hyp_evolved.yaml') # save best result here + if opt.bucket: + os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists + + for _ in range(100): # generations to evolve + if os.path.exists('evolve.txt'): # if evolve.txt exists: select best hyps and mutate + # Select parent(s) + parent = 'single' # parent selection method: 'single' or 'weighted' + x = np.loadtxt('evolve.txt', ndmin=2) + n = min(5, len(x)) # number of previous results to consider + x = x[np.argsort(-fitness(x))][:n] # top n mutations + w = fitness(x) - fitness(x).min() # weights + if parent == 'single' or len(x) == 1: + # x = x[random.randint(0, n - 1)] # random selection + x = x[random.choices(range(n), weights=w)[0]] # weighted selection + elif parent == 'weighted': + x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination + + # Mutate + mp, s = 0.9, 0.2 # mutation probability, sigma + npr = np.random + npr.seed(int(time.time())) + g = np.array([x[0] for x in meta.values()]) # gains 0-1 + ng = len(meta) + v = np.ones(ng) + while all(v == 1): # mutate until a change occurs (prevent duplicates) + v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) + for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) + hyp[k] = float(x[i + 7] * v[i]) # mutate + + # Constrain to limits + for k, v in meta.items(): + hyp[k] = max(hyp[k], v[1]) # lower limit + hyp[k] = min(hyp[k], v[2]) # upper limit + hyp[k] = round(hyp[k], 5) # significant digits + + # Train mutation + results = train(hyp.copy(), opt, device) + + # Write mutation results + print_mutation(hyp.copy(), results, yaml_file, opt.bucket) + + # Plot results + plot_evolution(yaml_file) + print('Hyperparameter evolution complete. Best results saved as: %s\nCommand to train a new model with these ' + 'hyperparameters: $ python train.py --hyp %s' % (yaml_file, yaml_file)) + +def main(opt): + os.environ['MASTER_ADDR'] = opt.addr + os.environ['MASTER_PORT'] = '29501' + if opt.dist_url == "env://" and opt.world_size == -1: + opt.world_size = int(os.environ["WORLD_SIZE"]) + opt.process_device_map = device_id_to_process_device_map(opt.device_list) + if opt.device == 'npu': + npus_per_node = len(opt.process_device_map) + else: + npus_per_node = torch.npu.device_count() + + print('{} node found.'.format(npus_per_node)) + if opt.multiprocessing_distributed: + opt.world_size = npus_per_node * opt.world_size + main_worker(opt) + else: + print('multi npu training failed to init...') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--weights', type=str, default='yolov4.pt', help='initial weights path') + parser.add_argument('--cfg', type=str, default='', help='model.yaml path') + parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path') + parser.add_argument('--hyp', type=str, default='', help='hyperparameters path, i.e. data/hyp.scratch.yaml') + parser.add_argument('--epochs', type=int, default=300) + parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs') + parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes') + parser.add_argument('--rect', action='store_true', help='rectangular training') + parser.add_argument('--resume', nargs='?', const='get_last', default=False, + help='resume from given path/last.pt, or most recent run if blank') + parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') + parser.add_argument('--notest', action='store_true', help='only test final epoch') + parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') + parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') + parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') + parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') + parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied') + #parser.add_argument('--device', default='', help='npu device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') + parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') + parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') + parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') + parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') + parser.add_argument('--logdir', type=str, default='runs/', help='logging directory') + parser.add_argument('--amp', default=False, action='store_true', + help='use amp to train the model') + parser.add_argument('--loss_scale', default='dynamic', + help='loss scale using in amp, default means dynamic loss scale') + parser.add_argument('--opt-level', default='O1', type=str, + help='loss scale using in amp, default O1') + parser.add_argument('--world_size', default=-1, type=int, + help='number of nodes for distributed training') + parser.add_argument('--multiprocessing_distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N NPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') + parser.add_argument('--global_rank', default=-1, type=int, + help='node rank for distributed training') + parser.add_argument('--dist_url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') + parser.add_argument('--dist_backend', default='nccl', type=str, + help='distributed backend') + parser.add_argument('--addr', default='90.90.176.152', type=str, + help='master addr') + parser.add_argument('--device_list', default='0,1,2,3,4,5,6,7', type=str, + help='device id list') + parser.add_argument('--device', default='npu', type=str, help='npu or cpu') + parser.add_argument('--stop_step_num', default=None, type=int, + help='after the stop_step, killing the training task') + opt = parser.parse_args() + + main(opt) + # Resume + # + # if opt.local_rank == -1 or ("RANK" in os.environ and os.environ["RANK"] == "0"): + # check_git_status() + + # opt.hyp = opt.hyp or ('data/hyp.scratch.yaml') + # opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files + # assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' + # + # opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) + # device = select_device(opt.device, batch_size=opt.batch_size) + # opt.total_batch_size = opt.batch_size + # opt.world_size = 1 + # opt.global_rank = -1 + + # DDP mode + # if opt.local_rank != -1: + # assert torch.npu.device_count() > opt.local_rank + # torch.npu.set_device(opt.local_rank) + # device = torch.device('npu', opt.local_rank) + # dist.init_process_group(backend='nccl', init_method='env://') # distributed backend + # opt.world_size = dist.get_world_size() + # opt.global_rank = dist.get_rank() + # assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' + # opt.batch_size = opt.total_batch_size // opt.world_size +