From ee5dd799c0a4deda8e91710cfa581ad7c056201d Mon Sep 17 00:00:00 2001 From: WMC <846995292@qq.com> Date: Mon, 30 May 2022 20:30:42 +0800 Subject: [PATCH 1/5] update torch version --- .../cv/classification/SENet154/README.md | 7 +++--- .../classification/SENet154/test/run_2onnx.sh | 2 +- .../SENet154/test/train_eval_8p.sh | 2 +- .../SENet154/test/train_full_1p.sh | 2 +- .../SENet154/test/train_full_8p.sh | 2 +- .../SENet154/test/train_performance_1p.sh | 2 +- .../SENet154/test/train_performance_8p.sh | 2 +- .../cv/classification/SENet154/train.py | 22 +++++++++---------- 8 files changed, 20 insertions(+), 21 deletions(-) diff --git a/PyTorch/contrib/cv/classification/SENet154/README.md b/PyTorch/contrib/cv/classification/SENet154/README.md index 3a719047ec..9da2ec1fb0 100755 --- a/PyTorch/contrib/cv/classification/SENet154/README.md +++ b/PyTorch/contrib/cv/classification/SENet154/README.md @@ -42,8 +42,8 @@ Label smoothing is required for qualified model accuracy. 4. `--batch-size N`:分配个每个设备的batch大小 3. 开始训练 ``` - bash ./test/train_full_1p.sh --data_path=数据集路径 # 精度训练 - bash ./test/train_performance_1p.sh --data_path=数据集路径 # 性能训练 + bash ./test/train_full_8p.sh --data_path=数据集路径 # 精度训练 + bash ./test/train_performance_8p.sh --data_path=数据集路径 # 性能训练 ``` ### 训练结果 @@ -67,5 +67,4 @@ $E$为当前一轮的Epoch序号,从0开始 ### NPU 8p |Epochs|Learning rate |Optimization type|FPS |Acc@1 |Acc@5 | |:----:|:------------------------------------:|:---------------:|:------:|:----:|:----:| -|120 |$0.6\times 0.1^{\lfloor E/30 \rfloor}$|O2 |1524.537|78.599|93.849| -|120 |$0.3\times (1+\cos{\frac{E\pi}{120}})$|O2 |1522.120|80.048|94.799| \ No newline at end of file +|120 |$0.3\times (1+\cos{\frac{E\pi}{120}})$|O2 |1022.920|80.564|94.428| \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh b/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh index 949af6d740..38cec26b8f 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh @@ -1,3 +1,3 @@ source test/env_npu.sh -python3.7.5 pth2onnx.py "$@" checkpoint-120.pth.rar \ No newline at end of file +python3.7 pth2onnx.py "$@" checkpoint-120.pth.rar \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh index 18aa841d70..5e47f1196c 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh @@ -75,7 +75,7 @@ kernel_num=$(($(nproc) / 8)) pid_start=$((kernel_num * rank)) pid_end=$((pid_start + kernel_num - 1)) -taskset -c $pid_start-$pid_end python3.7.5 train.py \ +taskset -c $pid_start-$pid_end python3.7 train.py \ --distributed \ --num-devices 8 \ --local-rank $rank \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh index e6e42752b8..d68cf96ec4 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi kernel_num=$(($(nproc) / 8)) -python3.7.5 -u train.py "$@" \ +python3.7 -u train.py "$@" \ --num-workers $kernel_num \ --data $data_path \ --device npu:$ASCEND_DEVICE_ID \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh index de36d5a226..eedaa7aeb7 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh @@ -74,7 +74,7 @@ kernel_num=$(($(nproc) / 8)) pid_start=$((kernel_num * rank)) pid_end=$((pid_start + kernel_num - 1)) -taskset -c $pid_start-$pid_end python3.7.5 -u train.py "$@" \ +taskset -c $pid_start-$pid_end python3.7 -u train.py "$@" \ --distributed \ --num-devices 8 \ --local-rank $rank \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh index 27f4881c05..18c1844a5a 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi kernel_num=$(($(nproc) / 8)) -python3.7.5 -u train.py "$@" \ +python3.7 -u train.py "$@" \ --num-workers $kernel_num \ --data $data_path \ --device npu:$ASCEND_DEVICE_ID \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh index cc9e364d11..d857dd3d1b 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh @@ -74,7 +74,7 @@ kernel_num=$(($(nproc) / 8)) pid_start=$((kernel_num * rank)) pid_end=$((pid_start + kernel_num - 1)) -taskset -c $pid_start-$pid_end python3.7.5 -u train.py "$@" \ +taskset -c $pid_start-$pid_end python3.7 -u train.py "$@" \ --distributed \ --num-devices 8 \ --local-rank $rank \ diff --git a/PyTorch/contrib/cv/classification/SENet154/train.py b/PyTorch/contrib/cv/classification/SENet154/train.py index 02f9927ea6..2e5e5e2f7a 100755 --- a/PyTorch/contrib/cv/classification/SENet154/train.py +++ b/PyTorch/contrib/cv/classification/SENet154/train.py @@ -25,7 +25,8 @@ import torch.distributed as dist import torch.nn as nn import torch.nn.parallel as par import torch.optim as optim - +if torch.__version__ >= '1.8.1': + import torch_npu sys.path.append('.') import checkpoint import data @@ -80,11 +81,11 @@ def main_worker(args): pretrained_dict.pop('module.last_linear.weight') pretrained_dict.pop('module.last_linear.bias') - for param in net.parameters(): + for param in model.parameters(): param.requires_gard = False - net.last_linear = nn.Linear(2048, 1000) - net.load_state_dict(pretrained_dict, strict=False) + model.last_linear = nn.Linear(2048, 1000) + model.load_state_dict(pretrained_dict, strict=False) if args.label_smoothing_epsilon > 0.0: logging.info('Using label smoothing with epsilon = {:.3f}'.format(args.label_smoothing_epsilon)) @@ -126,11 +127,11 @@ def main_worker(args): _, val_loader = data.create_val_loader( model, val_dir, args, scale, distributed=args.distributed ) - + model, optimizer = apex.amp.initialize( model, optimizer, opt_level=args.opt_level, - loss_scale=args.loss_scale + loss_scale=args.loss_scale if args.loss_scale != -1 else 'dynamic' ) if args.distributed: model = par.DistributedDataParallel(model, device_ids=[args.local_rank]) @@ -196,8 +197,8 @@ def train(train_loader, model, criterion, optimizer, epoch_id, args): input = input.to(args.device, non_blocking=True) target = target.to(args.device, non_blocking=True) # compute output - output = model(input) - loss = criterion(output, target) + output = model(input) + loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data.item(), input.size(0)) @@ -419,8 +420,8 @@ def parse_args(): parser.add_argument('--opt-level', default='O2', type=str, choices=['O1', 'O2'], help='optimization level of amp.initialize (default: \'O2\')') - parser.add_argument('--loss-scale', default=128, type=int, - help='loss scale of amp.initialize (default: 128)') + parser.add_argument('--loss-scale', default=-1, type=int, + help='loss scale of amp.initialize (default:128), -1 means dynamic') parser.add_argument('--epochs', default=100, type=int, metavar='N', help='number of total epochs to run (default: 100)') @@ -498,7 +499,6 @@ def parse_args(): if __name__ == '__main__': args = parse_args() if 'npu' in args.device: - import torch.npu if args.distributed: os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '23333' -- Gitee From 8b5653bde116a1f9eb6b785c601f153889f6a8e1 Mon Sep 17 00:00:00 2001 From: WMC <846995292@qq.com> Date: Wed, 1 Jun 2022 13:49:03 +0800 Subject: [PATCH 2/5] update --- PyTorch/contrib/cv/classification/SENet154/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/SENet154/train.py b/PyTorch/contrib/cv/classification/SENet154/train.py index 2e5e5e2f7a..00fe6d17e5 100755 --- a/PyTorch/contrib/cv/classification/SENet154/train.py +++ b/PyTorch/contrib/cv/classification/SENet154/train.py @@ -131,7 +131,7 @@ def main_worker(args): model, optimizer = apex.amp.initialize( model, optimizer, opt_level=args.opt_level, - loss_scale=args.loss_scale if args.loss_scale != -1 else 'dynamic' + loss_scale='dynamic' ) if args.distributed: model = par.DistributedDataParallel(model, device_ids=[args.local_rank]) -- Gitee From 0836051cbdbcc955bf7eeb34e93e177fbec696e8 Mon Sep 17 00:00:00 2001 From: WMC <846995292@qq.com> Date: Mon, 30 May 2022 20:30:42 +0800 Subject: [PATCH 3/5] update torch version --- .../cv/classification/SENet154/README.md | 7 +++--- .../classification/SENet154/test/run_2onnx.sh | 2 +- .../SENet154/test/train_eval_8p.sh | 2 +- .../SENet154/test/train_full_1p.sh | 2 +- .../SENet154/test/train_full_8p.sh | 2 +- .../SENet154/test/train_performance_1p.sh | 2 +- .../SENet154/test/train_performance_8p.sh | 2 +- .../cv/classification/SENet154/train.py | 22 +++++++++---------- 8 files changed, 20 insertions(+), 21 deletions(-) diff --git a/PyTorch/contrib/cv/classification/SENet154/README.md b/PyTorch/contrib/cv/classification/SENet154/README.md index 3a719047ec..9da2ec1fb0 100755 --- a/PyTorch/contrib/cv/classification/SENet154/README.md +++ b/PyTorch/contrib/cv/classification/SENet154/README.md @@ -42,8 +42,8 @@ Label smoothing is required for qualified model accuracy. 4. `--batch-size N`:分配个每个设备的batch大小 3. 开始训练 ``` - bash ./test/train_full_1p.sh --data_path=数据集路径 # 精度训练 - bash ./test/train_performance_1p.sh --data_path=数据集路径 # 性能训练 + bash ./test/train_full_8p.sh --data_path=数据集路径 # 精度训练 + bash ./test/train_performance_8p.sh --data_path=数据集路径 # 性能训练 ``` ### 训练结果 @@ -67,5 +67,4 @@ $E$为当前一轮的Epoch序号,从0开始 ### NPU 8p |Epochs|Learning rate |Optimization type|FPS |Acc@1 |Acc@5 | |:----:|:------------------------------------:|:---------------:|:------:|:----:|:----:| -|120 |$0.6\times 0.1^{\lfloor E/30 \rfloor}$|O2 |1524.537|78.599|93.849| -|120 |$0.3\times (1+\cos{\frac{E\pi}{120}})$|O2 |1522.120|80.048|94.799| \ No newline at end of file +|120 |$0.3\times (1+\cos{\frac{E\pi}{120}})$|O2 |1022.920|80.564|94.428| \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh b/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh index 949af6d740..38cec26b8f 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/run_2onnx.sh @@ -1,3 +1,3 @@ source test/env_npu.sh -python3.7.5 pth2onnx.py "$@" checkpoint-120.pth.rar \ No newline at end of file +python3.7 pth2onnx.py "$@" checkpoint-120.pth.rar \ No newline at end of file diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh index 18aa841d70..5e47f1196c 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_eval_8p.sh @@ -75,7 +75,7 @@ kernel_num=$(($(nproc) / 8)) pid_start=$((kernel_num * rank)) pid_end=$((pid_start + kernel_num - 1)) -taskset -c $pid_start-$pid_end python3.7.5 train.py \ +taskset -c $pid_start-$pid_end python3.7 train.py \ --distributed \ --num-devices 8 \ --local-rank $rank \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh index e6e42752b8..d68cf96ec4 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi kernel_num=$(($(nproc) / 8)) -python3.7.5 -u train.py "$@" \ +python3.7 -u train.py "$@" \ --num-workers $kernel_num \ --data $data_path \ --device npu:$ASCEND_DEVICE_ID \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh index de36d5a226..eedaa7aeb7 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_full_8p.sh @@ -74,7 +74,7 @@ kernel_num=$(($(nproc) / 8)) pid_start=$((kernel_num * rank)) pid_end=$((pid_start + kernel_num - 1)) -taskset -c $pid_start-$pid_end python3.7.5 -u train.py "$@" \ +taskset -c $pid_start-$pid_end python3.7 -u train.py "$@" \ --distributed \ --num-devices 8 \ --local-rank $rank \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh index 27f4881c05..18c1844a5a 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_1p.sh @@ -78,7 +78,7 @@ if [ x"${etp_flag}" != x"true" ];then source ${test_path_dir}/env_npu.sh fi kernel_num=$(($(nproc) / 8)) -python3.7.5 -u train.py "$@" \ +python3.7 -u train.py "$@" \ --num-workers $kernel_num \ --data $data_path \ --device npu:$ASCEND_DEVICE_ID \ diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh index cc9e364d11..d857dd3d1b 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_performance_8p.sh @@ -74,7 +74,7 @@ kernel_num=$(($(nproc) / 8)) pid_start=$((kernel_num * rank)) pid_end=$((pid_start + kernel_num - 1)) -taskset -c $pid_start-$pid_end python3.7.5 -u train.py "$@" \ +taskset -c $pid_start-$pid_end python3.7 -u train.py "$@" \ --distributed \ --num-devices 8 \ --local-rank $rank \ diff --git a/PyTorch/contrib/cv/classification/SENet154/train.py b/PyTorch/contrib/cv/classification/SENet154/train.py index 02f9927ea6..2e5e5e2f7a 100755 --- a/PyTorch/contrib/cv/classification/SENet154/train.py +++ b/PyTorch/contrib/cv/classification/SENet154/train.py @@ -25,7 +25,8 @@ import torch.distributed as dist import torch.nn as nn import torch.nn.parallel as par import torch.optim as optim - +if torch.__version__ >= '1.8.1': + import torch_npu sys.path.append('.') import checkpoint import data @@ -80,11 +81,11 @@ def main_worker(args): pretrained_dict.pop('module.last_linear.weight') pretrained_dict.pop('module.last_linear.bias') - for param in net.parameters(): + for param in model.parameters(): param.requires_gard = False - net.last_linear = nn.Linear(2048, 1000) - net.load_state_dict(pretrained_dict, strict=False) + model.last_linear = nn.Linear(2048, 1000) + model.load_state_dict(pretrained_dict, strict=False) if args.label_smoothing_epsilon > 0.0: logging.info('Using label smoothing with epsilon = {:.3f}'.format(args.label_smoothing_epsilon)) @@ -126,11 +127,11 @@ def main_worker(args): _, val_loader = data.create_val_loader( model, val_dir, args, scale, distributed=args.distributed ) - + model, optimizer = apex.amp.initialize( model, optimizer, opt_level=args.opt_level, - loss_scale=args.loss_scale + loss_scale=args.loss_scale if args.loss_scale != -1 else 'dynamic' ) if args.distributed: model = par.DistributedDataParallel(model, device_ids=[args.local_rank]) @@ -196,8 +197,8 @@ def train(train_loader, model, criterion, optimizer, epoch_id, args): input = input.to(args.device, non_blocking=True) target = target.to(args.device, non_blocking=True) # compute output - output = model(input) - loss = criterion(output, target) + output = model(input) + loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data.item(), input.size(0)) @@ -419,8 +420,8 @@ def parse_args(): parser.add_argument('--opt-level', default='O2', type=str, choices=['O1', 'O2'], help='optimization level of amp.initialize (default: \'O2\')') - parser.add_argument('--loss-scale', default=128, type=int, - help='loss scale of amp.initialize (default: 128)') + parser.add_argument('--loss-scale', default=-1, type=int, + help='loss scale of amp.initialize (default:128), -1 means dynamic') parser.add_argument('--epochs', default=100, type=int, metavar='N', help='number of total epochs to run (default: 100)') @@ -498,7 +499,6 @@ def parse_args(): if __name__ == '__main__': args = parse_args() if 'npu' in args.device: - import torch.npu if args.distributed: os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '23333' -- Gitee From c66f24fc1a403550521d2b4e6418d92a027d36ce Mon Sep 17 00:00:00 2001 From: WMC <846995292@qq.com> Date: Wed, 1 Jun 2022 13:49:03 +0800 Subject: [PATCH 4/5] update --- PyTorch/contrib/cv/classification/SENet154/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/SENet154/train.py b/PyTorch/contrib/cv/classification/SENet154/train.py index 2e5e5e2f7a..00fe6d17e5 100755 --- a/PyTorch/contrib/cv/classification/SENet154/train.py +++ b/PyTorch/contrib/cv/classification/SENet154/train.py @@ -131,7 +131,7 @@ def main_worker(args): model, optimizer = apex.amp.initialize( model, optimizer, opt_level=args.opt_level, - loss_scale=args.loss_scale if args.loss_scale != -1 else 'dynamic' + loss_scale='dynamic' ) if args.distributed: model = par.DistributedDataParallel(model, device_ids=[args.local_rank]) -- Gitee From ceff51072d904fbb28ee851fda02c0edb5644cc1 Mon Sep 17 00:00:00 2001 From: WMC <846995292@qq.com> Date: Wed, 1 Jun 2022 14:07:32 +0800 Subject: [PATCH 5/5] init --- .../contrib/cv/classification/SENet154/test/train_full_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh b/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh index d68cf96ec4..1e695a4b06 100755 --- a/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/classification/SENet154/test/train_full_1p.sh @@ -87,7 +87,7 @@ python3.7 -u train.py "$@" \ --scheduler cosine \ --label-smoothing-epsilon 0.1 \ --batch-size $batch_size \ - --print-freq 10 ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --print-freq 10 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait -- Gitee