diff --git "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/README.md" "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/README.md" index e504fa9f28690c0519334a088035f61c7167602b..d7ddc8a4b3722af9fe0e43770f8448fd43f46669 100644 --- "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/README.md" +++ "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/README.md" @@ -30,6 +30,12 @@ bash ./test/train_full_8p.sh --data_path=real_data_path # training 8p performance bash ./test/train_performance_8p.sh --data_path=real_data_path + +#test 8p accuracy +bash test/train_eval_8p.sh --data_path=real_data_path --pth_path=real_pre_train_model_path + +# finetuning 1p +bash test/train_finetune_1p.sh --data_path=real_data_path --pth_path=real_pre_train_model_path ``` Log path: diff --git "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/main.py" "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/main.py" index a99b07efc19ac24399e8c4b512e67653d42a4315..32cb29fefc5587590915ef112ffcb8cf2e04b2bc 100644 --- "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/main.py" +++ "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/main.py" @@ -65,6 +65,8 @@ parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') +parser.add_argument('--pth_path', default='', type=str, metavar='PATH', + help='path to pretrained checkpoint (default: none)') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, @@ -82,7 +84,7 @@ parser.add_argument('--multiprocessing-distributed', action='store_true', 'N processes per node, which has N GPUs. This is the ' 'fastest way to use PyTorch for either single node or ' 'multi node data parallel training') -parser.add_argument('--num-classes', default=1000, type=int, +parser.add_argument('--num_classes', default=1000, type=int, help='The number of classes.') ## for ascend 910 parser.add_argument('--device', default='npu', type=str, help='npu or gpu') @@ -190,9 +192,14 @@ def main_worker(gpu, ngpus_per_node, args): print("=> using pre-trained model wide_resnet50_2") model = resnet_0_6_0.wide_resnet50_2(num_classes=args.num_classes) print("loading model of yours...") - pretrained_dict = torch.load("./model_best.pth.tar", map_location="cpu")["state_dict"] - model.load_state_dict({k.replace('module.',''):v for k, v in pretrained_dict.items()}) + if args.pth_path: + print("load pth you give") + pretrained_dict = torch.load(args.pth_path, map_location="cpu")["state_dict"] + else: + pretrained_dict = torch.load("./model_best.pth.tar", map_location="cpu")["state_dict"] + if "fc.weight" in pretrained_dict: + print("pop fc layer weight") pretrained_dict.pop('fc.weight') pretrained_dict.pop('fc.bias') model.load_state_dict(pretrained_dict, strict=False) diff --git "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_eval_8p.sh" "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_eval_8p.sh" index 7f09f2fbfe86ed7dd5925d55228290cfb1d9dc1f..d34f47b5e08af2face35096ec34a07adacfbcd14 100644 --- "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_eval_8p.sh" +++ "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_eval_8p.sh" @@ -8,11 +8,10 @@ Network="WideResNet50_2_ID1627_for_PyTorch" batch_size=4096 # 训练使用的npu卡数 export RANK_SIZE=8 -# checkpoint文件路径,以实际路径为准 -resume=/home/checkpoint.pth.tar # 数据集路径,保持为空,不需要修改 data_path="" - +# checkpoint文件路径,以实际路径为准 +pth_path="" # 训练epoch train_epochs=200 # 学习率 @@ -28,6 +27,8 @@ do workers=`echo ${para#*=}` elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --pth_path* ]];then + pth_path=`echo ${para#*=}` fi done @@ -37,6 +38,11 @@ if [[ $data_path == "" ]];then exit 1 fi +# 校验是否传入 pth_path , 验证脚本需要传入此参数 +if [[ $pth_path == "" ]];then + echo "[Error] para \"pth_path\" must be confing" + exit 1 +fi ###############指定训练脚本执行路径############### # cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 @@ -73,7 +79,7 @@ fi python3.7 ./main.py \ ${data_path} \ --evaluate \ - --resume ${resume} \ + --resume ${pth_path} \ --addr=$(hostname -I |awk '{print $1}') \ --seed=49 \ --workers=${workers} \ diff --git "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_finetune_1p.sh" "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_finetune_1p.sh" index 30dac86bf9162a5421cb525f0d8363fec42009f0..1821a33a110e974026c47fa537ce66ef5bdbace3 100644 --- "a/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_finetune_1p.sh" +++ "b/Pytorch\350\256\255\347\273\203\347\244\272\344\276\213/WideResNet50_2_ID1627_for_PyTorch/test/train_finetune_1p.sh" @@ -10,7 +10,8 @@ batch_size=256 export RANK_SIZE=1 # 数据集路径,保持为空,不需要修改 data_path="" - +# checkpoint文件路径,以实际路径为准 +pth_path="" # 训练epoch train_epochs=200 # 指定训练所使用的npu device卡id @@ -25,6 +26,8 @@ do device_id=`echo ${para#*=}` elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` + elif [[ $para == --pth_path* ]];then + pth_path=`echo ${para#*=}` fi done @@ -33,6 +36,13 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi + +# 校验是否传入 pth_path , 验证脚本需要传入此参数 +if [[ $pth_path == "" ]];then + echo "[Error] para \"pth_path\" must be confing" + exit 1 +fi + # 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 if [ $ASCEND_DEVICE_ID ];then echo "device id is ${ASCEND_DEVICE_ID}" @@ -95,6 +105,7 @@ python3.7 ./main.py \ --loss-scale=32 \ --amp \ --pretrained \ + --pth_path=${pth_path} \ --num_classes=1200 \ --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &