diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md index a1ebd3cc45edc3535a118a534a620ad4cfd9feba..bfc2be29646e37d2873b4188956e703a2e24b73e 100644 --- a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md @@ -128,6 +128,13 @@ CRNN (Convolutional Recurrent Neural Network) 于2015年由华中科技大学的 ``` 训练完成后,权重文件保存在当前路径下,并输出模型训练精度和性能信息。 + + - 在线推理 + 启动在线推理。 + 首先在./LMDB_config.yaml 中,将TRAIN.RESUME.IS_RESUME的值改为True,并为FILE指定一个pth权重 + ``` + bash ./test/train_eval_1p.sh --data_path=/data/crnn_data #在线推理,data_path根据真实数据集路径修改 + ``` # 训练结果展示 diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py index 62d8e7df640b42675face970e40d310974b8bfee..4026a74300e3aa30d6062711ff0fe207a4b16f41 100644 --- a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py @@ -59,6 +59,8 @@ def parse_arg(): parser.add_argument('--start_step', default=0, type=int, help='start_step') parser.add_argument('--stop_step', default=1000, type=int,help='stop_step') parser.add_argument('--ND', type=ast.literal_eval, default=False, help="enable nd compile") + parser.add_argument('--evaluate', dest='evaluate', help="this is evaluate") + parser.add_argument('--batch_size', default=2560, type=int, help='size of each batch') args = parser.parse_args() with open(args.cfg, 'r') as f: config = yaml.safe_load(f) @@ -68,6 +70,10 @@ def parse_arg(): def main(): + # DynamicRNN算子加入黑名单 + option = {} + option['NPU_FUZZY_COMPILE_BLACKLIST'] = "DynamicRNN" + torch.npu.set_option(option) # load config config, args = parse_arg() if args.ND: @@ -149,7 +155,7 @@ def main(): checkpoint = torch.load(model_state_file, map_location=device) if 'state_dict' in checkpoint.keys(): last_epoch = checkpoint['epoch'] - model.load_state_dict(checkpoint['state_dict']) + model.load_state_dict({k.replace('module.',''):v for k,v in checkpoint['state_dict'].items()}) best_acc = checkpoint['best_acc'] optimizer.load_state_dict(checkpoint['optimizer']) if config.TRAIN.AMP: @@ -193,6 +199,11 @@ def main(): converter = utils.strLabelConverter(config.DATASET.ALPHABETS) if config.DISTRIBUTED.RANK % npus_per_node == 0: checkpoint_dir, log_dir = utils.create_output_folder(config) + + if args.evaluate: + acc = validate(config, val_loader, val_dataset, converter, model, criterion, device, 1) + return + for epoch in range(last_epoch, config.TRAIN.END_EPOCH): train(config, train_loader, train_dataset, converter, model, criterion, optimizer, device, epoch, npus_per_node, npu) @@ -294,6 +305,8 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e model.eval() n_correct = 0 n_total = 0 + batch_num = len(val_loader) + start = time.time() with torch.no_grad(): for i, (inp, idx) in enumerate(val_loader): labels = idx @@ -317,6 +330,8 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e n_correct += 1 if (i + 1) % config.PRINT_FREQ == 0: print('Epoch: [{0}][{1}/{2}]'.format(epoch, i, len(val_loader))) + cost = time.time() - start + print("Time:", cost / batch_num) raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:config.TEST.NUM_TEST_DISP] for raw_pred, pred, gt in zip(raw_preds, sim_preds, labels): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..bea387cc8af9453cf049c728bffa6c2cac8c2bef --- /dev/null +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh @@ -0,0 +1,131 @@ +#!/bin/bash + + +#集合通信参数,不需要修改 +export RANK_SIZE=1 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="CRNN_for_PyTorch" + +#训练batch_size,,需要模型审视修改 +batch_size=2560 + +# 指定训练所使用的npu device卡id +device_id=0 +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#创建DeviceID输出目录,不需要修改 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/$ASCEND_DEVICE_ID + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +# 必要参数替换配置文件 +cur_path=`pwd` +sed -i "0,/BATCH_SIZE_PER_GPU.*$/s//BATCH_SIZE_PER_GPU\: ${batch_size}/g" ${cur_path}/LMDB_config.yaml +sed -i "s/END_EPOCH.*$/END_EPOCH\: ${epochs}/g" ${cur_path}/LMDB_config.yaml +sed -i "s|TRAIN_ROOT.*$|TRAIN_ROOT\: ${data_path}/MJ_LMDB|g" ${cur_path}/LMDB_config.yaml +sed -i "s|TEST_ROOT.*$|TEST_ROOT\: ${data_path}/IIIT5K_lmdb|g" ${cur_path}/LMDB_config.yaml +sed -i "s/DEVICE_ID.*$/DEVICE_ID\: ${device_id}/g" ${cur_path}/LMDB_config.yaml + +python3 main_8p.py \ + --cfg LMDB_config.yaml \ + --evaluate True \ + --npu $ASCEND_DEVICE_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 & +wait + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'Time:' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log | tail -n 1 | awk '{print$2}' | awk '{sum+=$1} END {print ('$batch_size')/(sum/NR)}'` +FPS=${FPS#* } # 去除前面的空格字符 +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +eval_accuracy=`grep -a 'best acc is:' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|awk 'END{print}'|awk -F " " '{print $NF}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${eval_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +#从test_$ASCEND_DEVICE_ID.log提取Loss到test_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'Loss' ${test_path_dir}/output/$ASCEND_DEVICE_ID/test_$ASCEND_DEVICE_ID.log|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/test_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/test_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "EvalAccuracy = ${eval_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log