From 18899f33e08ac1a76bf65c1bc9db1868b3e5b7b2 Mon Sep 17 00:00:00 2001 From: tangyunxiang <546783735@qq.com> Date: Mon, 17 Jul 2023 19:41:43 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E6=96=B0=E5=A2=9EREADME.md=E7=9A=84?= =?UTF-8?q?=E5=9C=A8=E7=BA=BF=E6=8E=A8=E7=90=86=E6=8C=87=E5=AF=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/cv/classification/CRNN_for_PyTorch/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md index a1ebd3cc45..bfc2be2964 100644 --- a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/README.md @@ -128,6 +128,13 @@ CRNN (Convolutional Recurrent Neural Network) 于2015年由华中科技大学的 ``` 训练完成后,权重文件保存在当前路径下,并输出模型训练精度和性能信息。 + + - 在线推理 + 启动在线推理。 + 首先在./LMDB_config.yaml 中,将TRAIN.RESUME.IS_RESUME的值改为True,并为FILE指定一个pth权重 + ``` + bash ./test/train_eval_1p.sh --data_path=/data/crnn_data #在线推理,data_path根据真实数据集路径修改 + ``` # 训练结果展示 -- Gitee From 5ef273333fda1d28fe115e106ef20ef9e1e2ccc5 Mon Sep 17 00:00:00 2001 From: tangyunxiang <546783735@qq.com> Date: Mon, 17 Jul 2023 19:42:47 +0800 Subject: [PATCH 2/5] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=9C=A8=E7=BA=BF?= =?UTF-8?q?=E6=8E=A8=E7=90=86=E5=88=86=E6=94=AF=EF=BC=8C=E4=BB=A5=E5=8F=8A?= =?UTF-8?q?=E8=AE=B0=E5=BD=95=E8=AE=A1=E7=AE=97FPS=E6=89=80=E5=BF=85?= =?UTF-8?q?=E9=A1=BB=E7=9A=84=E6=97=B6=E9=97=B4=E6=89=93=E7=82=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../CRNN_for_PyTorch/main_8p.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py index 62d8e7df64..8ad1b02e27 100644 --- a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py @@ -59,6 +59,8 @@ def parse_arg(): parser.add_argument('--start_step', default=0, type=int, help='start_step') parser.add_argument('--stop_step', default=1000, type=int,help='stop_step') parser.add_argument('--ND', type=ast.literal_eval, default=False, help="enable nd compile") + parser.add_argument('--evaluate', dest='evaluate', help="this is evaluate") + parser.add_argument('--batch_size', default=2560, type=int, help='size of each batch') args = parser.parse_args() with open(args.cfg, 'r') as f: config = yaml.safe_load(f) @@ -68,6 +70,10 @@ def parse_arg(): def main(): + # DynamicRNN算子加入黑名单 + option = {} + option['NPU_FUZZY_COMPILE_BLACKLIST'] = "DynamicRNN" + torch.npu.set_option(option) # load config config, args = parse_arg() if args.ND: @@ -149,7 +155,7 @@ def main(): checkpoint = torch.load(model_state_file, map_location=device) if 'state_dict' in checkpoint.keys(): last_epoch = checkpoint['epoch'] - model.load_state_dict(checkpoint['state_dict']) + model.load_state_dict({k.replace('module.',''):v for k,v in checkpoint['state_dict'].items()}) best_acc = checkpoint['best_acc'] optimizer.load_state_dict(checkpoint['optimizer']) if config.TRAIN.AMP: @@ -193,6 +199,11 @@ def main(): converter = utils.strLabelConverter(config.DATASET.ALPHABETS) if config.DISTRIBUTED.RANK % npus_per_node == 0: checkpoint_dir, log_dir = utils.create_output_folder(config) + + if args.evaluate: + acc = validate(config, val_loader, val_dataset, converter, model, criterion, device, 1) + return + for epoch in range(last_epoch, config.TRAIN.END_EPOCH): train(config, train_loader, train_dataset, converter, model, criterion, optimizer, device, epoch, npus_per_node, npu) @@ -295,6 +306,9 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e n_correct = 0 n_total = 0 with torch.no_grad(): + time_sum = 0 + start = time.time() + batch_num = 0 for i, (inp, idx) in enumerate(val_loader): labels = idx inp = inp.to(device) @@ -317,6 +331,12 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e n_correct += 1 if (i + 1) % config.PRINT_FREQ == 0: print('Epoch: [{0}][{1}/{2}]'.format(epoch, i, len(val_loader))) + cost = time.time() - start + time_sum += cost + batch_num += 1 + start = time.time() + print("batch_num", batch_num) + print("Time:", time_sum / batch_num) raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:config.TEST.NUM_TEST_DISP] for raw_pred, pred, gt in zip(raw_preds, sim_preds, labels): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) -- Gitee From 892970080716279705b006cf7be01d5783c6f54b Mon Sep 17 00:00:00 2001 From: tangyunxiang <546783735@qq.com> Date: Mon, 17 Jul 2023 19:43:13 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=9C=A8=E7=BA=BF?= =?UTF-8?q?=E6=8E=A8=E7=90=86=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../CRNN_for_PyTorch/test/train_eval_1p.sh | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh new file mode 100644 index 0000000000..bea387cc8a --- /dev/null +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/test/train_eval_1p.sh @@ -0,0 +1,131 @@ +#!/bin/bash + + +#集合通信参数,不需要修改 +export RANK_SIZE=1 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#网络名称,同目录名称,需要模型审视修改 +Network="CRNN_for_PyTorch" + +#训练batch_size,,需要模型审视修改 +batch_size=2560 + +# 指定训练所使用的npu device卡id +device_id=0 +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#创建DeviceID输出目录,不需要修改 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/$ASCEND_DEVICE_ID + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +# 必要参数替换配置文件 +cur_path=`pwd` +sed -i "0,/BATCH_SIZE_PER_GPU.*$/s//BATCH_SIZE_PER_GPU\: ${batch_size}/g" ${cur_path}/LMDB_config.yaml +sed -i "s/END_EPOCH.*$/END_EPOCH\: ${epochs}/g" ${cur_path}/LMDB_config.yaml +sed -i "s|TRAIN_ROOT.*$|TRAIN_ROOT\: ${data_path}/MJ_LMDB|g" ${cur_path}/LMDB_config.yaml +sed -i "s|TEST_ROOT.*$|TEST_ROOT\: ${data_path}/IIIT5K_lmdb|g" ${cur_path}/LMDB_config.yaml +sed -i "s/DEVICE_ID.*$/DEVICE_ID\: ${device_id}/g" ${cur_path}/LMDB_config.yaml + +python3 main_8p.py \ + --cfg LMDB_config.yaml \ + --evaluate True \ + --npu $ASCEND_DEVICE_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 & +wait + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'Time:' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log | tail -n 1 | awk '{print$2}' | awk '{sum+=$1} END {print ('$batch_size')/(sum/NR)}'` +FPS=${FPS#* } # 去除前面的空格字符 +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +eval_accuracy=`grep -a 'best acc is:' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|awk 'END{print}'|awk -F " " '{print $NF}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${eval_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +#从test_$ASCEND_DEVICE_ID.log提取Loss到test_${CaseName}_loss.txt中,需要模型审视修改 +grep -a 'Loss' ${test_path_dir}/output/$ASCEND_DEVICE_ID/test_$ASCEND_DEVICE_ID.log|awk -F "Loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/test_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/test_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "EvalAccuracy = ${eval_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 0fec141febc8aa5bbe8a5e794b005b2621607318 Mon Sep 17 00:00:00 2001 From: tangyunxiang <546783735@qq.com> Date: Tue, 18 Jul 2023 15:19:32 +0800 Subject: [PATCH 4/5] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AE=A1=E7=AE=97FPS?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=E6=97=B6=E9=97=B4=E6=89=93=E7=82=B9=E6=96=B9?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cv/classification/CRNN_for_PyTorch/main_8p.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py index 8ad1b02e27..129aae6d9b 100644 --- a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py @@ -305,10 +305,9 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e model.eval() n_correct = 0 n_total = 0 + batch_num = floor(len(val_loader) / val_loader.batch_size) + start = time.time() with torch.no_grad(): - time_sum = 0 - start = time.time() - batch_num = 0 for i, (inp, idx) in enumerate(val_loader): labels = idx inp = inp.to(device) @@ -331,12 +330,8 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e n_correct += 1 if (i + 1) % config.PRINT_FREQ == 0: print('Epoch: [{0}][{1}/{2}]'.format(epoch, i, len(val_loader))) - cost = time.time() - start - time_sum += cost - batch_num += 1 - start = time.time() - print("batch_num", batch_num) - print("Time:", time_sum / batch_num) + cost = time.time() - start + print("Time:", cost / batch_num) raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:config.TEST.NUM_TEST_DISP] for raw_pred, pred, gt in zip(raw_preds, sim_preds, labels): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) -- Gitee From 5ebfc754834f8c436752c6b1dfa60f81f43334d5 Mon Sep 17 00:00:00 2001 From: tangyunxiang <546783735@qq.com> Date: Tue, 18 Jul 2023 18:50:10 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E4=BF=AE=E6=94=B9batch=5Fnum=E7=9A=84?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py index 129aae6d9b..4026a74300 100644 --- a/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py +++ b/PyTorch/built-in/cv/classification/CRNN_for_PyTorch/main_8p.py @@ -305,7 +305,7 @@ def validate(config, val_loader, dataset, converter, model, criterion, device, e model.eval() n_correct = 0 n_total = 0 - batch_num = floor(len(val_loader) / val_loader.batch_size) + batch_num = len(val_loader) start = time.time() with torch.no_grad(): for i, (inp, idx) in enumerate(val_loader): -- Gitee