diff --git a/PyTorch/contrib/cv/pose_estimation/VoxelPose/README.md b/PyTorch/contrib/cv/pose_estimation/VoxelPose/README.md index 4974cf07a4a0235780351f2250b8cd965441a792..fdd1a2623a325b5eac2015f7c4e0b6120b1bb4dd 100644 --- a/PyTorch/contrib/cv/pose_estimation/VoxelPose/README.md +++ b/PyTorch/contrib/cv/pose_estimation/VoxelPose/README.md @@ -18,15 +18,10 @@ Set AscendProject = /usr/local/Ascend #use real path ### Training To train a model, run the following scripts with the real path of dataset: +PS:There is a large jitter on single card training with bs=1, do not perform single card trainning. ```bash # real_data_path = data/shelf -# training 1p accuracy -bash ./test/train_full_1p.sh --data_path=real_data_path --device_id=xxx - -# training 1p performance -bash ./test/train_performance_1p.sh --data_path=real_data_path --device_id=xxx - # training 8p accuracy bash ./test/train_full_8p.sh --data_path=real_data_path @@ -43,8 +38,8 @@ Log path: ### VoxelPose training result +说明:由于模型单卡训练(bs=1)抖动大,loss不收敛,暂不持支单卡训练。 | 名称 | 精度 | FPS | AMP_Type | | :------: | :------: | :------: | :------: | -| NPU-1p | 96.62 | 0.289 | O1 | | NPU-8p | 97.10 | 1.267 | O1 | diff --git a/PyTorch/contrib/cv/pose_estimation/VoxelPose/test/train_full_1p.sh b/PyTorch/contrib/cv/pose_estimation/VoxelPose/test/train_full_1p.sh deleted file mode 100644 index f9fb187754efe808d752e6bf107102883b19f5b2..0000000000000000000000000000000000000000 --- a/PyTorch/contrib/cv/pose_estimation/VoxelPose/test/train_full_1p.sh +++ /dev/null @@ -1,150 +0,0 @@ -# data_path="data/Shelf" -# -# for para in $* -# do -# if [[ $para == --data_path* ]]; then -# data_path=`echo ${para#*=}` -# fi -# done -# -# echo ${data_path} -# -# python run/train_3d.py \ -# --cfg configs/shelf/prn64_cpn80x80x20.yaml \ -# --data_path ${data_path} \ -# --num_epochs 1 \ -# --apex - -#!/bin/bash - -################基础配置参数,需要模型审视修改################## -# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE -# 网络名称,同目录名称 -Network="VoxelPose" -# 训练batch_size -batch_size=1 -# 训练使用的npu卡数 -export RANK_SIZE=1 -# 数据集路径,保持为空,不需要修改 -data_path="" - -# 训练epoch -num_epochs=30 -# 指定训练所使用的npu device卡id -device_id=0 - - -# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 -for para in $* -do - if [[ $para == --device_id* ]];then - device_id=`echo ${para#*=}` - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -# 校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi -# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 -if [ $ASCEND_DEVICE_ID ];then - echo "device id is ${ASCEND_DEVICE_ID}" -elif [ ${device_id} ];then - export ASCEND_DEVICE_ID=${device_id} - echo "device id is ${ASCEND_DEVICE_ID}" -else - "[Error] device id must be config" - exit 1 -fi - - - -###############指定训练脚本执行路径############### -# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 -cur_path=`pwd` -cur_path_last_dirname=${cur_path##*/} -if [ x"${cur_path_last_dirname}" == x"test" ];then - test_path_dir=${cur_path} - cd .. - cur_path=`pwd` -else - test_path_dir=${cur_path}/test -fi - - -#################创建日志输出目录,不需要修改################# -if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} - mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID -else - mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID -fi - - -#################启动训练脚本################# -#训练开始时间,不需要修改 -start_time=$(date +%s) -# 非平台场景时source 环境变量 -check_etp_flag=`env | grep etp_running_flag` -etp_flag=`echo ${check_etp_flag#*=}` -if [ x"${etp_flag}" != x"true" ];then - source ${test_path_dir}/env_npu.sh -fi -python3.7 run/train_3d.py \ - --cfg configs/shelf/prn64_cpn80x80x20.yaml \ - --data_path ${data_path} \ - --addr=$(hostname -I |awk '{print $1}') \ - --num_epochs ${num_epochs} \ - --apex > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -wait - -##################获取训练数据################ -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS@all' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "FPS@all " '{print $2}'| awk -F "," '{print $1}'| awk 'END {print}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep -a 'model best precision is' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $5}'` - -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -a 'Loss' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "(" '{print $4}'| awk -F ")" '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/pose_estimation/VoxelPose/test/train_performance_1p.sh b/PyTorch/contrib/cv/pose_estimation/VoxelPose/test/train_performance_1p.sh deleted file mode 100644 index 325a58e88a7f6e98fc098b3c45424bb2906bda89..0000000000000000000000000000000000000000 --- a/PyTorch/contrib/cv/pose_estimation/VoxelPose/test/train_performance_1p.sh +++ /dev/null @@ -1,151 +0,0 @@ -# data_path="data/Shelf" -# -# for para in $* -# do -# if [[ $para == --data_path* ]]; then -# data_path=`echo ${para#*=}` -# fi -# done -# -# echo ${data_path} -# -# python run/train_3d.py \ -# --cfg configs/shelf/prn64_cpn80x80x20.yaml \ -# --data_path ${data_path} \ -# --num_epochs 1 \ -# --apex - -#!/bin/bash - -################基础配置参数,需要模型审视修改################## -# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE -# 网络名称,同目录名称 -Network="VoxelPose" -# 训练batch_size -batch_size=1 -# 训练使用的npu卡数 -export RANK_SIZE=1 -# 数据集路径,保持为空,不需要修改 -data_path="" - -# 训练epoch -num_epochs=1 -# 指定训练所使用的npu device卡id -device_id=0 - - -# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 -for para in $* -do - if [[ $para == --device_id* ]];then - device_id=`echo ${para#*=}` - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -# 校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi -# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 -if [ $ASCEND_DEVICE_ID ];then - echo "device id is ${ASCEND_DEVICE_ID}" -elif [ ${device_id} ];then - export ASCEND_DEVICE_ID=${device_id} - echo "device id is ${ASCEND_DEVICE_ID}" -else - "[Error] device id must be config" - exit 1 -fi - - - -###############指定训练脚本执行路径############### -# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 -cur_path=`pwd` -cur_path_last_dirname=${cur_path##*/} -if [ x"${cur_path_last_dirname}" == x"test" ];then - test_path_dir=${cur_path} - cd .. - cur_path=`pwd` -else - test_path_dir=${cur_path}/test -fi - - -#################创建日志输出目录,不需要修改################# -if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} - mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID -else - mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID -fi - - -#################启动训练脚本################# -#训练开始时间,不需要修改 -start_time=$(date +%s) -# 非平台场景时source 环境变量 -check_etp_flag=`env | grep etp_running_flag` -etp_flag=`echo ${check_etp_flag#*=}` -if [ x"${etp_flag}" != x"true" ];then - source ${test_path_dir}/env_npu.sh -fi -python3.7 run/train_3d.py \ - --cfg configs/shelf/prn64_cpn80x80x20.yaml \ - --data_path ${data_path} \ - --addr=$(hostname -I |awk '{print $1}') \ - --num_epochs ${num_epochs} \ - --apex > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -wait - - -##################获取训练数据################ -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -FPS=`grep -a 'FPS@all' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "FPS@all " '{print $2}'| awk -F "," '{print $1}'| awk 'END {print}'` -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep -a 'model best precision is' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $5}'` - -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -a 'Loss' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "(" '{print $4}'| awk -F ")" '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log