diff --git a/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh b/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh index c2f267ec784e86df28e99409d649239e79e582bc..8d3705f90b22c40b216839a35256351d6edb621a 100644 --- a/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh +++ b/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh @@ -1,8 +1,24 @@ # 网络名称,同目录名称,需要模型审视修改 -Network="blip2" +Network="blip2_ID4380_for_pytorch" batch_size=16 world_size=8 +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + # cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=$(pwd) cur_path_last_dirname=${cur_path##*/} @@ -15,9 +31,19 @@ else fi source ${test_path_dir}/env_npu.sh - +device_id=0 +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi #创建DeviceID输出目录,不需要修改 -output_path=${cur_path}/test/output/ +output_path=${cur_path}/test/output/${ASCEND_DEVICE_ID} if [ -d ${output_path} ]; then rm -rf ${output_path} @@ -25,12 +51,15 @@ fi mkdir -p ${output_path} +# 数据预处理 +ln -s /npu/traindata/blip2_dataset/facebook facebook +ln -s /npu/traindata/blip2_dataset/bert-base-uncased bert-base-uncased #训练开始时间,不需要修改 start_time=$(date +%s) echo "start_time: ${start_time}" python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip2/train/caption_coco_ft_performance.yaml \ - > ${test_path_dir}/output/train_performance_8p_blip2_caption_coco_opt2.7b_ft.log 2>&1 & + > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_0.log 2>&1 & wait @@ -63,18 +92,18 @@ echo "E2E Training Duration sec : $e2e_time" BatchSize=${batch_size} WORLD_SIZE=${world_size} DeviceType=$(uname -m) -CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'performance' +CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'perf' #单迭代训练时长 TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}') #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" >${test_path_dir}/output/${CaseName}.log -echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/${CaseName}.log -echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/${CaseName}.log -echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/${CaseName}.log -echo "CaseName = ${CaseName}" >>${test_path_dir}/output/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >>${test_path_dir}/output/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/${CaseName}.log \ No newline at end of file +echo "Network = ${Network}" >${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log