From 1ac6a217b8226f233c9bf0c9a3b5747857479561 Mon Sep 17 00:00:00 2001 From: libin <1581602217@qq.com> Date: Tue, 13 Aug 2024 03:38:40 +0000 Subject: [PATCH 1/2] update PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh. Signed-off-by: libin <1581602217@qq.com> --- ...rmance_8p_blip2_caption_coco_opt2.7b_ft.sh | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh b/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh index c2f267ec78..4c603e183d 100644 --- a/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh +++ b/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh @@ -1,5 +1,5 @@ # 网络名称,同目录名称,需要模型审视修改 -Network="blip2" +Network="blip2_ID4380_for_pytorch" batch_size=16 world_size=8 @@ -15,9 +15,19 @@ else fi source ${test_path_dir}/env_npu.sh - +device_id=0 +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi #创建DeviceID输出目录,不需要修改 -output_path=${cur_path}/test/output/ +output_path=${cur_path}/test/output/${ASCEND_DEVICE_ID} if [ -d ${output_path} ]; then rm -rf ${output_path} @@ -30,7 +40,7 @@ start_time=$(date +%s) echo "start_time: ${start_time}" python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip2/train/caption_coco_ft_performance.yaml \ - > ${test_path_dir}/output/train_performance_8p_blip2_caption_coco_opt2.7b_ft.log 2>&1 & + > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_0.log 2>&1 & wait @@ -63,18 +73,18 @@ echo "E2E Training Duration sec : $e2e_time" BatchSize=${batch_size} WORLD_SIZE=${world_size} DeviceType=$(uname -m) -CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'performance' +CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'perf' #单迭代训练时长 TrainingTime=$(awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}') #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" >${test_path_dir}/output/${CaseName}.log -echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/${CaseName}.log -echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/${CaseName}.log -echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/${CaseName}.log -echo "CaseName = ${CaseName}" >>${test_path_dir}/output/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >>${test_path_dir}/output/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/${CaseName}.log \ No newline at end of file +echo "Network = ${Network}" >${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "CaseName = ${CaseName}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log \ No newline at end of file -- Gitee From 0dd0a7cd350af55f095ad167da8439dd1919d56d Mon Sep 17 00:00:00 2001 From: libin <1581602217@qq.com> Date: Tue, 13 Aug 2024 07:03:08 +0000 Subject: [PATCH 2/2] update PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh. Signed-off-by: libin <1581602217@qq.com> --- ...rmance_8p_blip2_caption_coco_opt2.7b_ft.sh | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh b/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh index 4c603e183d..8d3705f90b 100644 --- a/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh +++ b/PyTorch/built-in/mlm/LAVIS/test/train_performance_8p_blip2_caption_coco_opt2.7b_ft.sh @@ -3,6 +3,22 @@ Network="blip2_ID4380_for_pytorch" batch_size=16 world_size=8 +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + # cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=$(pwd) cur_path_last_dirname=${cur_path##*/} @@ -35,6 +51,9 @@ fi mkdir -p ${output_path} +# 数据预处理 +ln -s /npu/traindata/blip2_dataset/facebook facebook +ln -s /npu/traindata/blip2_dataset/bert-base-uncased bert-base-uncased #训练开始时间,不需要修改 start_time=$(date +%s) echo "start_time: ${start_time}" @@ -87,4 +106,4 @@ echo "CaseName = ${CaseName}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${Ca echo "ActualFPS = ${ActualFPS}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log \ No newline at end of file +echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/${ASCEND_DEVICE_ID}/${CaseName}.log -- Gitee