diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md index 82b0a668e7ded6f9540dd355b0efb765accdcbaf..2d4c2019298b3232f2e4a0700f83096e660aa0c0 100644 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md +++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/README.md @@ -11,13 +11,13 @@ This folder contains the implementation of the `MoBY` with `Swin Transformer` fo - Install `timm==0.3.2`: ```bash -pip install timm==0.3.2 +pip3.7 install timm==0.3.2 ``` - Install other requirements: ```bash -pip install opencv-python==4.4.0.46 termcolor==1.1.0 yacs==0.1.8 diffdist +pip3.7 install opencv-python==4.4.0.46 termcolor==1.1.0 yacs==0.1.8 diffdist ``` ### Data preparation @@ -56,23 +56,9 @@ To train `MoBY` with `Swin Transformer Tiny` on ImageNet, run: ```bash bash ./test/train_full_8p.sh --data_path= ``` - -For example, to train `MoBY` with `Swin Transformer Tiny` with 8 NPU on a single node for 300 epochs, run: - -```bash -bash ./test/train_full_8p.sh --data_path=/data/imagenet -``` - Defaultly, training auto-resumes checkpoint in output directory. Remove the `output` directory to train from begin. ### Performance Test - -To train `MoBY Swin-T` on 1 NPU for performance test, run: - -```bash -bash ./test/train_performance_1p.sh --data_path= -``` - For performance test on 8 NPU, run: ```bash @@ -87,12 +73,6 @@ To evaluate a pre-trained `MoBY` with `Swin Transformer Tiny` on ImageNet-1K lin bash ./test/eval_8p.sh --data_path= ``` -For example, to evaluate `MoBY Swin-T` with 8 NPU on a single node on ImageNet-1K linear evluation, run: - -```bash -bash ./test/eval_8p.sh --data_path=/data/imagenet -``` - ### Training result for `MoBY Swin-T` | Acc@1 | FPS | Npu_nums | Epochs | AMP_Type | CPU | diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh deleted file mode 100644 index c5f14bc83e77a09f744a6f6bd865b1d948ff5946..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/env_npu.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -export install_path=/usr/local/Ascend - -if [ -d ${install_path}/toolkit ]; then - export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} - export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH - export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH - export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=${install_path}/opp -else - if [ -d ${install_path}/nnae/latest ];then - export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH - export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ - export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ - export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so - export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH - export ASCEND_AICPU_PATH=${install_path}/nnae/latest - else - export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH - export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ - export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ - export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so - export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH - export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest - fi -fi - - -#将Host日志输出到串口,0-关闭/1-开启 -export ASCEND_SLOG_PRINT_TO_STDOUT=0 -#设置默认日志级别,0-debug/1-info/2-warning/3-error -export ASCEND_GLOBAL_LOG_LEVEL=3 -#设置Event日志开启标志,0-关闭/1-开启 -export ASCEND_GLOBAL_EVENT_ENABLE=0 -#设置是否开启taskque,0-关闭/1-开启 -export TASK_QUEUE_ENABLE=1 -#设置是否开启PTCopy,0-关闭/1-开启 -export PTCOPY_ENABLE=1 -#设置是否开启combined标志,0-关闭/1-开启 -export COMBINED_ENABLE=1 -#设置特殊场景是否需要重新编译,不需要修改 -export DYNAMIC_OP="ADD#MUL" -#HCCL白名单开关,1-关闭/0-开启 -export HCCL_WHITELIST_DISABLE=1 -export HCCL_IF_IP=$(hostname -I |awk '{print $1}') - -export SCALAR_TO_HOST_MEM=1 -export BMMV2_ENABLE=1 - -#设置device侧日志登记为error -${install_path}/driver/tools/msnpureport -g error -d 0 -${install_path}/driver/tools/msnpureport -g error -d 1 -${install_path}/driver/tools/msnpureport -g error -d 2 -${install_path}/driver/tools/msnpureport -g error -d 3 -${install_path}/driver/tools/msnpureport -g error -d 4 -${install_path}/driver/tools/msnpureport -g error -d 5 -${install_path}/driver/tools/msnpureport -g error -d 6 -${install_path}/driver/tools/msnpureport -g error -d 7 -#关闭Device侧Event日志 -${install_path}/driver/tools/msnpureport -e disable - -path_lib=$(python3.7 -c """ -import sys -import re -result='' -for index in range(len(sys.path)): - match_sit = re.search('-packages', sys.path[index]) - if match_sit is not None: - match_lib = re.search('lib', sys.path[index]) - - if match_lib is not None: - end=match_lib.span()[1] - result += sys.path[index][0:end] + ':' - - result+=sys.path[index] + '/torch/lib:' -print(result)""" -) - -echo ${path_lib} - -export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh deleted file mode 100644 index cd8463dbccf52047dc5fffe351036f7aec444ec1..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_linear_evaluation.sh +++ /dev/null @@ -1,2 +0,0 @@ -python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 moby_linear.py \ ---cfg configs/moby_swin_tiny.yaml --data-path /data/imagenet > linear.log 2>&1 & \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh deleted file mode 100644 index fe2a96ba78ce425f58241229ad6ccb32066736e9..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/run8p_pretrain.sh +++ /dev/null @@ -1,14 +0,0 @@ -source env_npu.sh - -export WORLD_SIZE=8 -for i in $(seq 0 7) -do - export RANK=$i - start=$((24 * i)) - end=$((start + 23)) - taskset -c $start-$end nohup python -u moby_main.py \ - --cfg configs/moby_swin_tiny.yaml \ - --data-path /data/imagenet \ - --local_rank $i \ - --batch-size 128 > train_${i}.log 2>&1 & -done \ No newline at end of file diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh index ef8490219f0a230f7d53223b80bd9793bd4db603..0c4f6dad0beb48f6db7ce209f92a716bd528ec41 100644 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh +++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/eval_8p.sh @@ -37,11 +37,16 @@ if [[ $data_path == "" ]];then exit 1 fi -# 非平台场景时source 环境变量 -check_etp_flag=`env | grep etp_running_flag` -etp_flag=`echo ${check_etp_flag#*=}` -if [ x"${etp_flag}" != x"true" ];then - source ${cur_path}/env_npu.sh +##################指定训练脚本执行路径################## +# cd到与test文件同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test fi #进入训练脚本目录,需要模型审视修改 @@ -55,14 +60,28 @@ export RANK_ID=$RANK_ID export ASCEND_DEVICE_ID=$RANK_ID ASCEND_DEVICE_ID=$RANK_ID +#创建DeviceID输出目录,不需要修改 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ +fi + #训练开始时间,不需要修改 start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 export RANK_SIZE=8 -python3.7 -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 moby_linear.py \ ---cfg configs/moby_swin_tiny.yaml --data-path ${data_path} > ${cur_path}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & +nohup python3.7 -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 moby_linear.py \ +--cfg configs/moby_swin_tiny.yaml --data-path ${data_path} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 & wait @@ -76,13 +95,13 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -time=`grep -a 'time' $cur_path/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'` +time=`grep -a 'time' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'` FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'*8}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep -a 'Max accuracy' $cur_path/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk -F " " 'END {print $8}'|sed 's/%//g'` +train_accuracy=`grep -a 'Max accuracy' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log|awk -F " " 'END {print $8}'|sed 's/%//g'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -100,21 +119,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 -grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/eval_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep 'INFO Train' ${test_path_dir}/output/$ASCEND_DEVICE_ID/eval_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/eval_${CaseName}_loss.txt|sed 's/.$//'` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/eval_${CaseName}_loss.txt|sed 's/.$//'` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log #退出anaconda环境 if [ -n "$conda_name" ];then echo "conda $conda_name deactivate" diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh index 213bc33037318d73bd6f11a741704d8a0d163c00..7deb0770c9c8563e697acd27d720c224efdceb6e 100644 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_full_8p.sh @@ -36,12 +36,22 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi - +##################指定训练脚本执行路径################## +# cd到与test文件同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi # 非平台场景时source 环境变量 check_etp_flag=`env | grep etp_running_flag` etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then - source ${cur_path}/env_npu.sh + source ${test_path_dir}/env_npu.sh fi #进入训练脚本目录,需要模型审视修改 @@ -56,13 +66,14 @@ export ASCEND_DEVICE_ID=$RANK_ID ASCEND_DEVICE_ID=$RANK_ID #创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ fi + #训练开始时间,不需要修改 start_time=$(date +%s) @@ -79,17 +90,17 @@ do then PID_START=$((KERNEL_NUM * RANK_ID)) PID_END=$((PID_START + KERNEL_NUM - 1)) - taskset -c $PID_START-$PID_END python3.7 -u moby_main.py \ + taskset -c $PID_START-$PID_END nohup python3.7 -u moby_main.py \ --cfg configs/moby_swin_tiny.yaml \ --data-path $data_path \ --local_rank $RANK_ID \ - --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --batch-size $batch_size > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & else - python3.7 -u moby_main.py \ + nohup python3.7 -u moby_main.py \ --cfg configs/moby_swin_tiny.yaml \ --data-path $data_path \ --local_rank $RANK_ID \ - --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --batch-size $batch_size > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & fi done @@ -105,13 +116,13 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -time=`grep -a 'time' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'` +time=`grep -a 'time' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'` FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'*8}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep -a 'Precision' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|head -1` +train_accuracy=`grep -a 'Precision' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $NF}'|head -1` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -129,21 +140,21 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 -grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep 'INFO Train' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|sed 's/.$//'` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|sed 's/.$//'` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log #退出anaconda环境 if [ -n "$conda_name" ];then echo "conda $conda_name deactivate" diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh index 856cc6eec23b372a8d5c314d4306e361796af3ef..17c6a4565264871364cecdc975f73e8ccd63d360 100644 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_1p.sh @@ -2,6 +2,8 @@ #当前路径,不需要修改 cur_path=`pwd` +# 指定训练所使用的npu device卡id +device_id=0 #集合通信参数,不需要修改 export RANK_SIZE=1 @@ -35,34 +37,53 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi - -# 非平台场景时source 环境变量 -check_etp_flag=`env | grep etp_running_flag` -etp_flag=`echo ${check_etp_flag#*=}` -if [ x"${etp_flag}" != x"true" ];then - source ${cur_path}/env_npu.sh +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi +#进入训练脚本目录,需要模型审视修改 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test fi + #进入训练脚本目录,需要模型审视修改 ASCEND_DEVICE_ID=0 #创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -python3.7 -m torch.distributed.launch --nproc_per_node 1 --master_port 12345 moby_main.py \ +nohup python3.7 -m torch.distributed.launch --nproc_per_node 1 --master_port 12345 moby_main.py \ --cfg configs/moby_swin_tiny.yaml \ --data-path $data_path \ --steps 1000 \ - --batch-size 128 > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --batch-size 128 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait @@ -73,7 +94,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -time=`grep -a 'time' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $13}'` +time=`grep -a 'time' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $13}'` FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -94,21 +115,21 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 -grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep 'INFO Train' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log #退出anaconda环境 if [ -n "$conda_name" ];then echo "conda $conda_name deactivate" diff --git a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh index 0e17d260314c6c24593c50aa3d997ce4fac856b0..5192ea1ac4d1fc690e3ddd8603ee37b060e8fda5 100644 --- a/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/cv/classification/Transformer-SSL_for_PyTorch/test/train_performance_8p.sh @@ -36,16 +36,38 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi +##################指定训练脚本执行路径################## +# cd到与test文件同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ + +#训练开始时间,不需要修改 +start_time=$(date +%s) # 非平台场景时source 环境变量 check_etp_flag=`env | grep etp_running_flag` etp_flag=`echo ${check_etp_flag#*=}` if [ x"${etp_flag}" != x"true" ];then - source ${cur_path}/env_npu.sh + source ${test_path_dir}/env_npu.sh fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/ +#创建DeviceID输出目录,不需要修改 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ +fi #设置环境变量,不需要修改 RANK_ID=0 @@ -55,18 +77,6 @@ export RANK_ID=$RANK_ID export ASCEND_DEVICE_ID=$RANK_ID ASCEND_DEVICE_ID=$RANK_ID -#创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ -else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ -fi - - -#训练开始时间,不需要修改 -start_time=$(date +%s) - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 export RANK_SIZE=8 @@ -81,19 +91,19 @@ do then PID_START=$((KERNEL_NUM * RANK_ID)) PID_END=$((PID_START + KERNEL_NUM - 1)) - taskset -c $PID_START-$PID_END python3.7 -u moby_main.py \ + taskset -c $PID_START-$PID_END nohup python3.7 -u moby_main.py \ --cfg configs/moby_swin_tiny.yaml \ --data-path $data_path \ --epochs 2 \ --local_rank $RANK_ID \ - --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --batch-size $batch_size > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & else - python3.7 -u moby_main.py \ + nohup python3.7 -u moby_main.py \ --cfg configs/moby_swin_tiny.yaml \ --data-path $data_path \ --epochs 2 \ --local_rank $RANK_ID \ - --batch-size $batch_size > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --batch-size $batch_size > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & fi done @@ -109,7 +119,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -time=`grep -a 'time' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'` +time=`grep -a 'time' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep loss|awk -F " " 'END {print $14}'|sed 's/(//g'|sed 's/)//g'` FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${time}'*8}'` #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -130,21 +140,21 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${time}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 -grep 'INFO Train' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep 'INFO Train' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "loss " '{print $NF}' | awk -F " " '{print $1}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log #退出anaconda环境 if [ -n "$conda_name" ];then echo "conda $conda_name deactivate"