diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh index e23728d377b1bde542b4110901b1e8114db47b95..74ea8a80f5f77678c1134c12aa218072ddba0057 100644 --- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh @@ -5,8 +5,8 @@ cur_path=`pwd` #集合通信参数,不需要修改 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/../8p.json +export RANK_SIZES=8 +#export RANK_TABLE_FILE=${cur_path}/../8p.json export JOB_ID=10087 RANK_ID_START=0 @@ -89,9 +89,19 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -108,15 +118,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py +sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -128,9 +140,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -153,7 +165,8 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py +sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 @@ -171,13 +184,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -187,7 +200,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -195,4 +208,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_16p.sh index 24aaf972b09aac08f47ae94f05381403e76f58b0..7bce07eb4eadea2a90fcc418fc55bc9a7511eac6 100644 --- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_16p.sh @@ -3,17 +3,13 @@ cur_path=`pwd` #集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -#export RANK_SIZE=8 -#export RANK_TABLE_FILE=${cur_path}/../8p.json -#export JOB_ID=10087 RANK_ID_START=0 # 数据集路径,保持为空,不需要修改 data_path="/npu/traindata/imagenet_TF" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 #基础参数 需要模型审视修改 #网络名称,同目录名称 @@ -93,9 +89,27 @@ do server_index=`echo ${para#*=}` elif [[ $para == --conf_path* ]];then conf_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -111,27 +125,29 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) -export RANK_SIZE=16 +export RANK_SIZES=16 rank_size=8 - -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi -wait -export RANK_TABLE_FILE=${cur_path}/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +#fi +# +#wait +#export RANK_TABLE_FILE=${cur_path}/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py +sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} @@ -142,9 +158,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -168,7 +184,8 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py +sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 @@ -187,13 +204,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -203,7 +220,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -211,4 +228,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log #echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_32p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_32p.sh index 50478ea1d1e7c84b84e2a11a9f47fe6c0a7a8037..a3824dc961d3ef75223e2b8ea62dbcd51cc2c634 100644 --- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_32p.sh +++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_32p.sh @@ -3,17 +3,13 @@ cur_path=`pwd` export ASCEND_HOST_LOG_FILE_NUM=1000 #集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -#export RANK_SIZE=8 -#export RANK_TABLE_FILE=${cur_path}/../8p.json -#export JOB_ID=10087 RANK_ID_START=0 # 数据集路径,保持为空,不需要修改 data_path="/npu/traindata/imagenet_TF" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 #基础参数 需要模型审视修改 #网络名称,同目录名称 @@ -93,9 +89,26 @@ do server_index=`echo ${para#*=}` elif [[ $para == --conf_path* ]];then conf_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=32 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -111,18 +124,20 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) -export RANK_SIZE=32 +export RANK_SIZES=32 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi - -wait -export RANK_TABLE_FILE=${cur_path}/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZES/rank_size)) --conf_path=$conf_path +#fi +# +#wait +#export RANK_TABLE_FILE=${cur_path}/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py +sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); @@ -142,9 +157,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} + let a=RANK_ID*${corenum}/${RANK_SIZES} let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -168,7 +183,8 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py +sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 @@ -187,13 +203,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -203,7 +219,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -211,5 +227,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log #echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_64p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_64p.sh index acbce4c11ba8789051996626d00680b9bff030fb..ce3e639740225912ce5aa14b1df92150b1a8f735 100644 --- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_64p.sh +++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_64p.sh @@ -3,17 +3,13 @@ cur_path=`pwd` #集合通信参数,不需要修改 -#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -#export RANK_SIZE=8 -#export RANK_TABLE_FILE=${cur_path}/../8p.json -#export JOB_ID=10087 RANK_ID_START=0 # 数据集路径,保持为空,不需要修改 data_path="/npu/traindata/imagenet_TF" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 #基础参数 需要模型审视修改 #网络名称,同目录名称 @@ -93,9 +89,27 @@ do server_index=`echo ${para#*=}` elif [[ $para == --conf_path* ]];then conf_path=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi + +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=64 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -111,27 +125,29 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) -export RANK_SIZE=64 +export RANK_SIZES=64 rank_size=8 -if [[ $conf_path != "" ]];then - nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path -fi - -wait -export RANK_TABLE_FILE=${cur_path}/rank_table.json +#if [[ $conf_path != "" ]];then +# nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +#fi +# +#wait +#export RANK_TABLE_FILE=${cur_path}/rank_table.json export JOB_ID=10087 export DEVICE_INDEX=0 +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py +sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +for((RANK_IDS=$((rank_size*server_index));RANK_IDS<$((((server_index+1))*rank_size));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` - ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_IDS} - $((rank_size*server_index))` #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} @@ -142,9 +158,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -168,7 +184,8 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py +sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 @@ -187,13 +204,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -203,7 +220,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log @@ -211,5 +228,4 @@ echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log #echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_8p.sh index 29d51a154d6f660accd2dce4e3aab6e02d6d56d9..b9038f8e13f56f9edc0746aab99dea4da77ae07f 100644 --- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_performance_8p.sh @@ -4,8 +4,8 @@ cur_path=`pwd` #集合通信参数,不需要修改 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/../8p.json +export RANK_SIZES=8 +#export RANK_TABLE_FILE=${cur_path}/../8p.json export JOB_ID=10087 RANK_ID_START=0 @@ -13,7 +13,7 @@ RANK_ID_START=0 data_path="/npu/traindata/imagenet_TF" #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 +#export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 #基础参数 需要模型审视修改 #网络名称,同目录名称 @@ -89,9 +89,19 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" + elif [[ $para == --one_node_ip* ]];then + one_node_ip=`echo ${para#*=}` fi done +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -108,15 +118,17 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) +sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py +sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $RANK_IDS" + export RANK_IDS=$RANK_IDS + export ASCEND_DEVICE_ID=$RANK_IDS + ASCEND_DEVICE_ID=$RANK_IDS #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -128,9 +140,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_ID*${corenum}/${RANK_SIZE} - let b=RANK_ID+1 - let c=b*${corenum}/${RANK_SIZE}-1 + let a=RANK_IDS*${corenum}/${RANK_SIZES} + let b=RANK_IDS+1 + let c=b*${corenum}/${RANK_SIZES}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -154,7 +166,8 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) - +sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py +sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 @@ -173,23 +186,23 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'perf' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log