diff --git a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh index 74ea8a80f5f77678c1134c12aa218072ddba0057..013de4755ac3a0bcda6fdd3f69688481218a365d 100644 --- a/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/EfficientNet_B0_ID0009_for_TensorFlow/test/train_full_8p.sh @@ -5,8 +5,8 @@ cur_path=`pwd` #集合通信参数,不需要修改 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export RANK_SIZES=8 -#export RANK_TABLE_FILE=${cur_path}/../8p.json +export RANK_SIZE=8 +export RANK_TABLE_FILE=${cur_path}/../8p.json export JOB_ID=10087 RANK_ID_START=0 @@ -89,19 +89,9 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" - elif [[ $para == --one_node_ip* ]];then - one_node_ip=`echo ${para#*=}` fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -118,17 +108,15 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) -sed -i 's/RANK_SIZE/RANK_SIZES/g' ../modelarts/start.py ../efficientnet/main_npu.py -sed -i 's/RANK_ID/RANK_IDS/g' ../modelarts/start.py #进入训练脚本目录,需要模型审视修改 cd $cur_path/../ -for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_IDS" - export RANK_IDS=$RANK_IDS - export ASCEND_DEVICE_ID=$RANK_IDS - ASCEND_DEVICE_ID=$RANK_IDS + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID #创建DeviceID输出目录,不需要修改 if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then @@ -140,9 +128,9 @@ do # 绑核,不需要的绑核的模型删除,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=RANK_IDS*${corenum}/${RANK_SIZES} - let b=RANK_IDS+1 - let c=b*${corenum}/${RANK_SIZES}-1 + let a=RANK_ID*${corenum}/${RANK_SIZE} + let b=RANK_ID+1 + let c=b*${corenum}/${RANK_SIZE}-1 #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path @@ -165,8 +153,7 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -sed -i 's/RANK_SIZES/RANK_SIZE/g' modelarts/start.py efficientnet/main_npu.py -sed -i 's/RANK_IDS/RANK_ID/g' modelarts/start.py + #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 @@ -184,13 +171,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 grep 'logger.py:54' $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $8}' |awk -F ":" '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -200,7 +187,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh index bf5829ca4c942170ac606df8236ed24df3ce9448..8b3c78b5df45f61d92e60b7e11db56418677b32b 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_RT2_performance_8p.sh @@ -49,15 +49,6 @@ do fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 - if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 @@ -75,6 +66,14 @@ export RANK_SIZES=8 #export RANK_TABLE_FILE="${cur_path}/test/8p.json" export JOB_ID=10086 +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 start=$(date +%s) # 8P训练模式 diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh index e17f87b2d5cd69ea6d14ed5c7e8dbcd07b207132..8cd78e2f2b804957f26e967acf45a7359b05c13e 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_full_8p.sh @@ -43,20 +43,9 @@ do name_bind="_bindcore" elif [[ $para == --dynamic_input* ]];then dynamic_input=`echo ${para#*=}` - elif [[ $para == --one_node_ip* ]];then - one_node_ip=`echo ${para#*=}` - fi + fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 - if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 @@ -70,8 +59,8 @@ python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$c wait export ASCEND_DEVICE_ID=0 -export RANK_SIZES=8 -#export RANK_TABLE_FILE="${cur_path}/test/8p.json" +export RANK_SIZE=8 +export RANK_TABLE_FILE="${cur_path}/test/8p.json" export JOB_ID=10086 start=$(date +%s) @@ -80,7 +69,7 @@ start=$(date +%s) for i in 0 1 2 3 4 5 6 7 do #设置环境变量 - export RANK_IDS=$i + export RANK_ID=$i export ASCEND_DEVICE_ID=$i ASCEND_DEVICE_ID=$i echo "Device ID: $ASCEND_DEVICE_ID" @@ -94,8 +83,8 @@ do echo $ASCEND_DEVICE_ID #(Step3)训练 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_IDS*${corenum}/8 - let b=RANK_IDS+1 + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh index f997f241892b7b9fee664fec7cb3bb7fb146491d..496f7ab50e0650863fe9758ba1e05c4ac74867f6 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh @@ -60,20 +60,6 @@ done linux_num=$servers_num -if [[ $conf_path == "" ]];then - fix_node_ip=$fix_node_ip - one_node_ip=$one_node_ip -else - one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -fi - -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 -export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 - if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" exit 1 @@ -93,7 +79,19 @@ cd $cur_path python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000000-bootstrap wait +if [[ $conf_path == "" ]];then + fix_node_ip=$fix_node_ip + one_node_ip=$one_node_ip +else + one_node_ip=`find $conf_path -name "server_*_0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +fi +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=16 #卡数,单机为8,多机为8n,所有服务器一致 +export CM_WORKER_IP=${fix_node_ip} #当前服务器ip,不同环境ip不同 start=$(date +%s) # 8P训练模式 diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh index f280a31684b062155dd378e66b3f1274c8aa495a..7640687102d59371ccd654b08f550ef46fdf41b5 100644 --- a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_8p.sh @@ -48,14 +48,6 @@ do fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be config" @@ -74,6 +66,14 @@ export RANK_SIZES=8 #export RANK_TABLE_FILE="${cur_path}/test/8p.json" export JOB_ID=10086 +#8p训练必须参数(本机IP) +one_node_ip=$one_node_ip +#新增适配集群环境变量 +export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 +export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 +export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 +export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 +export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 start=$(date +%s) # 8P训练模式 diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh index cc5390018b3c6d6e9cb1bee5aed5c148512cd3e8..254cad2ad44821779738f3a202600f7f8c65f3ee 100644 --- a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_full_8p.sh @@ -5,8 +5,8 @@ cur_path=`pwd` #集合通信参数,不需要修改 #保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 -export RANK_SIZES=8 -#export RANK_TABLE_FILE=${cur_path}/../configs/8p.json +export RANK_SIZE=8 +export RANK_TABLE_FILE=${cur_path}/../configs/8p.json export JOB_ID=10087 RANK_ID_START=0 @@ -21,7 +21,7 @@ Network="ResNet50_ID0058_for_TensorFlow" export HCCL_CONNECT_TIMEOUT=600 corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` export RANK_INDEX=0 -export RANK_IDS=0 +export RANK_ID=0 config_file=res50_256bs_8p_eval iterations_per_loop=100 @@ -86,20 +86,9 @@ do elif [[ $para == --bind_core* ]]; then bind_core=`echo ${para#*=}` name_bind="_bindcore" - elif [[ $para == --one_node_ip* ]];then - one_node_ip=`echo ${para#*=}` fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 - #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -115,16 +104,16 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) -sed -i 's/RANK_SIZE/RANK_SIZES/g' ../src/data_loader/resnet50/data_loader.py + #进入训练脚本目录,需要模型审视修改 cd $cur_path/.. -for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_IDS" - # export RANK_IDS=$RANK_IDS - export ASCEND_DEVICE_ID=$RANK_IDS - ASCEND_DEVICE_ID=$RANK_IDS + echo "Device ID: $RANK_ID" + # export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID export DEVICE_ID=$ASCEND_DEVICE_ID DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 )) export DEVICE_INDEX=$DEVICE_INDEX @@ -139,8 +128,8 @@ do #执行训练脚本,需要模型审视修改 corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` - let a=RANK_IDS*${corenum}/8 - let b=RANK_IDS+1 + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 let c=b*${corenum}/8-1 if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" @@ -169,7 +158,6 @@ wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -sed -i 's/RANK_SIZES/RANK_SIZE/g' src/data_loader/resnet50/data_loader.py echo "------------------ Final result ------------------" #单step时长,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 @@ -186,13 +174,20 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' ##获取性能数据 #吞吐量,不需要修改 ActualFPS=${FPS} #单迭代训练时长,不需要修改 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${e2e_time} +#单迭代训练时长 +TrainingTime=${e2e_time} ##获取Loss,通过train_*.log中关键字,需要根据模型审视 grep "total_loss:" $cur_path/output/0/train_0.log|awk '{print $9}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt @@ -202,7 +197,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseNam #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh index f2d9c2f99357b067274e1d35cb838c570f19748f..771f5fb921c1d2f0df9f4bca94c0293c7172f86a 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/scripts/run_accuracy_8p.sh @@ -9,8 +9,8 @@ mkdir -p ${upDir}/test/output/0 # user env export JOB_ID=NPU20210126 -export RANK_SIZES=8 -#export RANK_TABLE_FILE=${currentDir}/8p.json +export RANK_SIZE=8 +export RANK_TABLE_FILE=${currentDir}/8p.json data_dir=$1 fold=$2 @@ -21,7 +21,7 @@ if [ x"${fold}" = x"all" ] ; then for device_index in ${device_group} do - RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 & + RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} 0 & done wait @@ -32,9 +32,10 @@ else echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train start" for device_index in ${device_group} do - RANK_IDS=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} & + RANK_ID=${device_index} ASCEND_DEVICE_ID=${device_index} ${currentDir}/train_accuracy_8p.sh ${data_dir} ${fold} & done wait echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] fold$fold train end" fi + diff --git a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh index b69de76e242e2f58effe146caae2f1b60358e16c..02a6ba19f7063dd1c47bedf34e76ffa88dc2f538 100644 --- a/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/cv/image_segmentation/UNet3D_ID0057_for_TensorFlow/test/train_full_8p.sh @@ -5,9 +5,9 @@ cur_path=`pwd` export LANG=en_US.UTF-8 -export RANK_SIZES=8 +export RANK_SIZE=8 export JOB_ID=10087 -#export RANK_TABLE_FILE=$cur_path/../scripts/8p.json +export RANK_TABLE_FILE=$cur_path/../scripts/8p.json RANK_ID_START=0 @@ -71,19 +71,9 @@ do cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` - elif [[ $para == --one_node_ip* ]];then - one_node_ip=`echo ${para#*=}` fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 #data_path='../' #校验是否传入data_path,不需要修改 @@ -91,23 +81,18 @@ if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 fi -cd $cur_path/../ -sed -i 's/RANK_SIZE/RANK_SIZES/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py -sed -i 's/RANK_ID/RANK_IDS/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py + cd $cur_path/../scripts #训练开始时间,不需要修改 start_time=$(date +%s) bash run_accuracy_8p.sh ${data_path} all -wait #训练结束时间,不需要修改 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -cd $cur_path/../ -sed -i 's/RANK_SIZES/RANK_SIZE/g' model/model_fn.py pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py -sed -i 's/RANK_IDS/RANK_ID/g' pbinference/unet3d_pb_inference.sh main_npu.py dataset/data_loader.py runtime/hooks.py runtime/setup.py + sleep 30 train_accuracy=`grep -r "whole" $cur_path/output/0/train_0.log | awk '{print $6}'` @@ -120,6 +105,7 @@ echo "E2E Training Duration sec : $e2e_time" echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 fps=`grep throughput_train $cur_path/output/0/train_0.log|awk -F 'throughput_train' '{print $2}'|awk -F ':' '{print $2}'|awk '{print $1}'` +#FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${fps}'}'` FPS=1.5 #打印,不需要修改 echo "Final Performance images/sec : $FPS" @@ -130,13 +116,13 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' #获取性能数据,不需要修改 #吞吐量 ActualFPS=${FPS} #单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZES}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RANK_SIZE}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 @@ -147,7 +133,7 @@ ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log -echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/0/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh index 7250053b376a4ff13fa825c662658b18fab2bd60..e58e09555687d14d31fe22af0948add099c5f108 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh @@ -4,11 +4,13 @@ cur_path=`pwd` #集合通信参数,不需要修改 + + export HCCL_CONNECT_TIMEOUT=1200 #集合通信参数,不需要修改 -export RANK_SIZES=8 -#export RANK_TABLE_FILE=$cur_path/8p.json +export RANK_SIZE=8 +export RANK_TABLE_FILE=$cur_path/8p.json export JOB_ID=10087 RANK_ID_START=0 ASCEND_DEVICE_ID_START=0 @@ -55,19 +57,14 @@ elif [[ $para == --over_dump* ]];then mkdir -p ${over_dump_path} elif [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` - elif [[ $para == --one_node_ip* ]];then - one_node_ip=`echo ${para#*=}` fi done -#8p训练必须参数(本机IP) -one_node_ip=$one_node_ip -#新增适配集群环境变量 -export CM_CHIEF_IP=${one_node_ip} #主节点ip,所有服务器一致 -export CM_CHIEF_PORT=29688 #通信端口,所有服务器一致 -export CM_CHIEF_DEVICE=0 #配置为0,配置主卡,类似于主节点,所有服务器一致 -export CM_WORKER_SIZE=8 #卡数,单机为8,所有服务器一致 -export CM_WORKER_IP=${one_node_ip} #当前服务器ip,不同环境ip不同 +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then @@ -105,8 +102,6 @@ sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py sed -i "s%n_epoches = 2%n_epoches = $n_epoches%p" configs/config.py -sed -i 's/RANK_SIZE/RANK_SIZES/g' widedeep/WideDeep_fp16_huifeng.py -sed -i 's/RANK_SIZE/RANK_SIZES/g' train.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run @@ -114,13 +109,13 @@ cp configs/config.py configs/config.py.run cd $cur_path/../ start=$(date +%s) -for((RANK_IDS=$RANK_ID_START;RANK_IDS<$((RANK_SIZES+RANK_ID_START));RANK_IDS++)); +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 - echo "Device ID: $RANK_IDS" - export RANK_IDS=$RANK_IDS - export ASCEND_DEVICE_ID=$RANK_IDS - ASCEND_DEVICE_ID=$RANK_IDS + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then rm -rf $cur_path/output/${ASCEND_DEVICE_ID} mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} @@ -136,8 +131,7 @@ done wait end=$(date +%s) e2e_time=$(( $end - $start )) -sed -i 's/RANK_SIZES/RANK_SIZE/g' train.py -sed -i 's/RANK_SIZES/RANK_SIZE/g' widedeep/WideDeep_fp16_huifeng.py + #配置文件恢复 mv -f configs/config.py.bak configs/config.py @@ -161,9 +155,9 @@ echo "E2E Training Duration sec : $e2e_time" BatchSize=${batch_size} DeviceType=`uname -m` if [[ $precision_mode == "must_keep_origin_dtype" ]];then - CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'fp32'_'acc' + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'acc' else - CaseName=${Network}_bs${BatchSize}_${RANK_SIZES}'p'_'acc' + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' fi ##获取性能数据 @@ -182,7 +176,7 @@ ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZES}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log