From 900aad441721b5be4ee995894f5eee7b270fe685 Mon Sep 17 00:00:00 2001 From: memg Date: Fri, 3 Mar 2023 09:27:53 +0000 Subject: [PATCH 1/5] update TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md. Signed-off-by: memg --- .../recommendation/DeepCTR_Series_for_TensorFlow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md index 6f08d2511..92c51f0c1 100644 --- a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md @@ -211,4 +211,4 @@ python3 gen_kaggle_criteo_tfrecords.py #### 训练过程 -通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡,8卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。以8卡训练为例,loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file +通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file -- Gitee From 09a790c17b603832c81d9ed2077a7cb319ef8ca4 Mon Sep 17 00:00:00 2001 From: memg Date: Fri, 3 Mar 2023 09:28:11 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Tens?= =?UTF-8?q?orFlow/built-in/recommendation/DeepCTR=5FSeries=5Ffor=5FTensorF?= =?UTF-8?q?low/test/train=5FID4032=5FDCNMix=5Fperformance=5F8p=5FRT2.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train_ID4032_DCNMix_performance_8p_RT2.sh | 153 ------------------ 1 file changed, 153 deletions(-) delete mode 100644 TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p_RT2.sh diff --git a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p_RT2.sh b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p_RT2.sh deleted file mode 100644 index b0430aec6..000000000 --- a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p_RT2.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` -#集合通信参数,不需要修改 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/rank_8p.json -export JOB_ID=10087 -RANK_ID_START=0 - -#使能RT2.0 -export ENABLE_RUNTIME_V2=1 -# 数据集路径,保持为空,不需要修改 -data_path="" -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="DCNMix_ID4032_for_TensorFlow" -#训练epoch -train_epochs=1 -#训练batch_size -batch_size=10240 -#训练step -train_steps=100 -#学习率 -learning_rate= - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_fp32_to_fp16" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/../examples - -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - nohup python3 run_dcnmix.py \ - --data_path=${data_path} \ - --train_batch_size=${batch_size} \ - --eval_batch_size=${batch_size} \ - --num_epochs=${train_epochs} \ - --max_steps=${train_steps} \ - --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -# #输出性能FPS,需要模型审视修改 -fps=`grep "examples\/sec" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $2}'|tail -n +5 |awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '${fps}'}'` -# #打印,不需要修改 -echo "Final Performance item/sec : $FPS" - -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'RT2'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -TrainingTime=`awk 'BEGIN{printf "%.6f\n",'${BatchSize}'/'${FPS}'}'` - -ActualFPS=${FPS} -grep ":loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $3}' | sed 's/,//g' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 95ab3057a219293dcbfabb5cbc3d5e1908766760 Mon Sep 17 00:00:00 2001 From: memg Date: Fri, 3 Mar 2023 09:28:21 +0000 Subject: [PATCH 3/5] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Tens?= =?UTF-8?q?orFlow/built-in/recommendation/DeepCTR=5FSeries=5Ffor=5FTensorF?= =?UTF-8?q?low/test/train=5FID4032=5FDCNMix=5Fperformance=5F8p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train_ID4032_DCNMix_performance_8p.sh | 150 ------------------ 1 file changed, 150 deletions(-) delete mode 100644 TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p.sh diff --git a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p.sh b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p.sh deleted file mode 100644 index a19701a8b..000000000 --- a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_performance_8p.sh +++ /dev/null @@ -1,150 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` -#集合通信参数,不需要修改 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/rank_8p.json -export JOB_ID=10087 -RANK_ID_START=0 -# 数据集路径,保持为空,不需要修改 -data_path="" -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="DCNMix_ID4032_for_TensorFlow" -#训练epoch -train_epochs=1 -#训练batch_size -batch_size=10240 -#训练step -train_steps=100 -#学习率 -learning_rate= - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_fp32_to_fp16" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/../examples - -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - nohup python3 run_dcnmix.py \ - --data_path=${data_path} \ - --train_batch_size=${batch_size} \ - --eval_batch_size=${batch_size} \ - --num_epochs=${train_epochs} \ - --max_steps=${train_steps} \ - --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -# #输出性能FPS,需要模型审视修改 -fps=`grep "examples\/sec" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $2}'|tail -n +5 |awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '${fps}'}'` -# #打印,不需要修改 -echo "Final Performance item/sec : $FPS" - -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -TrainingTime=`awk 'BEGIN{printf "%.6f\n",'${BatchSize}'/'${FPS}'}'` - -ActualFPS=${FPS} -grep ":loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $3}' | sed 's/,//g' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 34596471832b1581771470180bf91c81002eaa25 Mon Sep 17 00:00:00 2001 From: memg Date: Fri, 3 Mar 2023 09:28:31 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Tens?= =?UTF-8?q?orFlow/built-in/recommendation/DeepCTR=5FSeries=5Ffor=5FTensorF?= =?UTF-8?q?low/test/train=5FID4032=5FDCNMix=5Ffull=5F8p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/train_ID4032_DCNMix_full_8p.sh | 153 ------------------ 1 file changed, 153 deletions(-) delete mode 100644 TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_full_8p.sh diff --git a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_full_8p.sh b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_full_8p.sh deleted file mode 100644 index 60f518698..000000000 --- a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/test/train_ID4032_DCNMix_full_8p.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` -#集合通信参数,不需要修改 -export RANK_SIZE=8 -export RANK_TABLE_FILE=${cur_path}/rank_8p.json -export JOB_ID=10087 -RANK_ID_START=0 -# 数据集路径,保持为空,不需要修改 -data_path="" -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="DCNMix_ID4032_for_TensorFlow" -#训练epoch -train_epochs=4 -#训练batch_size -batch_size=10240 -#训练step -train_steps= -#学习率 -learning_rate= - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_fp32_to_fp16" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/../examples - -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - nohup python3 run_dcnmix.py \ - --data_path=${data_path} \ - --train_batch_size=${batch_size} \ - --eval_batch_size=${batch_size} \ - --num_epochs=${train_epochs} \ - --output_dir=${cur_path}/output/$ASCEND_DEVICE_ID/ckpt > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -# #输出性能FPS,需要模型审视修改 -fps=`grep "examples\/sec" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $2}'|tail -n +5 |awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{printf "%.2f\n", '${fps}'}'` -# #打印,不需要修改 -echo "Final Performance item/sec : $FPS" - -# #输出训练精度,需要模型审视修改 -train_accuracy=`grep "AUC = " $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk 'END {print $9}' | sed 's/,//g'` -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'accu' - -##获取性能数据,不需要修改 -#吞吐量 -TrainingTime=`awk 'BEGIN{printf "%.6f\n",'${BatchSize}'/'${FPS}'}'` - -ActualFPS=${FPS} -grep ":loss =" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log| awk '{print $3}' | sed 's/,//g' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From df522d3ccc26e21e98429bab7d6947e87e531c23 Mon Sep 17 00:00:00 2001 From: memg Date: Mon, 6 Mar 2023 06:29:29 +0000 Subject: [PATCH 5/5] update TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md. Signed-off-by: memg --- .../recommendation/DeepCTR_Series_for_TensorFlow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md index 92c51f0c1..6f08d2511 100644 --- a/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md +++ b/TensorFlow/built-in/recommendation/DeepCTR_Series_for_TensorFlow/README.md @@ -211,4 +211,4 @@ python3 gen_kaggle_criteo_tfrecords.py #### 训练过程 -通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file +通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡,8卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。以8卡训练为例,loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file -- Gitee