From df25d8038e7b86aa450fbd9c17ddf8c7a2c02f81 Mon Sep 17 00:00:00 2001 From: majun121 <867479212@qq.com> Date: Thu, 1 Aug 2024 09:08:50 +0000 Subject: [PATCH 1/2] add kill&bin&ffts shell Signed-off-by: majun121 <867479212@qq.com> --- .../test/train_performance_1p_bin.sh | 166 +++++++++++++ .../train_performance_1p_deterministic.sh | 186 ++++++++++++++ .../train_performance_1p_dump_kill_2_9.sh | 235 ++++++++++++++++++ .../train_performance_1p_dump_kill_8_11.sh | 235 ++++++++++++++++++ .../test/train_performance_1p_ffts.sh | 166 +++++++++++++ .../test/train_performance_1p_nobin.sh | 166 +++++++++++++ .../train_performance_1p_overflow_kill_2_9.sh | 235 ++++++++++++++++++ ...train_performance_1p_overflow_kill_8_11.sh | 235 ++++++++++++++++++ ...train_performance_1p_profiling_kill_2_9.sh | 235 ++++++++++++++++++ ...rain_performance_1p_profiling_kill_8_11.sh | 235 ++++++++++++++++++ 10 files changed, 2094 insertions(+) create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_bin.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_deterministic.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_2_9.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_8_11.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_ffts.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_nobin.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_2_9.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_8_11.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_2_9.sh create mode 100644 TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_8_11.sh diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_bin.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_bin.sh new file mode 100644 index 000000000..92ad08793 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_bin.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +#export GE_USE_STATIC_MEMORY=1 + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 + + +RANK_ID_START=0 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=131072 +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 + +#参数配置 +data_path="/npu/traindata/ID2940_CarPeting_TF_WideDeep/" +train_size=13107200 +display_step=10 + +#维持参数,以下不需要修改 +over_dump=False +precision_mode="allow_mix_precision" +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --over_dump if or not over detection, default is False + --data_path source data of training + --train_epochs train epochs + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` +elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +##############执行训练########## +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +#if [ -d $cur_path/../config/1p_$ASCEND_DEVICE.json ];then +# export RANK_TABLE_FILE=$cur_path/../config/1p_$ASCEND_DEVICE.json +# export RANK_ID=$ASCEND_DEVICE_ID +#else +# export RANK_TABLE_FILE=$cur_path/../config/1p.json +# export RANK_ID=0 +#fi +wait + +#配置文件备份和修改 +cd $cur_path/../ +if [ -f configs/config.py.bak ];then + cp configs/config.py.bak configs/config.py + rm -f configs/config.py.run +else + cp configs/config.py configs/config.py.bak + rm -f configs/config.py.run +fi +sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs/config.py +sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py +sed -i "s%59761827%${train_size}%p" configs/config.py +sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py +#echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` +cp configs/config.py configs/config.py.run +#训练执行 +start=$(date +%s) + +sed -i 's|\#custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"true\")|custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"false\")|g' train.py + +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#配置文件恢复 +mv -f configs/config.py.bak configs/config.py +sed -i 's|custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"false\")|\#custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"true\")|g' train.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 + +#FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` +time=`grep -rn 'epoch 2 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'100'*'${batch_size}'/'${time}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'eval auc' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $8}' |tail -n 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +if [[ $precision_mode == "must_keep_origin_dtype" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'bin_perf' +else + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'bin_perf' +fi +echo "CaseName : $CaseName" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=$time +echo "TrainingTime(ms/step) : $TrainingTime" + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +loss=`grep 'loss =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | awk -F' ' '{print $9}'|sed 's/,$//'` +echo "${loss}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | tail -n 1` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_deterministic.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_deterministic.sh new file mode 100644 index 000000000..1c1ee38e8 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_deterministic.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +#export GE_USE_STATIC_MEMORY=1 + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 + + +RANK_ID_START=0 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=131072 +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 + +#参数配置 +data_path="/npu/traindata/ID2940_CarPeting_TF_WideDeep/" +train_size=13107200 +display_step=10 + +#维持参数,以下不需要修改 +over_dump=False +precision_mode="allow_mix_precision" +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --over_dump if or not over detection, default is False + --data_path source data of training + --train_epochs train epochs + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` +elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +##############执行训练########## +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +#if [ -d $cur_path/../config/1p_$ASCEND_DEVICE.json ];then +# export RANK_TABLE_FILE=$cur_path/../config/1p_$ASCEND_DEVICE.json +# export RANK_ID=$ASCEND_DEVICE_ID +#else +# export RANK_TABLE_FILE=$cur_path/../config/1p.json +# export RANK_ID=0 +#fi +wait + +#配置文件备份和修改 +cd $cur_path/../ +if [ -f configs/config.py.bak ];then + cp configs/config.py.bak configs/config.py + rm -f configs/config.py.run +else + cp configs/config.py configs/config.py.bak + rm -f configs/config.py.run +fi +sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs/config.py +sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py +sed -i "s%59761827%${train_size}%p" configs/config.py +sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py +#echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` +cp configs/config.py configs/config.py.run +#训练执行 +start=$(date +%s) +sed -i 's|\#custom_op.parameter_map\[\"deterministic\"\].i = 1|custom_op.parameter_map\[\"deterministic\"\].i = 1|g' train.py +sed -i "s|perform_shuffle=True|perform_shuffle=False|g" train.py + +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait + +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train1_$ASCEND_DEVICE_ID.log 2>&1 & +wait + + +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#配置文件恢复 +mv -f configs/config.py.bak configs/config.py +sed -i 's|custom_op.parameter_map\[\"deterministic\"\].i = 1|\#custom_op.parameter_map\[\"deterministic\"\].i = 1|g' train.py +sed -i "s|perform_shuffle=False|perform_shuffle=True|g" train.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 + +#FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` +time=`grep -rn 'epoch 2 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'100'*'${batch_size}'/'${time}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'eval auc' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $8}' |tail -n 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +if [[ $precision_mode == "must_keep_origin_dtype" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'deterministic_perf' +else + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'deterministic_perf' +fi +echo "CaseName : $CaseName" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=$time +echo "TrainingTime(ms/step) : $TrainingTime" + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +loss=`grep 'loss =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | awk -F' ' '{print $9}'|sed 's/,$//'` +echo "${loss}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +loss1=`grep 'loss =' $cur_path/output/${ASCEND_DEVICE_ID}/train1_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | awk -F' ' '{print $9}'|sed 's/,$//'` +echo "${loss1}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss1.txt + +if diff $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss1.txt >/dev/null;then + acc=1 +else + acc=0 +fi + + +#最后一个迭代loss值,不需要修改 +ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | tail -n 1` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${acc}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_2_9.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_2_9.sh new file mode 100644 index 000000000..3373fcd59 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_2_9.sh @@ -0,0 +1,235 @@ +#/bin/bash + +#初始结果设置 +result1=true #芯片健康结果 +result2=true #HBM内存结果 +result3=true #告警结果 +result4=true #0xb异常结果 +result5=true #进程状态结果 +result6=false #业务状态结果 + +#设置最终结果 +TrainAccuracy=0 + +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#训练batch_size +batch_size=131072 +#RANK_SIZE +export RANK_SIZE=1 + +#设置环境变量 +#source /usr/local/Ascend/latest/bin/setenv.bash +#export HCCL_EXEC_TIMEOUT=68 +#export HCCL_CONNECT_TIMEOUT=300 + +#清除环境初始日志 +rm -rf /root/ascend/log/* + +#coredump设置 +echo /npu/coredump/core.%t.%e.%p > /proc/sys/kernel/core_pattern +echo 1 > /proc/sys/fs/suid_dumpable +ulimit -c unlimited + +#业务环境变量 +export HCCL_DIAGNOSE_ENABLE=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=False + +#压测次数 +times=20 +#docker适配 +docker_enable="false" + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --gather* ]];then + export gather=1 + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --dispatcher_type* ]];then + dispatcher_type=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --conda_name* ]];then + conda_name=`echo ${para#*=}` + elif [[ $para == --train_iters* ]];then + train_iters=`echo ${para#*=}` + elif [[ $para == --times* ]];then + times=`echo ${para#*=}` + elif [[ $para == --docker_enable* ]]; then + docker_enable=`echo ${para#*=}` + fi +done + +# 初始化,清除环境上业务进程,确保无残留进程 +ps -ef |grep python|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -9 + +#获取0卡初始状态HBM +cp -r /npu/npu_smi ./ +./npu_smi | tee -a npu_smi.log +HBM_start=`awk NR==4 npu_smi.log | awk {'print $4'}` +#增加20M波动范围 +HBM_start=$(( $HBM_start + 10 )) +echo $HBM_start + +for ((i=1;i<=${times};i++)) +do + # 定义kill的信号量(此处交替执行kill -2和kill -9) + random_number=$((RANDOM % 2)) + if [ $random_number -eq 0 ]; then + signal=2 + else + signal=9 + fi + echo "------------- start $i --------------------" + echo "------------- kill -$signal ---------------" + #设置最终结果 + TrainAccuracy=0 + + cur_path=`pwd` + + # 执行业务(根据具体业务调整,此处执行网络开启overflow) + + sed -i "s|train_size=13107200|train_size=1310720000000000|g" $cur_path/train_performance_1p_dump.sh + setsid bash train_performance_1p_dump.sh --precision_mode=must_keep_origin_dtype --conda_name=py2 --data_path=/npu/traindata/ID2940_CarPeting_TF_WideDeep/ >/dev/null 2>&1 & + + #执行开始时间 + start_time=$(date +%s) + + # 判断所运行的业务实际开启(根据具体业务调整,此处通过log确认已进入训练) + echo "--------------------------------------" $i + + for s in {1..300} + do + ####因为迭代信息在执行完才会显示,训练过程中不打屏,所以只能校验graph_id + grep -rn "current graph id is: 51" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + if test $? -eq 0 ;then + echo "--------------------------------" + rnd=$((RANDOM%60+120)) + result6=true + #sleep $rnd + break + fi + sleep 1 + done + sed -i "s|train_size=1310720000000000|train_size=13107200|g" $cur_path/train_performance_1p_dump.sh + #业务状态判断 + if [ $result6 != true ];then + echo "业务error" && break + fi + + # 执行进程kill中断 + echo "------------------------------------" + + ps -ef |grep python|grep train_dump.py|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -$signal + + if [ -$signal == 9 ];then + sleep 5 + else + sleep 1m + fi + + #判断进程状态是否均已退出 + count=`ps -ef |grep python |grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec|grep -v "grep" |wc -l` + count1=`ps -ef | grep python | grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec| grep -v "grep"| wc -l` + count2=`ps -ef | grep python | grep train_dump.py | grep -v "grep"| wc -l` + echo $count + echo $count1 + echo $count2 + if [ $count -ne 0 ] || [ $count1 -ne 0 ] || [ $count2 -ne 0 ];then + result5=false + echo "进程状态error" + fi + + #收集smi + cp -r /npu/npu_smi ./ + ./npu_smi | tee -a npu_smi_$i.log + + #查看npu_smi回显是否有变化 + title1=`awk NR==2 npu_smi_$i.log | awk {'print $2'}` + title2=`awk NR==2 npu_smi_$i.log | awk {'print $4'}` + title3=`awk NR==2 npu_smi_$i.log | awk {'print $10'}` + title4=`awk NR==4 npu_smi_$i.log | awk {'print $1'}` + title5=`awk NR==18 npu_smi_$i.log | awk {'print $1'}` + + if [ "$title1"="Health" ] && [ "$title2"=="HBM-Usage(MB)" ] && [ "$title3"=="perrorcode" ] && [ "$title4"==0 ] && [ "$title5"==7 ];then + echo "回显正常无变化" + else + echo "回显有变,请重新适配脚本" + break + fi + + for ((j=4;j<=18;j=(j+2))) + do + # 检查芯片健康状态 + a=`awk NR==${j} npu_smi_$i.log | awk {'print $2'}` + if [ $a != 'OK' ];then + result1=false + echo "芯片健康转态error" + fi + + # 检查芯片HBM内存释放情况 + b=`awk NR==${j} npu_smi_$i.log | awk {'print $4'}` + if (( $b > $HBM_start ));then + echo $b + result2=false + echo "内存释放error" + fi + + # 检查芯片告警情况 + c=`awk NR==${j} npu_smi_$i.log | awk {'print $10'}` + if [[ $c =~ '0x' ]] ;then + result3=false + echo "告警error" + fi + done + + + # 检查0xb异常 + msnpureport > msnpureport.log + num=$(grep -rn "sq_fsm=0xb" 20*-*|wc -l) + if [ $num != 0 ]; then + result4=false + echo "0xb异常error" + fi + + #判断最终结果 + if $result1 && $result2 && $result3 && $result4 &&$result5;then + echo "检查点均通过" + TrainAccuracy=1 + rm -rf 20*-* + else + echo "检查点不通过" && break + fi + +done +rm -rf $cur_path/output/$ASCEND_DEVICE_ID/*perf.log +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'dump_kill'_'2_9'_'acc' + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_8_11.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_8_11.sh new file mode 100644 index 000000000..be7a4d0e9 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_dump_kill_8_11.sh @@ -0,0 +1,235 @@ +#/bin/bash + +#初始结果设置 +result1=true #芯片健康结果 +result2=true #HBM内存结果 +result3=true #告警结果 +result4=true #0xb异常结果 +result5=true #进程状态结果 +result6=false #业务状态结果 + +#设置最终结果 +TrainAccuracy=0 + +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#训练batch_size +batch_size=131072 +#RANK_SIZE +export RANK_SIZE=1 + +#设置环境变量 +#source /usr/local/Ascend/latest/bin/setenv.bash +#export HCCL_EXEC_TIMEOUT=68 +#export HCCL_CONNECT_TIMEOUT=300 + +#清除环境初始日志 +rm -rf /root/ascend/log/* + +#coredump设置 +echo /npu/coredump/core.%t.%e.%p > /proc/sys/kernel/core_pattern +echo 1 > /proc/sys/fs/suid_dumpable +ulimit -c unlimited + +#业务环境变量 +export HCCL_DIAGNOSE_ENABLE=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=False + +#压测次数 +times=20 +#docker适配 +docker_enable="false" + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --gather* ]];then + export gather=1 + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --dispatcher_type* ]];then + dispatcher_type=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --conda_name* ]];then + conda_name=`echo ${para#*=}` + elif [[ $para == --train_iters* ]];then + train_iters=`echo ${para#*=}` + elif [[ $para == --times* ]];then + times=`echo ${para#*=}` + elif [[ $para == --docker_enable* ]]; then + docker_enable=`echo ${para#*=}` + fi +done + +# 初始化,清除环境上业务进程,确保无残留进程 +ps -ef |grep python|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -9 + +#获取0卡初始状态HBM +cp -r /npu/npu_smi ./ +./npu_smi | tee -a npu_smi.log +HBM_start=`awk NR==4 npu_smi.log | awk {'print $4'}` +#增加20M波动范围 +HBM_start=$(( $HBM_start + 10 )) +echo $HBM_start + +for ((i=1;i<=${times};i++)) +do + # 定义kill的信号量(此处交替执行kill -2和kill -9) + random_number=$((RANDOM % 2)) + if [ $random_number -eq 0 ]; then + signal=8 + else + signal=11 + fi + echo "------------- start $i --------------------" + echo "------------- kill -$signal ---------------" + #设置最终结果 + TrainAccuracy=0 + + cur_path=`pwd` + + # 执行业务(根据具体业务调整,此处执行网络开启overflow) + + sed -i "s|train_size=13107200|train_size=1310720000000000|g" $cur_path/train_performance_1p_dump.sh + setsid bash train_performance_1p_dump.sh --precision_mode=must_keep_origin_dtype --conda_name=py2 --data_path=/npu/traindata/ID2940_CarPeting_TF_WideDeep/ >/dev/null 2>&1 & + + #执行开始时间 + start_time=$(date +%s) + + # 判断所运行的业务实际开启(根据具体业务调整,此处通过log确认已进入训练) + echo "--------------------------------------" $i + + for s in {1..300} + do + ####因为迭代信息在执行完才会显示,训练过程中不打屏,所以只能校验graph_id + grep -rn "current graph id is: 51" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + if test $? -eq 0 ;then + echo "--------------------------------" + rnd=$((RANDOM%60+120)) + result6=true + #sleep $rnd + break + fi + sleep 1 + done + sed -i "s|train_size=1310720000000000|train_size=13107200|g" $cur_path/train_performance_1p_dump.sh + #业务状态判断 + if [ $result6 != true ];then + echo "业务error" && break + fi + + # 执行进程kill中断 + echo "------------------------------------" + + ps -ef |grep python|grep train_dump.py|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -$signal + + if [ -$signal == 9 ];then + sleep 5 + else + sleep 1m + fi + + #判断进程状态是否均已退出 + count=`ps -ef |grep python |grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec|grep -v "grep" |wc -l` + count1=`ps -ef | grep python | grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec| grep -v "grep"| wc -l` + count2=`ps -ef | grep python | grep train_dump.py | grep -v "grep"| wc -l` + echo $count + echo $count1 + echo $count2 + if [ $count -ne 0 ] || [ $count1 -ne 0 ] || [ $count2 -ne 0 ];then + result5=false + echo "进程状态error" + fi + + #收集smi + cp -r /npu/npu_smi ./ + ./npu_smi | tee -a npu_smi_$i.log + + #查看npu_smi回显是否有变化 + title1=`awk NR==2 npu_smi_$i.log | awk {'print $2'}` + title2=`awk NR==2 npu_smi_$i.log | awk {'print $4'}` + title3=`awk NR==2 npu_smi_$i.log | awk {'print $10'}` + title4=`awk NR==4 npu_smi_$i.log | awk {'print $1'}` + title5=`awk NR==18 npu_smi_$i.log | awk {'print $1'}` + + if [ "$title1"="Health" ] && [ "$title2"=="HBM-Usage(MB)" ] && [ "$title3"=="perrorcode" ] && [ "$title4"==0 ] && [ "$title5"==7 ];then + echo "回显正常无变化" + else + echo "回显有变,请重新适配脚本" + break + fi + + for ((j=4;j<=18;j=(j+2))) + do + # 检查芯片健康状态 + a=`awk NR==${j} npu_smi_$i.log | awk {'print $2'}` + if [ $a != 'OK' ];then + result1=false + echo "芯片健康转态error" + fi + + # 检查芯片HBM内存释放情况 + b=`awk NR==${j} npu_smi_$i.log | awk {'print $4'}` + if (( $b > $HBM_start ));then + echo $b + result2=false + echo "内存释放error" + fi + + # 检查芯片告警情况 + c=`awk NR==${j} npu_smi_$i.log | awk {'print $10'}` + if [[ $c =~ '0x' ]] ;then + result3=false + echo "告警error" + fi + done + + + # 检查0xb异常 + msnpureport > msnpureport.log + num=$(grep -rn "sq_fsm=0xb" 20*-*|wc -l) + if [ $num != 0 ]; then + result4=false + echo "0xb异常error" + fi + + #判断最终结果 + if $result1 && $result2 && $result3 && $result4 &&$result5;then + echo "检查点均通过" + TrainAccuracy=1 + rm -rf 20*-* + else + echo "检查点不通过" && break + fi + +done +rm -rf $cur_path/output/$ASCEND_DEVICE_ID/*perf.log +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'dump_kill'_'8_11'_'acc' + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_ffts.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_ffts.sh new file mode 100644 index 000000000..f9acd012c --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_ffts.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +#export GE_USE_STATIC_MEMORY=1 + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 + + +RANK_ID_START=0 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=131072 +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 + +#参数配置 +data_path="/npu/traindata/ID2940_CarPeting_TF_WideDeep/" +train_size=13107200 +display_step=10 + +#维持参数,以下不需要修改 +over_dump=False +precision_mode="allow_mix_precision" +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --over_dump if or not over detection, default is False + --data_path source data of training + --train_epochs train epochs + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` +elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +##############执行训练########## +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +#if [ -d $cur_path/../config/1p_$ASCEND_DEVICE.json ];then +# export RANK_TABLE_FILE=$cur_path/../config/1p_$ASCEND_DEVICE.json +# export RANK_ID=$ASCEND_DEVICE_ID +#else +# export RANK_TABLE_FILE=$cur_path/../config/1p.json +# export RANK_ID=0 +#fi +wait + +#配置文件备份和修改 +cd $cur_path/../ +if [ -f configs/config.py.bak ];then + cp configs/config.py.bak configs/config.py + rm -f configs/config.py.run +else + cp configs/config.py configs/config.py.bak + rm -f configs/config.py.run +fi +sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs/config.py +sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py +sed -i "s%59761827%${train_size}%p" configs/config.py +sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py +#echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` +cp configs/config.py configs/config.py.run +#训练执行 +start=$(date +%s) + +########## ffts ############## +export ASCEND_ENHANCE_ENABLE=1 + +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#配置文件恢复 +mv -f configs/config.py.bak configs/config.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 + +#FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` +time=`grep -rn 'epoch 2 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'100'*'${batch_size}'/'${time}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'eval auc' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $8}' |tail -n 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +if [[ $precision_mode == "must_keep_origin_dtype" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'ffts_perf' +else + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'ffts_perf' +fi +echo "CaseName : $CaseName" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=$time +echo "TrainingTime(ms/step) : $TrainingTime" + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +loss=`grep 'loss =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | awk -F' ' '{print $9}'|sed 's/,$//'` +echo "${loss}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | tail -n 1` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_nobin.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_nobin.sh new file mode 100644 index 000000000..e7e85d569 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_nobin.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +#export GE_USE_STATIC_MEMORY=1 + +#集合通信参数,不需要修改 + +export RANK_SIZE=1 +export JOB_ID=10087 + + +RANK_ID_START=0 + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=131072 +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=1 + +#参数配置 +data_path="/npu/traindata/ID2940_CarPeting_TF_WideDeep/" +train_size=13107200 +display_step=10 + +#维持参数,以下不需要修改 +over_dump=False +precision_mode="allow_mix_precision" +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --over_dump if or not over detection, default is False + --data_path source data of training + --train_epochs train epochs + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` +elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +##############执行训练########## +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID/ckpt +fi + +#if [ -d $cur_path/../config/1p_$ASCEND_DEVICE.json ];then +# export RANK_TABLE_FILE=$cur_path/../config/1p_$ASCEND_DEVICE.json +# export RANK_ID=$ASCEND_DEVICE_ID +#else +# export RANK_TABLE_FILE=$cur_path/../config/1p.json +# export RANK_ID=0 +#fi +wait + +#配置文件备份和修改 +cd $cur_path/../ +if [ -f configs/config.py.bak ];then + cp configs/config.py.bak configs/config.py + rm -f configs/config.py.run +else + cp configs/config.py configs/config.py.bak + rm -f configs/config.py.run +fi +sed -i "s%/npu/traindata/ID2940_CarPeting_TF_WideDeep_TF%${data_path}%p" configs/config.py +sed -i "s%./model%$cur_path/output/$ASCEND_DEVICE_ID/ckpt%p" configs/config.py +sed -i "s%59761827%${train_size}%p" configs/config.py +sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py +#echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` +cp configs/config.py configs/config.py.run +#训练执行 +start=$(date +%s) + +sed -i 's|\#custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"true\")|custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"true\")|g' train.py + +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +wait +end=$(date +%s) +e2e_time=$(( $end - $start )) + +#配置文件恢复 +mv -f configs/config.py.bak configs/config.py +sed -i 's|custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"true\")|\#custom_op.parameter_map\[\"jit_compile\"\].s = tf.compat.as_bytes(\"true\")|g' train.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 + +#FPS=`grep 'fps :' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $25}' | tail -n 1` +time=`grep -rn 'epoch 2 total time =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F '=' '{print $2}'|sed s/[[:space:]]//g` +FPS=`awk 'BEGIN{printf "%.2f\n",'100'*'${batch_size}'/'${time}'}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'eval auc' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F' ' '{print $8}' |tail -n 1` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +if [[ $precision_mode == "must_keep_origin_dtype" ]];then + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'fp32'_'nobin_perf' +else + CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'nobin_perf' +fi +echo "CaseName : $CaseName" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=$time +echo "TrainingTime(ms/step) : $TrainingTime" + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +loss=`grep 'loss =' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tr -d '\b\r' | awk -F' ' '{print $9}'|sed 's/,$//'` +echo "${loss}"> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`cat $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt | tail -n 1` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_2_9.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_2_9.sh new file mode 100644 index 000000000..ba8f4f5dc --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_2_9.sh @@ -0,0 +1,235 @@ +#/bin/bash + +#初始结果设置 +result1=true #芯片健康结果 +result2=true #HBM内存结果 +result3=true #告警结果 +result4=true #0xb异常结果 +result5=true #进程状态结果 +result6=false #业务状态结果 + +#设置最终结果 +TrainAccuracy=0 + +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#训练batch_size +batch_size=131072 +#RANK_SIZE +export RANK_SIZE=1 + +#设置环境变量 +#source /usr/local/Ascend/latest/bin/setenv.bash +#export HCCL_EXEC_TIMEOUT=68 +#export HCCL_CONNECT_TIMEOUT=300 + +#清除环境初始日志 +rm -rf /root/ascend/log/* + +#coredump设置 +echo /npu/coredump/core.%t.%e.%p > /proc/sys/kernel/core_pattern +echo 1 > /proc/sys/fs/suid_dumpable +ulimit -c unlimited + +#业务环境变量 +export HCCL_DIAGNOSE_ENABLE=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=False + +#压测次数 +times=20 +#docker适配 +docker_enable="false" + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --gather* ]];then + export gather=1 + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --dispatcher_type* ]];then + dispatcher_type=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --conda_name* ]];then + conda_name=`echo ${para#*=}` + elif [[ $para == --train_iters* ]];then + train_iters=`echo ${para#*=}` + elif [[ $para == --times* ]];then + times=`echo ${para#*=}` + elif [[ $para == --docker_enable* ]]; then + docker_enable=`echo ${para#*=}` + fi +done + +# 初始化,清除环境上业务进程,确保无残留进程 +ps -ef |grep python|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -9 + +#获取0卡初始状态HBM +cp -r /npu/npu_smi ./ +./npu_smi | tee -a npu_smi.log +HBM_start=`awk NR==4 npu_smi.log | awk {'print $4'}` +#增加20M波动范围 +HBM_start=$(( $HBM_start + 10 )) +echo $HBM_start + +for ((i=1;i<=${times};i++)) +do + # 定义kill的信号量(此处交替执行kill -2和kill -9) + random_number=$((RANDOM % 2)) + if [ $random_number -eq 0 ]; then + signal=2 + else + signal=9 + fi + echo "------------- start $i --------------------" + echo "------------- kill -$signal ---------------" + #设置最终结果 + TrainAccuracy=0 + + cur_path=`pwd` + + # 执行业务(根据具体业务调整,此处执行网络开启overflow) + + sed -i "s|train_size=13107200|train_size=1310720000000000|g" $cur_path/train_performance_1p_overflow.sh + setsid bash train_performance_1p_overflow.sh --precision_mode=must_keep_origin_dtype --conda_name=py2 --data_path=/npu/traindata/ID2940_CarPeting_TF_WideDeep/ >/dev/null 2>&1 & + + #执行开始时间 + start_time=$(date +%s) + + # 判断所运行的业务实际开启(根据具体业务调整,此处通过log确认已进入训练) + echo "--------------------------------------" $i + + for s in {1..300} + do + ####因为迭代信息在执行完才会显示,训练过程中不打屏,所以只能校验graph_id + grep -rn "current graph id is: 51" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + if test $? -eq 0 ;then + echo "--------------------------------" + rnd=$((RANDOM%60+120)) + result6=true + #sleep $rnd + break + fi + sleep 1 + done + sed -i "s|train_size=1310720000000000|train_size=13107200|g" $cur_path/train_performance_1p_overflow.sh + #业务状态判断 + if [ $result6 != true ];then + echo "业务error" && break + fi + + # 执行进程kill中断 + echo "------------------------------------" + + ps -ef |grep python|grep train_overflow.py|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -$signal + + if [ -$signal == 9 ];then + sleep 5 + else + sleep 1m + fi + + #判断进程状态是否均已退出 + count=`ps -ef |grep python |grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec|grep -v "grep" |wc -l` + count1=`ps -ef | grep python | grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec| grep -v "grep"| wc -l` + count2=`ps -ef | grep python | grep train_overflow.py | grep -v "grep"| wc -l` + echo $count + echo $count1 + echo $count2 + if [ $count -ne 0 ] || [ $count1 -ne 0 ] || [ $count2 -ne 0 ];then + result5=false + echo "进程状态error" + fi + + #收集smi + cp -r /npu/npu_smi ./ + ./npu_smi | tee -a npu_smi_$i.log + + #查看npu_smi回显是否有变化 + title1=`awk NR==2 npu_smi_$i.log | awk {'print $2'}` + title2=`awk NR==2 npu_smi_$i.log | awk {'print $4'}` + title3=`awk NR==2 npu_smi_$i.log | awk {'print $10'}` + title4=`awk NR==4 npu_smi_$i.log | awk {'print $1'}` + title5=`awk NR==18 npu_smi_$i.log | awk {'print $1'}` + + if [ "$title1"="Health" ] && [ "$title2"=="HBM-Usage(MB)" ] && [ "$title3"=="perrorcode" ] && [ "$title4"==0 ] && [ "$title5"==7 ];then + echo "回显正常无变化" + else + echo "回显有变,请重新适配脚本" + break + fi + + for ((j=4;j<=18;j=(j+2))) + do + # 检查芯片健康状态 + a=`awk NR==${j} npu_smi_$i.log | awk {'print $2'}` + if [ $a != 'OK' ];then + result1=false + echo "芯片健康转态error" + fi + + # 检查芯片HBM内存释放情况 + b=`awk NR==${j} npu_smi_$i.log | awk {'print $4'}` + if (( $b > $HBM_start ));then + echo $b + result2=false + echo "内存释放error" + fi + + # 检查芯片告警情况 + c=`awk NR==${j} npu_smi_$i.log | awk {'print $10'}` + if [[ $c =~ '0x' ]] ;then + result3=false + echo "告警error" + fi + done + + + # 检查0xb异常 + msnpureport > msnpureport.log + num=$(grep -rn "sq_fsm=0xb" 20*-*|wc -l) + if [ $num != 0 ]; then + result4=false + echo "0xb异常error" + fi + + #判断最终结果 + if $result1 && $result2 && $result3 && $result4 &&$result5;then + echo "检查点均通过" + TrainAccuracy=1 + rm -rf 20*-* + else + echo "检查点不通过" && break + fi + +done +rm -rf $cur_path/output/$ASCEND_DEVICE_ID/*perf.log +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'overflow_kill'_'2_9'_'acc' + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_8_11.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_8_11.sh new file mode 100644 index 000000000..61a434658 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_overflow_kill_8_11.sh @@ -0,0 +1,235 @@ +#/bin/bash + +#初始结果设置 +result1=true #芯片健康结果 +result2=true #HBM内存结果 +result3=true #告警结果 +result4=true #0xb异常结果 +result5=true #进程状态结果 +result6=false #业务状态结果 + +#设置最终结果 +TrainAccuracy=0 + +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#训练batch_size +batch_size=131072 +#RANK_SIZE +export RANK_SIZE=1 + +#设置环境变量 +#source /usr/local/Ascend/latest/bin/setenv.bash +#export HCCL_EXEC_TIMEOUT=68 +#export HCCL_CONNECT_TIMEOUT=300 + +#清除环境初始日志 +rm -rf /root/ascend/log/* + +#coredump设置 +echo /npu/coredump/core.%t.%e.%p > /proc/sys/kernel/core_pattern +echo 1 > /proc/sys/fs/suid_dumpable +ulimit -c unlimited + +#业务环境变量 +export HCCL_DIAGNOSE_ENABLE=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=False + +#压测次数 +times=20 +#docker适配 +docker_enable="false" + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --gather* ]];then + export gather=1 + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --dispatcher_type* ]];then + dispatcher_type=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --conda_name* ]];then + conda_name=`echo ${para#*=}` + elif [[ $para == --train_iters* ]];then + train_iters=`echo ${para#*=}` + elif [[ $para == --times* ]];then + times=`echo ${para#*=}` + elif [[ $para == --docker_enable* ]]; then + docker_enable=`echo ${para#*=}` + fi +done + +# 初始化,清除环境上业务进程,确保无残留进程 +ps -ef |grep python|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -9 + +#获取0卡初始状态HBM +cp -r /npu/npu_smi ./ +./npu_smi | tee -a npu_smi.log +HBM_start=`awk NR==4 npu_smi.log | awk {'print $4'}` +#增加20M波动范围 +HBM_start=$(( $HBM_start + 10 )) +echo $HBM_start + +for ((i=1;i<=${times};i++)) +do + # 定义kill的信号量(此处交替执行kill -2和kill -9) + random_number=$((RANDOM % 2)) + if [ $random_number -eq 0 ]; then + signal=8 + else + signal=11 + fi + echo "------------- start $i --------------------" + echo "------------- kill -$signal ---------------" + #设置最终结果 + TrainAccuracy=0 + + cur_path=`pwd` + + # 执行业务(根据具体业务调整,此处执行网络开启overflow) + + sed -i "s|train_size=13107200|train_size=1310720000000000|g" $cur_path/train_performance_1p_overflow.sh + setsid bash train_performance_1p_overflow.sh --precision_mode=must_keep_origin_dtype --conda_name=py2 --data_path=/npu/traindata/ID2940_CarPeting_TF_WideDeep/ >/dev/null 2>&1 & + + #执行开始时间 + start_time=$(date +%s) + + # 判断所运行的业务实际开启(根据具体业务调整,此处通过log确认已进入训练) + echo "--------------------------------------" $i + + for s in {1..300} + do + ####因为迭代信息在执行完才会显示,训练过程中不打屏,所以只能校验graph_id + grep -rn "current graph id is: 51" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + if test $? -eq 0 ;then + echo "--------------------------------" + rnd=$((RANDOM%60+120)) + result6=true + #sleep $rnd + break + fi + sleep 1 + done + sed -i "s|train_size=1310720000000000|train_size=13107200|g" $cur_path/train_performance_1p_overflow.sh + #业务状态判断 + if [ $result6 != true ];then + echo "业务error" && break + fi + + # 执行进程kill中断 + echo "------------------------------------" + + ps -ef |grep python|grep train_overflow.py|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -$signal + + if [ -$signal == 9 ];then + sleep 5 + else + sleep 1m + fi + + #判断进程状态是否均已退出 + count=`ps -ef |grep python |grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec|grep -v "grep" |wc -l` + count1=`ps -ef | grep python | grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec| grep -v "grep"| wc -l` + count2=`ps -ef | grep python | grep train_overflow.py | grep -v "grep"| wc -l` + echo $count + echo $count1 + echo $count2 + if [ $count -ne 0 ] || [ $count1 -ne 0 ] || [ $count2 -ne 0 ];then + result5=false + echo "进程状态error" + fi + + #收集smi + cp -r /npu/npu_smi ./ + ./npu_smi | tee -a npu_smi_$i.log + + #查看npu_smi回显是否有变化 + title1=`awk NR==2 npu_smi_$i.log | awk {'print $2'}` + title2=`awk NR==2 npu_smi_$i.log | awk {'print $4'}` + title3=`awk NR==2 npu_smi_$i.log | awk {'print $10'}` + title4=`awk NR==4 npu_smi_$i.log | awk {'print $1'}` + title5=`awk NR==18 npu_smi_$i.log | awk {'print $1'}` + + if [ "$title1"="Health" ] && [ "$title2"=="HBM-Usage(MB)" ] && [ "$title3"=="perrorcode" ] && [ "$title4"==0 ] && [ "$title5"==7 ];then + echo "回显正常无变化" + else + echo "回显有变,请重新适配脚本" + break + fi + + for ((j=4;j<=18;j=(j+2))) + do + # 检查芯片健康状态 + a=`awk NR==${j} npu_smi_$i.log | awk {'print $2'}` + if [ $a != 'OK' ];then + result1=false + echo "芯片健康转态error" + fi + + # 检查芯片HBM内存释放情况 + b=`awk NR==${j} npu_smi_$i.log | awk {'print $4'}` + if (( $b > $HBM_start ));then + echo $b + result2=false + echo "内存释放error" + fi + + # 检查芯片告警情况 + c=`awk NR==${j} npu_smi_$i.log | awk {'print $10'}` + if [[ $c =~ '0x' ]] ;then + result3=false + echo "告警error" + fi + done + + + # 检查0xb异常 + msnpureport > msnpureport.log + num=$(grep -rn "sq_fsm=0xb" 20*-*|wc -l) + if [ $num != 0 ]; then + result4=false + echo "0xb异常error" + fi + + #判断最终结果 + if $result1 && $result2 && $result3 && $result4 &&$result5;then + echo "检查点均通过" + TrainAccuracy=1 + rm -rf 20*-* + else + echo "检查点不通过" && break + fi + +done +rm -rf $cur_path/output/$ASCEND_DEVICE_ID/*perf.log +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'overflow_kill'_'8_11'_'acc' + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_2_9.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_2_9.sh new file mode 100644 index 000000000..0ca6deac3 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_2_9.sh @@ -0,0 +1,235 @@ +#/bin/bash + +#初始结果设置 +result1=true #芯片健康结果 +result2=true #HBM内存结果 +result3=true #告警结果 +result4=true #0xb异常结果 +result5=true #进程状态结果 +result6=false #业务状态结果 + +#设置最终结果 +TrainAccuracy=0 + +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#训练batch_size +batch_size=131072 +#RANK_SIZE +export RANK_SIZE=1 + +#设置环境变量 +#source /usr/local/Ascend/latest/bin/setenv.bash +#export HCCL_EXEC_TIMEOUT=68 +#export HCCL_CONNECT_TIMEOUT=300 + +#清除环境初始日志 +rm -rf /root/ascend/log/* + +#coredump设置 +echo /npu/coredump/core.%t.%e.%p > /proc/sys/kernel/core_pattern +echo 1 > /proc/sys/fs/suid_dumpable +ulimit -c unlimited + +#业务环境变量 +export HCCL_DIAGNOSE_ENABLE=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=False + +#压测次数 +times=20 +#docker适配 +docker_enable="false" + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --gather* ]];then + export gather=1 + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --dispatcher_type* ]];then + dispatcher_type=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --conda_name* ]];then + conda_name=`echo ${para#*=}` + elif [[ $para == --train_iters* ]];then + train_iters=`echo ${para#*=}` + elif [[ $para == --times* ]];then + times=`echo ${para#*=}` + elif [[ $para == --docker_enable* ]]; then + docker_enable=`echo ${para#*=}` + fi +done + +# 初始化,清除环境上业务进程,确保无残留进程 +ps -ef |grep python|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -9 + +#获取0卡初始状态HBM +cp -r /npu/npu_smi ./ +./npu_smi | tee -a npu_smi.log +HBM_start=`awk NR==4 npu_smi.log | awk {'print $4'}` +#增加20M波动范围 +HBM_start=$(( $HBM_start + 10 )) +echo $HBM_start + +for ((i=1;i<=${times};i++)) +do + # 定义kill的信号量(此处交替执行kill -2和kill -9) + random_number=$((RANDOM % 2)) + if [ $random_number -eq 0 ]; then + signal=2 + else + signal=9 + fi + echo "------------- start $i --------------------" + echo "------------- kill -$signal ---------------" + #设置最终结果 + TrainAccuracy=0 + + cur_path=`pwd` + + # 执行业务(根据具体业务调整,此处执行网络开启overflow) + + sed -i "s|train_size=13107200|train_size=1310720000000000|g" $cur_path/train_performance_1p_profiling2.sh + setsid bash train_performance_1p_profiling2.sh --precision_mode=must_keep_origin_dtype --conda_name=py2 --data_path=/npu/traindata/ID2940_CarPeting_TF_WideDeep/ >/dev/null 2>&1 & + + #执行开始时间 + start_time=$(date +%s) + + # 判断所运行的业务实际开启(根据具体业务调整,此处通过log确认已进入训练) + echo "--------------------------------------" $i + + for s in {1..300} + do + ####因为迭代信息在执行完才会显示,训练过程中不打屏,所以只能校验graph_id + grep -rn "current graph id is: 51" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + if test $? -eq 0 ;then + echo "--------------------------------" + rnd=$((RANDOM%60+120)) + result6=true + #sleep $rnd + break + fi + sleep 1 + done + sed -i "s|train_size=1310720000000000|train_size=13107200|g" $cur_path/train_performance_1p_profiling2.sh + #业务状态判断 + if [ $result6 != true ];then + echo "业务error" && break + fi + + # 执行进程kill中断 + echo "------------------------------------" + + ps -ef |grep python|grep train_profiling.py|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -$signal + + if [ -$signal == 9 ];then + sleep 5 + else + sleep 1m + fi + + #判断进程状态是否均已退出 + count=`ps -ef |grep python |grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec|grep -v "grep" |wc -l` + count1=`ps -ef | grep python | grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec| grep -v "grep"| wc -l` + count2=`ps -ef | grep python | grep train_profiling.py | grep -v "grep"| wc -l` + echo $count + echo $count1 + echo $count2 + if [ $count -ne 0 ] || [ $count1 -ne 0 ] || [ $count2 -ne 0 ];then + result5=false + echo "进程状态error" + fi + + #收集smi + cp -r /npu/npu_smi ./ + ./npu_smi | tee -a npu_smi_$i.log + + #查看npu_smi回显是否有变化 + title1=`awk NR==2 npu_smi_$i.log | awk {'print $2'}` + title2=`awk NR==2 npu_smi_$i.log | awk {'print $4'}` + title3=`awk NR==2 npu_smi_$i.log | awk {'print $10'}` + title4=`awk NR==4 npu_smi_$i.log | awk {'print $1'}` + title5=`awk NR==18 npu_smi_$i.log | awk {'print $1'}` + + if [ "$title1"="Health" ] && [ "$title2"=="HBM-Usage(MB)" ] && [ "$title3"=="perrorcode" ] && [ "$title4"==0 ] && [ "$title5"==7 ];then + echo "回显正常无变化" + else + echo "回显有变,请重新适配脚本" + break + fi + + for ((j=4;j<=18;j=(j+2))) + do + # 检查芯片健康状态 + a=`awk NR==${j} npu_smi_$i.log | awk {'print $2'}` + if [ $a != 'OK' ];then + result1=false + echo "芯片健康转态error" + fi + + # 检查芯片HBM内存释放情况 + b=`awk NR==${j} npu_smi_$i.log | awk {'print $4'}` + if (( $b > $HBM_start ));then + echo $b + result2=false + echo "内存释放error" + fi + + # 检查芯片告警情况 + c=`awk NR==${j} npu_smi_$i.log | awk {'print $10'}` + if [[ $c =~ '0x' ]] ;then + result3=false + echo "告警error" + fi + done + + + # 检查0xb异常 + msnpureport > msnpureport.log + num=$(grep -rn "sq_fsm=0xb" 20*-*|wc -l) + if [ $num != 0 ]; then + result4=false + echo "0xb异常error" + fi + + #判断最终结果 + if $result1 && $result2 && $result3 && $result4 &&$result5;then + echo "检查点均通过" + TrainAccuracy=1 + rm -rf 20*-* + else + echo "检查点不通过" && break + fi + +done +rm -rf $cur_path/output/$ASCEND_DEVICE_ID/*perf.log +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'profiling_kill'_'2_9'_'acc' + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_8_11.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_8_11.sh new file mode 100644 index 000000000..ae0dc9463 --- /dev/null +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p_profiling_kill_8_11.sh @@ -0,0 +1,235 @@ +#/bin/bash + +#初始结果设置 +result1=true #芯片健康结果 +result2=true #HBM内存结果 +result3=true #告警结果 +result4=true #0xb异常结果 +result5=true #进程状态结果 +result6=false #业务状态结果 + +#设置最终结果 +TrainAccuracy=0 + +#网络名称,同目录名称 +Network="WideDeep_ID2712_for_TensorFlow" +#训练batch_size +batch_size=131072 +#RANK_SIZE +export RANK_SIZE=1 + +#设置环境变量 +#source /usr/local/Ascend/latest/bin/setenv.bash +#export HCCL_EXEC_TIMEOUT=68 +#export HCCL_CONNECT_TIMEOUT=300 + +#清除环境初始日志 +rm -rf /root/ascend/log/* + +#coredump设置 +echo /npu/coredump/core.%t.%e.%p > /proc/sys/kernel/core_pattern +echo 1 > /proc/sys/fs/suid_dumpable +ulimit -c unlimited + +#业务环境变量 +export HCCL_DIAGNOSE_ENABLE=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=False + +#压测次数 +times=20 +#docker适配 +docker_enable="false" + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --gather* ]];then + export gather=1 + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --dispatcher_type* ]];then + dispatcher_type=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --fp32 ]];then + fp32=`echo ${para#*=}` + elif [[ $para == --hf32 ]];then + hf32=`echo ${para#*=}` + elif [[ $para == --conda_name* ]];then + conda_name=`echo ${para#*=}` + elif [[ $para == --train_iters* ]];then + train_iters=`echo ${para#*=}` + elif [[ $para == --times* ]];then + times=`echo ${para#*=}` + elif [[ $para == --docker_enable* ]]; then + docker_enable=`echo ${para#*=}` + fi +done + +# 初始化,清除环境上业务进程,确保无残留进程 +ps -ef |grep python|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -9 + +#获取0卡初始状态HBM +cp -r /npu/npu_smi ./ +./npu_smi | tee -a npu_smi.log +HBM_start=`awk NR==4 npu_smi.log | awk {'print $4'}` +#增加20M波动范围 +HBM_start=$(( $HBM_start + 10 )) +echo $HBM_start + +for ((i=1;i<=${times};i++)) +do + # 定义kill的信号量(此处交替执行kill -2和kill -9) + random_number=$((RANDOM % 2)) + if [ $random_number -eq 0 ]; then + signal=8 + else + signal=11 + fi + echo "------------- start $i --------------------" + echo "------------- kill -$signal ---------------" + #设置最终结果 + TrainAccuracy=0 + + cur_path=`pwd` + + # 执行业务(根据具体业务调整,此处执行网络开启overflow) + + sed -i "s|train_size=13107200|train_size=1310720000000000|g" $cur_path/train_performance_1p_profiling2.sh + setsid bash train_performance_1p_profiling2.sh --precision_mode=must_keep_origin_dtype --conda_name=py2 --data_path=/npu/traindata/ID2940_CarPeting_TF_WideDeep/ >/dev/null 2>&1 & + + #执行开始时间 + start_time=$(date +%s) + + # 判断所运行的业务实际开启(根据具体业务调整,此处通过log确认已进入训练) + echo "--------------------------------------" $i + + for s in {1..300} + do + ####因为迭代信息在执行完才会显示,训练过程中不打屏,所以只能校验graph_id + grep -rn "current graph id is: 51" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log + if test $? -eq 0 ;then + echo "--------------------------------" + rnd=$((RANDOM%60+120)) + result6=true + #sleep $rnd + break + fi + sleep 1 + done + sed -i "s|train_size=1310720000000000|train_size=13107200|g" $cur_path/train_performance_1p_profiling2.sh + #业务状态判断 + if [ $result6 != true ];then + echo "业务error" && break + fi + + # 执行进程kill中断 + echo "------------------------------------" + + ps -ef |grep python|grep train_profiling.py|grep -v grep|grep -v network_analyse_tool|grep -v culster_task_exec|cut -c 9-16 | xargs kill -$signal + + if [ -$signal == 9 ];then + sleep 5 + else + sleep 1m + fi + + #判断进程状态是否均已退出 + count=`ps -ef |grep python |grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec|grep -v "grep" |wc -l` + count1=`ps -ef | grep python | grep multiprocessing|grep -v network_analyse_tool|grep -v culster_task_exec| grep -v "grep"| wc -l` + count2=`ps -ef | grep python | grep train_profiling.py | grep -v "grep"| wc -l` + echo $count + echo $count1 + echo $count2 + if [ $count -ne 0 ] || [ $count1 -ne 0 ] || [ $count2 -ne 0 ];then + result5=false + echo "进程状态error" + fi + + #收集smi + cp -r /npu/npu_smi ./ + ./npu_smi | tee -a npu_smi_$i.log + + #查看npu_smi回显是否有变化 + title1=`awk NR==2 npu_smi_$i.log | awk {'print $2'}` + title2=`awk NR==2 npu_smi_$i.log | awk {'print $4'}` + title3=`awk NR==2 npu_smi_$i.log | awk {'print $10'}` + title4=`awk NR==4 npu_smi_$i.log | awk {'print $1'}` + title5=`awk NR==18 npu_smi_$i.log | awk {'print $1'}` + + if [ "$title1"="Health" ] && [ "$title2"=="HBM-Usage(MB)" ] && [ "$title3"=="perrorcode" ] && [ "$title4"==0 ] && [ "$title5"==7 ];then + echo "回显正常无变化" + else + echo "回显有变,请重新适配脚本" + break + fi + + for ((j=4;j<=18;j=(j+2))) + do + # 检查芯片健康状态 + a=`awk NR==${j} npu_smi_$i.log | awk {'print $2'}` + if [ $a != 'OK' ];then + result1=false + echo "芯片健康转态error" + fi + + # 检查芯片HBM内存释放情况 + b=`awk NR==${j} npu_smi_$i.log | awk {'print $4'}` + if (( $b > $HBM_start ));then + echo $b + result2=false + echo "内存释放error" + fi + + # 检查芯片告警情况 + c=`awk NR==${j} npu_smi_$i.log | awk {'print $10'}` + if [[ $c =~ '0x' ]] ;then + result3=false + echo "告警error" + fi + done + + + # 检查0xb异常 + msnpureport > msnpureport.log + num=$(grep -rn "sq_fsm=0xb" 20*-*|wc -l) + if [ $num != 0 ]; then + result4=false + echo "0xb异常error" + fi + + #判断最终结果 + if $result1 && $result2 && $result3 && $result4 &&$result5;then + echo "检查点均通过" + TrainAccuracy=1 + rm -rf 20*-* + else + echo "检查点不通过" && break + fi + +done +rm -rf $cur_path/output/$ASCEND_DEVICE_ID/*perf.log +CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'profiling_kill'_'8_11'_'acc' + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${TrainAccuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee From 446ce853525a2509eacd23fc05dde4f526ef2f9e Mon Sep 17 00:00:00 2001 From: majun121 <867479212@qq.com> Date: Thu, 1 Aug 2024 09:09:48 +0000 Subject: [PATCH 2/2] update TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py. Signed-off-by: majun121 <867479212@qq.com> --- .../recommendation/WideDeep_ID2712_for_TensorFlow/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py index 8f2d1cd5b..06d38e298 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py @@ -339,6 +339,8 @@ if __name__ == '__main__': custom_op.parameter_map["optypelist_for_implmode"].s = tf.compat.as_bytes("UnsortedSegmentSum,GatherV2") custom_op.parameter_map["enable_data_pre_proc"].b = True ##True getNext false在host侧 #custom_op.parameter_map["mix_compile_mode"].b = True #开启混合计算,根据实际情况配置 + #custom_op.parameter_map["jit_compile"].s = tf.compat.as_bytes("true") + #custom_op.parameter_map["deterministic"].i = 1 custom_op.parameter_map["use_off_line"].b = True custom_op.parameter_map["min_group_size"].b = 1 custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes(args.precision_mode) -- Gitee