From 0858e114e0cea382f4dde68c5a8fa89742c6759b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Spencer=E5=85=94=E5=AD=90?= Date: Thu, 4 May 2023 09:03:46 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9Etrain=5Fperformance=5Fbs4?= =?UTF-8?q?8=5F1p.sh.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Spencer兔子 --- .../test/train_performance_bs48_1p.sh | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs48_1p.sh diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs48_1p.sh b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs48_1p.sh new file mode 100644 index 000000000..9d22e92f3 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/test/train_performance_bs48_1p.sh @@ -0,0 +1,179 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +export JOB_ID=99990001 +export RANK_ID=1 +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="ResNet50_ID0058_for_TensorFlow" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=48 +#训练step +train_steps=2000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#修改参数 +sed -i "50s|PATH_TO_BE_CONFIGURED|${data_path}|g" $cur_path/../src/configs/res50_bs48_1p.py +sed -i "107s|PATH_TO_BE_CONFIGURED|${cur_path}/output/0/d\_solution/ckpt0|g" $cur_path/../src/configs/res50_bs48_1p.py + +cp data_loader.py $cur_path/../src/data_loader/resnet50/ +#训练开始时间,不需要修改 +start_time=$(date +%s) +cd $cur_path/../ +#进入训练脚本目录,需要模型审视修改 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path,--autotune + nohup python3.7 ${cur_path}/../src/mains/res50.py --config_file=res50_bs48_1p \ + --max_train_steps=${train_steps} \ + --iterations_per_loop=100 \ + --debug=True \ + --eval=False \ + --model_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#参数改回 +sed -i "50s|${data_path}|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_bs48_1p.py +sed -i "107s|${cur_path}/output/0/d\_solution/ckpt0|PATH_TO_BE_CONFIGURED|g" $cur_path/../src/configs/res50_bs48_1p.py + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`cat ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep "FPS: " | awk -F "FPS: " '{print $2}' | awk -F " loss:" '{print $1}' | tail -n +2 | awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "FPS: " $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' | awk -F "total" '{print $1}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 47fdb98e6c64982311d90e65e1292b914fffb7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Spencer=E5=85=94=E5=AD=90?= Date: Thu, 4 May 2023 09:05:09 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=E6=96=B0=E5=A2=9Eres50=5Fbs48=5F1p.py.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Spencer兔子 --- .../src/configs/res50_bs48_1p.py | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/src/configs/res50_bs48_1p.py diff --git a/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/src/configs/res50_bs48_1p.py b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/src/configs/res50_bs48_1p.py new file mode 100644 index 000000000..220843ebc --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/ResNet50_ID0058_for_TensorFlow/src/configs/res50_bs48_1p.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import tensorflow as tf + +import os +log_dir = './results/'+os.path.basename(__file__).split('.')[0] + +#256 +config = { + # ============ for testing ===================== + 'accelerator': '1980', # 'gpu', '1980' + 'shuffle_enable': 'yes', + 'shuffle_buffer_size': 10000, + 'rank_size': 1, + 'shard': False, + + # ======= basic config ======= # + 'mode':'train', # "train","evaluate","train_and_evaluate" + 'epochs_between_evals': 4, #used if mode is "train_and_evaluate" + 'stop_threshold': 80.0, #used if mode is "train_and_evaluate" + 'data_dir':'/opt/npu/resnet_data_new', + 'data_url': 'file://PATH_TO_BE_CONFIGURED', + 'data_type': 'TFRECORD', + 'model_name': 'resnet50', + 'num_classes': 1001, + 'num_epochs': None, + 'height':224, + 'width':224, + 'dtype': tf.float32, + 'data_format': 'channels_last', + 'use_nesterov': True, + 'eval_interval': 1, + 'loss_scale': 1024, #could be float or string. If float, static loss scaling is applied. + #If string, the corresponding automatic loss scaling algorithm is used. + #Must be one of 'Backoff' of 'LogMax' (case insensitive). + 'use_lars': False, + 'label_smoothing':0.1, #If greater than 0 then smooth the labels. + 'weight_decay': 0.0001, + 'batch_size':48, #minibatch size per node, total batchsize = batch_size*hvd.size()*itersize + + 'momentum': [0.9], + + #======= data processing config ======= + 'min_object_covered': 0.1, #used for random crop + 'aspect_ratio_range':[3. / 4., 4. / 3.], + 'area_range':[0.16, 1.0], + 'max_attempts': 100, + + #======= data augment config ======= + 'increased_aug': False, + 'brightness':0.3, + 'saturation': 0.6, + 'contrast': 0.6, + 'hue': 0.13, + 'num_preproc_threads': 22, + + #======= initialization config ======= + 'conv_init': tf.variance_scaling_initializer(), + 'bn_init_mode': 'adv_bn_init', # "conv_bn_init" or "adv_bn_init",initializer the gamma in bn in different modes + # "adv_bn_init" means initialize gamma to 0 in each residual block's last bn, and initialize other gamma to 1 + # "conv_bn_init" means initialize all the gamma to a constant, defined by "bn_gamma_initial_value" + 'bn_gamma_initial_value': 1.0, + + #======== model architecture ========== + 'resnet_version': 'v1.5', + 'arch_type': 'original', # ------ input ------- + # C1,C2,C3: input block, stride in different layer + # ------ shortcut ------ + # D1: average_pooling + conv1*1 in shortcut in downsample block + # D2: conv3*3,stride=2 in shortcut in downsample block + # D3: conv1*1 +average_pooling in shortcut in downsample block + # ------ mainstream ---- + # E1: average_pooling + conv3*3 in mainstream in downsample block + # E2: conv3*3 + average_pooling in mainstream in downsample block + + #======= logger config ======= + 'display_every': 1, + 'log_name': 'resnet50.log', + 'log_dir': 'PATH_TO_BE_CONFIGURED', + + #======= Learning Rate Config ======= + 'lr_warmup_mode': 'linear', # "linear" or "cosine" + 'warmup_lr': 0.0, + 'warmup_epochs': 10, + 'learning_rate_maximum': 0.1, + + 'lr_decay_mode': 'cosine', # "steps", "poly", "poly_cycle", "cosine", "linear_cosine", "linear_twice", "constant" for 1980 only + 'learning_rate_end': 0.00001, + + 'decay_steps': '10,20,30', #for "steps" + 'lr_decay_steps': '6.4,0.64,0.064', + + 'ploy_power': 2.0, #for "poly" and "poly_cycle" + + 'cdr_first_decay_ratio': 0.33, #for "cosine_decay_restarts" + 'cdr_t_mul':2.0, + 'cdr_m_mul':0.1, + + 'lc_periods':0.47, #for "linear_consine" + 'lc_beta':0.00001, + + 'lr_mid': 0.5, #for "linear_twice" + 'epoch_mid': 80, + + 'bn_lr_scale':1.0, + + } + +def res50_config(): + config['global_batch_size'] = config['batch_size'] * config['rank_size'] + config['do_checkpoint'] = True + + return config \ No newline at end of file -- Gitee