diff --git a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh index 7c6036a715270f99169e8cc384ba8ae12af26a60..a9c3708955c785f98159cfa872157538f6aee1a8 100644 --- a/cv/classification/inceptionv3/tensorflow/get_num_devices.sh +++ b/cv/classification/inceptionv3/tensorflow/get_num_devices.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -19,8 +19,8 @@ if [ -n "$devices" ]; then _devices=(${devices//,/ }) num_devices=${#_devices[@]} else - num_devices=8 - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" fi export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} diff --git a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh index 1787fb081eb448fe832764edeb9fb0fc606123ba..1abf5d2c05bc628985e1e32ede54848e3829e366 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_distributed_imagenette.sh @@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1 i=0 model="alexnet" -for arg in "$@"; do +for arg in "$@" +do if [ $i -eq 0 ]; then model=$arg let i++ @@ -40,11 +41,12 @@ for arg in "$@"; do done echo "## Training model: ${model}" + : ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/${model}_distributed" DATA_DIR=./imagenette @@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi @@ -66,7 +69,7 @@ check_status() { # Prepare devices ################################################# devices=$CUDA_VISIBLE_DEVICES -if [ -n "$devices" ]; then +if [ -n "$devices" ]; then devices=(${devices//,/ }) num_devices=${#devices[@]} else @@ -86,7 +89,8 @@ fi ################################################# worker_hosts="" i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do if [ "$i" == "0" ]; then let i++ continue @@ -102,13 +106,13 @@ echo "worker_hosts: ${worker_hosts}" ################################################# trap ctrl_c INT function ctrl_c() { - echo "*** Trapped CTRL-C, killing process running background" - for pid in "${pid_list[@]}"; do - echo "Killing pid ${pid}" - kill ${pid} - wait ${pid} - done - exit 0 + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + wait ${pid} + done + exit 0 } ################################################# @@ -116,9 +120,10 @@ function ctrl_c() { ################################################# pid_list=() -last_device=$(expr ${num_devices} - 1) +last_device=`expr ${num_devices} - 1` i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do job_name="worker" if [ "${i}" == "0" ]; then job_name="ps" @@ -127,26 +132,30 @@ for device in "${devices[@]}"; do if [ ${i} -le 1 ]; then task_index=0 else - task_index=$(expr ${i} - 1) + task_index=`expr ${i} - 1` fi if [ "${i}" == "${last_device}" ]; then - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated \ - --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log - [[ ${PIPESTATUS[0]} == 0 ]] || exit + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" else - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" fi let i++ diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh index 12337021f8e73fc39f05e2e487812626730ca96c..9c44c920f824220dc29d47550e4e5d33aedc547f 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=rmsprop -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/inception3" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -51,10 +53,14 @@ for arg in "$@"; do let i++ done -python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1 --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh index c48d0cbfd97bf0571cc52dfa9a8a8686e5178b07..6a8938ec5f1daf3bbdb0ad57d3151518d2aa171e 100644 --- a/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh +++ b/cv/classification/inceptionv3/tensorflow/run_train_inception3_multigpu_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=rmsprop -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/inception3_multigpu" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,11 +55,14 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=inception3 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit exit ${EXIT_STATUS} diff --git a/cv/classification/resnet50/tensorflow/benchmark_cnn.py b/cv/classification/resnet50/tensorflow/benchmark_cnn.py index 6f65ea69b46f479a649c81aaddc797f30809c1ae..7f6c1db4c5ee81194c43a463c5bfad1235b0a04f 100644 --- a/cv/classification/resnet50/tensorflow/benchmark_cnn.py +++ b/cv/classification/resnet50/tensorflow/benchmark_cnn.py @@ -31,9 +31,11 @@ import re import threading import time import traceback +import sys from absl import flags as absl_flags import numpy as np +import math import six from six.moves import xrange # pylint: disable=redefined-builtin @@ -881,6 +883,9 @@ def benchmark_one_step(sess, lossval = results['average_loss'] else: lossval = 0. + if not math.isfinite(lossval): + print("Loss is {}, stopping training".format(lossval)) + sys.exit(1) if image_producer is not None: image_producer.notify_image_consumption() train_time = time.time() - start_time diff --git a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh index 98639e5c9f5656c7a46bcc5a1f00609c1170a3f9..f4f48223c71319eb1ca461986d6f81850b57b212 100644 --- a/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/resnet50/tensorflow/run_train_distributed_imagenette.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,7 +14,6 @@ # License for the specific language governing permissions and limitations # under the License. - bash ./get_imagenette.sh export TF_CUDNN_USE_AUTOTUNE=1 @@ -43,7 +42,7 @@ done echo "## Training model: ${model}" -: ${BATCH_SIZE:=32} +: ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum @@ -111,6 +110,7 @@ function ctrl_c() { for pid in "${pid_list[@]}"; do echo "Killing pid ${pid}" kill ${pid} + wait ${pid} done exit 0 } diff --git a/cv/classification/vgg/tensorflow/get_num_devices.sh b/cv/classification/vgg/tensorflow/get_num_devices.sh index 14d6c0a5a7dc9a1b93e5b389d715c36aee2aa618..a9c3708955c785f98159cfa872157538f6aee1a8 100644 --- a/cv/classification/vgg/tensorflow/get_num_devices.sh +++ b/cv/classification/vgg/tensorflow/get_num_devices.sh @@ -1,4 +1,5 @@ -# Copyright (c) 2023, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +#!/bin/bash +# Copyright (c) 2023-2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -18,8 +19,8 @@ if [ -n "$devices" ]; then _devices=(${devices//,/ }) num_devices=${#_devices[@]} else - num_devices=8 - export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + num_devices=2 + export CUDA_VISIBLE_DEVICES=0,1 echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}" fi -export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} \ No newline at end of file +export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices} diff --git a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh index 36b41ee8530f24ce9301138963981817ecceb4de..c7642b98633c5e291901347fd34bef03758bd635 100644 --- a/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_distributed_imagenette.sh @@ -25,7 +25,8 @@ export TF_CPP_MIN_LOG_LEVEL=1 i=0 model="alexnet" -for arg in "$@"; do +for arg in "$@" +do if [ $i -eq 0 ]; then model=$arg let i++ @@ -40,11 +41,12 @@ for arg in "$@"; do done echo "## Training model: ${model}" + : ${BATCH_SIZE:=16} # TRAIN_EPOCHS=10 # optional optimizer: momentum, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/${model}_distributed" DATA_DIR=./imagenette @@ -56,7 +58,8 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi @@ -66,7 +69,7 @@ check_status() { # Prepare devices ################################################# devices=$CUDA_VISIBLE_DEVICES -if [ -n "$devices" ]; then +if [ -n "$devices" ]; then devices=(${devices//,/ }) num_devices=${#devices[@]} else @@ -86,7 +89,8 @@ fi ################################################# worker_hosts="" i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do if [ "$i" == "0" ]; then let i++ continue @@ -102,12 +106,12 @@ echo "worker_hosts: ${worker_hosts}" ################################################# trap ctrl_c INT function ctrl_c() { - echo "*** Trapped CTRL-C, killing process running background" - for pid in "${pid_list[@]}"; do - echo "Killing pid ${pid}" - kill ${pid} - done - exit 0 + echo "*** Trapped CTRL-C, killing process running background" + for pid in "${pid_list[@]}"; do + echo "Killing pid ${pid}" + kill ${pid} + done + exit 0 } ################################################# @@ -115,9 +119,10 @@ function ctrl_c() { ################################################# pid_list=() -last_device=$(expr ${num_devices} - 1) +last_device=`expr ${num_devices} - 1` i=0 -for device in "${devices[@]}"; do +for device in "${devices[@]}"; +do job_name="worker" if [ "${i}" == "0" ]; then job_name="ps" @@ -126,26 +131,30 @@ for device in "${devices[@]}"; do if [ ${i} -le 1 ]; then task_index=0 else - task_index=$(expr ${i} - 1) + task_index=`expr ${i} - 1` fi if [ "${i}" == "${last_device}" ]; then - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated \ - --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log - [[ ${PIPESTATUS[0]} == 0 ]] || exit + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model} \ + --variable_update=distributed_replicated \ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit echo "Distributed training PID ($!) on device ${device} where job name = ${job_name}" else - CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW \ - --optimizer=${OPTIMIZER} --datasets_use_prefetch=False --local_parameter_device=gpu --num_gpus=${num_devices} \ - --batch_size=${BATCH_SIZE} --model=${model} \ - --variable_update=distributed_replicated --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}" \ - --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & + CUDA_VISIBLE_DEVICES=${device} UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW \ + --optimizer=${OPTIMIZER} --datasets_use_prefetch=False\ + --local_parameter_device=gpu --num_gpus=${num_devices}\ + --batch_size=${BATCH_SIZE} --model=${model}\ + --variable_update=distributed_replicated\ + --job_name=${job_name} --ps_hosts=127.0.0.1:50000 --worker_hosts="${worker_hosts}"\ + --train_dir=${TRAIN_DIR} --task_index=${task_index} --print_training_accuracy=True "${new_args[@]}" & echo "Distributed training PID ($!) on device ${device} where job name = ${job_name} and task_index = ${task_index}" fi let i++ diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh index aeea001bcade9882bd62a66aea792200898842e2..343dbe372b175d10e7a6c6855e3348125e0d117d 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/vgg16" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -51,13 +53,15 @@ for arg in "$@"; do let i++ done -python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1 \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 \ - --datasets_use_caching --stop_at_top_1_accuracy=0.9 --num_intra_threads=1 \ - --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=1\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + exit ${EXIT_STATUS} diff --git a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh index 4de70ff725f1a22d56f1e8e80f9f3ca73d1b3fff..d4c2bcd2ab01f05ee348fbf32c330596dd1b4d88 100644 --- a/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh +++ b/cv/classification/vgg/tensorflow/run_train_vgg16_multigpu_imagenette.sh @@ -23,7 +23,7 @@ export TF_CPP_MIN_LOG_LEVEL=1 #TRAIN_EPOCHS=10 # optional optimizer: adam, rmsprop, momentum, sgd OPTIMIZER=momentum -DATE=$(date +%Y%m%d%H%M%S) +DATE=`date +%Y%m%d%H%M%S` LOG_DIR="logs/vgg16_multigpu" DATA_DIR=./imagenette @@ -35,14 +35,16 @@ mkdir -p ${BASE_DIR} rm -rf ${TRAIN_DIR} EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } i=0 -for arg in "$@"; do +for arg in "$@" +do if [[ $arg =~ "--epoch" ]]; then new_args[$i]="--num_epochs" else @@ -53,13 +55,15 @@ done source ./get_num_devices.sh -UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py --data_name=imagenette --data_dir=${DATA_DIR} \ - --data_format=NCHW --batch_size=${BATCH_SIZE} \ - --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES} \ - --weight_decay=1e-4 --train_dir=${TRAIN_DIR} \ - --eval_during_training_every_n_epochs=2 --num_eval_epochs=1 --datasets_use_caching \ - --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu --num_intra_threads=1 \ - --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log -[[ ${PIPESTATUS[0]} == 0 ]] || exit +UMD_WAITAFTERLAUNCH=1 python3 -u tf_cnn_benchmarks.py\ + --data_name=imagenette --data_dir=${DATA_DIR}\ + --data_format=NCHW --batch_size=${BATCH_SIZE}\ + --model=vgg16 --optimizer=${OPTIMIZER} --num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}\ + --weight_decay=1e-4 --train_dir=${TRAIN_DIR}\ + --eval_during_training_every_n_epochs=2\ + --num_eval_epochs=1 --datasets_use_caching\ + --stop_at_top_1_accuracy=0.9 --all_reduce_spec=pscpu\ + --num_intra_threads=1 --num_inter_threads=1 "${new_args[@]}" 2>&1 | tee ${LOG_DIR}/${DATE}_${TRAIN_EPOCHS}_${BATCH_SIZE}_${OPTIMIZER}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit + exit ${EXIT_STATUS} diff --git a/cv/face/facenet/tensorflow/init.sh b/cv/face/facenet/tensorflow/init.sh index 7631e4b8060a61c36698b8d052972bc857d99fd5..13864c5f493c362b62659b35435b2893baeed2be 100644 --- a/cv/face/facenet/tensorflow/init.sh +++ b/cv/face/facenet/tensorflow/init.sh @@ -13,12 +13,20 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - -PY_VERSION=$(python3 -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}') -if [ "$PY_VERSION" == "10" ] || [ "$PY_VERSION" == "8" ] || [ "$PY_VERSION" == "9" ]; then +PY_VERSION=$(python3 -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}') +if [ "$PY_VERSION" == "10" ] || [ "$PY_VERSION" == "8" ] || [ "$PY_VERSION" == "9" ]; +then pip3 install -r requirements.txt - pip3 install scipy==1.7.2 + pip3 install scipy==1.7.2 + pip3 install numpy==1.23.5 else pip3 install -r requirements.txt pip3 install scipy fi + +cd data +wget -q http://10.150.9.95/swapp/datasets/cv/face/lfw_data.tar.gz +wget -q http://10.150.9.95/swapp/datasets/cv/face/webface_182_44.tar +tar -zxf lfw_data.tar.gz +tar -xf webface_182_44.tar +cd - diff --git a/cv/face/facenet/tensorflow/train_facenet_ddp.sh b/cv/face/facenet/tensorflow/train_facenet_ddp.sh index ce406a10de98f86e927fa63554f7a2286203b103..948172bb8fd9f0d8415fc46364702ed5fc016df7 100644 --- a/cv/face/facenet/tensorflow/train_facenet_ddp.sh +++ b/cv/face/facenet/tensorflow/train_facenet_ddp.sh @@ -13,41 +13,40 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - EXIT_STATUS=0 -check_status() { +check_status() +{ if ((${PIPESTATUS[0]} != 0)); then EXIT_STATUS=1 fi } horovodrun -np 16 --gloo python3 src/train_softmax_ddp.py \ - --logs_base_dir ./logs/facenet/ \ - --models_base_dir ./src/models/ \ - --data_dir ./data/webface_182_44 \ - --image_size 160 \ - --model_def models.inception_resnet_v1 \ - --lfw_dir ./data/lfw_data/lfw_160/ \ - --learning_rate -1 \ - --batch_size 128 \ - --optimizer ADAM \ - --max_nrof_epochs 500 \ - --keep_probability 0.8 \ - --random_flip \ - --random_crop \ - --use_fixed_image_standardization \ - --learning_rate_schedule_file ./data/learning_rate_schedule_classifier_casia_ddp.txt \ - --weight_decay 5e-4 \ - --embedding_size 512 \ - --lfw_distance_metric 1 \ - --lfw_use_flipped_images \ - --lfw_subtract_mean \ - --validation_set_split_ratio 0.01 \ - --validate_every_n_epochs 5 \ - --prelogits_norm_loss_factor 5e-4 \ - --gpu_memory_fraction 0.9 \ - --seed 43 \ - --epoch_size 200 "$@" -check_status + --logs_base_dir ./logs/facenet/ \ + --models_base_dir ./src/models/ \ + --data_dir ./data/webface_182_44 \ + --image_size 160 \ + --model_def models.inception_resnet_v1 \ + --lfw_dir ./data/lfw_data/lfw_160/ \ + --learning_rate -1 \ + --batch_size 128 \ + --optimizer ADAM \ + --max_nrof_epochs 500 \ + --keep_probability 0.8 \ + --random_flip \ + --random_crop \ + --use_fixed_image_standardization \ + --learning_rate_schedule_file ./data/learning_rate_schedule_classifier_casia_ddp.txt \ + --weight_decay 5e-4 \ + --embedding_size 512 \ + --lfw_distance_metric 1 \ + --lfw_use_flipped_images \ + --lfw_subtract_mean \ + --validation_set_split_ratio 0.01 \ + --validate_every_n_epochs 5 \ + --prelogits_norm_loss_factor 5e-4 \ + --gpu_memory_fraction 0.9 \ + --seed 43 \ + --epoch_size 200 "$@"; check_status exit ${EXIT_STATUS}