diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/README.cn.md b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/README.cn.md index 52ad1e3cda5262df30dd2ab77c2acc738e0441c3..a880d6052921e26178b59029cb059533cdb91a59 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/README.cn.md +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/README.cn.md @@ -15,12 +15,9 @@ cd v1.1 将数据集放到v1.1目录下 ``` -2. 确认数据集路径 - 请确保数据集路径如下 +2. 确认数据集 ``` ----bert_for_pytorch ----data ---squad ---v1.1 ---train-v1.1.json @@ -30,51 +27,45 @@ cd v1.1 3. 下载词典 -在工程根目录执行 +在数据集v1.1目录执行 ``` mkdir data/uncased_L-24_H-1024_A-16 cd data/uncased_L-24_H-1024_A-16 wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O vocab.txt -cd ../../ + ``` #### 预训练模型准备 -1. 获取预训练模型,新建checkpoints目录,并将预训练模型置于checkpoints目录下,参照:https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT -2. 确认预训练模型路径 -请确保如下路径: -bert-large: +1. 获取预训练模型,参照:https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT +2. 确认预训练模型 + bert-large: ``` ----bert_for_pytorch - ---checkpoints - ---bert_large_pretrained_amp.pt + ---bert_large_pretrained_amp.pt ``` -bert-base: + bert-base: ``` ----bert_for_pytorch - ---checkpoints - ---bert_base_pretrained_amp.pt + ---bert_base_pretrained_amp.pt ``` - #### bert-large启动训练 ##### 单卡 -bash scripts/run_squad_npu_1p.sh +bash test/train_large_full_1p.sh --data_path=/data/squad/v1.1 --ckpt_path=real_path ##### 8卡 -bash scripts/run_squad_npu_8p.sh +bash test/train_large_full_8p.sh --data_path=/data/squad/v1.1 --ckpt_path=real_path #### bert-base启动训练 ##### 单卡 -bash scripts/run_squad_base_npu_1p.sh +bash test/train_base_full_1p.sh --data_path=/data/squad/v1.1 --ckpt_path=real_path ##### 8卡 -bash scripts/run_squad_base_npu_8p.sh +bash test/train_base_full_8p.sh --data_path=/data/squad/v1.1 --ckpt_path=real_path # Q&A diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/env_new.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/env_new.sh deleted file mode 100644 index 15642ac49baa796303a56ce42a0e79f01631d47c..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/env_new.sh +++ /dev/null @@ -1,20 +0,0 @@ -path_lib=$(python3.7 -c """ -import sys -import re -result='' -for index in range(len(sys.path)): - match_sit = re.search('-packages', sys.path[index]) - if match_sit is not None: - match_lib = re.search('lib', sys.path[index]) - - if match_lib is not None: - end=match_lib.span()[1] - result += sys.path[index][0:end] + ':' - - result+=sys.path[index] + '/torch/lib:' -print(result)""" -) - -#echo ${path_lib} - -export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/glue_config.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/glue_config.sh deleted file mode 100644 index c15ee55afbb3923aff4cb47901efe8f12ec26d77..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/glue_config.sh +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -batch_size_and_gradient_accumulation_steps() { - batch_size=$((global_batch_size / num_gpu)) - gradient_accumulation_steps=1 - - while [ $((batch_size / gradient_accumulation_steps)) -gt $batch_size_capacity ] - do - gradient_accumulation_steps=$((gradient_accumulation_steps * 2)) - done -} - -commons () { - init_checkpoint=/workspace/bert/checkpoints/bert_uncased.pt - vocab_file=${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt - config_file=/workspace/bert/bert_config.json - max_steps=-1.0 -} - -mrpc_commons () { - data_dir=${BERT_PREP_WORKING_DIR}/download/glue/MRPC/ - out_dir=/workspace/bert/results/MRPC - task_name=mrpc - global_batch_size=128 - learning_rate=2.4e-5 - warmup_proportion=0.1 - epochs=3 -} - -sst-2_commons () { - data_dir=${BERT_PREP_WORKING_DIR}/download/glue/SST-2/ - out_dir=/workspace/bert/results/SST-2 - task_name=sst-2 - warmup_proportion=0.1 - epochs=3 -} - -dgxa100_fp16_commons () { - batch_size_capacity=128 - precision=fp16 -} - -dgxa100_tf32_commons () { - batch_size_capacity=64 - precision=tf32 -} - -dgx2_fp16_commons () { - batch_size_capacity=128 - precision=fp16 -} - -dgx2_fp32_commons () { - batch_size_capacity=64 - precision=fp32 -} - -print_arguments_in_order () { - echo \ - $init_checkpoint \ - $data_dir \ - $vocab_file \ - $config_file \ - $out_dir \ - $task_name \ - $num_gpu \ - $batch_size \ - $gradient_accumulation_steps \ - $learning_rate \ - $warmup_proportion \ - $epochs \ - $max_steps \ - $precision -} - -########################################## -# DGXA100 # -########################################## - -########################## -# MRPC # -########################## - -# AMP - -mrpc_dgxa100_1gpu_fp16 () { - commons - mrpc_commons - dgxa100_fp16_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgxa100_2gpu_fp16 () { - commons - mrpc_commons - dgxa100_fp16_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgxa100_4gpu_fp16 () { - commons - mrpc_commons - dgxa100_fp16_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgxa100_8gpu_fp16 () { - commons - mrpc_commons - dgxa100_fp16_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -# TF32 - -mrpc_dgxa100_1gpu_tf32 () { - commons - mrpc_commons - dgxa100_tf32_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgxa100_2gpu_tf32 () { - commons - mrpc_commons - dgxa100_tf32_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order - -} - -mrpc_dgxa100_4gpu_tf32 () { - commons - mrpc_commons - dgxa100_tf32_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgxa100_8gpu_tf32 () { - commons - mrpc_commons - dgxa100_tf32_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -########################## -# SST-2 # -########################## - -# AMP - -sst-2_dgxa100_fp16_commons () { - global_batch_size=1024 - learning_rate=3e-5 -} - -sst-2_dgxa100_1gpu_fp16 () { - commons - sst-2_commons - dgxa100_fp16_commons - sst-2_dgxa100_fp16_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgxa100_2gpu_fp16 () { - commons - sst-2_commons - dgxa100_fp16_commons - sst-2_dgxa100_fp16_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgxa100_4gpu_fp16 () { - commons - sst-2_commons - dgxa100_fp16_commons - sst-2_dgxa100_fp16_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgxa100_8gpu_fp16 () { - commons - sst-2_commons - dgxa100_fp16_commons - sst-2_dgxa100_fp16_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -# TF32 - -sst-2_dgxa100_tf32_commons () { - global_batch_size=512 - learning_rate=2e-5 -} - -sst-2_dgxa100_1gpu_tf32 () { - commons - sst-2_commons - dgxa100_tf32_commons - sst-2_dgxa100_tf32_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgxa100_2gpu_tf32 () { - commons - sst-2_commons - dgxa100_tf32_commons - sst-2_dgxa100_tf32_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgxa100_4gpu_tf32 () { - commons - sst-2_commons - dgxa100_tf32_commons - sst-2_dgxa100_tf32_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgxa100_8gpu_tf32 () { - commons - sst-2_commons - dgxa100_tf32_commons - sst-2_dgxa100_tf32_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -########################################## -# DGX2 # -########################################## - -########################## -# MRPC # -########################## - -# AMP - -mrpc_dgx2_1gpu_fp16 () { - commons - mrpc_commons - dgx2_fp16_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_2gpu_fp16 () { - commons - mrpc_commons - dgx2_fp16_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_4gpu_fp16 () { - commons - mrpc_commons - dgx2_fp16_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_8gpu_fp16 () { - commons - mrpc_commons - dgx2_fp16_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_16gpu_fp16 () { - commons - mrpc_commons - dgx2_fp16_commons - num_gpu=16 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -# FP32. - -mrpc_dgx2_1gpu_fp32 () { - commons - mrpc_commons - dgx2_fp32_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_2gpu_fp32 () { - commons - mrpc_commons - dgx2_fp32_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_4gpu_fp32 () { - commons - mrpc_commons - dgx2_fp32_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_8gpu_fp32 () { - commons - mrpc_commons - dgx2_fp32_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -mrpc_dgx2_16gpu_fp32 () { - commons - mrpc_commons - dgx2_fp32_commons - num_gpu=16 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -########################## -# SST-2 # -########################## - -sst-2_dgx2_commons () { - global_batch_size=1024 - learning_rate=3e-5 -} - -# AMP - -sst-2_dgx2_1gpu_fp16 () { - commons - sst-2_commons - dgx2_fp16_commons - sst-2_dgx2_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_2gpu_fp16 () { - commons - sst-2_commons - dgx2_fp16_commons - sst-2_dgx2_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_4gpu_fp16 () { - commons - sst-2_commons - dgx2_fp16_commons - sst-2_dgx2_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_8gpu_fp16 () { - commons - sst-2_commons - dgx2_fp16_commons - sst-2_dgx2_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_16gpu_fp16 () { - commons - sst-2_commons - dgx2_fp16_commons - sst-2_dgx2_commons - num_gpu=16 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -# TF32 - -sst-2_dgx2_1gpu_fp32 () { - commons - sst-2_commons - dgx2_fp32_commons - sst-2_dgx2_commons - num_gpu=1 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_2gpu_fp32 () { - commons - sst-2_commons - dgx2_fp32_commons - sst-2_dgx2_commons - num_gpu=2 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_4gpu_fp32 () { - commons - sst-2_commons - dgx2_fp32_commons - sst-2_dgx2_commons - num_gpu=4 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_8gpu_fp32 () { - commons - sst-2_commons - dgx2_fp32_commons - sst-2_dgx2_commons - num_gpu=8 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} - -sst-2_dgx2_16gpu_fp32 () { - commons - sst-2_commons - dgx2_fp32_commons - sst-2_dgx2_commons - num_gpu=16 - batch_size_and_gradient_accumulation_steps - print_arguments_in_order -} diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/pretrain_config.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/pretrain_config.sh deleted file mode 100644 index 102fa128fd3f05bccc52e1f4cd1cee8a1b8a476f..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/pretrain_config.sh +++ /dev/null @@ -1,252 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -dgxa100_8gpu_fp16 () -{ - train_batch_size="8192" - learning_rate="6e-3" - precision="fp16" - num_gpus=8 - warmup_proportion="0.2843" - train_steps=7038 - save_checkpoint_steps=200 - resume_training="false" - create_logfile="true" - accumulate_gradients="true" - gradient_accumulation_steps=128 - seed=42 - job_name="bert_lamb_pretraining" - allreduce_post_accumulation="true" - allreduce_post_accumulation_fp16="true" - train_batch_size_phase2=4096 - learning_rate_phase2="4e-3" - warmup_proportion_phase2="0.128" - train_steps_phase2=1563 - gradient_accumulation_steps_phase2=256 - DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/" - BERT_CONFIG=bert_config.json - CODEDIR="/workspace/bert" - init_checkpoint="None" - DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/" - echo $train_batch_size $learning_rate $precision $num_gpus \ - $warmup_proportion $train_steps $save_checkpoint_steps \ - $resume_training $create_logfile $accumulate_gradients \ - $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \ - $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \ - $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \ - $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR - -} - -dgxa100_8gpu_tf32 () -{ - train_batch_size="8192" - learning_rate="6e-3" - precision="tf32" - num_gpus=8 - warmup_proportion="0.2843" - train_steps=7038 - save_checkpoint_steps=200 - resume_training="false" - create_logfile="true" - accumulate_gradients="true" - gradient_accumulation_steps=128 - seed=42 - job_name="bert_lamb_pretraining" - allreduce_post_accumulation="true" - allreduce_post_accumulation_fp16="false" - train_batch_size_phase2=4096 - learning_rate_phase2="4e-3" - warmup_proportion_phase2="0.128" - train_steps_phase2=1563 - gradient_accumulation_steps_phase2=512 - DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/" - BERT_CONFIG=bert_config.json - CODEDIR="/workspace/bert" - init_checkpoint="None" - DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/" - echo $train_batch_size $learning_rate $precision $num_gpus \ - $warmup_proportion $train_steps $save_checkpoint_steps \ - $resume_training $create_logfile $accumulate_gradients \ - $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \ - $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \ - $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \ - $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR - -} - -# Full pretraining configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU) - -dgx2_16gpu_fp16 () -{ - train_batch_size="4096" - learning_rate="6e-3" - precision="fp16" - num_gpus=16 - warmup_proportion="0.2843" - train_steps=7038 - save_checkpoint_steps=200 - resume_training="false" - create_logfile="true" - accumulate_gradients="true" - gradient_accumulation_steps=64 - seed=42 - job_name="bert_lamb_pretraining" - allreduce_post_accumulation="true" - allreduce_post_accumulation_fp16="true" - train_batch_size_phase2=2048 - learning_rate_phase2="4e-3" - warmup_proportion_phase2="0.128" - train_steps_phase2=1563 - gradient_accumulation_steps_phase2=128 - DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/" - BERT_CONFIG=bert_config.json - CODEDIR="/workspace/bert" - init_checkpoint="None" - DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/" - echo $train_batch_size $learning_rate $precision $num_gpus \ - $warmup_proportion $train_steps $save_checkpoint_steps \ - $resume_training $create_logfile $accumulate_gradients \ - $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \ - $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \ - $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \ - $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR - -} - -dgx2_16gpu_fp32 () -{ - train_batch_size="4096" - learning_rate="6e-3" - precision="fp32" - num_gpus=16 - warmup_proportion="0.2843" - train_steps=7038 - save_checkpoint_steps=200 - resume_training="false" - create_logfile="true" - accumulate_gradients="true" - gradient_accumulation_steps=128 - seed=42 - job_name="bert_lamb_pretraining" - allreduce_post_accumulation="true" - allreduce_post_accumulation_fp16="false" - train_batch_size_phase2=2048 - learning_rate_phase2="4e-3" - warmup_proportion_phase2="0.128" - train_steps_phase2=1563 - gradient_accumulation_steps_phase2=256 - DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/" - BERT_CONFIG=bert_config.json - CODEDIR="/workspace/bert" - init_checkpoint="None" - DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/" - echo $train_batch_size $learning_rate $precision $num_gpus \ - $warmup_proportion $train_steps $save_checkpoint_steps \ - $resume_training $create_logfile $accumulate_gradients \ - $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \ - $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \ - $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \ - $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR - -} - -# Full pretraining configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU) - -dgx1_8gpu_fp16 () -{ - train_batch_size="8192" - learning_rate="6e-3" - precision="fp16" - num_gpus=8 - warmup_proportion="0.2843" - train_steps=7038 - save_checkpoint_steps=200 - resume_training="false" - create_logfile="true" - accumulate_gradients="true" - gradient_accumulation_steps=512 - seed=42 - job_name="bert_lamb_pretraining" - allreduce_post_accumulation="true" - allreduce_post_accumulation_fp16="true" - train_batch_size_phase2=4096 - learning_rate_phase2="4e-3" - warmup_proportion_phase2="0.128" - train_steps_phase2=1563 - gradient_accumulation_steps_phase2=512 - DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/" - BERT_CONFIG=bert_config.json - CODEDIR="/workspace/bert" - init_checkpoint="None" - DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/" - echo $train_batch_size $learning_rate $precision $num_gpus \ - $warmup_proportion $train_steps $save_checkpoint_steps \ - $resume_training $create_logfile $accumulate_gradients \ - $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \ - $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \ - $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \ - $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR - -} - -dgx1_8gpu_fp32 () -{ - train_batch_size="8192" - learning_rate="6e-3" - precision="fp32" - num_gpus=8 - warmup_proportion="0.2843" - train_steps=7038 - save_checkpoint_steps=200 - resume_training="false" - create_logfile="true" - accumulate_gradients="true" - gradient_accumulation_steps=1024 - seed=42 - job_name="bert_lamb_pretraining" - allreduce_post_accumulation="true" - allreduce_post_accumulation_fp16="false" - train_batch_size_phase2=4096 - learning_rate_phase2="4e-3" - warmup_proportion_phase2="0.128" - train_steps_phase2=1563 - gradient_accumulation_steps_phase2=1024 - DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE1="$BERT_PREP_WORKING_DIR/${DATASET}/" - BERT_CONFIG=bert_config.json - CODEDIR="/workspace/bert" - init_checkpoint="None" - DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets - DATA_DIR_PHASE2="$BERT_PREP_WORKING_DIR/${DATASET2}/" - echo $train_batch_size $learning_rate $precision $num_gpus \ - $warmup_proportion $train_steps $save_checkpoint_steps \ - $resume_training $create_logfile $accumulate_gradients \ - $gradient_accumulation_steps $seed $job_name $allreduce_post_accumulation \ - $allreduce_post_accumulation_fp16 $train_batch_size_phase2 $learning_rate_phase2 \ - $warmup_proportion_phase2 $train_steps_phase2 $gradient_accumulation_steps_phase2 \ - $DATA_DIR_PHASE1 $DATA_DIR_PHASE2 $CODEDIR - -} diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/squad_config.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/squad_config.sh deleted file mode 100644 index 8aece6d9a4d8b98f81b7019ed045cbc640b46b07..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/configs/squad_config.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -dgxa100_8gpu_fp16 () -{ - init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt" - epochs="2.0" - batch_size="32" - learning_rate="3e-5" - precision="fp16" - num_gpu="8" - seed="1" - squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" - vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" - OUT_DIR="/workspace/bert/results/SQuAD" - echo $init_checkpoint $epochs $batch_size $learning_rate \ - $precision $num_gpu $seed $squad_dir $vocab_file \ - $OUT_DIR -} - -dgxa100_8gpu_tf32 () -{ - init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt" - epochs="2.0" - batch_size="16" - learning_rate="3e-5" - precision="tf32" - num_gpu="8" - seed="1" - squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" - vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" - OUT_DIR="/workspace/bert/results/SQuAD" - echo $init_checkpoint $epochs $batch_size $learning_rate \ - $precision $num_gpu $seed $squad_dir $vocab_file \ - $OUT_DIR -} - -# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU) - -dgx2_16gpu_fp16 () -{ - init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt" - epochs="2.0" - batch_size="16" - learning_rate="3e-5" - precision="fp16" - num_gpu="16" - seed="1" - squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" - vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" - OUT_DIR="/workspace/bert/results/SQuAD" - echo $init_checkpoint $epochs $batch_size $learning_rate \ - $precision $num_gpu $seed $squad_dir $vocab_file \ - $OUT_DIR -} - -dgx2_16gpu_fp32 () -{ - init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt" - epochs="2.0" - batch_size="8" - learning_rate="3e-5" - precision="fp16" - num_gpu="16" - seed="1" - squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" - vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" - OUT_DIR="/workspace/bert/results/SQuAD" - echo $init_checkpoint $epochs $batch_size $learning_rate \ - $precision $num_gpu $seed $squad_dir $vocab_file \ - $OUT_DIR -} - -# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU) - -dgx1_8gpu_fp16 () -{ - init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt" - epochs="2.0" - batch_size="10" - learning_rate="3e-5" - precision="fp16" - num_gpu="8" - seed="1" - squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" - vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" - OUT_DIR="/workspace/bert/results/SQuAD" - echo $init_checkpoint $epochs $batch_size $learning_rate \ - $precision $num_gpu $seed $squad_dir $vocab_file \ - $OUT_DIR -} - -dgx1_8gpu_fp32 () -{ - init_checkpoint="/workspace/bert/checkpoints/bert_uncased.pt" - epochs="2.0" - batch_size="4" - learning_rate="3e-5" - precision="fp32" - num_gpu="8" - seed="1" - squad_dir="$BERT_PREP_WORKING_DIR/download/squad/v1.1" - vocab_file="$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt" - OUT_DIR="/workspace/bert/results/SQuAD" - echo $init_checkpoint $epochs $batch_size $learning_rate \ - $precision $num_gpu $seed $squad_dir $vocab_file \ - $OUT_DIR -} diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/data_download.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/data_download.sh deleted file mode 100644 index a66727e5783d48b3b524b47257185bf4cbe18e31..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/data_download.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -DATA_DIR=${1:-/workspace/bert/data} - -# Download vocab files from pretrained model -cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.* - -# Download SQUAD -cd $DATA_DIR/squad && . squad_download.sh - -# Download SWAG -git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag - -# Download GLUE -cd $DATA_DIR/glue && . download_mrpc.sh - -# WIKI Download -cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh - -# Bookcorpus Download -cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh - -cd $DATA_DIR -# Create HDF5 files for WIKI -bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \ - && rm -r ./wikipedia_corpus/final_* \ - -# Create HDF5 files for Bookcorpus -bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \ - && rm -r ./bookcorpus/final_* \ - -# Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus -bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024 diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/docker/build.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/docker/build.sh deleted file mode 100644 index a795e0d10d65da899e11e07d281880edc865b1f9..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/docker/build.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -docker build --network=host . --rm --pull --no-cache -t bert diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/docker/launch.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/docker/launch.sh deleted file mode 100644 index 4bd8ebc95063732dab47606c24611f6414cf9ad0..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/docker/launch.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -CMD=${1:-/bin/bash} -NV_VISIBLE_DEVICES=${2:-"all"} -DOCKER_BRIDGE=${3:-"host"} - -docker run -it --rm \ - --gpus device=$NV_VISIBLE_DEVICES \ - --net=$DOCKER_BRIDGE \ - --shm-size=1g \ - --ulimit memlock=-1 \ - --ulimit stack=67108864 \ - -e LD_LIBRARY_PATH='/workspace/install/lib/' \ - -v $PWD:/workspace/bert \ - -v $PWD/results:/results \ - bert $CMD diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_glue.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_glue.sh deleted file mode 100644 index 1052b37e556a9bfe08402f2295199901520c5d6a..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_glue.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -echo "Container nvidia build = " $NVIDIA_BUILD_ID - -init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"} -data_dir=${2:-"$BERT_PREP_WORKING_DIR/download/glue/MRPC/"} -vocab_file=${3:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"} -config_file=${4:-"/workspace/bert/bert_config.json"} -out_dir=${5:-"/workspace/bert/results/MRPC"} -task_name=${6:-"mrpc"} -num_gpu=${7:-"8"} -batch_size=${8:-"16"} -gradient_accumulation_steps=${9:-"1"} -learning_rate=${10:-"2.4e-5"} -warmup_proportion=${11:-"0.1"} -epochs=${12:-"3"} -max_steps=${13:-"-1.0"} -precision=${14:-"fp16"} -seed=${15:-"2"} -mode=${16:-"train eval"} - -mkdir -p $out_dir - -if [ "$mode" = "eval" ] ; then - num_gpu=1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16="--fp16" -fi - -if [ "$num_gpu" = "1" ] ; then - export CUDA_VISIBLE_DEVICES=0 - mpi_command="" -else - unset CUDA_VISIBLE_DEVICES - mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu" -fi - -CMD="python $mpi_command run_glue.py " -CMD+="--task_name ${task_name} " -if [[ $mode == *"train"* ]] ; then - CMD+="--do_train " - CMD+="--train_batch_size=$batch_size " -fi -if [[ $mode == *"eval"* ]] || [[ $mode == *"prediction"* ]]; then - if [[ $mode == *"eval"* ]] ; then - CMD+="--do_eval " - fi - if [[ $mode == *"prediction"* ]] ; then - CMD+="--do_predict " - fi - CMD+="--eval_batch_size=$batch_size " -fi - -CMD+="--gradient_accumulation_steps=$gradient_accumulation_steps " -CMD+="--do_lower_case " -CMD+="--data_dir $data_dir " -CMD+="--bert_model bert-large-uncased " -CMD+="--seed $seed " -CMD+="--init_checkpoint $init_checkpoint " -CMD+="--warmup_proportion $warmup_proportion " -CMD+="--max_seq_length 128 " -CMD+="--learning_rate $learning_rate " -CMD+="--num_train_epochs $epochs " -CMD+="--max_steps $max_steps " -CMD+="--vocab_file=$vocab_file " -CMD+="--config_file=$config_file " -CMD+="--output_dir $out_dir " -CMD+="$use_fp16" - -LOGFILE=$out_dir/logfile - -$CMD |& tee $LOGFILE diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining.sh deleted file mode 100644 index 397e84ae7cbaf83fe573c9591b91c79e6aabd880..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining.sh +++ /dev/null @@ -1,225 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -echo "Container nvidia build = " $NVIDIA_BUILD_ID -train_batch_size=${1:-8192} -learning_rate=${2:-"6e-3"} -precision=${3:-"fp16"} -num_gpus=${4:-8} -warmup_proportion=${5:-"0.2843"} -train_steps=${6:-7038} -save_checkpoint_steps=${7:-200} -resume_training=${8:-"false"} -create_logfile=${9:-"true"} -accumulate_gradients=${10:-"true"} -gradient_accumulation_steps=${11:-128} -seed=${12:-12439} -job_name=${13:-"bert_lamb_pretraining"} -allreduce_post_accumulation=${14:-"true"} -allreduce_post_accumulation_fp16=${15:-"true"} -train_batch_size_phase2=${16:-4096} -learning_rate_phase2=${17:-"4e-3"} -warmup_proportion_phase2=${18:-"0.128"} -train_steps_phase2=${19:-1563} -gradient_accumulation_steps_phase2=${20:-512} -DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets -DATA_DIR_PHASE1=${21:-$BERT_PREP_WORKING_DIR/${DATASET}/} -BERT_CONFIG=bert_config.json -DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training # change this for other datasets -DATA_DIR_PHASE2=${22:-$BERT_PREP_WORKING_DIR/${DATASET2}/} -CODEDIR=${23:-"/workspace/bert"} -init_checkpoint=${24:-"None"} -RESULTS_DIR=$CODEDIR/results -CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints - -mkdir -p $CHECKPOINTS_DIR - - -if [ ! -d "$DATA_DIR_PHASE1" ] ; then - echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start" -fi -if [ ! -d "$RESULTS_DIR" ] ; then - echo "Error! $RESULTS_DIR directory missing." - exit -1 -fi -if [ ! -d "$CHECKPOINTS_DIR" ] ; then - echo "Warning! $CHECKPOINTS_DIR directory missing." - echo "Checkpoints will be written to $RESULTS_DIR instead." - CHECKPOINTS_DIR=$RESULTS_DIR -fi -if [ ! -f "$BERT_CONFIG" ] ; then - echo "Error! BERT large configuration file not found at $BERT_CONFIG" - exit -1 -fi - -PREC="" -if [ "$precision" = "fp16" ] ; then - PREC="--fp16" -elif [ "$precision" = "fp32" ] ; then - PREC="" -elif [ "$precision" = "tf32" ] ; then - PREC="" -else - echo "Unknown argument" - exit -2 -fi - -ACCUMULATE_GRADIENTS="" -if [ "$accumulate_gradients" == "true" ] ; then - ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps" -fi - -CHECKPOINT="" -if [ "$resume_training" == "true" ] ; then - CHECKPOINT="--resume_from_checkpoint" -fi - -ALL_REDUCE_POST_ACCUMULATION="" -if [ "$allreduce_post_accumulation" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" -fi - -ALL_REDUCE_POST_ACCUMULATION_FP16="" -if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" -fi - -INIT_CHECKPOINT="" -if [ "$init_checkpoint" != "None" ] ; then - INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint" -fi - -echo $DATA_DIR_PHASE1 -INPUT_DIR=$DATA_DIR_PHASE1 -CMD=" $CODEDIR/run_pretraining.py" -CMD+=" --input_dir=$DATA_DIR_PHASE1" -CMD+=" --output_dir=$CHECKPOINTS_DIR" -CMD+=" --config_file=$BERT_CONFIG" -CMD+=" --bert_model=bert-large-uncased" -CMD+=" --train_batch_size=$train_batch_size" -CMD+=" --max_seq_length=128" -CMD+=" --max_predictions_per_seq=20" -CMD+=" --max_steps=$train_steps" -CMD+=" --warmup_proportion=$warmup_proportion" -CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" -CMD+=" --learning_rate=$learning_rate" -CMD+=" --seed=$seed" -CMD+=" $PREC" -CMD+=" $ACCUMULATE_GRADIENTS" -CMD+=" $CHECKPOINT" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" -CMD+=" $INIT_CHECKPOINT" -CMD+=" --do_train" -CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " - -CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD" - - -if [ "$create_logfile" = "true" ] ; then - export GBS=$(expr $train_batch_size \* $num_gpus) - printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS - DATESTAMP=`date +'%y%m%d%H%M%S'` - LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log - printf "Logs written to %s\n" "$LOGFILE" -fi - -set -x -if [ -z "$LOGFILE" ] ; then - $CMD -else - ( - $CMD - ) |& tee $LOGFILE -fi - -set +x - -echo "finished pretraining" - -#Start Phase2 - -PREC="" -if [ "$precision" = "fp16" ] ; then - PREC="--fp16" -elif [ "$precision" = "fp32" ] ; then - PREC="" -elif [ "$precision" = "tf32" ] ; then - PREC="" -else - echo "Unknown argument" - exit -2 -fi - -ACCUMULATE_GRADIENTS="" -if [ "$accumulate_gradients" == "true" ] ; then - ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2" -fi - -ALL_REDUCE_POST_ACCUMULATION="" -if [ "$allreduce_post_accumulation" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" -fi - -ALL_REDUCE_POST_ACCUMULATION_FP16="" -if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" -fi - -echo $DATA_DIR_PHASE2 -INPUT_DIR=$DATA_DIR_PHASE2 -CMD=" $CODEDIR/run_pretraining.py" -CMD+=" --input_dir=$DATA_DIR_PHASE2" -CMD+=" --output_dir=$CHECKPOINTS_DIR" -CMD+=" --config_file=$BERT_CONFIG" -CMD+=" --bert_model=bert-large-uncased" -CMD+=" --train_batch_size=$train_batch_size_phase2" -CMD+=" --max_seq_length=512" -CMD+=" --max_predictions_per_seq=80" -CMD+=" --max_steps=$train_steps_phase2" -CMD+=" --warmup_proportion=$warmup_proportion_phase2" -CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" -CMD+=" --learning_rate=$learning_rate_phase2" -CMD+=" --seed=$seed" -CMD+=" $PREC" -CMD+=" $ACCUMULATE_GRADIENTS" -CMD+=" $CHECKPOINT" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" -CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps" -CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " - -CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD" - -if [ "$create_logfile" = "true" ] ; then - export GBS=$(expr $train_batch_size_phase2 \* $num_gpus) - printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS - DATESTAMP=`date +'%y%m%d%H%M%S'` - LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log - printf "Logs written to %s\n" "$LOGFILE" -fi - -set -x -if [ -z "$LOGFILE" ] ; then - $CMD -else - ( - $CMD - ) |& tee $LOGFILE -fi - -set +x - -echo "finished phase2" diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining_npu_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining_npu_1p.sh deleted file mode 100644 index 7e0e51ca214a77904988757c9381ed2c98603e25..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining_npu_1p.sh +++ /dev/null @@ -1,231 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -source env_npu.sh - -echo "Container nvidia build = " $NVIDIA_BUILD_ID -train_batch_size=${1:-8192} -learning_rate=${2:-"6e-3"} -precision=${3:-"fp16"} -num_npu=${4:-8} -warmup_proportion=${5:-"0.2843"} -train_steps=${6:-7038} -save_checkpoint_steps=${7:-200} -resume_training=${8:-"false"} -create_logfile=${9:-"true"} -accumulate_gradients=${10:-"true"} -gradient_accumulation_steps=${11:-128} -seed=${12:-12439} -job_name=${13:-"bert_lamb_pretraining"} -allreduce_post_accumulation=${14:-"true"} -allreduce_post_accumulation_fp16=${15:-"true"} -train_batch_size_phase2=${16:-4096} -learning_rate_phase2=${17:-"4e-3"} -warmup_proportion_phase2=${18:-"0.128"} -train_steps_phase2=${19:-1563} -gradient_accumulation_steps_phase2=${20:-512} -DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets -DATA_DIR_PHASE1=${21:-"./data/${DATASET}/"} # change this for your env -BERT_CONFIG=bert_config.json -DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets -DATA_DIR_PHASE2=${22:-"./data/${DATASET2}/"} # change this for your env -CODEDIR=${23:-"${PWD}"} -init_checkpoint=${24:-"None"} -RESULTS_DIR=$CODEDIR/results -CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints - -mkdir -p $CHECKPOINTS_DIR - - -if [ ! -d "$DATA_DIR_PHASE1" ] ; then - echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start" -fi -if [ ! -d "$RESULTS_DIR" ] ; then - echo "Error! $RESULTS_DIR directory missing." - exit -1 -fi -if [ ! -d "$CHECKPOINTS_DIR" ] ; then - echo "Warning! $CHECKPOINTS_DIR directory missing." - echo "Checkpoints will be written to $RESULTS_DIR instead." - CHECKPOINTS_DIR=$RESULTS_DIR -fi -if [ ! -f "$BERT_CONFIG" ] ; then - echo "Error! BERT large configuration file not found at $BERT_CONFIG" - exit -1 -fi - -PREC="" -if [ "$precision" = "fp16" ] ; then - PREC="--fp16" -elif [ "$precision" = "fp32" ] ; then - PREC="" -elif [ "$precision" = "tf32" ] ; then - PREC="" -else - echo "Unknown argument" - exit -2 -fi - -ACCUMULATE_GRADIENTS="" -if [ "$accumulate_gradients" == "true" ] ; then - ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps" -fi - -CHECKPOINT="" -if [ "$resume_training" == "true" ] ; then - CHECKPOINT="--resume_from_checkpoint" -fi - -ALL_REDUCE_POST_ACCUMULATION="" -if [ "$allreduce_post_accumulation" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" -fi - -ALL_REDUCE_POST_ACCUMULATION_FP16="" -if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" -fi - -INIT_CHECKPOINT="" -if [ "$init_checkpoint" != "None" ] ; then - INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint" -fi - -echo $DATA_DIR_PHASE1 -INPUT_DIR=$DATA_DIR_PHASE1 -CMD=" $CODEDIR/run_pretraining.py" -CMD+=" --input_dir=$DATA_DIR_PHASE1" -CMD+=" --output_dir=$CHECKPOINTS_DIR" -CMD+=" --config_file=$BERT_CONFIG" -CMD+=" --bert_model=bert-large-uncased" -CMD+=" --train_batch_size=$train_batch_size" -CMD+=" --max_seq_length=128" -CMD+=" --max_predictions_per_seq=20" -CMD+=" --max_steps=$train_steps" -CMD+=" --warmup_proportion=$warmup_proportion" -CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" -CMD+=" --learning_rate=$learning_rate" -CMD+=" --seed=$seed" -CMD+=" $PREC" -CMD+=" $ACCUMULATE_GRADIENTS" -CMD+=" $CHECKPOINT" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" -CMD+=" $INIT_CHECKPOINT" -CMD+=" --do_train" -CMD+=" --use_npu" -CMD+=" --npu_id=0" -CMD+=" --loss_scale=8192.0" -CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " - -CMD="python3.7 -u $CMD" - -if [ "$create_logfile" = "true" ] ; then - export GBS=$(expr $train_batch_size \* $num_npu) - printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS - DATESTAMP=`date +'%y%m%d%H%M%S'` - LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log - printf "Logs written to %s\n" "$LOGFILE" -fi - -set -x -if [ -z "$LOGFILE" ] ; then - $CMD -else - ( - $CMD - ) |& tee $LOGFILE -fi - -set +x -wait -echo "finished pretraining" - -#Start Phase2 - -PREC="" -if [ "$precision" = "fp16" ] ; then - PREC="--fp16" -elif [ "$precision" = "fp32" ] ; then - PREC="" -elif [ "$precision" = "tf32" ] ; then - PREC="" -else - echo "Unknown argument" - exit -2 -fi - -ACCUMULATE_GRADIENTS="" -if [ "$accumulate_gradients" == "true" ] ; then - ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2" -fi - -ALL_REDUCE_POST_ACCUMULATION="" -if [ "$allreduce_post_accumulation" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" -fi - -ALL_REDUCE_POST_ACCUMULATION_FP16="" -if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" -fi - -echo $DATA_DIR_PHASE2 -INPUT_DIR=$DATA_DIR_PHASE2 -CMD=" $CODEDIR/run_pretraining.py" -CMD+=" --input_dir=$DATA_DIR_PHASE2" -CMD+=" --output_dir=$CHECKPOINTS_DIR" -CMD+=" --config_file=$BERT_CONFIG" -CMD+=" --bert_model=bert-large-uncased" -CMD+=" --train_batch_size=$train_batch_size_phase2" -CMD+=" --max_seq_length=512" -CMD+=" --max_predictions_per_seq=80" -CMD+=" --max_steps=$train_steps_phase2" -CMD+=" --warmup_proportion=$warmup_proportion_phase2" -CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" -CMD+=" --learning_rate=$learning_rate_phase2" -CMD+=" --seed=$seed" -CMD+=" $PREC" -CMD+=" $ACCUMULATE_GRADIENTS" -CMD+=" $CHECKPOINT" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" -CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps" -CMD+=" --use_npu" -CMD+=" --npu_id=0" -CMD+=" --loss_scale=4096.0" -CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " - -CMD="python3.7 -u $CMD" - -if [ "$create_logfile" = "true" ] ; then - export GBS=$(expr $train_batch_size_phase2 \* $num_npu) - printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS - DATESTAMP=`date +'%y%m%d%H%M%S'` - LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log - printf "Logs written to %s\n" "$LOGFILE" -fi - -set -x -if [ -z "$LOGFILE" ] ; then - $CMD -else - ( - $CMD - ) |& tee $LOGFILE -fi - -set +x - -echo "finished phase2" diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining_npu_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining_npu_8p.sh deleted file mode 100644 index dceeaafc63f117fa97100fb888cc17824b72b5ef..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_pretraining_npu_8p.sh +++ /dev/null @@ -1,260 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -source env_npu.sh - -echo "Container nvidia build = " $NVIDIA_BUILD_ID -train_batch_size=${1:-8192} -learning_rate=${2:-"6e-3"} -precision=${3:-"fp16"} -num_npu=${4:-8} -warmup_proportion=${5:-"0.2843"} -train_steps=${6:-7038} -save_checkpoint_steps=${7:-200} -resume_training=${8:-"false"} -create_logfile=${9:-"true"} -accumulate_gradients=${10:-"true"} -gradient_accumulation_steps=${11:-128} -seed=${12:-12439} -job_name=${13:-"bert_lamb_pretraining"} -allreduce_post_accumulation=${14:-"true"} -allreduce_post_accumulation_fp16=${15:-"true"} -train_batch_size_phase2=${16:-4096} -learning_rate_phase2=${17:-"4e-3"} -warmup_proportion_phase2=${18:-"0.128"} -train_steps_phase2=${19:-1563} -gradient_accumulation_steps_phase2=${20:-512} -DATASET=hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets -DATA_DIR_PHASE1=${21:-"./data/${DATASET}/"} # change this for your env -BERT_CONFIG=bert_config.json -DATASET2=hdf5_lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/wikicorpus_en # change this for other datasets -DATA_DIR_PHASE2=${22:-"./data/${DATASET2}/"} # change this for your env -CODEDIR=${23:-"${PWD}"} -init_checkpoint=${24:-"None"} -RESULTS_DIR=$CODEDIR/results -CHECKPOINTS_DIR=$RESULTS_DIR/checkpoints -RANK_SIZE=$num_npu - -mkdir -p $CHECKPOINTS_DIR - - -if [ ! -d "$DATA_DIR_PHASE1" ] ; then - echo "Warning! $DATA_DIR_PHASE1 directory missing. Training cannot start" -fi -if [ ! -d "$RESULTS_DIR" ] ; then - echo "Error! $RESULTS_DIR directory missing." - exit -1 -fi -if [ ! -d "$CHECKPOINTS_DIR" ] ; then - echo "Warning! $CHECKPOINTS_DIR directory missing." - echo "Checkpoints will be written to $RESULTS_DIR instead." - CHECKPOINTS_DIR=$RESULTS_DIR -fi -if [ ! -f "$BERT_CONFIG" ] ; then - echo "Error! BERT large configuration file not found at $BERT_CONFIG" - exit -1 -fi - -PREC="" -if [ "$precision" = "fp16" ] ; then - PREC="--fp16" -elif [ "$precision" = "fp32" ] ; then - PREC="" -elif [ "$precision" = "tf32" ] ; then - PREC="" -else - echo "Unknown argument" - exit -2 -fi - -ACCUMULATE_GRADIENTS="" -if [ "$accumulate_gradients" == "true" ] ; then - ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps" -fi - -CHECKPOINT="" -if [ "$resume_training" == "true" ] ; then - CHECKPOINT="--resume_from_checkpoint" -fi - -ALL_REDUCE_POST_ACCUMULATION="" -if [ "$allreduce_post_accumulation" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" -fi - -ALL_REDUCE_POST_ACCUMULATION_FP16="" -if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" -fi - -INIT_CHECKPOINT="" -if [ "$init_checkpoint" != "None" ] ; then - INIT_CHECKPOINT="--init_checkpoint=$init_checkpoint" -fi - -echo $DATA_DIR_PHASE1 -INPUT_DIR=$DATA_DIR_PHASE1 -CMD=" $CODEDIR/run_pretraining.py" -CMD+=" --input_dir=$DATA_DIR_PHASE1" -CMD+=" --output_dir=$CHECKPOINTS_DIR" -CMD+=" --config_file=$BERT_CONFIG" -CMD+=" --bert_model=bert-large-uncased" -CMD+=" --train_batch_size=$train_batch_size" -CMD+=" --max_seq_length=128" -CMD+=" --max_predictions_per_seq=20" -CMD+=" --max_steps=$train_steps" -CMD+=" --warmup_proportion=$warmup_proportion" -CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" -CMD+=" --learning_rate=$learning_rate" -CMD+=" --seed=$seed" -CMD+=" $PREC" -CMD+=" $ACCUMULATE_GRADIENTS" -CMD+=" $CHECKPOINT" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" -CMD+=" $INIT_CHECKPOINT" -CMD+=" --do_train" -CMD+=" --use_npu" -CMD+=" --loss_scale=8192.0" -CMD+=" --addr=127.0.0.1" -CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " - -CMD="python3.7 -u $CMD" - -if [ "$create_logfile" = "true" ] ; then - export GBS=$(expr $train_batch_size \* $num_npu) - printf -v TAG "pyt_bert_pretraining_phase1_%s_gbs%d" "$precision" $GBS - DATESTAMP=`date +'%y%m%d%H%M%S'` - LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log - printf "Logs written to %s\n" "$LOGFILE" -fi - -set -x -if [ -z "$LOGFILE" ] ; then - for i in $(seq 0 7) - do - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=$i*${corenum}/${RANK_SIZE} - let b=$i+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i & - done -else - ( - for i in $(seq 0 7) - do - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=$i*${corenum}/${RANK_SIZE} - let b=$i+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i & - done - ) |& tee $LOGFILE -fi - -set +x -wait -echo "finished pretraining" - -#Start Phase2 - -PREC="" -if [ "$precision" = "fp16" ] ; then - PREC="--fp16" -elif [ "$precision" = "fp32" ] ; then - PREC="" -elif [ "$precision" = "tf32" ] ; then - PREC="" -else - echo "Unknown argument" - exit -2 -fi - -ACCUMULATE_GRADIENTS="" -if [ "$accumulate_gradients" == "true" ] ; then - ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2" -fi - -ALL_REDUCE_POST_ACCUMULATION="" -if [ "$allreduce_post_accumulation" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation" -fi - -ALL_REDUCE_POST_ACCUMULATION_FP16="" -if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then - ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16" -fi - -echo $DATA_DIR_PHASE2 -INPUT_DIR=$DATA_DIR_PHASE2 -CMD=" $CODEDIR/run_pretraining.py" -CMD+=" --input_dir=$DATA_DIR_PHASE2" -CMD+=" --output_dir=$CHECKPOINTS_DIR" -CMD+=" --config_file=$BERT_CONFIG" -CMD+=" --bert_model=bert-large-uncased" -CMD+=" --train_batch_size=$train_batch_size_phase2" -CMD+=" --max_seq_length=512" -CMD+=" --max_predictions_per_seq=80" -CMD+=" --max_steps=$train_steps_phase2" -CMD+=" --warmup_proportion=$warmup_proportion_phase2" -CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps" -CMD+=" --learning_rate=$learning_rate_phase2" -CMD+=" --seed=$seed" -CMD+=" $PREC" -CMD+=" $ACCUMULATE_GRADIENTS" -CMD+=" $CHECKPOINT" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION" -CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16" -CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps" -CMD+=" --use_npu" -CMD+=" --loss_scale=4096.0" -CMD+=" --addr=127.0.0.1" -CMD+=" --json-summary ${RESULTS_DIR}/dllogger.json " - -CMD="python3.7 -u $CMD" - -if [ "$create_logfile" = "true" ] ; then - export GBS=$(expr $train_batch_size_phase2 \* $num_npu) - printf -v TAG "pyt_bert_pretraining_phase2_%s_gbs%d" "$precision" $GBS - DATESTAMP=`date +'%y%m%d%H%M%S'` - LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log - printf "Logs written to %s\n" "$LOGFILE" -fi - -set -x -if [ -z "$LOGFILE" ] ; then - for i in $(seq 0 7) - do - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=$i*${corenum}/${RANK_SIZE} - let b=$i+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i & - done -else - ( - for i in $(seq 0 7) - do - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=$i*${corenum}/${RANK_SIZE} - let b=$i+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i & - done - ) |& tee $LOGFILE -fi - -set +x - -echo "finished phase2" diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad.sh deleted file mode 100644 index f8dff9fde013273c3ae944d200eabec5ab1f38f3..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -echo "Container nvidia build = " $NVIDIA_BUILD_ID - -init_checkpoint=${1:-"/workspace/bert/checkpoints/bert_uncased.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"4"} -learning_rate=${4:-"3e-5"} -precision=${5:-"fp16"} -num_gpu=${6:-"8"} -seed=${7:-"1"} -squad_dir=${8:-"$BERT_PREP_WORKING_DIR/download/squad/v1.1"} -vocab_file=${9:-"$BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${10:-"/workspace/bert/results/SQuAD"} -mode=${11:-"train eval"} -CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"} -max_steps=${13:-"-1"} - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " -fi - -if [ "$num_gpu" = "1" ] ; then - export CUDA_VISIBLE_DEVICES=0 - mpi_command="" -else - unset CUDA_VISIBLE_DEVICES - mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu" -fi - -CMD="python $mpi_command run_squad.py " -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -fi - -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-large-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" - -LOGFILE=$OUT_DIR/logfile.txt -echo "$CMD |& tee $LOGFILE" -time $CMD |& tee $LOGFILE diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_base_npu_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_base_npu_1p.sh deleted file mode 100644 index eb61d63d0b80d36911b046d46b2754f5a685359d..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_base_npu_1p.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -source env_npu.sh - -init_checkpoint=${1:-"checkpoints/bert_base_pretrained_amp.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"96"} -learning_rate=${4:-"6e-5"} -precision=${5:-"fp16"} -seed=${6:-"1"} -squad_dir=${7:-"data/squad/v1.1"} -vocab_file=${8:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${9:-"results/SQuAD"} -mode=${10:-"train eval"} -CONFIG_FILE=${11:-"bert_base_config.json"} -max_steps=${12:-"-1"} -npu_id=${13:-"0"} - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " -fi - -CMD="python3.7 run_squad.py " - -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -fi - -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-large-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" -CMD+=" --use_npu" -CMD+=" --npu_id=$npu_id " -CMD+=" --loss_scale=4096 " - -$CMD - diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_base_npu_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_base_npu_8p.sh deleted file mode 100644 index c0f33fe8b20afe432e8986e4d389e9b1a42aeb99..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_base_npu_8p.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -source env_npu.sh - -init_checkpoint=${1:-"checkpoints/bert_base_pretrained_amp.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"96"} -learning_rate=${4:-"2e-4"} -precision=${5:-"fp16"} -num_npu=${6:-"8"} -seed=${7:-"1"} -squad_dir=${8:-"data/squad/v1.1"} -vocab_file=${9:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${10:-"results/SQuAD"} -mode=${11:-"train eval"} -CONFIG_FILE=${12:-"bert_base_config.json"} -max_steps=${13:-"-1"} -RANK_SIZE=$num_npu - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " -fi - -CMD="python3.7 run_squad.py " -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -fi - -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-large-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" -CMD+=" --use_npu" -CMD+=" --num_npu=$num_npu" -CMD+=" --loss_scale=4096" -CMD+=" --addr=127.0.0.1" - -if [ $(uname -m) = "aarch64" ] -then - for i in $(seq 0 7) - do - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=$i*${corenum}/${RANK_SIZE} - let b=$i+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i & - done -else - for i in $(seq 0 7) - do - $CMD --local_rank=$i & - done -fi -wait diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_npu_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_npu_1p.sh deleted file mode 100644 index 840dc84cd5409e6786d08691dd691ec8915de9b4..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_npu_1p.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -source env_npu.sh - -init_checkpoint=${1:-"checkpoints/bert_large_pretrained_amp.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"32"} -learning_rate=${4:-"6e-5"} -precision=${5:-"fp16"} -seed=${6:-"1"} -squad_dir=${7:-"data/squad/v1.1"} -vocab_file=${8:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${9:-"results/SQuAD"} -mode=${10:-"train eval"} -CONFIG_FILE=${11:-"bert_config.json"} -max_steps=${12:-"-1"} -npu_id=${13:-"0"} - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " -fi - -if [ $(uname -m) = "aarch64" ]; then - CMD="taskset -c 0-47 python3.7 run_squad.py " -else - CMD="python3.7 run_squad.py " -fi -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -fi - -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-large-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" -CMD+=" --use_npu" -CMD+=" --npu_id=$npu_id " -CMD+=" --loss_scale=4096 " - -$CMD diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_npu_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_npu_8p.sh deleted file mode 100644 index db7db7d1327ecfc588746af0115d7e777c0e5b7e..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_squad_npu_8p.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -source env_npu.sh - -init_checkpoint=${1:-"checkpoints/bert_large_pretrained_amp.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"32"} -learning_rate=${4:-"2e-4"} -precision=${5:-"fp16"} -num_npu=${6:-"8"} -seed=${7:-"1"} -squad_dir=${8:-"data/squad/v1.1"} -vocab_file=${9:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${10:-"results/SQuAD"} -mode=${11:-"train eval"} -CONFIG_FILE=${12:-"bert_config.json"} -max_steps=${13:-"-1"} -RANK_SIZE=$num_npu - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " -fi - -CMD="python3.7 run_squad.py " -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -fi - -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-large-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" -CMD+=" --use_npu" -CMD+=" --num_npu=$num_npu" -CMD+=" --loss_scale=4096" -CMD+=" --addr=127.0.0.1" - -if [ $(uname -m) = "aarch64" ] -then - for i in $(seq 0 7) - do - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=$i*${corenum}/${RANK_SIZE} - let b=$i+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i & - done -else - for i in $(seq 0 7) - do - $CMD --local_rank=$i & - done -fi -wait \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_swag.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_swag.sh deleted file mode 100644 index 377834eebb7ee9bfed44a978ec43ea2295eb5826..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/scripts/run_swag.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -SWAG_DIR=/workspace/bert/data/swag -OUT_DIR=/results/SWAG - -mkdir -p $OUT_DIR - -echo "Container nvidia build = " $NVIDIA_BUILD_ID - -init_checkpoint=${1} -mode=${2:-"train"} -max_steps=${3:-"-1.0"} # if < 0, has no effect -batch_size=${4:-"12"} -learning_rate=${5:-"5e-6"} -precision=${6:-"fp32"} -num_gpu=${7:-"8"} -epochs=${8:-"2"} - -if [ "$mode" != "train" ] ; then - num_gpu=1 -fi - -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16="--fp16" -fi - -if [ "$num_gpu" = "1" ] ; then - mpi_command="" -else - mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu" -fi - -CMD="python -m $mpi_command run_swag.py " -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_batch_size=$batch_size " -else - CMD+="--do_eval " - CMD+="--eval_batch_size=$batch_size " -fi -CMD+="--do_lower_case " -CMD+="--data_dir $SWAG_DIR/data/ " -CMD+="--bert_model bert-large-uncased " -CMD+="--max_seq_length 128 " -CMD+="--learning_rate $learning_rate " -CMD+="--num_train_epochs $epochs " -CMD+="--max_steps $max_steps " -CMD+="--output_dir $OUT_DIR " -CMD+="$use_fp16" - -LOGFILE=$OUT_DIR/logfile -$CMD |& tee $LOGFILE - -sed -r 's/ -|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit - -throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'` - -echo "throughput: $throughput" - diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/env.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/env.sh deleted file mode 100644 index 20de2e7d6046edb9e9a8741036423bfd9d9f6be5..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/env.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -cur_path=`pwd`/../ -export install_path=/usr/local/Ascend -export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH # 仅容器训练场景配置 -export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH -export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64:$LD_LIBRARY_PATH -export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:$PYTHONPATH -export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:${install_path}/tfplugin/python/site-packages:$PYTHONPATH -export ASCEND_OPP_PATH=${install_path}/opp -export ASCEND_AICPU_PATH=${install_path} -export PYTHONPATH=$cur_path/models/research:$cur_path/models/research/slim:$PYTHONPATH -export JOB_ID=10087 -export ASCEND_GLOBAL_LOG_LEVEL=3 -export ASCEND_DEVICE_ID=0 -export BMMV2_ENABLE=1 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/env_npu.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/env_npu.sh similarity index 99% rename from PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/env_npu.sh rename to PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/env_npu.sh index 6f43f54e2099e1da99bce5ef7b7bd9e7b4007f8c..e628cf0a5f2648576f05c8e6745e6b011e351680 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/env_npu.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/env_npu.sh @@ -27,8 +27,6 @@ else fi fi -source env_new.sh - export ASCEND_GLOBAL_LOG_LEVEL=3 export ASCEND_GLOBAL_EVENT_ENABLE=0 export ASCEND_SLOG_PRINT_TO_STDOUT=0 diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_ID3078_Bert-Squad_performance_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_ID3078_Bert-Squad_performance_8p.sh deleted file mode 100644 index 0bc29ea61d4170900482c88050143ccd31b66072..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_ID3078_Bert-Squad_performance_8p.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -export BMMV2_ENABLE=1 -export RANK_SIZE=8 -export JOB_ID=10087 -RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path="" -ckpt_path="" - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="Bert-Squad_ID3078_for_PyTorch" -#训练epoch -train_epochs=1 -#训练batch_size -batch_size=32 -#训练step -train_steps= -#学习率 -learning_rate=2e-4 - - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_fp32_to_fp16" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --ckpt_path* ]];then - ckpt_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - export ASCEND_DEVICE_ID=$RANK_ID - echo "Device ID: $ASCEND_DEVICE_ID" - export RANK_ID=$RANK_ID - - - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - nohup python3.7 run_squad.py \ - --init_checkpoint ${ckpt_path}/bert_large_pretrained_amp.pt \ - --bert_model bert-large-uncased \ - --do_train \ - --train_file ${data_path}/train-v1.1-min.json \ - --train_batch_size ${batch_size} \ - --do_predict \ - --predict_batch_size ${batch_size} \ - --predict_file ${data_path}/dev-v1.1.json \ - --learning_rate ${learning_rate} \ - --num_train_epochs ${train_epochs} \ - --seed 1 \ - --fp16 \ - --max_steps 100 \ - --use_npu \ - --loss_scale 4096 \ - --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ - --do_eval \ - --eval_script ${data_path}/evaluate-v1.1.py \ - --npu_id ${ASCEND_DEVICE_ID} \ - --do_lower_case \ - --output_dir ${cur_path}/../results \ - --config_file bert_config.json \ - --num_npu 8 \ - --graph_mode \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}' | tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` - -FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` - -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 -export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..5466044f5f584787ed5f9d5942ea8ddf678395d3 --- /dev/null +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_1p.sh @@ -0,0 +1,169 @@ +#!/bin/bash + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + +#集合通信参数,不需要修改 +export RANK_SIZE=1 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +ckpt_path="" + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Bert-Squad_ID0470_for_PyTorch" +#训练epoch +train_epochs=2 +#训练batch_size +batch_size=96 +#训练step +train_steps= +#学习率 +learning_rate=6e-5 + + +#维测参数,precision_mode需要模型审视修改 +precision_mode="allow_fp32_to_fp16" + + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + fi +done + +#校验是否传入ckpt_path,不需要修改 +if [[ $ckpt_path == "" ]];then + echo "[Error] para \"ckpt_path\" must be confing" + exit 1 +fi + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +mkdir -p results/SQUAD +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +#训练开始时间,不需要修改 +start_time=$(date +%s) +ASCEND_DEVICE_ID=0 +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/ +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3.7 run_squad.py \ + --init_checkpoint ${ckpt_path}/bert_base_pretrained_amp.pt \ + --bert_model bert-large-uncased \ + --do_train \ + --train_file ${data_path}/train-v1.1.json \ + --train_batch_size ${batch_size} \ + --do_predict \ + --predict_batch_size ${batch_size} \ + --predict_file ${data_path}/dev-v1.1.json \ + --learning_rate ${learning_rate} \ + --num_train_epochs ${train_epochs} \ + --seed 1 \ + --fp16 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --max_steps -1 \ + --use_npu \ + --loss_scale 4096 \ + --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ + --do_eval \ + --eval_script ${data_path}/evaluate-v1.1.py \ + --npu_id ${ASCEND_DEVICE_ID} \ + --do_lower_case \ + --output_dir results/SQUAD \ + --config_file bert_base_config.json > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` + +FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 +export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_8p.sh index 3dcfcd4c12f661707103a6f75e055ef19d1550a5..4ca582cb7d8ca55c5c0f69bbbbe73607ffe37c29 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_8p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_full_8p.sh @@ -1,130 +1,129 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` -cd $cur_path/../ +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + -source env_npu.sh #集合通信参数,不需要修改 export RANK_SIZE=8 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +ckpt_path="" #基础参数,需要模型审视修改 #网络名称,同目录名称 -Network="Bert-Squad_ID0470for_PyTorch" +Network="Bert-Squad_ID0470_for_PyTorch" +#训练epoch +train_epochs=2 +#训练batch_size +batch_size=96 +#训练step +train_steps= +#学习率 +learning_rate=2e-4 #维测参数,precision_mode需要模型审视修改 precision_mode="allow_fp32_to_fp16" - -#进入训练脚本目录,需要模型审视修改 - -init_checkpoint=${1:-"checkpoints/bert_base_pretrained_amp.pt"} -epochs=${2:-"2.0"} -batch_size=${3:-"96"} -learning_rate=${4:-"2e-4"} -precision=${5:-"fp16"} -num_npu=${6:-"8"} -seed=${7:-"1"} -squad_dir=${8:-"data/squad/v1.1"} -vocab_file=${9:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${10:-"results/SQuAD"} -mode=${11:-"train eval"} -CONFIG_FILE=${12:-"bert_base_config.json"} -max_steps=${13:-"-1"} - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 +mkdir -p results/SQUAD + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + fi +done + +#校验是否传入ckpt_path,不需要修改 +if [[ $ckpt_path == "" ]];then + echo "[Error] para \"ckpt_path\" must be confing" + exit 1 fi -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 fi -CMD="python3.7 run_squad.py " - -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh fi -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-base-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" -CMD+=" --use_npu" -CMD+=" --num_npu=$num_npu " -CMD+=" --loss_scale=4096 " -CMD+=" --addr=127.0.0.1" - #训练开始时间,不需要修改 start_time=$(date +%s) -if [ $(uname -m) = "aarch64" ] -then - for i in $(seq 0 ${RANK_SIZE}) - do - if [ -d ${cur_path}/output/${i} ];then - rm -rf ${cur_path}/output/${i} - mkdir -p ${cur_path}/output/$i - else - mkdir -p ${cur_path}/output/$i - fi - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=${i}*${corenum}/${RANK_SIZE} - let b=${i}+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 & - done -else - for i in $(seq 0 ${RANK_SIZE}) - do - if [ -d ${cur_path}/output/${i} ];then - rm -rf ${cur_path}/output/${i} - mkdir -p ${cur_path}/output/$i - else - mkdir -p ${cur_path}/output/$i - fi - $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 & - done -fi +#进入训练脚本目录,需要模型审视修改 +cd $cur_path +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + export ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3.7 run_squad.py \ + --init_checkpoint ${ckpt_path}/bert_base_pretrained_amp.pt \ + --bert_model bert-base-uncased \ + --do_train \ + --train_file ${data_path}/train-v1.1.json \ + --train_batch_size ${batch_size} \ + --do_predict \ + --predict_batch_size ${batch_size} \ + --predict_file ${data_path}/dev-v1.1.json \ + --learning_rate ${learning_rate} \ + --num_train_epochs ${train_epochs} \ + --seed 1 \ + --fp16 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --max_steps -1 \ + --use_npu \ + --addr 127.0.0.1 \ + --loss_scale 4096 \ + --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ + --do_eval \ + --eval_script ${data_path}/evaluate-v1.1.py \ + --npu_id ${ASCEND_DEVICE_ID} \ + --do_lower_case \ + --output_dir results/SQUAD \ + --config_file bert_base_config.json \ + --num_npu 8 \ + --local_rank $RANK_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done wait - - ASCEND_DEVICE_ID=0 #训练结束时间,不需要修改 end_time=$(date +%s) @@ -133,13 +132,15 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` -FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$step_time'*'$RANK_SIZE'}'` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` + +FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` + #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -148,7 +149,7 @@ echo "E2E Training Duration sec : $e2e_time" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' ##获取性能数据,不需要修改 #吞吐量 @@ -157,20 +158,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}">> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 +export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_ID3078_Bert-Squad_performance_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_1p.sh similarity index 46% rename from PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_ID3078_Bert-Squad_performance_1p.sh rename to PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_1p.sh index 94dc3e739ea94e91716e1b45ab75dc16a09315f8..87b09193b501be160ec123e2045d2a331cf85ea5 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_ID3078_Bert-Squad_performance_1p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_1p.sh @@ -1,12 +1,19 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #集合通信参数,不需要修改 -export BMMV2_ENABLE=1 export RANK_SIZE=1 -export JOB_ID=10087 RANK_ID_START=0 @@ -16,11 +23,11 @@ ckpt_path="" #基础参数,需要模型审视修改 #网络名称,同目录名称 -Network="Bert-Squad_ID3078_for_PyTorch" +Network="Bert-Squad_ID0470_for_PyTorch" #训练epoch train_epochs=1 #训练batch_size -batch_size=32 +batch_size=96 #训练step train_steps= #学习率 @@ -29,54 +36,24 @@ learning_rate=6e-5 #维测参数,precision_mode需要模型审视修改 precision_mode="allow_fp32_to_fp16" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi +mkdir -p results/SQUAD #参数校验,不需要修改 for para in $* do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then + if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` elif [[ $para == --ckpt_path* ]];then ckpt_path=`echo ${para#*=}` fi done +#校验是否传入ckpt_path,不需要修改 +if [[ $ckpt_path == "" ]];then + echo "[Error] para \"ckpt_path\" must be confing" + exit 1 +fi + #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -84,35 +61,39 @@ if [[ $data_path == "" ]];then fi +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi #训练开始时间,不需要修改 start_time=$(date +%s) - +ASCEND_DEVICE_ID=0 #进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ +cd $cur_path/ for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 echo "Device ID: $ASCEND_DEVICE_ID" export RANK_ID=$RANK_ID - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi - + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path nohup python3.7 run_squad.py \ - --init_checkpoint ${ckpt_path}/bert_large_pretrained_amp.pt \ + --init_checkpoint ${ckpt_path}/bert_base_pretrained_amp.pt \ --bert_model bert-large-uncased \ --do_train \ - --train_file ${data_path}/train-v1.1-min.json \ + --train_file ${data_path}/train-v1.1.json \ --train_batch_size ${batch_size} \ --do_predict \ --predict_batch_size ${batch_size} \ @@ -121,19 +102,19 @@ do --num_train_epochs ${train_epochs} \ --seed 1 \ --fp16 \ - --max_steps 100 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --max_steps -1 \ --use_npu \ --loss_scale 4096 \ --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ --do_eval \ - --eval_script ${data_path}/evaluate-v1.1.py \ + --eval_script ${data_path}/evaluate-v1.1.py \ --npu_id ${ASCEND_DEVICE_ID} \ --do_lower_case \ - --output_dir ${cur_path}/../results \ - --config_file bert_config.json \ - --graph_mode \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done + --output_dir results/SQUAD \ + --config_file bert_base_config.json > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done wait #训练结束时间,不需要修改 @@ -143,7 +124,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` @@ -151,7 +132,7 @@ FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -169,20 +150,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_8p.sh index 8f787d2e165e6baf874492bbb3acb186616f0de7..693b069a6232f214a80a8e2a902f4fbd9dec9dfe 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_8p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_base_performance_8p.sh @@ -1,130 +1,129 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` -cd $cur_path/../ +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + -source env_npu.sh #集合通信参数,不需要修改 export RANK_SIZE=8 +RANK_ID_START=0 + + +# 数据集路径,保持为空,不需要修改 +data_path="" +ckpt_path="" #基础参数,需要模型审视修改 #网络名称,同目录名称 -Network="Bert-Squad_ID0470for_PyTorch" +Network="Bert-Squad_ID0470_for_PyTorch" +#训练epoch +train_epochs=1 +#训练batch_size +batch_size=96 +#训练step +train_steps= +#学习率 +learning_rate=2e-4 #维测参数,precision_mode需要模型审视修改 precision_mode="allow_fp32_to_fp16" -#进入训练脚本目录,需要模型审视修改 - -init_checkpoint=${1:-"checkpoints/bert_base_pretrained_amp.pt"} -epochs=${2:-"1.0"} -batch_size=${3:-"96"} -learning_rate=${4:-"2e-4"} -precision=${5:-"fp16"} -num_npu=${6:-"8"} -seed=${7:-"1"} -squad_dir=${8:-"data/squad/v1.1"} -vocab_file=${9:-"data/uncased_L-24_H-1024_A-16/vocab.txt"} -OUT_DIR=${10:-"results/SQuAD"} -mode=${11:-"train eval"} -CONFIG_FILE=${12:-"bert_base_config.json"} -max_steps=${13:-"-100"} - -echo "out dir is $OUT_DIR" -mkdir -p $OUT_DIR -if [ ! -d "$OUT_DIR" ]; then - echo "ERROR: non existing $OUT_DIR" - exit 1 +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + fi +done + +#校验是否传入ckpt_path,不需要修改 +if [[ $ckpt_path == "" ]];then + echo "[Error] para \"ckpt_path\" must be confing" + exit 1 fi -use_fp16="" -if [ "$precision" = "fp16" ] ; then - echo "fp16 activated!" - use_fp16=" --fp16 " +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 fi -CMD="python3.7 run_squad.py " - -CMD+="--init_checkpoint=$init_checkpoint " -if [ "$mode" = "train" ] ; then - CMD+="--do_train " - CMD+="--train_file=$squad_dir/train-v1.1.json " - CMD+="--train_batch_size=$batch_size " -elif [ "$mode" = "eval" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " -elif [ "$mode" = "prediction" ] ; then - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " -else - CMD+=" --do_train " - CMD+=" --train_file=$squad_dir/train-v1.1.json " - CMD+=" --train_batch_size=$batch_size " - CMD+="--do_predict " - CMD+="--predict_file=$squad_dir/dev-v1.1.json " - CMD+="--predict_batch_size=$batch_size " - CMD+="--eval_script=$squad_dir/evaluate-v1.1.py " - CMD+="--do_eval " +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh fi -CMD+=" --do_lower_case " -CMD+=" --bert_model=bert-base-uncased " -CMD+=" --learning_rate=$learning_rate " -CMD+=" --seed=$seed " -CMD+=" --num_train_epochs=$epochs " -CMD+=" --max_seq_length=384 " -CMD+=" --doc_stride=128 " -CMD+=" --output_dir=$OUT_DIR " -CMD+=" --vocab_file=$vocab_file " -CMD+=" --config_file=$CONFIG_FILE " -CMD+=" --max_steps=$max_steps " -CMD+=" $use_fp16" -CMD+=" --use_npu" -CMD+=" --num_npu=$num_npu " -CMD+=" --loss_scale=4096 " -CMD+=" --addr=127.0.0.1" - +mkdir -p results/SQUAD #训练开始时间,不需要修改 start_time=$(date +%s) -if [ $(uname -m) = "aarch64" ] -then - for i in $(seq 0 ${RANK_SIZE}) - do - if [ -d ${cur_path}/output/${i} ];then - rm -rf ${cur_path}/output/${i} - mkdir -p ${cur_path}/output/$i - else - mkdir -p ${cur_path}/output/$i - fi - corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` - let a=${i}*${corenum}/${RANK_SIZE} - let b=${i}+1 - let c=b*${corenum}/${RANK_SIZE}-1 - taskset -c $a-$c $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 & - done -else - for i in $(seq 0 ${RANK_SIZE}) - do - if [ -d ${cur_path}/output/${i} ];then - rm -rf ${cur_path}/output/${i} - mkdir -p ${cur_path}/output/$i - else - mkdir -p ${cur_path}/output/$i - fi - $CMD --local_rank=$i > ${cur_path}/output/${i}/train_${i}.log 2>&1 & - done -fi +#进入训练脚本目录,需要模型审视修改 +cd $cur_path +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + export ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + + + + #创建DeviceID输出目录,不需要修改 + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt + fi + + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + nohup python3.7 run_squad.py \ + --init_checkpoint ${ckpt_path}/bert_base_pretrained_amp.pt \ + --bert_model bert-base-uncased \ + --do_train \ + --train_file ${data_path}/train-v1.1.json \ + --train_batch_size ${batch_size} \ + --do_predict \ + --predict_batch_size ${batch_size} \ + --predict_file ${data_path}/dev-v1.1.json \ + --learning_rate ${learning_rate} \ + --num_train_epochs ${train_epochs} \ + --seed 1 \ + --fp16 \ + --addr 127.0.0.1 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --max_steps -1 \ + --use_npu \ + --loss_scale 4096 \ + --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ + --do_eval \ + --eval_script ${data_path}/evaluate-v1.1.py \ + --npu_id ${ASCEND_DEVICE_ID} \ + --do_lower_case \ + --output_dir results/SQUAD \ + --config_file bert_base_config.json \ + --num_npu 8 \ + --local_rank $RANK_ID > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done wait - - ASCEND_DEVICE_ID=0 #训练结束时间,不需要修改 end_time=$(date +%s) @@ -133,13 +132,15 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` -FPS=`awk 'BEGIN{printf "%.2f\n",'$batch_size'/'$step_time'*'$RANK_SIZE'}'` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` + +FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` + #打印,不需要修改 echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -157,20 +158,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}">> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log - +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 +export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_full_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_full_1p.sh similarity index 66% rename from PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_full_1p.sh rename to PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_full_1p.sh index 13637a9a0bed697771662ff31c51731d7d83e2f6..a65b67a668d6dfc46354baee105005438c8e25f8 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_full_1p.sh @@ -1,7 +1,16 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #集合通信参数,不需要修改 @@ -83,25 +92,29 @@ if [[ $data_path == "" ]];then exit 1 fi +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + #训练开始时间,不需要修改 start_time=$(date +%s) - +ASCEND_DEVICE_ID=0 #进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 echo "Device ID: $ASCEND_DEVICE_ID" export RANK_ID=$RANK_ID - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi @@ -126,9 +139,9 @@ do --eval_script ${data_path}/evaluate-v1.1.py \ --npu_id ${ASCEND_DEVICE_ID} \ --do_lower_case \ - --output_dir ${cur_path}/../results \ + --output_dir ${test_path_dir}/results \ --config_file bert_config.json \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --json-summary ${test_path_dir}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -139,7 +152,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` @@ -147,7 +160,7 @@ FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -165,20 +178,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}">> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}">> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_full_8p.sh similarity index 65% rename from PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_full_8p.sh rename to PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_full_8p.sh index 8ff876b4c0ce28c6a133fca6849d4c1fb7607ec6..9f0afd7ca695844272f171fff62c4f99e38aed88 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_full_8p.sh @@ -1,7 +1,16 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #集合通信参数,不需要修改 @@ -90,20 +99,27 @@ echo "Device ID: $ASCEND_DEVICE_ID" #创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt +fi + +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh fi #训练开始时间,不需要修改 start_time=$(date +%s) #进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ +cd $cur_path/ for i in $(seq 0 7) do @@ -126,12 +142,12 @@ do --eval_script ${data_path}/evaluate-v1.1.py \ --npu_id ${ASCEND_DEVICE_ID} \ --do_lower_case \ - --output_dir ${cur_path}/../results \ + --output_dir ${cur_path}/results \ --config_file bert_config.json \ --num_npu 8 \ --local_rank=$i \ --addr=127.0.0.1 \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --json-summary ${test_path_dir}/output/${ASCEND_DEVICE_ID}/dllogger.json > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -143,7 +159,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` @@ -151,7 +167,7 @@ FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -169,20 +185,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainAccuracy = ${train_accuracy}">> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}">> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_performance_1p.sh similarity index 66% rename from PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh rename to PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_performance_1p.sh index b98710f5b5a737b6fe79b5c500343689795415f5..a8903d1448da3977b5a6cde9ac663885ac5901b1 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_performance_1p.sh @@ -1,7 +1,16 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi #集合通信参数,不需要修改 export BMMV2_ENABLE=1 @@ -83,25 +92,29 @@ if [[ $data_path == "" ]];then exit 1 fi +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi #训练开始时间,不需要修改 start_time=$(date +%s) - +ASCEND_DEVICE_ID=0 #进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ +cd $cur_path/ for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 echo "Device ID: $ASCEND_DEVICE_ID" export RANK_ID=$RANK_ID - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi @@ -111,7 +124,7 @@ do --init_checkpoint ${ckpt_path}/bert_large_pretrained_amp.pt \ --bert_model bert-large-uncased \ --do_train \ - --train_file ${data_path}/train-v1.1-min.json \ + --train_file ${data_path}/train-v1.1.json \ --train_batch_size ${batch_size} \ --do_predict \ --predict_batch_size ${batch_size} \ @@ -128,9 +141,9 @@ do --eval_script ${data_path}/evaluate-v1.1.py \ --npu_id ${ASCEND_DEVICE_ID} \ --do_lower_case \ - --output_dir ${cur_path}/../results \ + --output_dir ${cur_path}/results \ --config_file bert_config.json \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --json-summary ${test_path_dir}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -141,7 +154,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` @@ -149,7 +162,7 @@ FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'}'` echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -167,20 +180,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_performance_8p.sh similarity index 66% rename from PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_8p.sh rename to PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_performance_8p.sh index 93c3b755e4d869c1024bac32ab15c0c453083e91..36e4c1e661e544ec5dbd16a44c5d294c61933ddd 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_large_performance_8p.sh @@ -1,7 +1,17 @@ #!/bin/bash -#当前路径,不需要修改 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + #集合通信参数,不需要修改 export BMMV2_ENABLE=1 @@ -83,11 +93,18 @@ if [[ $data_path == "" ]];then exit 1 fi +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + #训练开始时间,不需要修改 start_time=$(date +%s) - +ASCEND_DEVICE_ID=0 #进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ +cd $cur_path for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do #设置环境变量,不需要修改 @@ -98,11 +115,11 @@ do #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi @@ -112,7 +129,7 @@ do --init_checkpoint ${ckpt_path}/bert_large_pretrained_amp.pt \ --bert_model bert-large-uncased \ --do_train \ - --train_file ${data_path}/train-v1.1-min.json \ + --train_file ${data_path}/train-v1.1.json \ --train_batch_size ${batch_size} \ --do_predict \ --predict_batch_size ${batch_size} \ @@ -126,13 +143,13 @@ do --loss_scale 4096 \ --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ --do_eval \ - --eval_script ${data_path}/evaluate-v1.1.py \ + --eval_script ${data_path}/evaluate-v1.1.py \ --npu_id ${ASCEND_DEVICE_ID} \ --do_lower_case \ - --output_dir ${cur_path}/../results \ + --output_dir ${cur_path}/results \ --config_file bert_config.json \ --num_npu 8 \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --json-summary ${test_path_dir}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -143,7 +160,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` +step_time=`grep 'step_time : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk '{print$13}'| tail -n+3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` @@ -151,7 +168,7 @@ FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -169,20 +186,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +ActualLoss=`awk 'END {print}' $test_path_dir/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh index ec2c8f254398694a9ca669fb061e7b52fce55ecb..90486f04ff72d589df855bf5743b6b7eebd8c94c 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_16p.sh @@ -4,8 +4,14 @@ export LD_LIBRARY_PATH=/usr/local/hdf5/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=/usr/local/hdf5/lib:$LIBRARY_PATH export CPATH=/usr/local/hdf5/include:$CPATH export HDF5_DISABLE_VERSION_CHECK=1 -#当前路径,不需要修改 -cur_path=`pwd` + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi #集合通信参数,不需要修改 export BMMV2_ENABLE=1 @@ -13,6 +19,12 @@ export RANK_SIZE=8 export JOB_ID=10087 RANK_ID_START=0 +#非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi # 数据集路径,保持为空,不需要修改 data_path="" @@ -121,7 +133,7 @@ export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",'${device_num}'*'${linux_num}' #进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ +cd $cur_path rank=0 for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do @@ -133,11 +145,11 @@ do #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/ckpt fi @@ -164,12 +176,12 @@ do --eval_script ${data_path}/evaluate-v1.1.py \ --npu_id ${ASCEND_DEVICE_ID} \ --do_lower_case \ - --output_dir ${cur_path}/../results \ + --output_dir ${cur_path}/results \ --config_file bert_config.json \ --num_npu 16 \ --local_rank=$RANK_ID \ --addr $one_node_ip \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + --json-summary ${test_path_dir}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & let rank++ done wait @@ -181,7 +193,7 @@ e2e_time=$(( $end_time - $start_time )) #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/0/train_0.log| awk '{print$13}'| tail -n +3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` +step_time=`grep 'step_time : ' $test_path_dir/output/0/train_0.log| awk '{print$13}'| tail -n +3 |awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g` FPS=`awk 'BEGIN{printf "%.2f\n", '$batch_size'/'$step_time'*16}'` @@ -189,7 +201,7 @@ FPS=`awk 'BEGIN{printf "%.2f\n", '$batch_size'/'$step_time'*16}'` echo "Final Performance images/sec : $FPS" #输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/0/train_0.log|awk '{print $10}'` +train_accuracy=`grep 'F1 : ' $test_path_dir/output/0/train_0.log|awk '{print $10}'` #打印,不需要修改 echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" @@ -207,20 +219,20 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` #从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/0/train_0.log | awk '{print $19}' > $cur_path/output/0/train_${CaseName}_loss.txt +grep -r "step_loss :" $test_path_dir/output/0/train_0.log | awk '{print $19}' > $test_path_dir/output/0/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 ActualLoss=`awk 'END {print}' $cur_path/output/0/train_${CaseName}_loss.txt` #关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/0/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/0/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/0/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/0/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/0/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/0/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/0/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/0/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/0/${CaseName}.log +echo "Network = ${Network}" > $test_path_dir/output/0/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/0/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/0/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/0/${CaseName}.log +echo "CaseName = ${CaseName}" >> $test_path_dir/output/0/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/0/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/0/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $test_path_dir/output/0/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/0/${CaseName}.log rm -rf ${data_path}/train-v1.1-min.json_bert-large-uncased_384_128_64 export BMMV2_ENABLE=0 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_bert_8p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_bert_8p.sh deleted file mode 100644 index a1f45f3fec012f3377cc69bca90f059530f24933..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_bert_8p.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - -#当前路径,不需要修改 -cur_path=`pwd` - -#集合通信参数,不需要修改 -export BMMV2_ENABLE=1 -export RANK_SIZE=8 -export JOB_ID=10087 -RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path="" -ckpt_path="" - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="Bert-Squad_ID0470_for_PyTorch" -#训练epoch -train_epochs=1 -#训练batch_size -batch_size=32 -#训练step -train_steps= -#学习率 -learning_rate=2e-4 - - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_fp32_to_fp16" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - -h/--help show help message - " - exit 1 -fi - -#参数校验,不需要修改 -for para in $* -do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --ckpt_path* ]];then - ckpt_path=`echo ${para#*=}` - fi -done - -#校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be confing" - exit 1 -fi - -cp run_squad.py $cur_path/../ -#训练开始时间,不需要修改 -start_time=$(date +%s) - -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/../ -for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); -do - #设置环境变量,不需要修改 - export ASCEND_DEVICE_ID=$RANK_ID - echo "Device ID: $ASCEND_DEVICE_ID" - export RANK_ID=$RANK_ID - - - - #创建DeviceID输出目录,不需要修改 - if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - else - mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt - fi - - - #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path - nohup python3.7 run_squad.py \ - --init_checkpoint ${ckpt_path}/bert_large_pretrained_amp.pt \ - --bert_model bert-large-uncased \ - --do_train \ - --train_file ${data_path}/train-v1.1.json \ - --train_batch_size ${batch_size} \ - --do_predict \ - --predict_batch_size ${batch_size} \ - --predict_file ${data_path}/dev-v1.1.json \ - --learning_rate ${learning_rate} \ - --num_train_epochs ${train_epochs} \ - --seed 1 \ - --fp16 \ - --max_steps 100 \ - --use_npu \ - --loss_scale 4096 \ - --vocab_file ${data_path}/data/uncased_L-24_H-1024_A-16/vocab.txt \ - --do_eval \ - --eval_script ${data_path}/evaluate-v1.1.py \ - --npu_id ${ASCEND_DEVICE_ID} \ - --do_lower_case \ - --output_dir ${cur_path}/../results \ - --config_file bert_config.json \ - --num_npu 8 \ - --json-summary ${cur_path}/output/${ASCEND_DEVICE_ID}/dllogger.json> ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & -done -wait - -#训练结束时间,不需要修改 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -#结果打印,不需要修改 -echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -step_time=`grep 'step_time : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $13}'` - -FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'` - -#打印,不需要修改 -echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep 'F1 : ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $10}'` -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" -echo "E2E Training Duration sec : $e2e_time" - -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p_bert'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt - -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -rm -rf ${data_path}/train-v1.1.json_bert-large-uncased_384_128_64 -export BMMV2_ENABLE=0 \ No newline at end of file