From 98ae26203ba211a9602945e6d5bcace0bfdb70de Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 19 May 2023 08:26:29 +0000 Subject: [PATCH 1/8] update /train_performance_squad1.1_base_8p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_performance_squad1.1_base_8p.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh index e2f0108a0..4b99d0d1b 100644 --- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh +++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh @@ -15,13 +15,13 @@ RANK_ID_START=0 export RANK_SIZE=8 export RANK_TABLE_FILE=${cur_path}/configs/rank_table_8p.json #性能优化 -export NPU_LOOP_SIZE=100 +export NPU_LOOP_SIZE=25 #训练epoch,可选 train_epochs=1 #训练step -train_steps=300 +train_steps=400 #学习率 -learning_rate=8e-5 +learning_rate=64e-5 ckpt_path="" #参数配置 -- Gitee From bcee7d98dd16ec52d5ad0d9905c686bcc38b930a Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 19 May 2023 09:11:13 +0000 Subject: [PATCH 2/8] update train_performance_squad1.1_base_8p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_performance_squad1.1_base_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh index 4b99d0d1b..0ca2f36de 100644 --- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh +++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh @@ -19,7 +19,7 @@ export NPU_LOOP_SIZE=25 #训练epoch,可选 train_epochs=1 #训练step -train_steps=400 +train_steps=600 #学习率 learning_rate=64e-5 ckpt_path="" -- Gitee From 2243d6a463d24b112113dffabe20bd333b13446b Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 19 May 2023 09:26:14 +0000 Subject: [PATCH 3/8] update train_performance_squad1.1_base_8p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_performance_squad1.1_base_8p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh index 0ca2f36de..159577614 100644 --- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh +++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh @@ -134,7 +134,7 @@ do --learning_rate=${learning_rate} \ --num_train_epochs=${train_epochs} \ --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \ - --log_steps=100 \ + --log_steps=200 \ --steps_per_loop=${NPU_LOOP_SIZE} \ --train_steps=${train_steps} \ --num_gpus=1 \ -- Gitee From 677fe1c4649bf18682bc9a4acd309e9c16fa5486 Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 19 May 2023 09:42:04 +0000 Subject: [PATCH 4/8] update bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_performance_squad1.1_base_8p.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh index 159577614..98643e7a6 100644 --- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh +++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh @@ -121,6 +121,18 @@ do mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID fi + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + cpustep=`expr $cpucount / 8` + echo "taskset c steps:" $cpustep + let a=RANK_ID*$cpustep + let b=RANK_ID+1 + let c=b*$cpustep-1 + + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + nohup python3 ./official/nlp/bert/run_squad.py \ --mode=${MODE} \ --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \ -- Gitee From 0d81dddcbd7e7206f6f5a87443b2273a1c50a25b Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 19 May 2023 10:15:44 +0000 Subject: [PATCH 5/8] update bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_performance_squad1.1_base_8p.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh index 98643e7a6..c4378bebf 100644 --- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh +++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh @@ -19,7 +19,7 @@ export NPU_LOOP_SIZE=25 #训练epoch,可选 train_epochs=1 #训练step -train_steps=600 +#train_steps=600 #学习率 learning_rate=64e-5 ckpt_path="" @@ -148,7 +148,6 @@ do --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \ --log_steps=200 \ --steps_per_loop=${NPU_LOOP_SIZE} \ - --train_steps=${train_steps} \ --num_gpus=1 \ --distribution_strategy=one_device \ --sub_model_export_name=sub_model \ -- Gitee From dd20c9f24887393b56f138a9c9e0e5e048d48a77 Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Sat, 20 May 2023 08:27:07 +0000 Subject: [PATCH 6/8] update BertGoogle_Series_for_TensorFlow/run_squad.py. Signed-off-by: liuyihang <1905527319@qq.com> --- .../nlp/BertGoogle_Series_for_TensorFlow/run_squad.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py index 8c70b73c9..a15ea0299 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py @@ -781,8 +781,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, unique_id_to_result[result.unique_id] = result # process unique id issue - max_unique_id = all_results[-1].unique_id - print("max_unique_id=%d" % max_unique_id) + #max_unique_id = all_results[-1].unique_id + #print("max_unique_id=%d" % max_unique_id) _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", @@ -802,8 +802,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): - if feature.unique_id > max_unique_id: - continue + #if feature.unique_id > max_unique_id: + #continue result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) @@ -1336,7 +1336,7 @@ def main(_): input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, - drop_remainder=True) + drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. -- Gitee From 7e8f76ee0ec360fc6c5a8d864492c274b88b523b Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Sat, 20 May 2023 08:43:43 +0000 Subject: [PATCH 7/8] update /test/train_ID0495_Bert-Squad_full_1p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_ID0495_Bert-Squad_full_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh index 4c5aca93b..e4ea4691d 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh @@ -100,7 +100,7 @@ do nohup python3.7 ${parent_path}/run_squad.py \ --vocab_file=$vocab_file \ --bert_config_file=$bert_config_file \ - --init_checkpoint=$init_checkpoint \\ + --init_checkpoint=$init_checkpoint \ --read_tf_record=True \ --train_file=$train_file \ --do_predict=True \ -- Gitee From f2a1f24bb63b71a566ca08349bbedd4753c71a3d Mon Sep 17 00:00:00 2001 From: liuyihang <1905527319@qq.com> Date: Fri, 26 May 2023 03:25:41 +0000 Subject: [PATCH 8/8] update Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh. Signed-off-by: liuyihang <1905527319@qq.com> --- .../test/train_performance_1p.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh index 6508764ec..00d89de53 100644 --- a/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh @@ -175,7 +175,8 @@ ActualFPS=`echo "scale=2;${fps} * ${batch_size}"|bc` temp1=`echo "1000 * ${batch_size}"|bc` TrainingTime=`echo "scale=2;${temp1} / ${ActualFPS}"|bc` -ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,` +ActualLoss=`grep "Loss for final step:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'Loss for final step:' '{print $2}'|awk 'END {print $1}'` +ActualLoss=${ActualLoss%.*} #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -- Gitee