diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py index 8c70b73c91bc29a7547406b6c652ca7d416f1a72..a15ea029928adebb1fe624e29cc393e16c4defc3 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/run_squad.py @@ -781,8 +781,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, unique_id_to_result[result.unique_id] = result # process unique id issue - max_unique_id = all_results[-1].unique_id - print("max_unique_id=%d" % max_unique_id) + #max_unique_id = all_results[-1].unique_id + #print("max_unique_id=%d" % max_unique_id) _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", @@ -802,8 +802,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): - if feature.unique_id > max_unique_id: - continue + #if feature.unique_id > max_unique_id: + #continue result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) @@ -1336,7 +1336,7 @@ def main(_): input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, - drop_remainder=True) + drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. diff --git a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh index 4c5aca93b9d8023c66da8d3dcd670ad13cb6a2e6..e4ea4691d37cdc47bd35a620adb6093e6b9043ab 100644 --- a/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh +++ b/TensorFlow/built-in/nlp/BertGoogle_Series_for_TensorFlow/test/train_ID0495_Bert-Squad_full_1p.sh @@ -100,7 +100,7 @@ do nohup python3.7 ${parent_path}/run_squad.py \ --vocab_file=$vocab_file \ --bert_config_file=$bert_config_file \ - --init_checkpoint=$init_checkpoint \\ + --init_checkpoint=$init_checkpoint \ --read_tf_record=True \ --train_file=$train_file \ --do_predict=True \ diff --git a/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh index 6508764ec9ef516ffd90f5f1dd194033678a2ab4..00d89de53456cb891aa83e96cfcf7874037db869 100644 --- a/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/nlp/Bertsquad_ID0495_for_TensorFlow/test/train_performance_1p.sh @@ -175,7 +175,8 @@ ActualFPS=`echo "scale=2;${fps} * ${batch_size}"|bc` temp1=`echo "1000 * ${batch_size}"|bc` TrainingTime=`echo "scale=2;${temp1} / ${ActualFPS}"|bc` -ActualLoss=`grep "loss =" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'loss =' '{print $2}'|awk 'END {print $1}'|tr -d ,` +ActualLoss=`grep "Loss for final step:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk -F 'Loss for final step:' '{print $2}'|awk 'END {print $1}'` +ActualLoss=${ActualLoss%.*} #关键信息打印到${CaseName}.log中,不需要修改 echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh index e2f0108a087a8b1d387108546237a8e31d557ac4..c4378bebf7c4aa50b9caf4c7c133d6684ab06714 100644 --- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh +++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_8p.sh @@ -15,13 +15,13 @@ RANK_ID_START=0 export RANK_SIZE=8 export RANK_TABLE_FILE=${cur_path}/configs/rank_table_8p.json #性能优化 -export NPU_LOOP_SIZE=100 +export NPU_LOOP_SIZE=25 #训练epoch,可选 train_epochs=1 #训练step -train_steps=300 +#train_steps=600 #学习率 -learning_rate=8e-5 +learning_rate=64e-5 ckpt_path="" #参数配置 @@ -121,6 +121,18 @@ do mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID fi + #绑核,不需要绑核的模型删除,需要绑核的模型根据实际修改 + cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'` + cpustep=`expr $cpucount / 8` + echo "taskset c steps:" $cpustep + let a=RANK_ID*$cpustep + let b=RANK_ID+1 + let c=b*$cpustep-1 + + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + nohup python3 ./official/nlp/bert/run_squad.py \ --mode=${MODE} \ --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \ @@ -134,9 +146,8 @@ do --learning_rate=${learning_rate} \ --num_train_epochs=${train_epochs} \ --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \ - --log_steps=100 \ + --log_steps=200 \ --steps_per_loop=${NPU_LOOP_SIZE} \ - --train_steps=${train_steps} \ --num_gpus=1 \ --distribution_strategy=one_device \ --sub_model_export_name=sub_model \