From 5093486baa745e05c828c230f3ad735acb119620 Mon Sep 17 00:00:00 2001
From: yang-shenwu <1784196654@qq.com>
Date: Wed, 30 Mar 2022 09:31:30 +0800
Subject: [PATCH] =?UTF-8?q?Bert-base=E7=BD=91=E7=BB=9C=E5=A2=9E=E5=8A=A0ov?=
 =?UTF-8?q?erdump=E7=9A=84=E9=80=89=E9=A1=B9=EF=BC=8C=E5=AF=B9=E5=8E=9F?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD=E4=B8=8D=E5=8F=97=E5=BD=B1=E5=93=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/pretrain/run_pretraining.py                      | 9 +++++++++
 .../test/train_full_1p.sh                                | 2 ++
 .../test/train_performance_1p.sh                         | 2 ++
 3 files changed, 13 insertions(+)

diff --git a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py
index 602da0127..b2ac9e6cc 100644
--- a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py
+++ b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py
@@ -128,6 +128,10 @@ flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic o
 
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
+flags.DEFINE_bool("over_dump", False, "Whether to enable overflow.")
+
+flags.DEFINE_string("over_dump_path", None, "path to save overflow dump files.")
+
 flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
 
 flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.")
@@ -605,6 +609,10 @@ def main(_):
       raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
 
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  if FLAGS.over_dump:
+      dump_config = DumpConfig(enable_dump_debug = True, dump_path = FLAGS.over_dump_path, dump_debug_mode = "all")
+  else:
+      dump_config = DumpConfig(enable_dump_debug = False, dump_path = FLAGS.over_dump_path, dump_debug_mode = "all")
   config = tf.ConfigProto()
   if FLAGS.horovod:
     config.gpu_options.visible_device_list = str(hvd.local_rank())
@@ -621,6 +629,7 @@ def main(_):
 
   #run_config = tf.estimator.RunConfig(
   run_config = NPURunConfig(
+      dump_config=dump_config
       model_dir=FLAGS.output_dir,
       save_summary_steps=0,
       session_config=config,
diff --git a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh
index 8fb89982b..2cfc4af1a 100644
--- a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh
+++ b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh
@@ -138,6 +138,8 @@ do
     --npu_bert_clip_by_global_norm=False \
     --distributed=False \
     --npu_bert_loss_scale=0 \
+    --over_dump=${over_dump} \
+    --over_dump_path=${over_dump_path} \
     --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 done 
 wait
diff --git a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh
index 8fb89982b..2cfc4af1a 100644
--- a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh
+++ b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh
@@ -138,6 +138,8 @@ do
     --npu_bert_clip_by_global_norm=False \
     --distributed=False \
     --npu_bert_loss_scale=0 \
+    --over_dump=${over_dump} \
+    --over_dump_path=${over_dump_path} \
     --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 done 
 wait
-- 
Gitee