From 5093486baa745e05c828c230f3ad735acb119620 Mon Sep 17 00:00:00 2001 From: yang-shenwu <1784196654@qq.com> Date: Wed, 30 Mar 2022 09:31:30 +0800 Subject: [PATCH] =?UTF-8?q?Bert-base=E7=BD=91=E7=BB=9C=E5=A2=9E=E5=8A=A0ov?= =?UTF-8?q?erdump=E7=9A=84=E9=80=89=E9=A1=B9=EF=BC=8C=E5=AF=B9=E5=8E=9F?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E4=B8=8D=E5=8F=97=E5=BD=B1=E5=93=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/pretrain/run_pretraining.py | 9 +++++++++ .../test/train_full_1p.sh | 2 ++ .../test/train_performance_1p.sh | 2 ++ 3 files changed, 13 insertions(+) diff --git a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py index 602da0127..b2ac9e6cc 100644 --- a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py +++ b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/src/pretrain/run_pretraining.py @@ -128,6 +128,10 @@ flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic o flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") +flags.DEFINE_bool("over_dump", False, "Whether to enable overflow.") + +flags.DEFINE_string("over_dump_path", None, "path to save overflow dump files.") + flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.") flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.") @@ -605,6 +609,10 @@ def main(_): raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error") is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.over_dump: + dump_config = DumpConfig(enable_dump_debug = True, dump_path = FLAGS.over_dump_path, dump_debug_mode = "all") + else: + dump_config = DumpConfig(enable_dump_debug = False, dump_path = FLAGS.over_dump_path, dump_debug_mode = "all") config = tf.ConfigProto() if FLAGS.horovod: config.gpu_options.visible_device_list = str(hvd.local_rank()) @@ -621,6 +629,7 @@ def main(_): #run_config = tf.estimator.RunConfig( run_config = NPURunConfig( + dump_config=dump_config model_dir=FLAGS.output_dir, save_summary_steps=0, session_config=config, diff --git a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh index 8fb89982b..2cfc4af1a 100644 --- a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_full_1p.sh @@ -138,6 +138,8 @@ do --npu_bert_clip_by_global_norm=False \ --distributed=False \ --npu_bert_loss_scale=0 \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait diff --git a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh index 8fb89982b..2cfc4af1a 100644 --- a/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/nlp/Bert-base_ID0060_for_TensorFlow/test/train_performance_1p.sh @@ -138,6 +138,8 @@ do --npu_bert_clip_by_global_norm=False \ --distributed=False \ --npu_bert_loss_scale=0 \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ --output_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait -- Gitee