From 5e4aaf306902b327663660e4ea46dc670dae53b9 Mon Sep 17 00:00:00 2001
From: hxxhl88 <736544296@qq.com>
Date: Thu, 29 Sep 2022 15:33:57 +0800
Subject: [PATCH] update README.md
---
.../BertNV_Series_for_TensorFlow/README.md | 187 +++++++++++-------
.../src/utils/create_glue_data.py | 4 -
.../src/utils/create_squad_data.py | 6 +-
3 files changed, 115 insertions(+), 82 deletions(-)
diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md
index 810689cf9..61a89b830 100644
--- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md
+++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/README.md
@@ -4,6 +4,7 @@
- [快速上手](#快速上手.md)
- [迁移学习指导](#迁移学习指导.md)
- [高级参考](#高级参考.md)
+
基本信息
**发布者(Publisher):Huawei**
@@ -26,7 +27,7 @@
**应用级别(Categories):Benchmark**
-**描述(Description):基于TensorFlow框架的BERT-Base及下游任务代码**
+**描述(Description):基于TensorFlow框架的BERT预训练及下游任务代码**
概述
@@ -43,7 +44,7 @@
- 适配昇腾 AI 处理器的实现:
- https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow
+ https://gitee.com/hxxhl88/ModelZoo-TensorFlow/tree/master/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow
- 通过Git获取对应commit\_id的代码方法如下:
@@ -57,35 +58,18 @@
## 默认配置
-- 网络结构
-
- 学习率为1e-5,使用polynomial decay
-
- 优化器:Adam
-
- 优化器Weight decay为0.01
-
- 优化器epsilon设置为1e-4
-
- 单卡batchsize:128
-
- 32卡batchsize:128*32
-
- 总step数设置为500000
-
- Warmup step设置为10000
-
-- 训练数据集预处理(以wikipedia为例,仅作为用户参考示例):
-
- Sequence Length原则上用户可以自行定义
-
- 以常见的设置128为例,mask其中的20个tokens作为自编码恢复的目标。
-
- 下游任务预处理以用户需要为准。
-
-- 测试数据集预处理(以wikipedia为例,仅作为用户参考示例):
-
- 和训练数据集处理一致。
+- 网络结构
+ - 24-layer, 1024-hidden, 16-heads, 340M parameters
+- 训练超参(单卡):
+ - Batch size: 24
+ - max_predictions_per_seq: 80
+ - max_seq_length: 512
+ - Learning rate(LR): 5e-5, polynomial decay
+ - optimizer: Adam
+ - Weight decay: 0.01
+ - beta_1: 0.9
+ - beta_2: 0.999
+ - Train epoch: 1
## 支持特性
@@ -149,52 +133,100 @@
快速上手
-- 单击“立即下载”,下载源码包。
+## 数据集准备
-- 数据集准备
+1、用户自行准备好数据集,本网络包括Bert的Pre-training和Fine tuning任务
-数据集以文本格式表示,每段之间以空行隔开,如wikipedia。
-运行如下命令,将数据集转换为tfrecord格式。
+2、Pre-training任务使用的数据集是wikipedia-en,Fine tuning使用的数据集是MPRC、MNLI、CoLA、SQuAD1.1和SQuAD2.0。
-```
- python src/create_pretraining_data.py \
- --input_file= \
- --output_file=/some_output_data.tfrecord \
- --vocab_file= \
- --do_lower_case=True \
- --max_seq_length=128 \
- --max_predictions_per_seq=20 \
- --masked_lm_prob=0.15 \
- --random_seed=12345 \
- --dupe_factor=5
+```shell
+#为了提升训练的端到端效率,SQuAD1.1和SQuAD2.0均提前做了转tfrecord的处理,转换方式如下:
+cd ${work_path}
+python3 ${work_path}/src/utils/create_squad_data.py --train_file=${data_path}/train-v1.1.json \
+ --predict_file=${data_path}/dev-v1.1.json \
+ --vocab_file=${model_path}/vocab.txt
```
-- 模型训练
-- 启动训练之前,首先要配置程序运行相关环境变量。
+3、Bert训练的模型及数据集可以参考"概述 -> 参考实现"
- 环境变量配置信息参见:
+## 模型训练
- [Ascend 910训练平台环境变量设置](https://gitee.com/ascend/ModelZoo-TensorFlow/wikis/01.%E8%AE%AD%E7%BB%83%E8%84%9A%E6%9C%AC%E8%BF%81%E7%A7%BB%E6%A1%88%E4%BE%8B/Ascend%20910%E8%AE%AD%E7%BB%83%E5%B9%B3%E5%8F%B0%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F%E8%AE%BE%E7%BD%AE)
+- 单击“立即下载”,并选择合适的下载方式下载源码包。
+- 开始训练。
- 将环境变量配置到test/train_*.sh中
+ - 启动训练之前,首先要配置程序运行相关环境变量。
- - 单卡训练
-
- 1. 将test/train_ID0060_BertBase_performance_1p.sh的data_path配置为用户数据集具体路径
-
- 2. 单卡训练指令,在test目录下,执行如下命令:
- ```
- bash train_ID0060_BertBase_performance_1p.sh
- ```
+ 环境变量配置信息参见:
+
+ [Ascend 910训练平台环境变量设置](https://gitee.com/ascend/modelzoo/wikis/Ascend%20910%E8%AE%AD%E7%BB%83%E5%B9%B3%E5%8F%B0%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F%E8%AE%BE%E7%BD%AE?sort_id=3148819)
+
+ - 单卡训练
+ 网络共包含18个训练,其中Pre-training 8个训练任务, Fine tuning 10个训练任务。
+
+ **Pre-training 任务**:
+
+ ```shell
+ #ID0060: num_hidden_layers=12 max_seq_length=128 optimizer=Adam
+ bash train_ID0060_BertBase_performance_1p.sh --data_path=/home
+
+ #ID3067: num_hidden_layers=24 max_seq_length=128 optimizer=Adam
+ bash train_ID3067_BertLarge-128_performance_1p.sh --data_path=/home
+
+ #ID3068: num_hidden_layers=24 max_seq_length=512 optimizer=lamb phase2
+ bash train_ID3068_BertLarge-512_performance_1p.sh --data_path=/home
+
+ #ID3069: num_hidden_layers=12 max_seq_length=512 optimizer=lamb phase2
+ bash train_ID3069_BertBase-512_performance_1p.sh --data_path=/home
+
+ #ID3206: num_hidden_layers=12 max_seq_length=512 optimizer=Adam
+ bash train_ID3206_BertBase-512_performance_1p.sh --data_path=/home
+
+ #ID3207: num_hidden_layers=24 max_seq_length=512 optimizer=Adam
+ bash train_ID3207_BertLarge-512_performance_1p.sh --data_path=/home
+
+ #ID3208: num_hidden_layers=12 max_seq_length=128 optimizer=lamb phase1
+ bash train_ID3208_BertBase-128_performance_1p.sh --data_path=/home
+
+ #ID3209: num_hidden_layers=24 max_seq_length=128 optimizer=lamb phase1
+ bash train_ID3209_BertLarge-128_performance_1p.sh --data_path=/home
+ ```
+
+ **Fine tuning 任务**:
+
+ ```shell
+ #ID1641: MRPC num_hidden_layers=24 max_seq_length=128 optimizer=Adam
+ bash train_ID1641_BertLarge-128_performance_1p.sh --data_path=/home
+
+ #ID3232: MRPC num_hidden_layers=12 max_seq_length=128 optimizer=Adam
+ bash train_ID3232_BertBase-128_performance_1p.sh --data_path=/home
+
+ #ID1642: MNLI num_hidden_layers=24 max_seq_length=128 optimizer=Adam
+ bash train_ID1642_BertLarge-128_performance_1p.sh --data_path=/home
+
+ #ID3233: MNLI num_hidden_layers=12 max_seq_length=128 optimizer=Adam
+ bash train_ID3233_BertBase-128_performance_1p.sh --data_path=/home
+
+ #ID1643: CoLA num_hidden_layers=24 max_seq_length=128 optimizer=Adam
+ bash train_ID1643_BertLarge-128_performance_1p.sh --data_path=/home
+
+ #ID3234: CoLA num_hidden_layers=12 max_seq_length=128 optimizer=Adam
+ bash train_ID3234_BertBase-128_performance_1p.sh --data_path=/home
+
+ #ID3217: SQuAD1.1 num_hidden_layers=12 max_seq_length=384 optimizer=Adam
+ bash train_ID3217_BertBase-Squad1.1_performance_1p.sh --data_path=/home
+
+ #ID3218: SQuAD1.1 num_hidden_layers=24 max_seq_length=384 optimizer=Adam
+ bash train_ID3218_BertLarge-Squad1.1_performance_1p.sh --data_path=/home
+
+ #ID3219: SQuAD2.0 num_hidden_layers=12 max_seq_length=384 optimizer=Adam
+ bash train_ID3219_BertBase-Squad2.0_performance_1p.sh --data_path=/home
+
+ #ID3220: SQuAD2.0 num_hidden_layers=24 max_seq_length=384 optimizer=Adam
+ bash train_ID3220_BertLarge-Squad2.0_performance_1p.sh --data_path=/home
+ ```
-- 8卡训练
- 1. 修改test/train_ID0060_BertBase_performance_8p.sh中的data_path配置为用户数据集具体路径
- 2. 8卡训练指令,在test目录下,执行如下命令:
- ```
- bash train_ID0060_BertBase_performance_8p.sh
- ```
高级参考
@@ -205,17 +237,24 @@
│ ├──bert_large_config.json //bert base模型配置文件
│ ├──bert_base_vocab.txt //bert base中文词表
├── src
+ │ ├──utils
+ │ │ ├──create_pretraining_data.py //生成预训练数据脚本
+ │ │ ├──create_glue_data.py //glue数据集转tfrecord脚本
+ │ │ ├──create_squad_data.py //squad数据集转tfrecord脚本
+ │ │ ├──dllogger_class.py //生成与训练数据脚本
+ │ │ ├──gpu_affinity.py //设置gpu亲和性
+ │ │ ├──utils.py //公共脚本
│ ├──gpu_environment.py //原始gpu_environment设置
- │ ├──create_pretraining_data.py //生成与训练数据脚本
│ ├──modeling.py //NEZHA模型脚本
│ ├──optimization.py //优化器脚本
│ ├──extract_features.py //特征抽取脚本
│ ├──fp16_utils.py //fp16 utils脚本
│ ├──fused_layer_norm.py //layer norm融合脚本
│ ├──run_pretraining.py //预训练启动脚本
+ │ ├──run_classifier.py //下游任务分类脚本
+ │ ├──run_squad.py //下游任务squad脚本
│ ├──tf_metrics.py //tf metrics脚本
│ ├──tokenization.py //分词器脚本
- │ ├──utils.py //utils脚本
├── CONTRIBUTING.md //CONTRIBUTING.md
├── LICENCE //LICENCE
├── NOTICE //NOTICE
@@ -226,13 +265,13 @@
```
- --train_batch_size=128 \ #每个NPU训练的batch size,默认:128
- --learning_rate=1e-4 \ #学习率,默认:1e-4
- --num_warmup_steps=10000 \ # 初始warmup训练epoch数,默认:10000
- --num_train_steps=500000 \ #训练次数,单P 默认:500000
- --input_files_dir=/autotest/CI_daily/ModelZoo_BertBase_TF/data/wikipedia_128 \ #训练数据集路径
- --eval_files_dir=/autotest/CI_daily/ModelZoo_BertBase_TF/data/wikipedia_128 \ #验证数据集路径
- --iterations_per_loop=100 \ #NPU运行时,device端下沉次数,默认:1000
+ --train_batch_size=128 \ # 每个NPU训练的batch size
+ --learning_rate=1e-4 \ # 学习率
+ --num_warmup_steps=10000 \ # 初始warmup训练epoch数
+ --num_train_steps=500000 \ # 训练次数
+ --input_files_dir=xxxx \ # 训练数据集路径
+ --eval_files_dir=xxxx \ # 验证数据集路径
+ --iterations_per_loop=100 \ # NPU运行时,device端下沉次数
```
diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_glue_data.py b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_glue_data.py
index 99b70f2a7..c1ebe154f 100644
--- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_glue_data.py
+++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_glue_data.py
@@ -41,7 +41,6 @@ import optimization
import tokenization
import six
import tensorflow as tf
-#import horovod.tensorflow as hvd
import time
import csv
@@ -525,7 +524,4 @@ def main():
predict_file)
if __name__ == "__main__":
- (npu_sess, npu_shutdown) = init_resource()
main()
- shutdown_resource(npu_sess, npu_shutdown)
- close_session(npu_sess)
diff --git a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_squad_data.py b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_squad_data.py
index a65748c0e..710a75016 100644
--- a/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_squad_data.py
+++ b/TensorFlow/built-in/nlp/BertNV_Series_for_TensorFlow/src/utils/create_squad_data.py
@@ -41,8 +41,9 @@ import optimization
import tokenization
import six
import tensorflow as tf
-#import horovod.tensorflow as hvd
import time
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
flags = tf.flags
FLAGS = None
@@ -575,7 +576,4 @@ def main():
eval_writer.close()
if __name__ == "__main__":
- (npu_sess, npu_shutdown) = init_resource()
main()
- shutdown_resource(npu_sess, npu_shutdown)
- close_session(npu_sess)
--
Gitee