From 41a72b926973bb8eb7f6fb8283f4b06d767eae64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:16:52 +0000 Subject: [PATCH 01/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20contrib?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/.keep diff --git a/contrib/.keep b/contrib/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From a94764d342192e388d6f821315c530de918e1924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:17:22 +0000 Subject: [PATCH 02/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20TensorFlow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/.keep diff --git a/contrib/TensorFlow/.keep b/contrib/TensorFlow/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From b9381311f25b00dee5b232eac247ccd9a7888fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:17:41 +0000 Subject: [PATCH 03/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20Research?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/Research/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/.keep diff --git a/contrib/TensorFlow/Research/.keep b/contrib/TensorFlow/Research/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From f80bdc871c740e40be6f74cea23defb2c4fdd9e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:17:55 +0000 Subject: [PATCH 04/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20nlp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/Research/nlp/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/nlp/.keep diff --git a/contrib/TensorFlow/Research/nlp/.keep b/contrib/TensorFlow/Research/nlp/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 081929166d721c2eb930e1e8ed47b5730e0cdd11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:18:06 +0000 Subject: [PATCH 05/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20enas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/Research/nlp/enas/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/nlp/enas/.keep diff --git a/contrib/TensorFlow/Research/nlp/enas/.keep b/contrib/TensorFlow/Research/nlp/enas/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From f1aefca0a56049f33dbf36045c63bcadb06d6021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:18:18 +0000 Subject: [PATCH 06/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20ENAS=5FID2053=5Ffor?= =?UTF-8?q?=5FTensorFlow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 503900ce48efefcfc91143030b68ec0a8d4d9ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:19:36 +0000 Subject: [PATCH 07/27] add source code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../enas/ENAS_ID2053_for_TensorFlow/README.md | 207 ++++++ .../ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py | 36 + .../enas/ENAS_ID2053_for_TensorFlow/bash.py | 4 + .../boot_modelarts.py | 73 ++ .../enas/ENAS_ID2053_for_TensorFlow/child.py | 440 ++++++++++++ .../ENAS_ID2053_for_TensorFlow/ckpt2pb.py | 81 +++ .../ENAS_ID2053_for_TensorFlow/controller.py | 250 +++++++ .../ENAS_ID2053_for_TensorFlow/data_utils.py | 125 ++++ .../enas/ENAS_ID2053_for_TensorFlow/fixed.py | 318 +++++++++ .../ENAS_ID2053_for_TensorFlow/fixed_lib.py | 652 ++++++++++++++++++ .../help_modelarts.py | 93 +++ .../enas/ENAS_ID2053_for_TensorFlow/lstm.py | 174 +++++ .../ENAS_ID2053_for_TensorFlow/lstm_lib.py | 458 ++++++++++++ .../ENAS_ID2053_for_TensorFlow/process.py | 72 ++ .../enas/ENAS_ID2053_for_TensorFlow/search.py | 288 ++++++++ .../enas/ENAS_ID2053_for_TensorFlow/search.sh | 36 + .../ENAS_ID2053_for_TensorFlow/test-npu.sh | 45 ++ .../enas/ENAS_ID2053_for_TensorFlow/utils.py | 67 ++ 18 files changed, 3419 insertions(+) create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md new file mode 100644 index 000000000..50d2547b0 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -0,0 +1,207 @@ +###基本信息 +####发布者(Publisher):Huawei +####应用领域(Application Domain):NLP +####修改时间(Modified) :2018. +####框架(Framework):TensorFlow 1.15.0 +####模型格式(Model Format):ckpt +####精度(Precision):Mixed +####处理器(Processor):昇腾910 +####应用级别(Categories):Research +####描述(Description): enas模型用于ptb数据集的神经网络结构搜索 + + +###概述 +enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. +- 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +- 参考代码:[enas](https://github.com/melodyguan/enas) + +###默认配置 +####数据预处理 + - #### 输入数据为文本 + - #### 文本输入格式: id [int] +#### 训练超参数 + - #### search + - #### controller baseline decay : 0.999 + - #### controller entropy weight : 1e-5 + - #### controller temperature : 5 + - #### controller learning rate : 5e-5 + - #### controller num layers : 9 + - #### controller hidden size : 64 + - #### controller num functions : 4 + - #### child batch size : 128 + - #### child bptt steps : 35 + - #### num train epochs : 600 + - ####test + - #### child grad bound : 0.25 + - #### child weight decay : 2e-6 + - #### child num train epochs :3000 + - #### child hidden size : 800 + - #### learning_rate : 20. + +###支持特性 + +| 特性列表 | 是否支持 | +|------|------| +| 混合精度 | 是 | + +###混合精度训练 +#### 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 + +###快速上手 +####模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 + +###代码结构文件 +#### +|— search.py 搜索模型代码\ +|— child.py 子图模型代码\ +|— fixed.py 架构验证模型代码\ +|— fixed_lib.py\ +|— data_utils.py 数据处理代码\ +|— controller.py 性能评估模型代码\ +|— boot_modelarts.py 模型运行代码\ +|— ... + +###脚本参数 +#### +- search:\ +--data_path\ +--output_dir\ +--obs_dir +- test:\ +--data_path\ +--output_dir\ +--fixed_arc\ +--ckp_path + + + +###训练过程 +在论文的参数设置下,GPU训练精度和速度可以达到要求; +NPU的训练精度和速度还未达标。 +- #### GPU +#### search +epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ +valid_ppl=1800.73\ +epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ +valid_ppl=892.87\ +epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ +valid_ppl=843.70\ +epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ +valid_ppl=898.45\ +epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ +valid_ppl=774.25\ +epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ +valid_ppl=622.82\ +epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ +valid_ppl=606.77\ +epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ +valid_ppl=579.69\ +epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ +valid_ppl=520.63\ +epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ +...\ +valid_ppl=162.39\ +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 +#### test +epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ +epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ +valid_ppl=463.03\ +epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ +epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ +valid_ppl=339.76\ +epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ +epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ +valid_ppl=271.71\ +epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ +epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ +valid_ppl=245.00\ +epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ +epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ +valid_ppl=213.10\ +epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ +epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ +epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ +valid_ppl=209.90\ +epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ +epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ +valid_ppl=181.99\ +epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ +epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ +valid_ppl=176.79\ +epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ +epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ +valid_ppl=166.62\ +...\ +epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ +epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ +valid_ppl=61.17\ +epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ +epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ +epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ +valid_ppl=61.17\ +epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ +...\ +step=70000 test_ppl=59.15\ +step=71000 test_ppl=59.03\ +step=72000 test_ppl=59.06\ +step=73000 test_ppl=58.41\ +step=74000 test_ppl=58.24\ +step=75000 test_ppl=58.12\ +step=76000 test_ppl=58.15\ +step=77000 test_ppl=58.29\ +step=78000 test_ppl=58.36\ +step=79000 test_ppl=58.50\ +step=80000 test_ppl=58.43\ +step=81000 test_ppl=58.72\ +step=82000 test_ppl=58.52\ +step=82429 test_ppl=58.64 + +- #### NPU +#### test +epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ +epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ +valid_ppl=389.49\ +epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ +epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ +valid_ppl=298.25\ +epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ +epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ +epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ +valid_ppl=236.61\ +epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ +epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ +valid_ppl=252.75\ +epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ +epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ +valid_ppl=197.03\ +epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ +epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ +epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ +valid_ppl=191.64\ +epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ +epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ +valid_ppl=200.02\ +epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ +epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ +valid_ppl=201.46\ +epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ +epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ +epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ +valid_ppl=175.82\ +epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ +epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ +valid_ppl=209.94\ +epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ +epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ +epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ +valid_ppl=167.50\ +...\ +valid_ppl=112.40\ +epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ +epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ +valid_ppl=113.40\ +epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ +epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ +epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ +valid_ppl=113.22\ +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py new file mode 100644 index 000000000..4291e14e4 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py @@ -0,0 +1,36 @@ +from npu_bridge.npu_init import * +import os +import sys, getopt + + +def main(argv): + # print(argv) + # argv_ = ['-t', 'search'] + runType = "" + try: + opts, args = getopt.getopt(argv, "ht:", ["trun="]) + except getopt.GetoptError: + print("getopt.GetoptError!!") + print("useage: (sudo) python(3) pythonFileName.py -t ") + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print("useage: pythonFileName.py -t ") + sys.exit() + elif opt in ("-t", "--trun"): + runType = arg + if runType == "search": + print(f'runType={runType}!\n') + os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/search --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") + elif runType == "test-npu": + print(f'runType={runType}!\n') + os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/test --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") + # os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc = '0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=$(pwd)/output/test --data_path=$(pwd)/ptb/ptb.pkl") + # print("this part is writing...") + # pass + else: + print("This runType is invaild!!!") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py new file mode 100644 index 000000000..eaf741434 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py @@ -0,0 +1,4 @@ +from npu_bridge.npu_init import * +import os + +os.system("bash /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.sh") \ No newline at end of file diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py new file mode 100644 index 000000000..c4532fa3f --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py @@ -0,0 +1,73 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This is the boot file for ModelArts platform. +Firstly, the train datasets are copyed from obs to ModelArts. +Then, the string of train shell command is concated and using 'os.system()' to execute +""" +import os +import time +import numpy as np +import argparse +from help_modelarts import obs_data2modelarts +# import moxing as mox +print(os.system('env')) +print(os.system("python3 --version")) +#print(os.system("pip install dlib")) +print("===>>>hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh") +os.environ['ASCEND_GLOBAL_LOG_LEVEL'] = '4' + +#data_dir = "/root/.keras/models/" +if __name__ == '__main__': + ## Note: the code dir is not the same as work dir on ModelArts Platform!!! + code_dir = '.' + work_dir = os.getcwd() + print("===>>>code_dir:{}, work_dir:{}".format(code_dir, work_dir)) + output_path = "./output/test/" + str(time.strftime('%Y%m%d_%H%M%S')) + parser = argparse.ArgumentParser() + parser.add_argument("--train_url", type=str, default=output_path) + parser.add_argument("--data_url", type=str, default="./ptb") + parser.add_argument("--ckp_path", type=str, default="./output/test/20220715_182127/") + # parser.add_argument("--ckp_path", type=str, default="obs://rstg/workplace_ENAS/lm-train/MA-new-enas-05-23-19-34/output/result/") + # parser.add_argument("--modelarts_data_dir", type=str, default="/cache/ptb-dataset") + # parser.add_argument("--modelarts_result_dir", type=str, default="/cache/result") + config = parser.parse_args() + #if not os.path.exists(data_dir): + # os.makedirs(data_dir) + # print("=nvvvvvvvvvvvvvfdsfdsfdvnn") + + #os.system("pip install -i http://repo.myhuaweicloud.com/repository/pypi/simple pexpect==4.2.1") + #os.system("pip install torch") + #os.system("pip install absl-py") + print("--------config---------hhhhhhhhhhhggggggggggggggggkkkkkkkkkkkkkkkkkkkkkkkkkgg-") + for k in list(vars(config).keys()): + print("key:{}: value:{}".format(k, vars(config)[k])) + print("--------config----------") + + ## copy dataset from obs to modelarts + # obs_data2modelarts(config) + # ret = mox.file.exists('obs://rstg/MA-new-p/') + # retm = mox.file.make_dirs('obs://rstg/MA-new-p/') + # print("bbbbbbbbbbbbbbbbbbbbbbbbb ",retm) + # print("config.modelarts_result_dir ", config.modelarts_result_dir) + ## start to train on Modelarts platform + # if not os.path.exists(config.modelarts_result_dir): + # os.makedirs(config.modelarts_result_dir) + # print("6666666666666666666666666666666666666666 ", config.modelarts_result_dir) + bash_header = os.path.join(code_dir, 'test-npu.sh') + # bash_header = os.path.join(code_dir, 'search.sh') + arg_url = '%s %s %s %s' % (code_dir, config.data_url, config.train_url, config.ckp_path) + bash_command = 'bash %s %s' % (bash_header, arg_url) + print("bash command:", bash_command) + os.system(bash_command) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py new file mode 100644 index 000000000..09b6d878d --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py @@ -0,0 +1,440 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD ENAS fixed model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + + +import numpy as np +import tensorflow.compat.v1 as tf +import tensorflow.keras as keras + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils + + +flags = tf.app.flags +FLAGS = flags.FLAGS + + +flags.DEFINE_integer('child_batch_size', 128, '') +flags.DEFINE_integer('child_bptt_steps', 35, '') +flags.DEFINE_integer('num_train_epochs', 600, '') + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _rnn_fn(sample_arc, x, prev_s, w_prev, w_skip, input_mask, layer_mask, + params): + """Multi-layer LSTM. + + Args: + sample_arc: [num_layers * 2], sequence of tokens representing architecture. + x: [batch_size, num_steps, hidden_size]. + prev_s: [batch_size, hidden_size]. + w_prev: [2 * hidden_size, 2 * hidden_size]. + w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. + input_mask: `[batch_size, hidden_size]`. + layer_mask: `[batch_size, hidden_size]`. + params: hyper-params object. + + Returns: + next_s: [batch_size, hidden_size]. + all_s: [[batch_size, num_steps, hidden_size] * num_layers]. + """ + batch_size = params.batch_size + # num_steps = 35 + num_steps = tf.shape(x)[1] + print("num_steps:{}/{}".format(num_steps, num_steps)) + + num_layers = len(sample_arc) // 2 + set_shape = x.get_shape().as_list() + print("set_shape:{}".format(set_shape)) + # 修改点 + # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=True) + all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) + # extract the relevant variables, so that you only do L2-reg on them. + u_skip = [] + start_idx = 0 + + for layer_id in range(num_layers): + prev_idx = sample_arc[start_idx] + func_idx = sample_arc[start_idx + 1] + u_skip.append(w_skip[layer_id][func_idx, prev_idx]) + start_idx += 2 + w_skip = u_skip + var_s = [w_prev] + w_skip[1:] + + def _select_function(h, function_id): + h = tf.stack([tf.tanh(h), tf.nn.relu(h), tf.sigmoid(h), h], axis=0) + h = h[function_id] + return h + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, prev_s, all_s): + """Body function.""" + inp = x[:, step, :] + # print("inp:{}".format(inp)) + + # important change: first input uses a tanh() + if layer_mask is not None: + assert input_mask is not None + ht = tf.matmul(tf.concat([inp * input_mask, prev_s * layer_mask], + axis=1), w_prev) + else: + ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) + # print("ht:{}".format(ht)) + h, t = tf.split(ht, 2, axis=1) + h = tf.tanh(h) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + layers = [s] + # print("layer:{}".format(layers)) + + start_idx = 0 + used = [] + for layer_id in range(num_layers): + prev_idx = sample_arc[start_idx] + func_idx = sample_arc[start_idx + 1] + # print("layer_id/[prev_idx, func_idx]:{}/[{}, {}]".format(layer_id, prev_idx, func_idx)) + used.append(tf.one_hot(prev_idx, depth=num_layers, dtype=tf.int32)) + prev_s = tf.stack(layers, axis=0)[prev_idx] + if layer_mask is not None: + ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) + else: + ht = tf.matmul(prev_s, w_skip[layer_id]) + h, t = tf.split(ht, 2, axis=1) + + h = _select_function(h, func_idx) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + # print("s before set_shape:{}".format(s)) + s.set_shape([batch_size, params.hidden_size]) + # print("s after set_shape:{}".format(s)) + layers.append(s) + start_idx += 2 + # print("layers:{}\ns:{}".format(layers, s)) + + next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) + # print("step:{}\nnext_s:{}".format(step, next_s)) + # all_s = all_s.write(step, next_s) + t = tf.stack([next_s]) + # print("t:{}".format(t)) + all_s = tf.concat([all_s, t], 0) + # print("step:{}-all_s:{}".format(step, all_s)) + # all_s_my[step] = next_s + + return step + 1, next_s, all_s + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] + _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) + + all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) + # all_s_my.set_shape([_, batch_size, params.hidden_size]) + # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) + # print("stack_all_s:{}".format(all_s_my)) + + all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) + # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) + # print("all_s:{}".format(all_s)) + + return next_s, all_s, var_s + + +def _set_default_params(params): + """Set default hyper-parameters.""" + params.add_hparam('alpha', 0.0) # activation L2 reg + params.add_hparam('beta', 1.) # activation slowness reg + params.add_hparam('best_valid_ppl_threshold', 5) + + params.add_hparam('batch_size', FLAGS.child_batch_size) + params.add_hparam('bptt_steps', FLAGS.child_bptt_steps) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', 0.10) # word + params.add_hparam('drop_i', 0.20) # embeddings + params.add_hparam('drop_x', 0.75) # input to RNN cells + params.add_hparam('drop_l', 0.25) # between layers + params.add_hparam('drop_o', 0.75) # output + params.add_hparam('drop_w', 0.00) # weight + + params.add_hparam('grad_bound', 0.1) + params.add_hparam('hidden_size', 200) + params.add_hparam('init_range', 0.04) + params.add_hparam('learning_rate', 20.) + params.add_hparam('num_train_epochs', FLAGS.num_train_epochs) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', 8e-7) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, controller, x_train, x_valid, name='child'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.controller = controller + self.sample_arc = tf.unstack(controller.sample_arc) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, + self.base_bptt, self.bptt_rate) = data_utils.input_producer(x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam('num_train_steps', self.num_train_batches * params.num_train_epochs) + # self.x_train.set_shape([params.batch_size, self.base_bptt]) + # print("self.x_train:{}".format(self.x_train.get_shape().as_list())) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches) = data_utils.input_producer(x_valid, params.batch_size, params.bptt_steps) + # with tf.control_dependencies([self.base_bptt]): + self._build_params() + self._build_train() + self._build_valid() + + def _build_params(self): + """Create model parameters.""" + + print('-' * 80) + print('Building model params') + initializer = tf.initializers.random_uniform(minval=-self.params.init_range, + maxval=self.params.init_range) + num_functions = self.params.controller_num_functions + num_layers = self.params.controller_num_layers + hidden_size = self.params.hidden_size + # >>> add code >>> + with tf.variable_scope(self.name, initializer=initializer): + # >>> add code >>> + with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): + w_emb = tf.get_variable('w', [self.params.vocab_size, hidden_size]) + # >>> add code >>> + # 修改点 + dropped_w_emb = npu_ops.dropout(w_emb, 1-self.params.drop_e) + # dropped_w_emb = tf.layers.dropout( + # w_emb, self.params.drop_e, [self.params.vocab_size, 1], + # training=True) + with tf.variable_scope('rnn_cell', reuse=tf.AUTO_REUSE): + w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) + i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) + h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w_prev = w_prev * mask + w_skip, dropped_w_skip = [], [] + for layer_id in range(1, num_layers + 1): + with tf.variable_scope('layer_{}'.format(layer_id)): + w = tf.get_variable( + 'w', [num_functions, layer_id, hidden_size, 2 * hidden_size]) + mask = _gen_mask([1, 1, hidden_size, 2 * hidden_size], + self.params.drop_w) + dropped_w = w * mask + w_skip.append(w) + dropped_w_skip.append(dropped_w) + with tf.variable_scope('init_states', reuse=tf.AUTO_REUSE): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hidden_size] + batch_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset = tf.assign(batch_prev_s, zeros) + + self.num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() + if v.name.startswith(self.name)]) # .value + print('All children have {} params'.format(self.num_params)) + + num_params_per_child = 0 + for v in tf.trainable_variables(): + if v.name.startswith(self.name): + if 'rnn_cell' in v.name: + num_params_per_child += v.shape[-2] * v.shape[-1] + else: + num_params_per_child += np.prod([d for d in v.shape]) + print('Each child has {0} params'.format(num_params_per_child)) + + self.batch_init_states = { + 's': batch_prev_s, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_prev': dropped_w_prev, + 'w_skip': dropped_w_skip, + 'w_soft': w_emb, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_prev': w_prev, + 'w_skip': w_skip, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_prev = model_params['w_prev'] + w_skip = model_params['w_skip'] + w_soft = model_params['w_soft'] + prev_s = init_states['s'] + + # bug点 + # + print("before [embedding_lookup], x={}".format(x)) + emb = tf.nn.embedding_lookup(w_emb, x) + batch_size = self.params.batch_size + hidden_size = self.params.hidden_size + sample_arc = self.sample_arc + if is_training: + # >>> add code >>> + emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) + # >>> add code >>> + # 修改点 + # emb = tf.layers.dropout( + # emb, self.params.drop_i, [batch_size, 1, hidden_size], training=True) + + input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) + layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) + else: + input_mask = None + layer_mask = None + + out_s, all_s, var_s = _rnn_fn(sample_arc, emb, prev_s, w_prev, w_skip, + input_mask, layer_mask, params=self.params) + + top_s = all_s + if is_training: + # >>> add code >>> + # 修改点 + + top_s = npu_ops.dropout(top_s, 1-self.params.drop_o) # ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) + # >>> add code >>> + # top_s = tf.layers.dropout( + # top_s, self.params.drop_o, + # [self.params.batch_size, 1, self.params.hidden_size], training=True) + + carry_on = [tf.assign(prev_s, out_s)] + top_s_shape = top_s.get_shape().as_list() + # print("top_s_shape:{}".format(top_s_shape)) + # print("w_soft:{}".format(w_soft)) + logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) + # logits = tf.matmul(top_s, tf.transpose(w_soft)) + # print("logits:{}".format(logits)) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + # print("loss:{}".format(loss)) + loss = tf.reduce_mean(loss) + # print("_forward/loss:{}".format(loss)) + reg_loss = loss # `loss + regularization_terms` is for training only + if is_training: + # L2 weight reg + self.l2_reg_loss = tf.add_n([tf.nn.l2_loss(w ** 2) for w in var_s]) + reg_loss += self.params.weight_decay * self.l2_reg_loss + + # activation L2 reg + reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) + + # activation slowness reg + reg_loss += self.params.beta * tf.reduce_mean( + (all_s[:, 1:, :] - all_s[:, :-1, :]) ** 2) + # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = [v for v in tf.trainable_variables() + if v.name.startswith(self.name)] + # print("reg_loss:{}".format(reg_loss)) + # print("tf_vars:{}".format(tf_vars)) + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + learning_rate = utils.get_lr(global_step, self.params) * lr_scale + if self.params.grad_bound: + # grads = tf.gradients(reg_loss, tf_vars) + # clipped_grads, _ = tf.clip_by_global_norm(grads, self.params.grad_bound) + # clipped_grads, grad_norm = tf.clip_by_global_norm(grads, self.params.grad_bound) + # print("clipped_grads:{}".format(clipped_grads)) + + grads = tf.gradients(reg_loss, tf_vars) + # print("grads:{}".format(grads)) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + # print("optimizer:{}".format(optimizer)) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + # print("train_op:{}".format(train_op)) + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + self.rl_loss = loss + + def eval_valid(self, sess): + + """Eval 1 round on valid set.""" + total_loss = 0 + for _ in range(self.num_valid_batches): + sess.run(self.batch_init_states['reset']) + total_loss += sess.run(self.valid_loss) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + + return valid_ppl diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py new file mode 100644 index 000000000..2367ea1f0 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- +import tensorflow.compat.v1 as tf + +# from create_tf_record import * +from tensorflow.python.framework import graph_util +from tensorflow.python.tools import freeze_graph + +from npu_bridge.npu_init import * + +def freeze_graph(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "output" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=sess.graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in sess.graph.get_operations(): + # print(op.name, op.values()) + + +def freeze_graph2(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "InceptionV3/Logits/SpatialSqueeze" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + graph = tf.get_default_graph() # 获得默认的图 + input_graph_def = graph.as_graph_def() # 返回一个序列化的图代表当前的图 + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=input_graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in graph.get_operations(): + # print(op.name, op.values()) + + +if __name__ == '__main__': + # 输入ckpt模型路径 + input_checkpoint = './output/test/20220709_185707/model.ckpt-181200' + # 输出pb模型的路径 + out_pb_path = "models_pb/enas-lm-infer2.pb" + # 调用freeze_graph将ckpt转为pb + freeze_graph(input_checkpoint, out_pb_path) + print("Done pb!") + + # 测试pb模型 + image_path = 'test_image/animal.jpg' + # freeze_graph_test(pb_path=out_pb_path, image_path=image_path) + + diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py new file mode 100644 index 000000000..cb13d49ab --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py @@ -0,0 +1,250 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ENAS controller.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import numpy as np +import tensorflow.compat.v1 as tf + +flags = tf.app.flags +FLAGS = flags.FLAGS + +flags.DEFINE_float('controller_baseline_dec', 0.999, '') +flags.DEFINE_float('controller_entropy_weight', 1e-5, '') +flags.DEFINE_float('controller_temperature', 5., '') +flags.DEFINE_float('controller_tanh_constant', 2.25, '') +flags.DEFINE_float('controller_learning_rate', 5e-5, '') +flags.DEFINE_integer('controller_num_layers', 9, '') + +REWARD_CONSTANT = 80.0 + + +def _build_train_op(loss, tf_vars, learning_rate, train_step, num_aggregate): + """Build training ops from `loss` tensor.""" + optim = tf.train.AdamOptimizer(learning_rate) + optim = tf.train.SyncReplicasOptimizer( + optim, replicas_to_aggregate=num_aggregate, total_num_replicas=1, use_locking=True) + grads = tf.gradients(loss, tf_vars) + train_op = optim.apply_gradients(zip(grads, tf_vars), global_step=train_step) + grad_norm = tf.global_norm(grads) + return train_op, optim, grad_norm + + +def _lstm(x, prev_c, prev_h, w_lstm): + """LSTM subgraph.""" + ifog = tf.matmul(tf.concat([x, prev_h], axis=1), w_lstm) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + next_c = i * g + f * prev_c + next_h = o * tf.tanh(next_c) + return next_c, next_h + + +def _set_default_params(params): + """Add controller's default params.""" + params.add_hparam('controller_hidden_size', 64) + params.add_hparam('controller_num_layers', FLAGS.controller_num_layers) + params.add_hparam('controller_num_functions', 4) # tanh, relu, sigmoid, iden + + params.add_hparam('controller_baseline_dec', FLAGS.controller_baseline_dec) + params.add_hparam('controller_entropy_weight', + FLAGS.controller_entropy_weight) + params.add_hparam('controller_temperature', FLAGS.controller_temperature) + params.add_hparam('controller_tanh_constant', FLAGS.controller_tanh_constant) + params.add_hparam('controller_learning_rate', FLAGS.controller_learning_rate) + params.add_hparam('controller_num_aggregate', 10) + params.add_hparam('controller_num_train_steps', 25) + + return params + + +class Controller(object): + """ENAS controller. Samples architectures and creates training ops.""" + + def __init__(self, params, name='controller'): + print('-' * 80) + print('Create a controller') + self.params = _set_default_params(params) + self.name = name + self._build_params() + self._build_sampler() + + def _build_params(self): + """Create TF parameters.""" + initializer = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) + num_funcs = self.params.controller_num_functions # 4 + hidden_size = self.params.controller_hidden_size # 64 + with tf.variable_scope(self.name, initializer=initializer): + with tf.variable_scope('lstm'): + self.w_lstm = tf.get_variable('w', [2 * hidden_size, 4 * hidden_size]) + + with tf.variable_scope('embedding'): + self.g_emb = tf.get_variable('g', [1, hidden_size]) + self.w_emb = tf.get_variable('w', [num_funcs, hidden_size]) + + with tf.variable_scope('attention'): + self.attn_w_1 = tf.get_variable('w_1', [hidden_size, hidden_size]) + self.attn_w_2 = tf.get_variable('w_2', [hidden_size, hidden_size]) + self.attn_v = tf.get_variable('v', [hidden_size, 1]) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() + if v.name.startswith(self.name)]) + print('Controller has {0} params'.format(num_params)) + + def _build_sampler(self): + """Build the sampler ops and the log_prob ops.""" + hidden_size = self.params.controller_hidden_size + num_layers = self.params.controller_num_layers + + arc_seq = [] + sample_log_probs = [] + sample_entropy = [] + all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)] + all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)] + + # sampler ops + inputs = self.g_emb # ??? + prev_c = tf.zeros([1, hidden_size], dtype=tf.float32) + prev_h = tf.zeros([1, hidden_size], dtype=tf.float32) + + inputs = self.g_emb + for layer_id in range(1, num_layers + 1): + next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) + prev_c, prev_h = next_c, next_h + all_h.append(next_h) + all_h_w.append(tf.matmul(next_h, self.attn_w_1)) + + query = tf.matmul(next_h, self.attn_w_2) + query = query + tf.concat(all_h_w[:-1], axis=0) + query = tf.tanh(query) + logits = tf.matmul(query, self.attn_v) + logits = tf.reshape(logits, [1, layer_id]) + + if self.params.controller_temperature: + logits /= self.params.controller_temperature + if self.params.controller_tanh_constant: + logits = self.params.controller_tanh_constant * tf.tanh(logits) + diff = tf.cast(layer_id - tf.range(0, layer_id), tf.float32) ** 2 + logits -= tf.reshape(diff, [1, layer_id]) / 6.0 + skip_index = tf.random.categorical(logits, 1) + skip_index = tf.cast(skip_index, tf.int32) + skip_index = tf.reshape(skip_index, [1]) + arc_seq.append(skip_index) + + log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=skip_index) + sample_log_probs.append(log_prob) + + entropy = log_prob * tf.exp(-log_prob) + sample_entropy.append(tf.stop_gradient(entropy)) + + inputs = tf.nn.embedding_lookup( + tf.concat(all_h[:-1], axis=0), skip_index) + inputs /= (0.1 + tf.to_float(layer_id - skip_index)) + + next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) + prev_c, prev_h = next_c, next_h + logits = tf.matmul(next_h, self.w_emb, transpose_b=True) + if self.params.controller_temperature: + logits /= self.params.controller_temperature + if self.params.controller_tanh_constant: + logits = self.params.controller_tanh_constant * tf.tanh(logits) + func = tf.multinomial(logits, 1) + func = tf.to_int32(func) + func = tf.reshape(func, [1]) + arc_seq.append(func) + log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=func) + sample_log_probs.append(log_prob) + entropy = log_prob * tf.exp(-log_prob) + sample_entropy.append(tf.stop_gradient(entropy)) + inputs = tf.nn.embedding_lookup(self.w_emb, func) + + arc_seq = tf.concat(arc_seq, axis=0) + self.sample_arc = arc_seq + + self.sample_log_probs = tf.concat(sample_log_probs, axis=0) + self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) + + sample_entropy = tf.concat(sample_entropy, axis=0) + self.sample_entropy = tf.reduce_sum(sample_entropy) + + self.all_h = all_h + + def build_trainer(self, child_model): + """Build the train ops by connecting Controller with a Child.""" + # actor + self.valid_loss = tf.to_float(child_model.rl_loss) + self.valid_loss = tf.stop_gradient(self.valid_loss) + self.valid_ppl = tf.exp(self.valid_loss) + self.reward = REWARD_CONSTANT / self.valid_ppl + + if self.params.controller_entropy_weight: + self.reward += self.params.controller_entropy_weight * self.sample_entropy + + # or baseline + self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) + self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) + baseline_update = tf.assign_sub(self.baseline, + ((1 - self.params.controller_baseline_dec) * + (self.baseline - self.reward))) + + with tf.control_dependencies([baseline_update]): + self.reward = tf.identity(self.reward) + self.loss = self.sample_log_probs * (self.reward - self.baseline) + + self.train_step = tf.Variable( + 0, dtype=tf.int32, trainable=False, name='train_step') + tf_vars = [var for var in tf.trainable_variables() + if var.name.startswith(self.name)] + + self.train_op, self.optimizer, self.grad_norm = _build_train_op( + loss=self.loss, + tf_vars=tf_vars, + learning_rate=self.params.controller_learning_rate, + train_step=self.train_step, + num_aggregate=self.params.controller_num_aggregate) + + def train(self, sess, reset_op, log_every=10): + """Train the controller for `num_steps`.""" + print('-' * 80) + print('Training controller') + num_steps = (self.params.controller_num_aggregate * + self.params.controller_num_train_steps) + run_ops = [self.sample_arc, + self.sample_entropy, + self.reward, + self.baseline, + self.train_op] + + for step in range(num_steps): + arc, ent, reward, baseline, _ = sess.run(run_ops) + sess.run(reset_op) + if step % log_every == 0: + log_string = 'step={0:<5d}'.format(step) + log_string += ' ent={0:<7.3f}'.format(ent) + log_string += ' ppl={0:<7.2f}'.format(REWARD_CONSTANT / reward) + log_string += ' rw={0:<7.4f}'.format(reward) + log_string += ' bl={0:<7.4f}'.format(baseline) + log_string += ' arc=[{0}]'.format(' '.join([str(v) for v in arc])) + print(log_string) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py new file mode 100644 index 000000000..6d767073c --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Load picked Penn Treebank data.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +# from npu_bridge.npu_init import * + +import numpy as np +import tensorflow.compat.v1 as tf + + +def input_producer(raw_data, batch_size, num_steps, shuffle=False, + randomize=False, random_len=False): + """Produces graph-based input for Penn Treebank. + + Args: + raw_data: np tensor of size [num_words]. + batch_size: self-explained. + num_steps: number of BPTT steps. + shuffle: whether to shuffle sentences. + randomize: use random segments instead of the continuous corpus. + random_len: random sequence len. + + Returns: + If `random_len` is set, return op that represents whether we have reached + the end of a sequence. + Otherwise, return number of batches in an epoch. + """ + print("raw_data_size:{}".format(np.size(raw_data))) + print("num_steps:{}".format(num_steps)) + batch_len = np.size(raw_data) // batch_size + num_batches_per_epoch = ((np.size(raw_data) // batch_size) - 1) // num_steps + print("num_batches_per_epoch:{}".format(num_batches_per_epoch)) + raw_data = tf.convert_to_tensor(raw_data, name='raw_data', dtype=tf.int32) + + # data_len = tf.size(raw_data) + + + print("batch_len:{}".format(batch_len)) + data = tf.reshape(raw_data[0: batch_size * batch_len], + [batch_size, batch_len]) + + epoch_size = (batch_len - 1) // num_steps + with tf.device('/cpu:0'): + epoch_size = tf.identity(epoch_size, name='epoch_size') + + if random_len: + start_idx = tf.Variable(0, name='start_idx', dtype=tf.int32,trainable=False) + # start_idx = 0 + base_bptt = tf.cond( + tf.random_uniform(shape=(), minval=0., maxval=1.) < 0.95, + lambda: tf.cast(num_steps, dtype=tf.float32), + lambda: tf.cast(num_steps, dtype=tf.float32) / 2.) + # base_bptt = int(tf.cond( + # tf.greater_equal(0.95, np.random.uniform(100)/100), + # lambda:num_steps / 1., + # lambda:num_steps / 2.).item()) + # base_bptt = 35 + seq_len = tf.random.truncated_normal(shape=(), mean=base_bptt, stddev=5., + dtype=tf.float32) + # seq_len = int(np.random.normal(num_steps, 5)) + # seq_len = 35 + seq_len = tf.cast(seq_len, dtype=tf.int32) + seq_len = tf.minimum(seq_len, num_steps + 20) # seq_len <= bptt + 40 + seq_len = tf.minimum(seq_len, batch_len - start_idx - 1) + + # seq_len = tf.cond(tf.greater_equal(seq_len, num_steps + 20), lambda: num_steps + 20, lambda: seq_len).item() + # seq_len = tf.cond(tf.greater_equal(seq_len, int(batch_len - start_idx - 1)), lambda: int(batch_len - start_idx - 1), lambda: seq_len).item() + # seq_len = min(seq_len, num_steps + 20, batch_len - start_idx - 1) + print("seq_len:{}, type:{}".format(seq_len, type(seq_len))) + + end_idx = start_idx + seq_len + + x = data[:, start_idx: end_idx] + # x = tf.reshape(x, [batch_size, seq_len]) + # print("xshape:{}".format(x.get_shape().as_list())) + y = data[:, start_idx + 1: end_idx + 1] + # y = tf.reshape(y, [batch_size, seq_len]) + # print("yshape:{}".format(y.get_shape().as_list())) + + with tf.control_dependencies([x, y]): + with tf.control_dependencies([tf.assign(start_idx, end_idx)]): + should_reset = tf.greater_equal(end_idx, batch_len - 3) + reset_start_idx = tf.assign(start_idx, 0) + # reset_start_idx = tf.assign(tf.Variable(start_idx, name='reset_start_idx', dtype=tf.int32, trainable=False), 0) + return (x, y, num_batches_per_epoch, reset_start_idx, should_reset, + base_bptt, seq_len / batch_len) + + if randomize: + i = tf.random_uniform([1], minval=0, maxval=batch_len - num_steps,dtype=tf.int32)[0] + x = tf.strided_slice(data, [0, i], [batch_size, i + num_steps]) + y = tf.strided_slice(data, [0, i + 1], [batch_size, i + num_steps + 1]) + else: + # """ + # 修改点 + start_idx_eval = tf.Variable(0, name='start_idx', dtype=tf.int32, + trainable=False) + seq_len = num_steps + seq_len = tf.cast(seq_len, dtype=tf.int32) + end_idx = start_idx_eval + seq_len + x = data[:, start_idx_eval: end_idx] + y = data[:, start_idx_eval + 1: end_idx + 1] + with tf.control_dependencies([x, y]): + with tf.control_dependencies([tf.assign(start_idx_eval, end_idx)]): + should_reset_eval = tf.greater_equal(end_idx, batch_len - num_steps - 3) + reset_start_idx_eval = tf.assign(start_idx_eval, 0) + x.set_shape([batch_size, num_steps]) + y.set_shape([batch_size, num_steps]) + + return x, y, num_batches_per_epoch, reset_start_idx_eval, should_reset_eval diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py new file mode 100644 index 000000000..51ed715f5 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD ENAS with a fixed architecture.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.python.tools import freeze_graph + +import os +import pickle +import sys + +# TODO:change path +# sys.path.append("/home/test_user06/AscendZhongzhi_NJU/") +import time + +import numpy as np +import tensorflow.compat.v1 as tf + +import fixed_lib +import utils +from tensorflow.contrib import training as contrib_training + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +## Required parameters +subfolder = str(time.strftime('%Y%m%d_%H%M%S')) +flags.DEFINE_string('output_dir', "./output/infer0/" + subfolder, '') +flags.DEFINE_string('data_path', './ptb/ptb.pkl', '') +flags.DEFINE_string("ckp_path", '', "checkpoint path") + +## Other parametersresult +flags.DEFINE_boolean('reload_model', True, '') +flags.DEFINE_boolean('reset_output_dir', True, '') +flags.DEFINE_boolean('is_training', False, '') +flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") + +flags.DEFINE_integer('log_every', 100, '') + + +def get_ops(params, x_train, x_valid, x_test): + """Build [train, valid, test] graphs.""" + lm = fixed_lib.LM(params, x_train, x_valid, x_test) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + 'eval_test': lm.do_infer, + 'bptt_rate': lm.bptt_rate, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'moving_avg_started': lm.moving_avg_started, + 'update_moving_avg': lm.update_moving_avg_ops, + 'start_moving_avg': lm.start_moving_avg_op, + 'end_moving_avg': lm.end_moving_avg_op, + 'reset_avg': lm.restart_avg, + 'set_lr_decay': lm.set_lr_decay, + 'reset_start_idx_eval': lm.reset_start_idx_eval, + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + + +def load_ckpt_model(sess, save_path): + print("reload model from:{}".format(save_path)) + checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 + input_checkpoint = checkpoint.model_checkpoint_path + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 + saver.restore(sess, input_checkpoint) # 使用最新模型 + sess.run(tf.global_variables_initializer())# 初始化所有变量 + + +def train(params, is_training=True): + """Entry point for training.""" + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, x_test, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + print(' test_size: {0}'.format(np.size(x_test))) + + g = tf.Graph() + with g.as_default(): + tf.random.set_random_seed(2126) + ops = get_ops(params, x_train, x_valid, x_test) + run_ops = [ + ops['train_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['moving_avg_started'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=2) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + + # >>> add code >> + # 创建session + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 + custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") + # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 + # # custom_op.parameter_map[ + # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("./dump/") + # custom_op.parameter_map["enable_dump_debug"].b = True + # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 + # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) + # >>> add code >> + + + # config = tf.ConfigProto() + # config.gpu_options.allow_growth = True + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + # reload model + if params.ckp_path is not "" and FLAGS.reload_model: + last_checkpoint = tf.train.latest_checkpoint(params.ckp_path) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + + accum_loss = 0. + accum_step = 0 + epoch = sess.run(ops['global_step']) // params.num_train_batches + best_valid_ppl = [] + accum_rate = 0. + start_time = time.time() + last_min = (time.time() - start_time) / 60 + cleaned = True + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + avg_flag = "no_null" + while True and is_training: + try: + loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) + # bptt_rate = sess.run(ops['bptt_rate']) + # accum_rate += bptt_rate + + accum_loss += loss + accum_step += 1 + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + # epoch = step // params.num_train_batches + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + min_pices = mins_so_far-last_min + last_min = mins_so_far + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0}/{1:<6d}'.format(step, params.num_train_steps) + log_string += ' ppl={0:<10.2f}'.format(train_ppl) + log_string += ' lr={0:<6.3f}'.format(lr) + log_string += ' |g|={0:<6.3f}'.format(gn) + log_string += ' avg={0:<2d}'.format(moving_avg_started) + log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, min_pices/params.log_every) + # log_string += ' accum_rate(rate of a epoch)={0:<4.6f}'.format(accum_rate) + # log_string += ' should_reset:{}'.format(should_reset) + print(log_string) + + if moving_avg_started: + if avg_flag is "": + sess.run(ops['end_moving_avg']) + sess.run(ops['reset_avg']) + avg_flag = "restart_avg" + else: + sess.run(ops['update_moving_avg']) + # ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + + + if step <= (300 * params.num_train_batches): + if step % (10 * params.num_train_batches) == 0: + print('Start learning decay ...') + sess.run(ops['set_lr_decay']) + if moving_avg_started and step + 5 % (10 * params.num_train_batches) == 0 and len(best_valid_ppl) > params.best_valid_ppl_threshold and valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]): + print('Start learning decay ...') + sess.run(ops['set_lr_decay']) + if should_reset: + accum_rate=0. + print("should_reset:{}".format(should_reset)) + sess.run(ops['reset_batch_states']) + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + # 初始化验证集idx + sess.run(ops['reset_start_idx_eval']) + # 初始化训练集 batch_state, idx + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + # note:当目前的ppl不是最好的10个时,利用移动平均权重法进行调整。 + if (not moving_avg_started and + len(best_valid_ppl) > params.best_valid_ppl_threshold and + valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]) + ): + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + # print('Start learning decay ...') + # sess.run(ops['set_lr_decay']) + + if valid_ppl > 15.: + best_valid_ppl.append(valid_ppl) + if not cleaned: + best_valid_ppl = [p for p in best_valid_ppl if p < 40.] + cleaned = True + # ops['eval_test'](sess, use_moving_avg=moving_avg_started) + if step % (1 * params.num_train_batches) == 0: + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + print("test_ppl:{}".format(test_ppl)) + sess.run(ops['reset_start_idx_eval']) + if step >= params.num_train_steps: + #inference + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + print("final_test_ppl:{}".format(test_ppl)) + break + except tf.errors.InvalidArgumentError: + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + accum_loss, accum_step = 0., 0 + if not is_training: + moving_avg_started = sess.run(ops['moving_avg_started']) + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + sess.close() + # infer_loss = ops['inference']() + with tf.Session() as sess: + print("test_ppl:{}".format(test_ppl)) + #保存图,在./pb_model文件夹中生成model.pb文件 + # model.pb文件将作为input_graph给到接下来的freeze_graph函数 + tf.train.write_graph(sess.graph_def, './models_pb', 'model3.pb') # 通过write_graph生成模型文件 + freeze_graph.freeze_graph( + input_graph='./models_pb/model3.pb', # 传入write_graph生成的模型文件 + input_saver='', + input_binary=False, + input_checkpoint=params.ckp_path+'model.ckpt-906', # 传入训练生成的checkpoint文件 + output_node_names='output', # 与定义的推理网络输出节点保持一致 + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + output_graph='./models_pb/enas_lm3.pb', # 改为需要生成的推理网络的名称 + clear_devices=False, + initializer_nodes='') + print("done pb!") + else: + sess.close() + """ + if not is_training: + return infer_loss + else: + return -1 + """ + +def main(unused_args): + tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.info("**********") + print("===>>>data_path:{}".format(FLAGS.data_path)) + print("===>>>output_dir:{}".format(FLAGS.output_dir)) + print("===>>>ckp_path:{}".format(FLAGS.ckp_path)) + + print('-' * 80) + output_dir = FLAGS.output_dir + + print('-' * 80) + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(output_dir)) + gfile.DeleteRecursively(output_dir) + gfile.MakeDirs(output_dir) + + print('-' * 80) + log_file = os.path.join(output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ckp_path=FLAGS.ckp_path, + ) + + train(params, is_training=FLAGS.is_training) + + +if __name__ == '__main__': + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py new file mode 100644 index 000000000..49659f706 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py @@ -0,0 +1,652 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD ENAS fixed model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + +import numpy as np +import tensorflow.compat.v1 as tf + +import data_utils +import utils + +flags = tf.app.flags +FLAGS = flags.FLAGS + +flags.DEFINE_string('fixed_arc', '0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0', '') +flags.DEFINE_float('child_alpha', 0.7, 'activation L2 reg') +flags.DEFINE_float('child_drop_e', 0.125, 'drop rate words') +flags.DEFINE_float('child_drop_i', 0.175, 'drop rate embeddings') +flags.DEFINE_float('child_drop_l', 0.225, 'drop rate between layers') +flags.DEFINE_float('child_drop_o', 0.75, 'drop rate output') +flags.DEFINE_float('child_drop_w', 0.00, 'drop rate weight') +flags.DEFINE_float('child_drop_x', 0.725, 'drop rate at input of RNN cells') +flags.DEFINE_float('child_init_range', 0.05, '') +flags.DEFINE_float('child_grad_bound', 0.25, '') +flags.DEFINE_float('child_weight_decay', 2e-6, '') +flags.DEFINE_integer('child_num_train_epochs', 2, '') +flags.DEFINE_integer('child_hidden_size', 800, '') + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, minval=0., maxval=1., dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _rnn_fn(x, prev_s, w_prev, w_skip, input_mask, layer_mask, params): + """Multi-layer LSTM. + + Args: + x: [batch_size, num_steps, hidden_size]. + prev_s: [batch_size, hidden_size]. + w_prev: [2 * hidden_size, 2 * hidden_size]. + w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. + input_mask: [batch_size, hidden_size]. + layer_mask: [batch_size, hidden_size]. + params: hyper-params object. + + Returns: + next_s: [batch_size, hidden_size]. + all_s: [[batch_size, num_steps, hidden_size] * num_layers]. + """ + batch_size = x.get_shape()[0].value + print("batch_size:{}".format(batch_size)) + # batch_size = params.batch_size + num_steps = tf.shape(x)[1] + fixed_arc = params.fixed_arc + num_layers = len(fixed_arc) // 2 + set_shape = x.get_shape().as_list() + print("x.set_shape:{}".format(set_shape)) + + # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) + # all_s_my = [] + all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) + + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, prev_s, all_s_my): + """Body fn for `tf.while_loop`.""" + inp = x[:, step, :] + # print("inp:{}".format(inp)) + if layer_mask is not None: + assert input_mask is not None + ht = tf.matmul( + tf.concat([inp * input_mask, prev_s * layer_mask], axis=1), w_prev) + else: + ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) + # print("w_prev:{}".format(w_prev)) + h, t = tf.split(ht, 2, axis=1) + h = tf.tanh(h) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + layers = [s] + + def _select_function(h, function_id): + if function_id == 0: + return tf.tanh(h) + elif function_id == 1: + return tf.nn.relu(h) + elif function_id == 2: + return tf.sigmoid(h) + elif function_id == 3: + return h + raise ValueError('Unknown func_idx {0}'.format(function_id)) + + start_idx = 0 + for layer_id in range(num_layers): + prev_idx = fixed_arc[start_idx] + func_idx = fixed_arc[start_idx + 1] + prev_s = layers[prev_idx] + if layer_mask is not None: + ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) + else: + ht = tf.matmul(prev_s, w_skip[layer_id]) + h, t = tf.split(ht, 2, axis=1) + + h = _select_function(h, func_idx) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + # print("layers_id:{}\ns before set_shape:{}".format(layer_id, s)) + s.set_shape([batch_size, params.hidden_size]) + # print("s after set_shape:{}".format(s)) + layers.append(s) + start_idx += 2 + # print("layers:{}\ns:{}".format(layers, s)) + next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) + # print("next_s:{}".format(next_s)) + t = tf.stack([next_s]) + # print("t:{}".format(t)) + all_s_my = tf.concat([all_s_my, t], 0) + # print("all_s_my:{}".format(all_s_my)) + # all_s.append(next_s) + return step + 1, next_s, all_s_my + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] + _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) + # >>> add code >>> + # all_s_my = tf.reshape(all_s_my, [set_shape[1]+1, set_shape[0], params.hidden_size]) + # print("all_s_my(list):{}".format(all_s_my)) + # tmp = all_s_my[1:, :, :] + # # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) + # print("stack_all_s:{}".format(tmp)) + # all_s = tf.transpose(tmp, perm=[1, 0, 2]) + # # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) + # all_s = tf.reshape(all_s, [set_shape[0], set_shape[1], params.hidden_size]) + # print("all_s:{}".format(all_s)) + all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) + # print("stack_all_s:{}".format(all_s_my)) + + all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) + # print("all_s:{}".format(all_s)) + + return next_s, all_s + + +def _set_default_params(params): + """Set default values for the hparams.""" + params.add_hparam('alpha', FLAGS.child_alpha) # activation L2 reg + params.add_hparam('best_valid_ppl_threshold', 10) + + params.add_hparam('batch_size', 64) + params.add_hparam('bptt_steps', 32) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', FLAGS.child_drop_e) # word + params.add_hparam('drop_i', FLAGS.child_drop_i) # embeddings + params.add_hparam('drop_l', FLAGS.child_drop_l) # between RNN nodes + params.add_hparam('drop_o', FLAGS.child_drop_o) # output + params.add_hparam('drop_w', FLAGS.child_drop_w) # weight + params.add_hparam('drop_x', FLAGS.child_drop_x) # input to RNN layers + + assert FLAGS.fixed_arc is not None + print(FLAGS.fixed_arc) + L_arc = FLAGS.fixed_arc.split(' ') + print("L_arc:{}".format(L_arc)) + params.add_hparam('fixed_arc', [int(d) for d in L_arc]) + + params.add_hparam('grad_bound', FLAGS.child_grad_bound) + params.add_hparam('hidden_size', FLAGS.child_hidden_size) + params.add_hparam('init_range', FLAGS.child_init_range) + params.add_hparam('learning_rate', 40.) + params.add_hparam('num_train_epochs', FLAGS.child_num_train_epochs) + params.add_hparam('num_warmup_epochs', 0.0) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', FLAGS.child_weight_decay) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, x_train, x_valid, x_test, name='language_model'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, + self.base_bptt, self.bptt_rate) = data_utils.input_producer( + x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam( + 'num_train_steps', self.num_train_batches * params.num_train_epochs) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer( + x_valid, params.batch_size, params.bptt_steps) + + # test data + (self.x_test, self.y_test, + self.num_test_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer(x_test, 1, 1) + + params.add_hparam('num_warmup_steps', + params.num_warmup_epochs * self.num_train_batches) + self._build_params() + self._build_train() + self._build_valid() + self._build_test() + self._build_infer() + self._build_avg_infer() + + def _build_params(self): + """Create model parameters.""" + + print('-' * 80) + print('Building model params') + initializer = tf.initializers.random_uniform(minval=-self.params.init_range, + maxval=self.params.init_range) + with tf.variable_scope(self.name, initializer=initializer): + with tf.variable_scope('embedding'): + w_emb = tf.get_variable( + 'w', [self.params.vocab_size, self.params.hidden_size], + initializer=initializer) + # >>> add code >>> + dropped_w_emb = npu_ops.dropout(w_emb, 1 - self.params.drop_e) + # >>> add code >>> + # dropped_w_emb = tf.layers.dropout( + # w_emb, self.params.drop_e, [self.params.vocab_size, 1], + # training=True) + + hidden_size = self.params.hidden_size + fixed_arc = self.params.fixed_arc + num_layers = len(fixed_arc) // 2 + with tf.variable_scope('rnn_cell'): + w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) + i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) + h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w_prev = w_prev * mask + + w_skip, dropped_w_skip = [], [] + for layer_id in range(num_layers): + mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + with tf.variable_scope('layer_{}'.format(layer_id)): + w = tf.get_variable('w', [hidden_size, 2 * hidden_size]) + dropped_w = w * mask + w_skip.append(w) + dropped_w_skip.append(dropped_w) + + with tf.variable_scope('init_states'): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hidden_size] + batch_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset = tf.assign(batch_prev_s, zeros) + with tf.variable_scope('test'): + init_shape = [1, hidden_size] + test_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = tf.zeros(init_shape, dtype=tf.float32) + test_reset = tf.assign(test_prev_s, zeros) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + print('Model has {0} params'.format(num_params)) + + self.batch_init_states = { + 's': batch_prev_s, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_prev': dropped_w_prev, + 'w_skip': dropped_w_skip, + 'w_soft': w_emb, + } + self.test_init_states = { + 's': test_prev_s, + 'reset': test_reset, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_prev': w_prev, + 'w_skip': w_skip, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_prev = model_params['w_prev'] + w_skip = model_params['w_skip'] + w_soft = model_params['w_soft'] + prev_s = init_states['s'] + + emb = tf.nn.embedding_lookup(w_emb, x) + batch_size = self.params.batch_size + hidden_size = self.params.hidden_size + if is_training: + # >>> add code >>> + emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) + + # >>> add code >>> + # emb = tf.layers.dropout( + # emb, self.params.drop_i, + # [self.params.batch_size, 1, hidden_size], training=True) + + input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) + layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) + else: + input_mask = None + layer_mask = None + + out_s, all_s = _rnn_fn(emb, prev_s, w_prev, w_skip, input_mask, layer_mask, + self.params) + top_s = all_s + if is_training: + # >>> add code >>> + top_s = npu_ops.dropout(top_s, + 1 - self.params.drop_o)# ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) + # >>> add code >>> + + # top_s = tf.layers.dropout(top_s, self.params.drop_o, + # [batch_size, 1, hidden_size], training=True) + + carry_on = [tf.assign(prev_s, out_s)] + # print("top_s:{}\nw_soft:{}".format(top_s, w_soft)) + logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) + # print("logits:{}".format(logits)) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + loss = tf.reduce_mean(loss) + + reg_loss = loss # loss + regularization_terms, for training only + # print("_forward/loss:{}".format(loss)) + if is_training: + # L2 weight reg + reg_loss += self.params.weight_decay * tf.add_n( + [tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + + # activation L2 reg + reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) + + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + # print("reg_loss:{}\nloss:{}".format(reg_loss, loss)) + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = tf.trainable_variables() + # print("reg_loss:{}".format(reg_loss)) + print("tf_vars:{}".format(tf_vars)) + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + with tf.variable_scope('HParam'): + lr_decay = tf.get_variable('learning_rate_decay', [], initializer=tf.constant_initializer(1.), dtype=tf.float32, trainable=False) + self.set_lr_decay = tf.assign_sub(lr_decay, 0.02*lr_decay) + learning_rate = utils.get_lr(global_step, self.params, lr_decay) * lr_scale + grads = tf.gradients(reg_loss, tf_vars) + # print("grads:{}".format(grads)) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + (self.update_moving_avg_ops, self.use_moving_avg_vars, + self.restore_normal_vars) = self._create_average_ops() + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + # def _EMA(self): + # """Build moving average ops.""" + # print('Creating moving average ops') + # + # with tf.variable_scope('moving_avg_flag'): + # self.moving_avg_started = tf.get_variable( + # 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + # trainable=False) + # self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + # self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) + # all_vars = tf.trainable_variables() + # + # ema = tf.train.ExponentialMovingAverage(0.99) + # + # average_op = ema.apply(all_vars) + # back_up_v = tf.identity(all_vars) + # use_average_op = tf.assign(all_vars, ema.average(all_vars)) + # ema.average_name() + # reverse_average_op = tf.assign(all_vars, back_up_v) + + + + + def _create_average_ops(self): + """Build moving average ops.""" + print('Creating moving average ops') + + with tf.variable_scope('moving_avg_flag'): + self.moving_avg_started = tf.get_variable( + 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + trainable=False) + self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) + + all_vars = tf.trainable_variables() + print('all_vars:{}'.format(all_vars)) + average_pairs = [] + var_cnt = 0 + with tf.variable_scope('average'): + for v in all_vars: + avg_v = tf.get_variable( + str(var_cnt), shape=v.shape, dtype=v.dtype, + initializer=tf.zeros_initializer, trainable=False) + var_cnt += 1 + average_pairs.append([v, avg_v]) + backup_pairs = [] + var_cnt = 0 + with tf.variable_scope('backup'): + for v in all_vars: + backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, + trainable=False) + var_cnt += 1 + backup_pairs.append([v, backup_v]) + # 原作者手动实现的Moving Average ::当eval_valid_ppl退化到一定阈值(退步10名)后启动 + with tf.variable_scope('avg_step'): + avg_step = tf.get_variable('step', [], initializer=tf.constant_initializer(0.), dtype=tf.float32, trainable=False) + tmp1 = [] + tmp2 = [] + tmp3 = [] + self.restart_avg = tf.assign(avg_step, 0.) + with tf.control_dependencies([tf.assign_add(avg_step, 1.)]): + average_op = [] + for v, avg_v in average_pairs: + # v_curr = tf.Variable(tf.cast(tf.identity(v), tf.float32), dtype=tf.float32, trainable=False) + # avg_v_curr = tf.Variable(tf.cast(tf.identity(avg_v), tf.float32), dtype=tf.float32, trainable=False) + # mu = 1. / avg_step + mu = tf.cond(tf.cast(0.999 < (1. + avg_step) / (10. + avg_step), tf.bool), + lambda: tf.cast(tf.constant(0.99), dtype=tf.float32), + lambda: tf.cast((1. + avg_step) / (10. + avg_step), dtype=tf.float32)) + + new_avg = mu * tf.cast(avg_v, tf.float32) + (1. - mu) * tf.cast(v, tf.float32) + with tf.control_dependencies([new_avg]): + average_op.append(tf.assign(avg_v, tf.cast(new_avg, avg_v.dtype))) + # 追踪变量 + tmp1.append(v) + tmp2.append(new_avg) + tmp3.append([avg_step, mu, tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(new_avg ** 2)]) + + self.p1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp1]) + self.p2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp2]) + self.p3 = tmp3 + # # 使用官方API + # with tf.variable_scope('avg_step'): + # avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) + # + # ema = tf.train.ExponentialMovingAverage(0.99, avg_step) + # with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): + # average_op = [] + # for v, avg_v in average_pairs: + # v = tf.Variable(tf.cast(v, tf.float32), dtype=tf.float32, trainable=False) + # avg_v = tf.Variable(tf.cast(avg_v, tf.float32), dtype=tf.float32, trainable=False) + # print('v:{}'.format(v)) + # ema.apply([v]) + # new_avg = ema.average(v) + # print('new_avg:{}'.format(new_avg)) + # with tf.control_dependencies([new_avg]): + # print('avg_v:'.format(avg_v)) + # average_op.append(tf.assign(avg_v, new_avg)) + # # average_op = tf.group(*average_op) + + assert len(average_pairs) == len(all_vars) + assert len(average_pairs) == len(backup_pairs) + use_average_op = [] + + new_tmp1 = [] + for i in range(len(average_pairs)): + v, avg_v = average_pairs[i] + _, backup_v = backup_pairs[i] + with tf.control_dependencies([tf.assign(backup_v, v)]): + new_tmp1.append([tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(backup_v ** 2)]) + use_average_op.append(tf.assign(v, avg_v)) + self.p4 = new_tmp1 + + use_average_op = tf.group(*use_average_op) + # with tf.control_dependencies([use_average_op]): + self.p3_1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + reverse_average_op = [] + new_tmp2 = [] + for v, backup_v in backup_pairs: + # with tf.control_dependencies([use_average_op]): + new_tmp2.append([tf.reduce_sum(v ** 2), tf.reduce_sum(backup_v ** 2)]) + reverse_average_op.append(tf.assign(v, backup_v)) + self.p5 = new_tmp2 + reverse_average_op = tf.group(*reverse_average_op) + # with tf.control_dependencies([reverse_average_op]): + self.p3_2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + + return average_op, use_average_op, reverse_average_op + + def _eval_test(self, sess, use_moving_avg=False): + """Eval 1 round on test set.""" + total_loss = 0 + if use_moving_avg: + print('v:{}'.format(tf.trainable_variables())) + sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) + print('v_avg:{}'.format(tf.trainable_variables())) + for step in range(int(self.num_test_batches)): + total_loss += sess.run(self.test_loss) + if (step + 1) % 1000 == 0: + test_ppl = np.exp(total_loss / (step + 1)) + log_string = 'step={0:<6d}'.format(step + 1) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + if sess.run(self.should_reset_eval): + break + # test_ppl = np.exp(total_loss / self.num_test_batches) + + # log_string = 'step={0:<6d}'.format(self.num_test_batches) + # log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + # print(log_string) + if use_moving_avg: + sess.run(self.restore_normal_vars) + # test_ppl = tf.math.exp(total_loss/ self.num_test_batches, name='output') + # print("test_ppl:{}".format(test_ppl)) + # loss_assign_op = tf.assign(self.tt_loss, tf.Variable(total_loss, name='total_loss', dtype=tf.float32,trainable=False)) + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + + def _build_test(self): + print('Building test graph') + _, loss = self._forward(self.x_test, self.y_test, + self.eval_params, self.test_init_states) + self.test_loss = loss + + def _build_infer(self): + print("Building infer graph") + tt_loss = tf.Variable(0, name="total_loss", dtype=tf.float32, trainable=False) + def _condition(step, *unused_args): + return tf.less(step, self.num_test_batches-3) + def _body(step, tt_loss): + with tf.control_dependencies([self.test_loss]): + tt_loss += self.test_loss + return step+1, tt_loss + loop_inps = [tf.constant(0, dtype=tf.int32), tt_loss] + _, tt_loss = tf.while_loop(_condition, _body, loop_inps) + test_ppl = tf.math.exp(tt_loss/ self.num_test_batches, name='test_ppl') + print("test_ppl:{}".format(test_ppl)) + self.infer_ppl = test_ppl + + def _build_avg_infer(self): + print("Build avg_infer graph") + def _fp(): + with tf.control_dependencies([self.use_moving_avg_vars, self.test_init_states['reset']]): + avg_infer_ppl = self.infer_ppl + with tf.control_dependencies([avg_infer_ppl, self.restore_normal_vars]): + return avg_infer_ppl + def _fn(): + return self.infer_ppl + + with tf.control_dependencies([self.moving_avg_started]): + avg_infer_ppl = tf.cond(tf.greater_equal(self.moving_avg_started, 1), _fp, _fn) + self.avg_infer_ppl = tf.identity(avg_infer_ppl, name="output") + print("self.avg_infer_ppl:{}".format(self.avg_infer_ppl)) + + + def eval_valid(self, sess, use_moving_avg=False): + """Eval 1 round on valid set.""" + total_loss = 0 + + if use_moving_avg: + # print('sum_v:{}'.format(sess.run(self.p1))) + # print('new_sum_v:{}'.format(sess.run(self.p2))) + # print('[[step, mu, v, v_avg, new_v_avg]]={}'.format(sess.run(self.p3))) + # self.use_moving_avg_vars ===>影子权重暂时替代当前权重 + sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) + # print('v_avg:{}\n[[v, avg_v, backup_v]]={}'.format(sess.run(self.p3_1), sess.run(self.p4))) + + valid_loss = [] + for _ in range(self.num_valid_batches): + loss = sess.run(self.valid_loss) + total_loss += loss + valid_loss.append(loss) + if sess.run(self.should_reset_eval): + break + print("valid_loss={}, self.num_valid_batches={}".format(valid_loss, self.num_valid_batches)) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + # print('v:{}\n[[v, backup_v]]={} \n============================================================'.format( + # sess.run(self.p3_2), sess.run(self.p5))) + + return valid_ppl + + def do_infer(self, sess, use_moving_avg=False): + # self._eval_test(sess, use_moving_avg) + return sess.run(self.avg_infer_ppl) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py new file mode 100644 index 000000000..d8bdec500 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py @@ -0,0 +1,93 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import datetime +# import moxing as mox +import tensorflow.compat.v1 as tf +gfile = tf.gfile + +def obs_data2modelarts(config): + """ + Copy train data from obs to modelarts by using moxing api. + """ + start = datetime.datetime.now() + print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.data_url, config.modelarts_data_dir)) + mox.file.copy_parallel(src_url=config.data_url, dst_url=config.modelarts_data_dir) + print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.ckp_path, config.modelarts_result_dir)) + output_dir = config.modelarts_result_dir + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + mox.file.copy_parallel(src_url=config.ckp_path, dst_url=config.modelarts_result_dir) + end = datetime.datetime.now() + files = os.listdir(config.modelarts_data_dir) + print("===>>>Files:", files) + files2 = os.listdir(config.modelarts_result_dir) + print("===>>>Files2:", files2) + + +def modelarts_result2obs(FLAGS): + """ + Copy debug data from modelarts to obs. + According to the swich flags, the debug data may contains auto tune repository, + dump data for precision comparision, even the computation graph and profiling data. + """ + work_dir = os.getcwd() + print("start op: modelarts_result2obs..........") + + ## copy result from modelarts to obs + obs_result_dir = os.path.join(FLAGS.obs_dir, 'result') + if not mox.file.exists(obs_result_dir): + mox.file.make_dirs(obs_result_dir) + else: + mox.file.remove(obs_result_dir, recursive=True) + mox.file.make_dirs(obs_result_dir) + mox.file.copy_parallel(src_url=FLAGS.output_dir, dst_url=obs_result_dir) + print("===>>>Copy Event or Checkpoint from modelarts dir:{} to obs:{}".format(FLAGS.output_dir, obs_result_dir)) + + ## Copy auto tune repository. Comment this snippets if npu_auto_tune is off. + # if FLAGS.npu_auto_tune: + # modelarts_auto_tune_dir = os.path.join(work_dir, "npu_auto_tune") + # obs_auto_tune_dir = os.path.join(FLAGS.obs_dir, 'npu_auto_tune') + # if not mox.file.exists(obs_auto_tune_dir): + # mox.file.make_dirs(obs_auto_tune_dir) + # mox.file.copy_parallel(modelarts_auto_tune_dir, obs_auto_tune_dir) + # print("===>>>Auto tune:{} on OBS dir:{}".format(mox.file.list_directory(obs_auto_tune_dir), obs_auto_tune_dir)) + # + # ## Copy dump data. Comment this snippets if npu_dump_data is off. + # if FLAGS.npu_dump_data: + # modelarts_dump_data_dir = os.path.join(work_dir, "npu_dump_data") + # obs_dump_data_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_data') + # if not mox.file.exists(obs_dump_data_dir): + # mox.file.make_dirs(obs_dump_data_dir) + # mox.file.copy_parallel(modelarts_dump_data_dir, obs_dump_data_dir) + # print("===>>>Dumped graph:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_data_dir), obs_dump_data_dir)) + # + # ## Copy compute graph. Comment this snippets if npu_dump_graph is off. + # if FLAGS.npu_dump_graph: + # modelarts_dump_graph_dir = os.path.join(work_dir, "npu_dump_graph") + # obs_dump_graph_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_graph') + # if not mox.file.exists(obs_dump_graph_dir): + # mox.file.make_dirs(obs_dump_graph_dir) + # mox.file.copy_parallel(modelarts_dump_graph_dir, obs_dump_graph_dir) + # print("===>>>Dumped data:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_graph_dir), obs_dump_graph_dir)) + # + # ## Copy profiling data. Comment this snippets if npu_profiling is off. + # if FLAGS.npu_profiling: + # modelarts_profiling_dir = os.path.join(work_dir, "npu_profiling") + # obs_profiling_dir = os.path.join(FLAGS.obs_dir, 'npu_profiling') + # if not mox.file.exists(obs_profiling_dir): + # mox.file.make_dirs(obs_profiling_dir) + # mox.file.copy_parallel(modelarts_profiling_dir, obs_profiling_dir) + # print("===>>>Profiling data:{} on OBS dir:{}".format(mox.file.list_directory(obs_profiling_dir), obs_profiling_dir)) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py new file mode 100644 index 000000000..2a1816ac3 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD LSTM.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import os +import pickle +import sys +import time + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import lstm_lib +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils +from tensorflow.contrib import training as contrib_training + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +flags.DEFINE_boolean('reset_output_dir', False, '') +flags.DEFINE_string('output_dir', None, '') +flags.DEFINE_string('data_path', None, '') + +flags.DEFINE_integer('log_every', 200, '') + + +def get_ops(params, x_train, x_valid, x_test): + """Build [train, valid, test] graphs.""" + + lm = lstm_lib.LM(params, x_train, x_valid, x_test) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + 'eval_test': lm.eval_test, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'moving_avg_started': lm.moving_avg_started, + 'update_moving_avg': lm.update_moving_avg_ops, + 'start_moving_avg': lm.start_moving_avg_op, + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + + +def train(params): + """Entry point for training.""" + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, x_test, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + print(' test_size: {0}'.format(np.size(x_test))) + + g = tf.Graph() + with g.as_default(): + ops = get_ops(params, x_train, x_valid, x_test) + run_ops = [ + ops['train_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['moving_avg_started'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=5) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + accum_loss = 0 + accum_step = 0 + epoch = 0 + best_valid_ppl = [] + start_time = time.time() + while True: + sess.run(ops['reset_batch_states']) + loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) + accum_loss += loss + accum_step += 1 + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0:<7d}'.format(step) + log_string += ' ppl={0:<9.2f}'.format(train_ppl) + log_string += ' lr={0:<10.7f}'.format(lr) + log_string += ' |g|={0:<5.2f}'.format(gn) + log_string += ' avg={0:<2d}'.format(moving_avg_started) + log_string += ' mins={0:<.2f}'.format(mins_so_far) + print(log_string) + + if moving_avg_started: + sess.run(ops['update_moving_avg']) + + # if step % params.num_train_batches == 0: + if should_reset: + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + if (not moving_avg_started and + len(best_valid_ppl) > params.best_valid_ppl_threshold and + valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold])): + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + best_valid_ppl.append(valid_ppl) + + if step >= params.num_train_steps: + ops['eval_test'](sess, use_moving_avg=moving_avg_started) + break + sess.close() + + +def main(unused_args): + output_dir = FLAGS.output_dir + print('-' * 80) + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(output_dir)) + gfile.DeleteRecursively(output_dir) + gfile.MakeDirs(output_dir) + + print('-' * 80) + log_file = os.path.join(output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ) + + train(params) + + +if __name__ == '__main__': + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py new file mode 100644 index 000000000..576b6f2e2 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py @@ -0,0 +1,458 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD LSTM model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils + +MOVING_AVERAGE_DECAY = 0.9995 + +MOVING_AVERAGE_DECAY = 0.9995 + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _lstm(x, prev_c, prev_h, w_lstm, layer_masks): + """Multi-layer LSTM. + + Args: + x: [batch_size, num_steps, hidden_size]. + prev_c: [[batch_size, hidden_size] * num_layers]. + prev_h: [[batch_size, hidden_size] * num_layers]. + w_lstm: [[2 * hidden_size, 4 * hidden_size] * num_layers]. + layer_masks: [([hidden_size, hidden_size] or None)* num_layers]. + + Returns: + next_c: [[batch_size, hidden_size] * num_layers]. + next_h: [[batch_size, hidden_size] * num_layers]. + all_h: [batch_size, num_steps, hidden_size]. + """ + _, num_steps, _ = tf.unstack(tf.shape(x)) + num_layers = len(w_lstm) + + all_h = [tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) + for _ in range(num_layers)] + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, pprev_c, pprev_h, all_h): + """Apply LSTM at each step.""" + next_c, next_h = [], [] + for layer_id, (p_c, p_h, w, m) in enumerate(zip( + pprev_c, pprev_h, w_lstm, layer_masks)): + inp = x[:, step, :] if layer_id == 0 else next_h[-1] + if m is not None: + inp *= m + ifog = tf.matmul(tf.concat([inp, p_h], axis=1), w) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + c = i * g + f * p_c + h = o * tf.tanh(c) + all_h[layer_id] = all_h[layer_id].write(step, h) + next_c.append(c) + next_h.append(h) + return step + 1, next_c, next_h, all_h + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_c, prev_h, all_h] + _, next_c, next_h, all_h = tf.while_loop(_condition, _body, loop_inps, + parallel_iterations=1) + all_h = [tf.transpose(h.stack(), [1, 0, 2]) + for h in all_h] + + return next_c, next_h, all_h + + +def _set_default_params(params): + """Set default parameters.""" + params.add_hparam('alpha', 2.) # activation L2 reg + params.add_hparam('best_valid_ppl_threshold', 7) + params.add_hparam('beta', 1.) # activation slowness reg + + params.add_hparam('batch_size', 12) + params.add_hparam('bptt_steps', 70) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', 0.10) # word + params.add_hparam('drop_i', 0.65) # embeddings + params.add_hparam('drop_l', 0.30) # between layers + params.add_hparam('drop_o', 0.40) # output + params.add_hparam('drop_w', 0.50) # weight + + params.add_hparam('emb_size', 400) + params.add_hparam('start_decay_epoch', 14) + params.add_hparam('decay_every_epoch', 1) + params.add_hparam('decay_rate', 0.98) + params.add_hparam('grad_bound', 0.25) + params.add_hparam('hidden_size', 1100) + params.add_hparam('init_range', 0.1) + params.add_hparam('learning_rate', 20.) + params.add_hparam('num_layers', 3) + params.add_hparam('num_train_epochs', 500) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', 1.2e-6) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, x_train, x_valid, x_test, name='language_model'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, self.base_bptt) = data_utils.input_producer( + x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam( + 'num_train_steps', self.num_train_batches * params.num_train_epochs) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches) = data_utils.input_producer( + x_valid, params.batch_size, params.bptt_steps) + + # test data + (self.x_test, self.y_test, + self.num_test_batches) = data_utils.input_producer(x_test, 1, 1) + + params.add_hparam('start_decay_step', + params.start_decay_epoch * self.num_train_batches) + params.add_hparam('decay_every_step', + params.decay_every_epoch * self.num_train_batches) + + self._build_params() + self._build_train() + self._build_valid() + self._build_test() + + def _build_params(self): + """Create and count model parameters.""" + print('-' * 80) + print('Building model params') + with tf.variable_scope(self.name): + with tf.variable_scope('embedding'): + initializer = tf.initializers.random_uniform( + -self.params.init_range, self.params.init_range) + w_emb = tf.get_variable( + 'w', [self.params.vocab_size, self.params.emb_size], + initializer=initializer) + dropped_w_emb = tf.layers.dropout( + w_emb, self.params.drop_e, [self.params.vocab_size, 1], + training=True) + + w_lstm = [] + dropped_w_lstm = [] + with tf.variable_scope('lstm'): + for i in range(self.params.num_layers): + inp_size = self.params.emb_size if i == 0 else self.params.hidden_size + hid_size = (self.params.emb_size if i == self.params.num_layers - 1 + else self.params.hidden_size) + init_range = 1.0 / np.sqrt(hid_size) + initializer = tf.initializers.random_uniform(-init_range, init_range) + with tf.variable_scope('layer_{0}'.format(i)): + w = tf.get_variable('w', [inp_size + hid_size, 4 * hid_size], + initializer=initializer) + i_mask = tf.ones([inp_size, 4 * hid_size], dtype=tf.float32) + h_mask = _gen_mask([hid_size, 4 * hid_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w = w * mask + w_lstm.append(w) + dropped_w_lstm.append(dropped_w) + + with tf.variable_scope('init_states'): + batch_prev_c, batch_prev_h, batch_reset = [], [], [] + test_prev_c, test_prev_h, test_reset = [], [], [] + for i in range(self.params.num_layers): + inp_size = self.params.emb_size if i == 0 else self.params.hidden_size + hid_size = (self.params.emb_size if i == self.params.num_layers - 1 + else self.params.hidden_size) + + with tf.variable_scope('layer_{0}'.format(i)): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hid_size] + batch_prev_c.append(tf.get_variable( + 'c', init_shape, dtype=tf.float32, trainable=False)) + batch_prev_h.append(tf.get_variable( + 'h', init_shape, dtype=tf.float32, trainable=False)) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset.append(tf.assign(batch_prev_c[-1], zeros)) + batch_reset.append(tf.assign(batch_prev_h[-1], zeros)) + with tf.variable_scope('test'): + init_shape = [1, hid_size] + test_prev_c.append(tf.get_variable( + 'c', init_shape, dtype=tf.float32, trainable=False)) + test_prev_h.append(tf.get_variable( + 'h', init_shape, dtype=tf.float32, trainable=False)) + zeros = np.zeros(init_shape, dtype=np.float32) + test_reset.append(tf.assign(test_prev_c[-1], zeros)) + test_reset.append(tf.assign(test_prev_h[-1], zeros)) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + print('Model has {0} params'.format(num_params)) + + self.batch_init_states = { + 'c': batch_prev_c, + 'h': batch_prev_h, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_lstm': dropped_w_lstm, + 'w_soft': w_emb, + } + self.test_init_states = { + 'c': test_prev_c, + 'h': test_prev_h, + 'reset': test_reset, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_lstm': w_lstm, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_lstm = model_params['w_lstm'] + w_soft = model_params['w_soft'] + prev_c = init_states['c'] + prev_h = init_states['h'] + + emb = tf.nn.embedding_lookup(w_emb, x) + if is_training: + emb = tf.layers.dropout( + emb, self.params.drop_i, + [self.params.batch_size, 1, self.params.emb_size], training=True) + + layer_masks = [None] + for _ in range(1, self.params.num_layers - 1): + mask = _gen_mask([self.params.batch_size, self.params.hidden_size], + self.params.drop_l) + layer_masks.append(mask) + layer_masks.append(None) + else: + layer_masks = [None] * self.params.num_layers + + out_c, out_h, all_h = _lstm(emb, prev_c, prev_h, w_lstm, layer_masks) + top_h = all_h[-1] + if is_training: + top_h = tf.layers.dropout( + top_h, self.params.drop_o, + [self.params.batch_size, 1, self.params.emb_size], training=True) + + carry_on = [] + for var, val in zip(prev_c + prev_h, out_c + out_h): + carry_on.append(tf.assign(var, val)) + + logits = tf.einsum('bnh,vh->bnv', top_h, w_soft) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + loss = tf.reduce_mean(loss) # TODO(hyhieu): watch for num_steps + + reg_loss = loss # loss + regularization_terms, for training only + if is_training: + # L2 weight reg + reg_loss += self.params.weight_decay * tf.add_n( + [tf.reduce_sum(w ** 2) for w in tf.trainable_variables()]) + + # activation L2 reg + reg_loss += self.params.alpha * tf.add_n( + [tf.reduce_mean(h ** 2) for h in all_h[:-1]]) + + # activation slowness L2 reg + reg_loss += self.params.beta * tf.add_n( + [tf.reduce_mean((h[:, 1:, :] - h[:, :-1, :]) ** 2) + for h in all_h[:-1]]) + + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = tf.trainable_variables() + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + learning_rate = utils.get_lr(global_step, self.params) * lr_scale + # learning_rate = tf.Print( + # learning_rate, + # [learning_rate, lr_scale, self.base_bptt, tf.shape(self.y_train)], + # message='lr: ', summarize=3) + grads = tf.gradients(reg_loss, tf_vars) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + + (self.update_moving_avg_ops, self.use_moving_avg_vars, + self.restore_normal_vars) = self._create_average_ops() + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + def _create_average_ops(self): + """Build moving average ops.""" + print('Creating moving average ops') + + with tf.variable_scope('moving_avg_flag'): + self.moving_avg_started = tf.get_variable( + 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + trainable=False) + self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + + all_vars = tf.trainable_variables() + average_pairs = [] + var_cnt = 0 + with tf.variable_scope('average'): + for v in all_vars: + avg_v = tf.get_variable( + str(var_cnt), shape=v.shape, dtype=v.dtype, + initializer=tf.zeros_initializer, trainable=False) + var_cnt += 1 + average_pairs.append([v, avg_v]) + backup_pairs = [] + var_cnt = 0 + with tf.variable_scope('backup'): + for v in all_vars: + backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, + trainable=False) + var_cnt += 1 + backup_pairs.append([v, backup_v]) + + with tf.variable_scope('avg_step'): + avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) + + with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): + average_op = [] + for v, avg_v in average_pairs: + mu = 1 / avg_step + new_avg = mu * v + (1 - mu) * avg_v + with tf.control_dependencies([new_avg]): + average_op.append(tf.assign(avg_v, new_avg)) + + assert len(average_pairs) == len(all_vars) + assert len(average_pairs) == len(backup_pairs) + use_average_op = [] + for i in range(len(average_pairs)): + v, avg_v = average_pairs[i] + _, backup_v = backup_pairs[i] + with tf.control_dependencies([tf.assign(backup_v, v)]): + use_average_op.append(tf.assign(v, avg_v)) + use_average_op = tf.group(*use_average_op) + + reverse_average_op = [] + for v, backup_v in backup_pairs: + reverse_average_op.append(tf.assign(v, backup_v)) + reverse_average_op = tf.group(*reverse_average_op) + + return average_op, use_average_op, reverse_average_op + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + + def _build_test(self): + print('Building test graph') + _, loss = self._forward(self.x_test, self.y_test, + self.eval_params, self.test_init_states) + self.test_loss = loss + + def eval_valid(self, sess, use_moving_avg=False): + """Eval 1 round on valid set.""" + total_loss = 0 + if use_moving_avg: + sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) + for _ in range(self.num_valid_batches): + total_loss += sess.run(self.valid_loss) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + return valid_ppl + + def eval_test(self, sess, use_moving_avg=False): + """Eval 1 round on test set.""" + total_loss = 0 + if use_moving_avg: + sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) + for step in range(self.num_test_batches): + total_loss += sess.run(self.test_loss) + if (step + 1) % 1000 == 0: + test_ppl = np.exp(total_loss / (step + 1)) + log_string = 'step={0}'.format(step + 1) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + test_ppl = np.exp(total_loss / self.num_valid_batches) + log_string = 'step={0}'.format(self.num_test_batches) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + return test_ppl diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py new file mode 100644 index 000000000..9a8804313 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Preprocess Penn-Treebank dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import pickle +import numpy as np +import os + + +def main(): + dataFolder = "/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/dataset/Penn_Treebank_dataset" + dataList = os.listdir(dataFolder) + dataPath = {} + for dataName in dataList: + dataPath[dataName] = os.path.join(dataFolder, dataName) + + with open(dataPath.get("ptb.train.txt")) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + vocab, index = {}, {} + for word in sorted(words): + if word not in vocab: + index[len(vocab)] = word + vocab[word] = len(vocab) + print('vocab size: {}'.format(len(vocab))) + + x_train = [vocab[word] for word in words] + [vocab['']] + x_train = np.array(x_train, dtype=np.int32) + + with open(dataPath.get('ptb.valid.txt')) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + x_valid = [vocab[word] for word in words] + [vocab['']] + x_valid = np.array(x_valid, dtype=np.int32) + + with open(dataPath.get("ptb.test.txt")) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + x_test = [vocab[word] for word in words] + [vocab['']] + x_test = np.array(x_test, dtype=np.int32) + + print('train size: {}'.format(np.size(x_train))) + print('valid size: {}'.format(np.size(x_valid))) + print('test size: {}'.format(np.size(x_test))) + + with open('ptb/ptb.pkl', 'wb') as fout: + pickle.dump((x_train, x_valid, x_test, vocab, index), fout, protocol=2) + + +if __name__ == '__main__': + main() diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py new file mode 100644 index 000000000..4d73e2b37 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py @@ -0,0 +1,288 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD ENAS search process.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig + +import os +import pickle +import sys +import time + +sys.path.append("/home/ma-user/modelarts/user-job-dir/") + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import child +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import controller +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils +from tensorflow.contrib import training as contrib_training + + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string('output_dir', None, '') +flags.DEFINE_string('data_path', None, '') +flags.DEFINE_string("obs_dir", "obs://rstg/log", "obs result path, not need on gpu and apulis platform") + + +## Other parametersresult +flags.DEFINE_boolean('reset_output_dir', False, '') +flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") + +flags.DEFINE_integer('log_every', 20, '') + + + +def get_ops(params, x_train, x_valid): + """Build [train, valid, test] graphs.""" + + ct = controller.Controller(params=params) + lm = child.LM(params, ct, x_train, x_valid) + ct.build_trainer(lm) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'l2_reg_loss': lm.l2_reg_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'bptt_rate': lm.bptt_rate, + + 'controller_train_op': ct.train_op, + 'controller_grad_norm': ct.train_op, + 'controller_sample_arc': ct.sample_arc, + 'controller_entropy': ct.sample_entropy, + 'controller_reward': ct.reward, + 'controller_baseline': ct.baseline, + 'controller_optimizer': ct.optimizer, + 'controller_train_fn': ct.train, + + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + +def load_ckpt_model(sess, save_path): + print("reload model from:{}".format(save_path)) + checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 + input_checkpoint = checkpoint.model_checkpoint_path + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 + saver.restore(sess, input_checkpoint) # 使用最新模型 + sess.run(tf.global_variables_initializer())# 初始化所有变量 + +def train(params): + """Entry train function.""" + print("data_path:{}".format(params.data_path)) + print("output_dir:{}".format(params.output_dir)) + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, _, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + + + g = tf.Graph() + with g.as_default(): + tf.random.set_random_seed(2126) + ops = get_ops(params, x_train, x_valid) + run_ops = [ + ops['train_loss'], + ops['l2_reg_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=5) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + hooks.append(ops['controller_optimizer'].make_session_run_hook(True)) + + # >>> add code >> + # 创建session + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 + custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 + # custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fusion_switch.cfg") + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("/home/ma-user/modelarts/inputs/data_url_0") + # + # custom_op.parameter_map["enable_dump_debug"].b = True + # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 + # # custom_op.parameter_map[ + # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 + # + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 + # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) + # >>> add code >> + + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + # reload model + if len(gfile.ListDirectory(params.output_dir)): + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + + accum_loss = 0 + accum_step = 0 + epoch = sess.run(ops['global_step']) // params.num_train_batches + best_valid_ppl = [] + start_time = time.time() + last_mins = (time.time() - start_time) / 60 + accum_rate = 0. + # sess.run(tf.global_variables_initializer()) + while True: + try: + # run_ops = [ + # ops['train_loss'], + # ops['l2_reg_loss'], + # ops['grad_norm'], + # ops['learning_rate'], + # ops['should_reset'], + # ops['train_op'], + # ] + # 修改点 + # loss, l2_reg, gn, lr, should_reset, _ = sess.run(run_ops) + loss = sess.run(ops['train_loss']) + # print("loss_OK:loss:{}".format(loss)) + l2_reg = sess.run(ops['l2_reg_loss']) + # print("l2_reg_OK:l2_reg:{}".format(l2_reg)) + gn = sess.run(ops['grad_norm']) + # gn = -111111 + # print("gn_OK:gn:{}".format(gn)) + lr = sess.run(ops['learning_rate']) + # print("lr_OK:le:{}".format(lr)) + should_reset = sess.run(ops['should_reset']) + _ = sess.run(ops["train_op"]) + + bptt_rate = sess.run(ops['bptt_rate']) + # print("should_reset_OK:should_reset:{}".format(should_reset)) + # if not should_not_train : + # _ = sess.run(ops["train_op"]) + + accum_loss += loss + accum_step += 1 + accum_rate += bptt_rate + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + mins_pices = mins_so_far - last_mins + last_mins = mins_so_far + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0:<7d}/{1:<6d}'.format(step, params.num_train_steps) + log_string += ' ppl={0:<9.2f}'.format(train_ppl) + log_string += ' lr={0:<7.2f}'.format(lr) + log_string += ' |w|={0:<6.2f}'.format(l2_reg) + log_string += ' |g|={0:<6.2f}'.format(gn) + log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, mins_pices/params.log_every) + # log_string += ' accum_rate(rate of a epoch)={0:<4.4f}'.format(accum_rate) + # log_string += ' should_reset:{}'.format(should_reset) + print(log_string) + + if should_reset: + accum_rate=0. + print("should_reset:{}".format(should_reset)) + ops['controller_train_fn'](sess, ops['reset_batch_states']) + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess) + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + best_valid_ppl.append(valid_ppl) + + if step % (params.num_train_batches * 10) == 0: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + if step >= params.num_train_steps: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + break + except tf.errors.InvalidArgumentError: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + sess.close() + + +def main(unused_args): + + tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.info("**********") + print("===>>>data_path:{}".format(FLAGS.data_path)) + print("===>>>output_dir:{}".format(FLAGS.output_dir)) + print("===>>>obs_dir:{}".format(FLAGS.obs_dir)) + print("===>>>train_step:{}".format(FLAGS.num_train_epochs)) + + np.set_printoptions(precision=3, suppress=True, threshold=int(1e9), + linewidth=80) + + print('-' * 80) + if not gfile.IsDirectory(FLAGS.output_dir): + print('Path {} does not exist. Creating'.format(FLAGS.output_dir)) + gfile.MakeDirs(FLAGS.output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(FLAGS.output_dir)) + gfile.DeleteRecursively(FLAGS.output_dir) + gfile.MakeDirs(FLAGS.output_dir) + + print('-' * 80) + log_file = os.path.join(FLAGS.output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ) + train(params) + + + +if __name__ == '__main__': + flags.mark_flag_as_required("data_path") + flags.mark_flag_as_required("output_dir") + flags.mark_flag_as_required("obs_dir") + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh new file mode 100644 index 000000000..b5df7f14d --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh @@ -0,0 +1,36 @@ +#!/bin/bash +### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. +### Modelarts Platform command for train + +#export ASCEND_GLOBAL_LOG_LEVEL=1 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为4 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 # plog日志是否打屏 +#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 + +export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config +#export ENABLE_FORCE_V2_CONTROL=1 + +code_dir=${1} +data_path=${2} +output_dir=${3} +obs_url=${4} + +current_time=`date "+%Y-%m-%d-%H-%M-%S"` + +python ${code_dir}/search.py \ + --data_path=${data_path}/ptb.pkl \ + --output_dir=${output_dir} \ + --obs_dir=${obs_url} \ + --platform='modelarts' \ + 2>&1 | tee ${output_dir}/${current_time}_train_npu.log + + +#BASE_PATH='/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' +# +#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/search' +# +#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' +# +#args="--output_dir=$OUTPUT_DIR --data_path=$DATA_PATH" +# +##run search +#python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py $args diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh new file mode 100644 index 000000000..b3ed57170 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh @@ -0,0 +1,45 @@ +#!/bin/bash +### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. +### Modelarts Platform command for train + +#export ASCEND_GLOBAL_LOG_LEVEL=4 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为3;null级别为4 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 # plog日志是否打屏 +#export ASCEND_HOST_LOG_FILE_NUM=1000 +#export ASCEND_LOG_DEVICE_FLUSH_TIMEOUT=0 +#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 +#export ASCEND_GLOBAL_TRACE_ENABLE=0 +#export PROFILING_MODE=false +#export PROFILING_OPTIONS='{"output":"/tmp/profiling","training_trace":"off","task_trace":"off","aicpu":"on","fp_point":"resnet_model/conv2d/Conv2Dresnet_model/batch_normalization/FusedBatchNormV3_Reduce","bp_point":"gradients/AddN_70","aic_metrics":"PipeUtilization"}' + +export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config +#export ENABLE_FORCE_V2_CONTROL=1 + +code_dir=${1} +data_path=${2} +output_dir=${3} +ckp_path=${4} + +current_time=`date "+%Y-%m-%d-%H-%M-%S"` +FIXED_ARC='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' + +nohup python3 ${code_dir}/fixed.py \ + --data_path=${data_path}/ptb.pkl \ + --output_dir=${output_dir} \ + --fixed_arc='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' \ + --ckp_path=${ckp_path} \ + --platform='modelarts' \ + > nohup1.out 2>&1 & + + +#FIXED_ARC='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' +# +#BASE_PATH = '/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' +# +#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/test' +# +#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' +# +#args ='--fixed_arc=FIXED_ARC --output_dir=$OUTPUT_DIR --data_path=$DATA_PATH' +# +##run test +#python3 /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py $args diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py new file mode 100644 index 000000000..7b59aec44 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common utils.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import re +import sys +import tensorflow.compat.v1 as tf + +gfile = tf.gfile + + +class Logger(object): + """Prints to both STDOUT and a file.""" + + def __init__(self, filepath): + self.terminal = sys.stdout + self.log = gfile.GFile(filepath, 'a+') + + def write(self, message): + self.terminal.write(message) + self.terminal.flush() + self.log.write(message) + self.log.flush() + + def flush(self): + self.terminal.flush() + self.log.flush() + + +def get_lr(curr_step, params, lr_decay_rate): + """Compute learning rate at step depends on `params`.""" + lr = tf.constant(params.learning_rate, dtype=tf.float32) + if 'num_warmup_steps' in params and params.num_warmup_steps > 0: + num_warmup_steps = tf.cast(params.num_warmup_steps, dtype=tf.float32) + step = tf.cast(curr_step, dtype=tf.float32) + warmup_lr = params.learning_rate * step / num_warmup_steps + lr = tf.cond(tf.less(step, num_warmup_steps), lambda: warmup_lr, lambda: lr) + return lr * lr_decay_rate + + +def strip_var_name(var_name): + """Strips variable name of sub-strings blocking variable name matching.""" + # Strip trailing number, e.g. convert + # 'lstm/W_0:0' to 'lstm/W_0'. + var_name = re.sub(r':\d+$', '', var_name) + # Strip partitioning info, e.g. convert + # 'W_0/part_3/Adagrad' to 'W_0/Adagrad'. + var_name = re.sub(r'/part_\d+', '', var_name) + return var_name -- Gitee From 15215627a9ca4da90ea1a0fe9eaad6ed499c3745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:24:49 +0000 Subject: [PATCH 08/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index 50d2547b0..1fa6b1e33 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -11,9 +11,9 @@ ###概述 -enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. -- 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) -- 参考代码:[enas](https://github.com/melodyguan/enas) +####enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. +- ####参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +- ####参考代码:[enas](https://github.com/melodyguan/enas) ###默认配置 ####数据预处理 -- Gitee From 2aba93dff3d8a9107b4bf8566d880efd80fdf484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:25:02 +0000 Subject: [PATCH 09/27] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?= =?UTF-8?q?ntrib/TensorFlow/Research/nlp/enas/ENAS=5FID2053=5Ffor=5FTensor?= =?UTF-8?q?Flow/README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../enas/ENAS_ID2053_for_TensorFlow/README.md | 207 ------------------ 1 file changed, 207 deletions(-) delete mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md deleted file mode 100644 index 1fa6b1e33..000000000 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ /dev/null @@ -1,207 +0,0 @@ -###基本信息 -####发布者(Publisher):Huawei -####应用领域(Application Domain):NLP -####修改时间(Modified) :2018. -####框架(Framework):TensorFlow 1.15.0 -####模型格式(Model Format):ckpt -####精度(Precision):Mixed -####处理器(Processor):昇腾910 -####应用级别(Categories):Research -####描述(Description): enas模型用于ptb数据集的神经网络结构搜索 - - -###概述 -####enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. -- ####参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) -- ####参考代码:[enas](https://github.com/melodyguan/enas) - -###默认配置 -####数据预处理 - - #### 输入数据为文本 - - #### 文本输入格式: id [int] -#### 训练超参数 - - #### search - - #### controller baseline decay : 0.999 - - #### controller entropy weight : 1e-5 - - #### controller temperature : 5 - - #### controller learning rate : 5e-5 - - #### controller num layers : 9 - - #### controller hidden size : 64 - - #### controller num functions : 4 - - #### child batch size : 128 - - #### child bptt steps : 35 - - #### num train epochs : 600 - - ####test - - #### child grad bound : 0.25 - - #### child weight decay : 2e-6 - - #### child num train epochs :3000 - - #### child hidden size : 800 - - #### learning_rate : 20. - -###支持特性 - -| 特性列表 | 是否支持 | -|------|------| -| 混合精度 | 是 | - -###混合精度训练 -#### 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 - -###快速上手 -####模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 - -###代码结构文件 -#### -|— search.py 搜索模型代码\ -|— child.py 子图模型代码\ -|— fixed.py 架构验证模型代码\ -|— fixed_lib.py\ -|— data_utils.py 数据处理代码\ -|— controller.py 性能评估模型代码\ -|— boot_modelarts.py 模型运行代码\ -|— ... - -###脚本参数 -#### -- search:\ ---data_path\ ---output_dir\ ---obs_dir -- test:\ ---data_path\ ---output_dir\ ---fixed_arc\ ---ckp_path - - - -###训练过程 -在论文的参数设置下,GPU训练精度和速度可以达到要求; -NPU的训练精度和速度还未达标。 -- #### GPU -#### search -epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ -valid_ppl=1800.73\ -epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ -valid_ppl=892.87\ -epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ -valid_ppl=843.70\ -epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ -valid_ppl=898.45\ -epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ -valid_ppl=774.25\ -epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ -valid_ppl=622.82\ -epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ -valid_ppl=606.77\ -epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ -valid_ppl=579.69\ -epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ -valid_ppl=520.63\ -epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ -...\ -valid_ppl=162.39\ -epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 -#### test -epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ -epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ -valid_ppl=463.03\ -epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ -epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ -valid_ppl=339.76\ -epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ -epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ -valid_ppl=271.71\ -epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ -epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ -valid_ppl=245.00\ -epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ -epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ -valid_ppl=213.10\ -epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ -epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ -epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ -valid_ppl=209.90\ -epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ -epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ -valid_ppl=181.99\ -epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ -epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ -valid_ppl=176.79\ -epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ -epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ -valid_ppl=166.62\ -...\ -epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ -epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ -valid_ppl=61.17\ -epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ -epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ -epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ -valid_ppl=61.17\ -epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ -...\ -step=70000 test_ppl=59.15\ -step=71000 test_ppl=59.03\ -step=72000 test_ppl=59.06\ -step=73000 test_ppl=58.41\ -step=74000 test_ppl=58.24\ -step=75000 test_ppl=58.12\ -step=76000 test_ppl=58.15\ -step=77000 test_ppl=58.29\ -step=78000 test_ppl=58.36\ -step=79000 test_ppl=58.50\ -step=80000 test_ppl=58.43\ -step=81000 test_ppl=58.72\ -step=82000 test_ppl=58.52\ -step=82429 test_ppl=58.64 - -- #### NPU -#### test -epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ -epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ -valid_ppl=389.49\ -epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ -epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ -valid_ppl=298.25\ -epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ -epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ -epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ -valid_ppl=236.61\ -epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ -epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ -valid_ppl=252.75\ -epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ -epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ -valid_ppl=197.03\ -epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ -epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ -epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ -valid_ppl=191.64\ -epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ -epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ -valid_ppl=200.02\ -epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ -epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ -valid_ppl=201.46\ -epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ -epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ -epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ -valid_ppl=175.82\ -epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ -epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ -valid_ppl=209.94\ -epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ -epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ -epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ -valid_ppl=167.50\ -...\ -valid_ppl=112.40\ -epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ -epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ -valid_ppl=113.40\ -epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ -epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ -epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ -valid_ppl=113.22\ -epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file -- Gitee From ab5fa2fd60688d2d4882ab04b8991880f29c9a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:25:50 +0000 Subject: [PATCH 10/27] motify README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../enas/ENAS_ID2053_for_TensorFlow/README.md | 207 ++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md new file mode 100644 index 000000000..1fa6b1e33 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -0,0 +1,207 @@ +###基本信息 +####发布者(Publisher):Huawei +####应用领域(Application Domain):NLP +####修改时间(Modified) :2018. +####框架(Framework):TensorFlow 1.15.0 +####模型格式(Model Format):ckpt +####精度(Precision):Mixed +####处理器(Processor):昇腾910 +####应用级别(Categories):Research +####描述(Description): enas模型用于ptb数据集的神经网络结构搜索 + + +###概述 +####enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. +- ####参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +- ####参考代码:[enas](https://github.com/melodyguan/enas) + +###默认配置 +####数据预处理 + - #### 输入数据为文本 + - #### 文本输入格式: id [int] +#### 训练超参数 + - #### search + - #### controller baseline decay : 0.999 + - #### controller entropy weight : 1e-5 + - #### controller temperature : 5 + - #### controller learning rate : 5e-5 + - #### controller num layers : 9 + - #### controller hidden size : 64 + - #### controller num functions : 4 + - #### child batch size : 128 + - #### child bptt steps : 35 + - #### num train epochs : 600 + - ####test + - #### child grad bound : 0.25 + - #### child weight decay : 2e-6 + - #### child num train epochs :3000 + - #### child hidden size : 800 + - #### learning_rate : 20. + +###支持特性 + +| 特性列表 | 是否支持 | +|------|------| +| 混合精度 | 是 | + +###混合精度训练 +#### 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 + +###快速上手 +####模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 + +###代码结构文件 +#### +|— search.py 搜索模型代码\ +|— child.py 子图模型代码\ +|— fixed.py 架构验证模型代码\ +|— fixed_lib.py\ +|— data_utils.py 数据处理代码\ +|— controller.py 性能评估模型代码\ +|— boot_modelarts.py 模型运行代码\ +|— ... + +###脚本参数 +#### +- search:\ +--data_path\ +--output_dir\ +--obs_dir +- test:\ +--data_path\ +--output_dir\ +--fixed_arc\ +--ckp_path + + + +###训练过程 +在论文的参数设置下,GPU训练精度和速度可以达到要求; +NPU的训练精度和速度还未达标。 +- #### GPU +#### search +epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ +valid_ppl=1800.73\ +epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ +valid_ppl=892.87\ +epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ +valid_ppl=843.70\ +epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ +valid_ppl=898.45\ +epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ +valid_ppl=774.25\ +epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ +valid_ppl=622.82\ +epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ +valid_ppl=606.77\ +epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ +valid_ppl=579.69\ +epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ +valid_ppl=520.63\ +epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ +...\ +valid_ppl=162.39\ +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 +#### test +epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ +epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ +valid_ppl=463.03\ +epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ +epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ +valid_ppl=339.76\ +epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ +epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ +valid_ppl=271.71\ +epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ +epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ +valid_ppl=245.00\ +epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ +epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ +valid_ppl=213.10\ +epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ +epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ +epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ +valid_ppl=209.90\ +epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ +epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ +valid_ppl=181.99\ +epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ +epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ +valid_ppl=176.79\ +epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ +epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ +valid_ppl=166.62\ +...\ +epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ +epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ +valid_ppl=61.17\ +epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ +epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ +epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ +valid_ppl=61.17\ +epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ +...\ +step=70000 test_ppl=59.15\ +step=71000 test_ppl=59.03\ +step=72000 test_ppl=59.06\ +step=73000 test_ppl=58.41\ +step=74000 test_ppl=58.24\ +step=75000 test_ppl=58.12\ +step=76000 test_ppl=58.15\ +step=77000 test_ppl=58.29\ +step=78000 test_ppl=58.36\ +step=79000 test_ppl=58.50\ +step=80000 test_ppl=58.43\ +step=81000 test_ppl=58.72\ +step=82000 test_ppl=58.52\ +step=82429 test_ppl=58.64 + +- #### NPU +#### test +epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ +epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ +valid_ppl=389.49\ +epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ +epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ +valid_ppl=298.25\ +epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ +epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ +epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ +valid_ppl=236.61\ +epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ +epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ +valid_ppl=252.75\ +epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ +epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ +valid_ppl=197.03\ +epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ +epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ +epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ +valid_ppl=191.64\ +epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ +epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ +valid_ppl=200.02\ +epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ +epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ +valid_ppl=201.46\ +epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ +epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ +epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ +valid_ppl=175.82\ +epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ +epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ +valid_ppl=209.94\ +epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ +epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ +epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ +valid_ppl=167.50\ +...\ +valid_ppl=112.40\ +epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ +epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ +valid_ppl=113.40\ +epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ +epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ +epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ +valid_ppl=113.22\ +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file -- Gitee From 0d27c6da88bf8246eb96a904cd266ddb5e3b6c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:30:30 +0000 Subject: [PATCH 11/27] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?= =?UTF-8?q?ntrib/TensorFlow/Research/nlp/enas/ENAS=5FID2053=5Ffor=5FTensor?= =?UTF-8?q?Flow/README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../enas/ENAS_ID2053_for_TensorFlow/README.md | 207 ------------------ 1 file changed, 207 deletions(-) delete mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md deleted file mode 100644 index 1fa6b1e33..000000000 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ /dev/null @@ -1,207 +0,0 @@ -###基本信息 -####发布者(Publisher):Huawei -####应用领域(Application Domain):NLP -####修改时间(Modified) :2018. -####框架(Framework):TensorFlow 1.15.0 -####模型格式(Model Format):ckpt -####精度(Precision):Mixed -####处理器(Processor):昇腾910 -####应用级别(Categories):Research -####描述(Description): enas模型用于ptb数据集的神经网络结构搜索 - - -###概述 -####enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. -- ####参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) -- ####参考代码:[enas](https://github.com/melodyguan/enas) - -###默认配置 -####数据预处理 - - #### 输入数据为文本 - - #### 文本输入格式: id [int] -#### 训练超参数 - - #### search - - #### controller baseline decay : 0.999 - - #### controller entropy weight : 1e-5 - - #### controller temperature : 5 - - #### controller learning rate : 5e-5 - - #### controller num layers : 9 - - #### controller hidden size : 64 - - #### controller num functions : 4 - - #### child batch size : 128 - - #### child bptt steps : 35 - - #### num train epochs : 600 - - ####test - - #### child grad bound : 0.25 - - #### child weight decay : 2e-6 - - #### child num train epochs :3000 - - #### child hidden size : 800 - - #### learning_rate : 20. - -###支持特性 - -| 特性列表 | 是否支持 | -|------|------| -| 混合精度 | 是 | - -###混合精度训练 -#### 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 - -###快速上手 -####模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 - -###代码结构文件 -#### -|— search.py 搜索模型代码\ -|— child.py 子图模型代码\ -|— fixed.py 架构验证模型代码\ -|— fixed_lib.py\ -|— data_utils.py 数据处理代码\ -|— controller.py 性能评估模型代码\ -|— boot_modelarts.py 模型运行代码\ -|— ... - -###脚本参数 -#### -- search:\ ---data_path\ ---output_dir\ ---obs_dir -- test:\ ---data_path\ ---output_dir\ ---fixed_arc\ ---ckp_path - - - -###训练过程 -在论文的参数设置下,GPU训练精度和速度可以达到要求; -NPU的训练精度和速度还未达标。 -- #### GPU -#### search -epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ -valid_ppl=1800.73\ -epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ -valid_ppl=892.87\ -epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ -valid_ppl=843.70\ -epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ -valid_ppl=898.45\ -epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ -valid_ppl=774.25\ -epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ -valid_ppl=622.82\ -epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ -valid_ppl=606.77\ -epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ -valid_ppl=579.69\ -epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ -valid_ppl=520.63\ -epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ -...\ -valid_ppl=162.39\ -epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 -#### test -epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ -epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ -valid_ppl=463.03\ -epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ -epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ -valid_ppl=339.76\ -epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ -epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ -valid_ppl=271.71\ -epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ -epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ -valid_ppl=245.00\ -epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ -epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ -valid_ppl=213.10\ -epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ -epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ -epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ -valid_ppl=209.90\ -epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ -epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ -valid_ppl=181.99\ -epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ -epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ -valid_ppl=176.79\ -epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ -epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ -valid_ppl=166.62\ -...\ -epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ -epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ -valid_ppl=61.17\ -epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ -epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ -epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ -valid_ppl=61.17\ -epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ -...\ -step=70000 test_ppl=59.15\ -step=71000 test_ppl=59.03\ -step=72000 test_ppl=59.06\ -step=73000 test_ppl=58.41\ -step=74000 test_ppl=58.24\ -step=75000 test_ppl=58.12\ -step=76000 test_ppl=58.15\ -step=77000 test_ppl=58.29\ -step=78000 test_ppl=58.36\ -step=79000 test_ppl=58.50\ -step=80000 test_ppl=58.43\ -step=81000 test_ppl=58.72\ -step=82000 test_ppl=58.52\ -step=82429 test_ppl=58.64 - -- #### NPU -#### test -epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ -epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ -valid_ppl=389.49\ -epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ -epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ -valid_ppl=298.25\ -epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ -epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ -epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ -valid_ppl=236.61\ -epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ -epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ -valid_ppl=252.75\ -epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ -epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ -valid_ppl=197.03\ -epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ -epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ -epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ -valid_ppl=191.64\ -epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ -epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ -valid_ppl=200.02\ -epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ -epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ -valid_ppl=201.46\ -epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ -epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ -epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ -valid_ppl=175.82\ -epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ -epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ -valid_ppl=209.94\ -epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ -epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ -epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ -valid_ppl=167.50\ -...\ -valid_ppl=112.40\ -epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ -epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ -valid_ppl=113.40\ -epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ -epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ -epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ -valid_ppl=113.22\ -epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file -- Gitee From ba7b67176ac2e6e28b20344f20cf479474fb4901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 25 Aug 2022 01:31:33 +0000 Subject: [PATCH 12/27] motify README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../enas/ENAS_ID2053_for_TensorFlow/README.md | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md new file mode 100644 index 000000000..f4d35a09c --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -0,0 +1,212 @@ +### 基本信息 +#### 发布者(Publisher):Huawei +#### 应用领域(Application Domain):NLP +#### 修改时间(Modified) :2018. +#### 框架(Framework):TensorFlow 1.15.0 +#### 模型格式(Model Format):ckpt +#### 精度(Precision):Mixed +#### 处理器(Processor):昇腾910 +#### 应用级别(Categories):Research +#### 描述(Description): enas模型用于ptb数据集的神经网络结构搜索 + +### 概述 +#### enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. + +- #### 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +- #### 参考代码:[enas](https://github.com/melodyguan/enas) + +### 默认配置 +#### 数据预处理 + + - #### 输入数据为文本 + - #### 文本输入格式: id [int] +#### 训练超参数 + - #### search + - #### controller baseline decay : 0.999 + - #### controller entropy weight : 1e-5 + - #### controller temperature : 5 + - #### controller learning rate : 5e-5 + - #### controller num layers : 9 + - #### controller hidden size : 64 + - #### controller num functions : 4 + - #### child batch size : 128 + - #### child bptt steps : 35 + - #### num train epochs : 600 + - #### test + + - #### child grad bound : 0.25 + - #### child weight decay : 2e-6 + - #### child num train epochs :3000 + - #### child hidden size : 800 + - #### learning_rate : 20. + +### 支持特性 + +| 特性列表 | 是否支持 | +|------|------| +| 混合精度 | 是 | + +### 混合精度训练 + +#### 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 + +### 快速上手 +#### 模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 + +### 代码结构文件 + +|— search.py 搜索模型代码\ +|— child.py 子图模型代码\ +|— fixed.py 架构验证模型代码\ +|— fixed_lib.py\ +|— data_utils.py 数据处理代码\ +|— controller.py 性能评估模型代码\ +|— boot_modelarts.py 模型运行代码\ +|— ... + +### 脚本参数 +#### + +- search:\ +--data_path\ +--output_dir\ +--obs_dir +- test:\ +--data_path\ +--output_dir\ +--fixed_arc\ +--ckp_path + + + +### 训练过程 +在论文的参数设置下,GPU训练精度和速度可以达到要求; +NPU的训练精度和速度还未达标。 + +- #### GPU +#### search +epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ +valid_ppl=1800.73\ +epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ +valid_ppl=892.87\ +epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ +valid_ppl=843.70\ +epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ +valid_ppl=898.45\ +epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ +valid_ppl=774.25\ +epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ +valid_ppl=622.82\ +epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ +valid_ppl=606.77\ +epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ +valid_ppl=579.69\ +epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ +valid_ppl=520.63\ +epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ +...\ +valid_ppl=162.39\ +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 +#### test +epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ +epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ +valid_ppl=463.03\ +epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ +epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ +valid_ppl=339.76\ +epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ +epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ +valid_ppl=271.71\ +epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ +epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ +valid_ppl=245.00\ +epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ +epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ +valid_ppl=213.10\ +epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ +epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ +epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ +valid_ppl=209.90\ +epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ +epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ +valid_ppl=181.99\ +epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ +epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ +valid_ppl=176.79\ +epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ +epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ +valid_ppl=166.62\ +...\ +epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ +epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ +valid_ppl=61.17\ +epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ +epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ +epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ +valid_ppl=61.17\ +epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ +...\ +step=70000 test_ppl=59.15\ +step=71000 test_ppl=59.03\ +step=72000 test_ppl=59.06\ +step=73000 test_ppl=58.41\ +step=74000 test_ppl=58.24\ +step=75000 test_ppl=58.12\ +step=76000 test_ppl=58.15\ +step=77000 test_ppl=58.29\ +step=78000 test_ppl=58.36\ +step=79000 test_ppl=58.50\ +step=80000 test_ppl=58.43\ +step=81000 test_ppl=58.72\ +step=82000 test_ppl=58.52\ +step=82429 test_ppl=58.64 + +- #### NPU +#### test +epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ +epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ +valid_ppl=389.49\ +epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ +epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ +valid_ppl=298.25\ +epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ +epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ +epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ +valid_ppl=236.61\ +epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ +epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ +valid_ppl=252.75\ +epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ +epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ +valid_ppl=197.03\ +epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ +epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ +epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ +valid_ppl=191.64\ +epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ +epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ +valid_ppl=200.02\ +epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ +epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ +valid_ppl=201.46\ +epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ +epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ +epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ +valid_ppl=175.82\ +epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ +epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ +valid_ppl=209.94\ +epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ +epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ +epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ +valid_ppl=167.50\ +...\ +valid_ppl=112.40\ +epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ +epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ +valid_ppl=113.40\ +epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ +epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ +epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ +valid_ppl=113.22\ +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file -- Gitee From f01e6eb5b252dcf5c9f3424e0052c8cacd4fa914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Mon, 29 Aug 2022 13:21:50 +0000 Subject: [PATCH 13/27] update README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index f4d35a09c..0fb741ac8 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -209,4 +209,14 @@ epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2 epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ valid_ppl=113.22\ -epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 + +### 执行结果打屏信息 +#### GPU +Final Test output : test_ppl=58.64 +#### NPU +Final Test output : test_ppl:82.80 +### 数据集OBS链接 +obs://rstg/Dataset/ptb +### 执行任务OBS链接 +obs://rstg/workplace_ENAS/lm-test/MA-new-enas-06-15-23-29_test/ \ No newline at end of file -- Gitee From 4e54675af400685bd29379a7f2f7f3ff92bdb187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Mon, 29 Aug 2022 13:25:47 +0000 Subject: [PATCH 14/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../enas/ENAS_ID2053_for_TensorFlow/README.md | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index 0fb741ac8..8de8c6607 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -10,35 +10,35 @@ #### 描述(Description): enas模型用于ptb数据集的神经网络结构搜索 ### 概述 -#### enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. +enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. -- #### 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) -- #### 参考代码:[enas](https://github.com/melodyguan/enas) +#### 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +#### 参考代码:[enas](https://github.com/melodyguan/enas) ### 默认配置 #### 数据预处理 - - #### 输入数据为文本 - - #### 文本输入格式: id [int] + - 输入数据为文本 + - 文本输入格式: id [int] #### 训练超参数 - #### search - - #### controller baseline decay : 0.999 - - #### controller entropy weight : 1e-5 - - #### controller temperature : 5 - - #### controller learning rate : 5e-5 - - #### controller num layers : 9 - - #### controller hidden size : 64 - - #### controller num functions : 4 - - #### child batch size : 128 - - #### child bptt steps : 35 - - #### num train epochs : 600 + - controller baseline decay : 0.999 + - controller entropy weight : 1e-5 + - controller temperature : 5 + - controller learning rate : 5e-5 + - controller num layers : 9 + - controller hidden size : 64 + - controller num functions : 4 + - child batch size : 128 + - child bptt steps : 35 + - num train epochs : 600 - #### test - - #### child grad bound : 0.25 - - #### child weight decay : 2e-6 - - #### child num train epochs :3000 - - #### child hidden size : 800 - - #### learning_rate : 20. + - child grad bound : 0.25 + - child weight decay : 2e-6 + - child num train epochs :3000 + - child hidden size : 800 + - learning_rate : 20. ### 支持特性 @@ -48,10 +48,10 @@ ### 混合精度训练 -#### 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 +昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 ### 快速上手 -#### 模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 +模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 ### 代码结构文件 @@ -65,7 +65,6 @@ |— ... ### 脚本参数 -#### - search:\ --data_path\ @@ -84,7 +83,7 @@ NPU的训练精度和速度还未达标。 - #### GPU -#### search + - #### search epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ valid_ppl=1800.73\ epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ @@ -107,7 +106,7 @@ epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 ...\ valid_ppl=162.39\ epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 -#### test + - #### test epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ valid_ppl=463.03\ @@ -162,7 +161,7 @@ step=82000 test_ppl=58.52\ step=82429 test_ppl=58.64 - #### NPU -#### test + - #### test epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ valid_ppl=389.49\ -- Gitee From 5df16635ed460fc9adf5868f2f1d7c9d21cdc2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Mon, 29 Aug 2022 13:29:46 +0000 Subject: [PATCH 15/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index 8de8c6607..787c42fd5 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -21,7 +21,7 @@ enas是一个快速高效的神经网络架构搜索的方法,使用了子图 - 输入数据为文本 - 文本输入格式: id [int] #### 训练超参数 - - #### search + - ##### search - controller baseline decay : 0.999 - controller entropy weight : 1e-5 - controller temperature : 5 @@ -32,7 +32,7 @@ enas是一个快速高效的神经网络架构搜索的方法,使用了子图 - child batch size : 128 - child bptt steps : 35 - num train epochs : 600 - - #### test + - ##### test - child grad bound : 0.25 - child weight decay : 2e-6 @@ -83,7 +83,7 @@ enas是一个快速高效的神经网络架构搜索的方法,使用了子图 NPU的训练精度和速度还未达标。 - #### GPU - - #### search +- ##### search epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ valid_ppl=1800.73\ epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ @@ -105,8 +105,8 @@ valid_ppl=520.63\ epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ ...\ valid_ppl=162.39\ -epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 - - #### test +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70\ +- ##### test epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ valid_ppl=463.03\ @@ -161,7 +161,7 @@ step=82000 test_ppl=58.52\ step=82429 test_ppl=58.64 - #### NPU - - #### test + - ##### test epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ valid_ppl=389.49\ -- Gitee From b06aa512529ca9bf1f87430d45d62db1b317156a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Mon, 29 Aug 2022 13:31:50 +0000 Subject: [PATCH 16/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index 787c42fd5..95c8c0bff 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -83,7 +83,7 @@ enas是一个快速高效的神经网络架构搜索的方法,使用了子图 NPU的训练精度和速度还未达标。 - #### GPU -- ##### search + - ##### search epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ valid_ppl=1800.73\ epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ @@ -106,7 +106,7 @@ epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 ...\ valid_ppl=162.39\ epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70\ -- ##### test + - ##### test epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ valid_ppl=463.03\ -- Gitee From a3806213c30866de2f13ffa36cf29af17f61c0d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Mon, 29 Aug 2022 13:33:19 +0000 Subject: [PATCH 17/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index 95c8c0bff..b767d1ed9 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -84,6 +84,7 @@ NPU的训练精度和速度还未达标。 - #### GPU - ##### search +###### epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ valid_ppl=1800.73\ epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ @@ -105,7 +106,7 @@ valid_ppl=520.63\ epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ ...\ valid_ppl=162.39\ -epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70\ +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 - ##### test epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ -- Gitee From 2e8aab1d409967d39279670bf8ab22f3fd2a99d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Mon, 29 Aug 2022 13:34:37 +0000 Subject: [PATCH 18/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index b767d1ed9..aff6aea1f 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -84,7 +84,6 @@ NPU的训练精度和速度还未达标。 - #### GPU - ##### search -###### epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ valid_ppl=1800.73\ epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ @@ -107,7 +106,7 @@ epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 ...\ valid_ppl=162.39\ epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 - - ##### test +- ##### test epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ valid_ppl=463.03\ -- Gitee From f9e6a0af7dcc7edac523bc17405df8e7e88aa11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 08:54:33 +0000 Subject: [PATCH 19/27] update contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md index aff6aea1f..051fc7c8f 100644 --- a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ b/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md @@ -208,14 +208,4 @@ epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2 epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ valid_ppl=113.22\ -epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 - -### 执行结果打屏信息 -#### GPU -Final Test output : test_ppl=58.64 -#### NPU -Final Test output : test_ppl:82.80 -### 数据集OBS链接 -obs://rstg/Dataset/ptb -### 执行任务OBS链接 -obs://rstg/workplace_ENAS/lm-test/MA-new-enas-06-15-23-29_test/ \ No newline at end of file +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file -- Gitee From 7653fd69ffbb69035264a62f5dba9001f77df0cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 09:16:23 +0000 Subject: [PATCH 20/27] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=20contrib=20?= =?UTF-8?q?=E4=B8=BA=20contrib=5Fold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- {contrib => contrib_old}/.keep | 0 {contrib => contrib_old}/TensorFlow/.keep | 0 {contrib => contrib_old}/TensorFlow/Research/.keep | 0 {contrib => contrib_old}/TensorFlow/Research/nlp/.keep | 0 {contrib => contrib_old}/TensorFlow/Research/nlp/enas/.keep | 0 .../TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py | 0 .../nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py | 0 .../nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh | 0 .../Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py | 0 24 files changed, 0 insertions(+), 0 deletions(-) rename {contrib => contrib_old}/.keep (100%) rename {contrib => contrib_old}/TensorFlow/.keep (100%) rename {contrib => contrib_old}/TensorFlow/Research/.keep (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/.keep (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/.keep (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh (100%) rename {contrib => contrib_old}/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py (100%) diff --git a/contrib/.keep b/contrib_old/.keep similarity index 100% rename from contrib/.keep rename to contrib_old/.keep diff --git a/contrib/TensorFlow/.keep b/contrib_old/TensorFlow/.keep similarity index 100% rename from contrib/TensorFlow/.keep rename to contrib_old/TensorFlow/.keep diff --git a/contrib/TensorFlow/Research/.keep b/contrib_old/TensorFlow/Research/.keep similarity index 100% rename from contrib/TensorFlow/Research/.keep rename to contrib_old/TensorFlow/Research/.keep diff --git a/contrib/TensorFlow/Research/nlp/.keep b/contrib_old/TensorFlow/Research/nlp/.keep similarity index 100% rename from contrib/TensorFlow/Research/nlp/.keep rename to contrib_old/TensorFlow/Research/nlp/.keep diff --git a/contrib/TensorFlow/Research/nlp/enas/.keep b/contrib_old/TensorFlow/Research/nlp/enas/.keep similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/.keep rename to contrib_old/TensorFlow/Research/nlp/enas/.keep diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh diff --git a/contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py similarity index 100% rename from contrib/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py rename to contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py -- Gitee From 0260db4c15fbb82c79616459a999fc0e35a9fd32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 09:16:34 +0000 Subject: [PATCH 21/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20contrib?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/.keep diff --git a/contrib/.keep b/contrib/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 7192a1d8593e837039bba5b4eca8233c2614cfa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 09:59:50 +0000 Subject: [PATCH 22/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20TensorFlow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/.keep diff --git a/contrib/TensorFlow/.keep b/contrib/TensorFlow/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From 280bc54ea360d3f4996327e891800600a339221a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 10:00:40 +0000 Subject: [PATCH 23/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20Research?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/Research/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/.keep diff --git a/contrib/TensorFlow/Research/.keep b/contrib/TensorFlow/Research/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From ce58dcfb3a451f8792e4505de677323cb81fed91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 10:00:51 +0000 Subject: [PATCH 24/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20nlp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/Research/nlp/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/nlp/.keep diff --git a/contrib/TensorFlow/Research/nlp/.keep b/contrib/TensorFlow/Research/nlp/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From cbf004fe614e365a6f66cbb300abd34f2a51ad93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 10:03:14 +0000 Subject: [PATCH 25/27] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20ENAS=5FID2053=5Ffor?= =?UTF-8?q?=5FTensorFlow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/.keep diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/.keep b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From d24b5428a5a5bb8ac676cb365eed4d68c2bcdcbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 10:18:53 +0000 Subject: [PATCH 26/27] add source code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 王培亮 --- .../nlp/ENAS_ID2053_for_TensorFlow/README.md | 211 ++++++ .../ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py | 36 + .../nlp/ENAS_ID2053_for_TensorFlow/bash.py | 4 + .../boot_modelarts.py | 73 ++ .../nlp/ENAS_ID2053_for_TensorFlow/child.py | 440 ++++++++++++ .../nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py | 81 +++ .../ENAS_ID2053_for_TensorFlow/controller.py | 250 +++++++ .../ENAS_ID2053_for_TensorFlow/data_utils.py | 125 ++++ .../nlp/ENAS_ID2053_for_TensorFlow/fixed.py | 318 +++++++++ .../ENAS_ID2053_for_TensorFlow/fixed_lib.py | 652 ++++++++++++++++++ .../help_modelarts.py | 93 +++ .../nlp/ENAS_ID2053_for_TensorFlow/lstm.py | 174 +++++ .../ENAS_ID2053_for_TensorFlow/lstm_lib.py | 458 ++++++++++++ .../nlp/ENAS_ID2053_for_TensorFlow/process.py | 72 ++ .../nlp/ENAS_ID2053_for_TensorFlow/search.py | 288 ++++++++ .../nlp/ENAS_ID2053_for_TensorFlow/search.sh | 36 + .../ENAS_ID2053_for_TensorFlow/test-npu.sh | 45 ++ .../nlp/ENAS_ID2053_for_TensorFlow/utils.py | 67 ++ 18 files changed, 3423 insertions(+) create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh create mode 100644 contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md new file mode 100644 index 000000000..051fc7c8f --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md @@ -0,0 +1,211 @@ +### 基本信息 +#### 发布者(Publisher):Huawei +#### 应用领域(Application Domain):NLP +#### 修改时间(Modified) :2018. +#### 框架(Framework):TensorFlow 1.15.0 +#### 模型格式(Model Format):ckpt +#### 精度(Precision):Mixed +#### 处理器(Processor):昇腾910 +#### 应用级别(Categories):Research +#### 描述(Description): enas模型用于ptb数据集的神经网络结构搜索 + +### 概述 +enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. + +#### 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +#### 参考代码:[enas](https://github.com/melodyguan/enas) + +### 默认配置 +#### 数据预处理 + + - 输入数据为文本 + - 文本输入格式: id [int] +#### 训练超参数 + - ##### search + - controller baseline decay : 0.999 + - controller entropy weight : 1e-5 + - controller temperature : 5 + - controller learning rate : 5e-5 + - controller num layers : 9 + - controller hidden size : 64 + - controller num functions : 4 + - child batch size : 128 + - child bptt steps : 35 + - num train epochs : 600 + - ##### test + + - child grad bound : 0.25 + - child weight decay : 2e-6 + - child num train epochs :3000 + - child hidden size : 800 + - learning_rate : 20. + +### 支持特性 + +| 特性列表 | 是否支持 | +|------|------| +| 混合精度 | 是 | + +### 混合精度训练 + +昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 + +### 快速上手 +模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 + +### 代码结构文件 + +|— search.py 搜索模型代码\ +|— child.py 子图模型代码\ +|— fixed.py 架构验证模型代码\ +|— fixed_lib.py\ +|— data_utils.py 数据处理代码\ +|— controller.py 性能评估模型代码\ +|— boot_modelarts.py 模型运行代码\ +|— ... + +### 脚本参数 + +- search:\ +--data_path\ +--output_dir\ +--obs_dir +- test:\ +--data_path\ +--output_dir\ +--fixed_arc\ +--ckp_path + + + +### 训练过程 +在论文的参数设置下,GPU训练精度和速度可以达到要求; +NPU的训练精度和速度还未达标。 + +- #### GPU + - ##### search +epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ +valid_ppl=1800.73\ +epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ +valid_ppl=892.87\ +epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ +valid_ppl=843.70\ +epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ +valid_ppl=898.45\ +epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ +valid_ppl=774.25\ +epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ +valid_ppl=622.82\ +epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ +valid_ppl=606.77\ +epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ +valid_ppl=579.69\ +epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ +valid_ppl=520.63\ +epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ +...\ +valid_ppl=162.39\ +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 +- ##### test +epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ +epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ +valid_ppl=463.03\ +epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ +epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ +valid_ppl=339.76\ +epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ +epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ +valid_ppl=271.71\ +epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ +epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ +valid_ppl=245.00\ +epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ +epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ +valid_ppl=213.10\ +epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ +epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ +epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ +valid_ppl=209.90\ +epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ +epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ +valid_ppl=181.99\ +epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ +epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ +valid_ppl=176.79\ +epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ +epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ +valid_ppl=166.62\ +...\ +epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ +epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ +valid_ppl=61.17\ +epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ +epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ +epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ +valid_ppl=61.17\ +epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ +...\ +step=70000 test_ppl=59.15\ +step=71000 test_ppl=59.03\ +step=72000 test_ppl=59.06\ +step=73000 test_ppl=58.41\ +step=74000 test_ppl=58.24\ +step=75000 test_ppl=58.12\ +step=76000 test_ppl=58.15\ +step=77000 test_ppl=58.29\ +step=78000 test_ppl=58.36\ +step=79000 test_ppl=58.50\ +step=80000 test_ppl=58.43\ +step=81000 test_ppl=58.72\ +step=82000 test_ppl=58.52\ +step=82429 test_ppl=58.64 + +- #### NPU + - ##### test +epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ +epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ +valid_ppl=389.49\ +epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ +epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ +valid_ppl=298.25\ +epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ +epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ +epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ +valid_ppl=236.61\ +epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ +epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ +valid_ppl=252.75\ +epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ +epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ +valid_ppl=197.03\ +epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ +epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ +epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ +valid_ppl=191.64\ +epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ +epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ +valid_ppl=200.02\ +epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ +epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ +valid_ppl=201.46\ +epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ +epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ +epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ +valid_ppl=175.82\ +epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ +epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ +valid_ppl=209.94\ +epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ +epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ +epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ +valid_ppl=167.50\ +...\ +valid_ppl=112.40\ +epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ +epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ +valid_ppl=113.40\ +epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ +epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ +epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ +valid_ppl=113.22\ +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py new file mode 100644 index 000000000..4291e14e4 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py @@ -0,0 +1,36 @@ +from npu_bridge.npu_init import * +import os +import sys, getopt + + +def main(argv): + # print(argv) + # argv_ = ['-t', 'search'] + runType = "" + try: + opts, args = getopt.getopt(argv, "ht:", ["trun="]) + except getopt.GetoptError: + print("getopt.GetoptError!!") + print("useage: (sudo) python(3) pythonFileName.py -t ") + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print("useage: pythonFileName.py -t ") + sys.exit() + elif opt in ("-t", "--trun"): + runType = arg + if runType == "search": + print(f'runType={runType}!\n') + os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/search --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") + elif runType == "test-npu": + print(f'runType={runType}!\n') + os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/test --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") + # os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc = '0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=$(pwd)/output/test --data_path=$(pwd)/ptb/ptb.pkl") + # print("this part is writing...") + # pass + else: + print("This runType is invaild!!!") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py new file mode 100644 index 000000000..eaf741434 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py @@ -0,0 +1,4 @@ +from npu_bridge.npu_init import * +import os + +os.system("bash /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.sh") \ No newline at end of file diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py new file mode 100644 index 000000000..d65dfdded --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py @@ -0,0 +1,73 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This is the boot file for ModelArts platform. +Firstly, the train datasets are copyed from obs to ModelArts. +Then, the string of train shell command is concated and using 'os.system()' to execute +""" +import os +import time +import numpy as np +import argparse +from help_modelarts import obs_data2modelarts +# import moxing as mox +print(os.system('env')) +print(os.system("python3 --version")) +#print(os.system("pip install dlib")) +print("===>>>hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh") +os.environ['ASCEND_GLOBAL_LOG_LEVEL'] = '4' + +#data_dir = "/root/.keras/models/" +if __name__ == '__main__': + ## Note: the code dir is not the same as work dir on ModelArts Platform!!! + code_dir = '.' + work_dir = os.getcwd() + print("===>>>code_dir:{}, work_dir:{}".format(code_dir, work_dir)) + output_path = "./output/test/" + str(time.strftime('%Y%m%d_%H%M%S')) + parser = argparse.ArgumentParser() + parser.add_argument("--train_url", type=str, default=output_path) + parser.add_argument("--data_url", type=str, default="./ptb") + parser.add_argument("--ckp_path", type=str, default="./output/test/20220715_182127/") + # parser.add_argument("--ckp_path", type=str, default="obs://rstg/workplace_ENAS/lm-train/MA-new-enas-05-23-19-34/output/result/") + # parser.add_argument("--modelarts_data_dir", type=str, default="/cache/ptb-dataset") + # parser.add_argument("--modelarts_result_dir", type=str, default="/cache/result") + config = parser.parse_args() + #if not os.path.exists(data_dir): + # os.makedirs(data_dir) + # print("=nvvvvvvvvvvvvvfdsfdsfdvnn") + + #os.system("pip install -i http://repo.myhuaweicloud.com/repository/pypi/simple pexpect==4.2.1") + #os.system("pip install torch") + #os.system("pip install absl-py") + print("--------config---------hhhhhhhhhhhggggggggggggggggkkkkkkkkkkkkkkkkkkkkkkkkkgg-") + for k in list(vars(config).keys()): + print("key:{}: value:{}".format(k, vars(config)[k])) + print("--------config----------") + + ## copy dataset from obs to modelarts + # obs_data2modelarts(config) + # ret = mox.file.exists('obs://rstg/MA-new-p/') + # retm = mox.file.make_dirs('obs://rstg/MA-new-p/') + # print("bbbbbbbbbbbbbbbbbbbbbbbbb ",retm) + # print("config.modelarts_result_dir ", config.modelarts_result_dir) + ## start to train on Modelarts platform + # if not os.path.exists(config.modelarts_result_dir): + # os.makedirs(config.modelarts_result_dir) + # print("6666666666666666666666666666666666666666 ", config.modelarts_result_dir) + bash_header = os.path.join(code_dir, 'test-npu.sh') + # bash_header = os.path.join(code_dir, 'search.sh') + arg_url = '%s %s %s %s' % (code_dir, config.data_url, config.train_url, config.ckp_path) + bash_command = 'bash %s %s' % (bash_header, arg_url) + print("bash command:", bash_command) + os.system(bash_command) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py new file mode 100644 index 000000000..09b6d878d --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py @@ -0,0 +1,440 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD ENAS fixed model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + + +import numpy as np +import tensorflow.compat.v1 as tf +import tensorflow.keras as keras + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils + + +flags = tf.app.flags +FLAGS = flags.FLAGS + + +flags.DEFINE_integer('child_batch_size', 128, '') +flags.DEFINE_integer('child_bptt_steps', 35, '') +flags.DEFINE_integer('num_train_epochs', 600, '') + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _rnn_fn(sample_arc, x, prev_s, w_prev, w_skip, input_mask, layer_mask, + params): + """Multi-layer LSTM. + + Args: + sample_arc: [num_layers * 2], sequence of tokens representing architecture. + x: [batch_size, num_steps, hidden_size]. + prev_s: [batch_size, hidden_size]. + w_prev: [2 * hidden_size, 2 * hidden_size]. + w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. + input_mask: `[batch_size, hidden_size]`. + layer_mask: `[batch_size, hidden_size]`. + params: hyper-params object. + + Returns: + next_s: [batch_size, hidden_size]. + all_s: [[batch_size, num_steps, hidden_size] * num_layers]. + """ + batch_size = params.batch_size + # num_steps = 35 + num_steps = tf.shape(x)[1] + print("num_steps:{}/{}".format(num_steps, num_steps)) + + num_layers = len(sample_arc) // 2 + set_shape = x.get_shape().as_list() + print("set_shape:{}".format(set_shape)) + # 修改点 + # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=True) + all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) + # extract the relevant variables, so that you only do L2-reg on them. + u_skip = [] + start_idx = 0 + + for layer_id in range(num_layers): + prev_idx = sample_arc[start_idx] + func_idx = sample_arc[start_idx + 1] + u_skip.append(w_skip[layer_id][func_idx, prev_idx]) + start_idx += 2 + w_skip = u_skip + var_s = [w_prev] + w_skip[1:] + + def _select_function(h, function_id): + h = tf.stack([tf.tanh(h), tf.nn.relu(h), tf.sigmoid(h), h], axis=0) + h = h[function_id] + return h + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, prev_s, all_s): + """Body function.""" + inp = x[:, step, :] + # print("inp:{}".format(inp)) + + # important change: first input uses a tanh() + if layer_mask is not None: + assert input_mask is not None + ht = tf.matmul(tf.concat([inp * input_mask, prev_s * layer_mask], + axis=1), w_prev) + else: + ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) + # print("ht:{}".format(ht)) + h, t = tf.split(ht, 2, axis=1) + h = tf.tanh(h) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + layers = [s] + # print("layer:{}".format(layers)) + + start_idx = 0 + used = [] + for layer_id in range(num_layers): + prev_idx = sample_arc[start_idx] + func_idx = sample_arc[start_idx + 1] + # print("layer_id/[prev_idx, func_idx]:{}/[{}, {}]".format(layer_id, prev_idx, func_idx)) + used.append(tf.one_hot(prev_idx, depth=num_layers, dtype=tf.int32)) + prev_s = tf.stack(layers, axis=0)[prev_idx] + if layer_mask is not None: + ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) + else: + ht = tf.matmul(prev_s, w_skip[layer_id]) + h, t = tf.split(ht, 2, axis=1) + + h = _select_function(h, func_idx) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + # print("s before set_shape:{}".format(s)) + s.set_shape([batch_size, params.hidden_size]) + # print("s after set_shape:{}".format(s)) + layers.append(s) + start_idx += 2 + # print("layers:{}\ns:{}".format(layers, s)) + + next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) + # print("step:{}\nnext_s:{}".format(step, next_s)) + # all_s = all_s.write(step, next_s) + t = tf.stack([next_s]) + # print("t:{}".format(t)) + all_s = tf.concat([all_s, t], 0) + # print("step:{}-all_s:{}".format(step, all_s)) + # all_s_my[step] = next_s + + return step + 1, next_s, all_s + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] + _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) + + all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) + # all_s_my.set_shape([_, batch_size, params.hidden_size]) + # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) + # print("stack_all_s:{}".format(all_s_my)) + + all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) + # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) + # print("all_s:{}".format(all_s)) + + return next_s, all_s, var_s + + +def _set_default_params(params): + """Set default hyper-parameters.""" + params.add_hparam('alpha', 0.0) # activation L2 reg + params.add_hparam('beta', 1.) # activation slowness reg + params.add_hparam('best_valid_ppl_threshold', 5) + + params.add_hparam('batch_size', FLAGS.child_batch_size) + params.add_hparam('bptt_steps', FLAGS.child_bptt_steps) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', 0.10) # word + params.add_hparam('drop_i', 0.20) # embeddings + params.add_hparam('drop_x', 0.75) # input to RNN cells + params.add_hparam('drop_l', 0.25) # between layers + params.add_hparam('drop_o', 0.75) # output + params.add_hparam('drop_w', 0.00) # weight + + params.add_hparam('grad_bound', 0.1) + params.add_hparam('hidden_size', 200) + params.add_hparam('init_range', 0.04) + params.add_hparam('learning_rate', 20.) + params.add_hparam('num_train_epochs', FLAGS.num_train_epochs) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', 8e-7) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, controller, x_train, x_valid, name='child'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.controller = controller + self.sample_arc = tf.unstack(controller.sample_arc) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, + self.base_bptt, self.bptt_rate) = data_utils.input_producer(x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam('num_train_steps', self.num_train_batches * params.num_train_epochs) + # self.x_train.set_shape([params.batch_size, self.base_bptt]) + # print("self.x_train:{}".format(self.x_train.get_shape().as_list())) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches) = data_utils.input_producer(x_valid, params.batch_size, params.bptt_steps) + # with tf.control_dependencies([self.base_bptt]): + self._build_params() + self._build_train() + self._build_valid() + + def _build_params(self): + """Create model parameters.""" + + print('-' * 80) + print('Building model params') + initializer = tf.initializers.random_uniform(minval=-self.params.init_range, + maxval=self.params.init_range) + num_functions = self.params.controller_num_functions + num_layers = self.params.controller_num_layers + hidden_size = self.params.hidden_size + # >>> add code >>> + with tf.variable_scope(self.name, initializer=initializer): + # >>> add code >>> + with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): + w_emb = tf.get_variable('w', [self.params.vocab_size, hidden_size]) + # >>> add code >>> + # 修改点 + dropped_w_emb = npu_ops.dropout(w_emb, 1-self.params.drop_e) + # dropped_w_emb = tf.layers.dropout( + # w_emb, self.params.drop_e, [self.params.vocab_size, 1], + # training=True) + with tf.variable_scope('rnn_cell', reuse=tf.AUTO_REUSE): + w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) + i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) + h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w_prev = w_prev * mask + w_skip, dropped_w_skip = [], [] + for layer_id in range(1, num_layers + 1): + with tf.variable_scope('layer_{}'.format(layer_id)): + w = tf.get_variable( + 'w', [num_functions, layer_id, hidden_size, 2 * hidden_size]) + mask = _gen_mask([1, 1, hidden_size, 2 * hidden_size], + self.params.drop_w) + dropped_w = w * mask + w_skip.append(w) + dropped_w_skip.append(dropped_w) + with tf.variable_scope('init_states', reuse=tf.AUTO_REUSE): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hidden_size] + batch_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset = tf.assign(batch_prev_s, zeros) + + self.num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() + if v.name.startswith(self.name)]) # .value + print('All children have {} params'.format(self.num_params)) + + num_params_per_child = 0 + for v in tf.trainable_variables(): + if v.name.startswith(self.name): + if 'rnn_cell' in v.name: + num_params_per_child += v.shape[-2] * v.shape[-1] + else: + num_params_per_child += np.prod([d for d in v.shape]) + print('Each child has {0} params'.format(num_params_per_child)) + + self.batch_init_states = { + 's': batch_prev_s, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_prev': dropped_w_prev, + 'w_skip': dropped_w_skip, + 'w_soft': w_emb, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_prev': w_prev, + 'w_skip': w_skip, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_prev = model_params['w_prev'] + w_skip = model_params['w_skip'] + w_soft = model_params['w_soft'] + prev_s = init_states['s'] + + # bug点 + # + print("before [embedding_lookup], x={}".format(x)) + emb = tf.nn.embedding_lookup(w_emb, x) + batch_size = self.params.batch_size + hidden_size = self.params.hidden_size + sample_arc = self.sample_arc + if is_training: + # >>> add code >>> + emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) + # >>> add code >>> + # 修改点 + # emb = tf.layers.dropout( + # emb, self.params.drop_i, [batch_size, 1, hidden_size], training=True) + + input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) + layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) + else: + input_mask = None + layer_mask = None + + out_s, all_s, var_s = _rnn_fn(sample_arc, emb, prev_s, w_prev, w_skip, + input_mask, layer_mask, params=self.params) + + top_s = all_s + if is_training: + # >>> add code >>> + # 修改点 + + top_s = npu_ops.dropout(top_s, 1-self.params.drop_o) # ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) + # >>> add code >>> + # top_s = tf.layers.dropout( + # top_s, self.params.drop_o, + # [self.params.batch_size, 1, self.params.hidden_size], training=True) + + carry_on = [tf.assign(prev_s, out_s)] + top_s_shape = top_s.get_shape().as_list() + # print("top_s_shape:{}".format(top_s_shape)) + # print("w_soft:{}".format(w_soft)) + logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) + # logits = tf.matmul(top_s, tf.transpose(w_soft)) + # print("logits:{}".format(logits)) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + # print("loss:{}".format(loss)) + loss = tf.reduce_mean(loss) + # print("_forward/loss:{}".format(loss)) + reg_loss = loss # `loss + regularization_terms` is for training only + if is_training: + # L2 weight reg + self.l2_reg_loss = tf.add_n([tf.nn.l2_loss(w ** 2) for w in var_s]) + reg_loss += self.params.weight_decay * self.l2_reg_loss + + # activation L2 reg + reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) + + # activation slowness reg + reg_loss += self.params.beta * tf.reduce_mean( + (all_s[:, 1:, :] - all_s[:, :-1, :]) ** 2) + # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = [v for v in tf.trainable_variables() + if v.name.startswith(self.name)] + # print("reg_loss:{}".format(reg_loss)) + # print("tf_vars:{}".format(tf_vars)) + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + learning_rate = utils.get_lr(global_step, self.params) * lr_scale + if self.params.grad_bound: + # grads = tf.gradients(reg_loss, tf_vars) + # clipped_grads, _ = tf.clip_by_global_norm(grads, self.params.grad_bound) + # clipped_grads, grad_norm = tf.clip_by_global_norm(grads, self.params.grad_bound) + # print("clipped_grads:{}".format(clipped_grads)) + + grads = tf.gradients(reg_loss, tf_vars) + # print("grads:{}".format(grads)) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + # print("optimizer:{}".format(optimizer)) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + # print("train_op:{}".format(train_op)) + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + self.rl_loss = loss + + def eval_valid(self, sess): + + """Eval 1 round on valid set.""" + total_loss = 0 + for _ in range(self.num_valid_batches): + sess.run(self.batch_init_states['reset']) + total_loss += sess.run(self.valid_loss) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + + return valid_ppl diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py new file mode 100644 index 000000000..2367ea1f0 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- +import tensorflow.compat.v1 as tf + +# from create_tf_record import * +from tensorflow.python.framework import graph_util +from tensorflow.python.tools import freeze_graph + +from npu_bridge.npu_init import * + +def freeze_graph(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "output" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=sess.graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in sess.graph.get_operations(): + # print(op.name, op.values()) + + +def freeze_graph2(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "InceptionV3/Logits/SpatialSqueeze" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + graph = tf.get_default_graph() # 获得默认的图 + input_graph_def = graph.as_graph_def() # 返回一个序列化的图代表当前的图 + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=input_graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in graph.get_operations(): + # print(op.name, op.values()) + + +if __name__ == '__main__': + # 输入ckpt模型路径 + input_checkpoint = './output/test/20220709_185707/model.ckpt-181200' + # 输出pb模型的路径 + out_pb_path = "models_pb/enas-lm-infer2.pb" + # 调用freeze_graph将ckpt转为pb + freeze_graph(input_checkpoint, out_pb_path) + print("Done pb!") + + # 测试pb模型 + image_path = 'test_image/animal.jpg' + # freeze_graph_test(pb_path=out_pb_path, image_path=image_path) + + diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py new file mode 100644 index 000000000..cb13d49ab --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py @@ -0,0 +1,250 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ENAS controller.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import numpy as np +import tensorflow.compat.v1 as tf + +flags = tf.app.flags +FLAGS = flags.FLAGS + +flags.DEFINE_float('controller_baseline_dec', 0.999, '') +flags.DEFINE_float('controller_entropy_weight', 1e-5, '') +flags.DEFINE_float('controller_temperature', 5., '') +flags.DEFINE_float('controller_tanh_constant', 2.25, '') +flags.DEFINE_float('controller_learning_rate', 5e-5, '') +flags.DEFINE_integer('controller_num_layers', 9, '') + +REWARD_CONSTANT = 80.0 + + +def _build_train_op(loss, tf_vars, learning_rate, train_step, num_aggregate): + """Build training ops from `loss` tensor.""" + optim = tf.train.AdamOptimizer(learning_rate) + optim = tf.train.SyncReplicasOptimizer( + optim, replicas_to_aggregate=num_aggregate, total_num_replicas=1, use_locking=True) + grads = tf.gradients(loss, tf_vars) + train_op = optim.apply_gradients(zip(grads, tf_vars), global_step=train_step) + grad_norm = tf.global_norm(grads) + return train_op, optim, grad_norm + + +def _lstm(x, prev_c, prev_h, w_lstm): + """LSTM subgraph.""" + ifog = tf.matmul(tf.concat([x, prev_h], axis=1), w_lstm) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + next_c = i * g + f * prev_c + next_h = o * tf.tanh(next_c) + return next_c, next_h + + +def _set_default_params(params): + """Add controller's default params.""" + params.add_hparam('controller_hidden_size', 64) + params.add_hparam('controller_num_layers', FLAGS.controller_num_layers) + params.add_hparam('controller_num_functions', 4) # tanh, relu, sigmoid, iden + + params.add_hparam('controller_baseline_dec', FLAGS.controller_baseline_dec) + params.add_hparam('controller_entropy_weight', + FLAGS.controller_entropy_weight) + params.add_hparam('controller_temperature', FLAGS.controller_temperature) + params.add_hparam('controller_tanh_constant', FLAGS.controller_tanh_constant) + params.add_hparam('controller_learning_rate', FLAGS.controller_learning_rate) + params.add_hparam('controller_num_aggregate', 10) + params.add_hparam('controller_num_train_steps', 25) + + return params + + +class Controller(object): + """ENAS controller. Samples architectures and creates training ops.""" + + def __init__(self, params, name='controller'): + print('-' * 80) + print('Create a controller') + self.params = _set_default_params(params) + self.name = name + self._build_params() + self._build_sampler() + + def _build_params(self): + """Create TF parameters.""" + initializer = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) + num_funcs = self.params.controller_num_functions # 4 + hidden_size = self.params.controller_hidden_size # 64 + with tf.variable_scope(self.name, initializer=initializer): + with tf.variable_scope('lstm'): + self.w_lstm = tf.get_variable('w', [2 * hidden_size, 4 * hidden_size]) + + with tf.variable_scope('embedding'): + self.g_emb = tf.get_variable('g', [1, hidden_size]) + self.w_emb = tf.get_variable('w', [num_funcs, hidden_size]) + + with tf.variable_scope('attention'): + self.attn_w_1 = tf.get_variable('w_1', [hidden_size, hidden_size]) + self.attn_w_2 = tf.get_variable('w_2', [hidden_size, hidden_size]) + self.attn_v = tf.get_variable('v', [hidden_size, 1]) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() + if v.name.startswith(self.name)]) + print('Controller has {0} params'.format(num_params)) + + def _build_sampler(self): + """Build the sampler ops and the log_prob ops.""" + hidden_size = self.params.controller_hidden_size + num_layers = self.params.controller_num_layers + + arc_seq = [] + sample_log_probs = [] + sample_entropy = [] + all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)] + all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)] + + # sampler ops + inputs = self.g_emb # ??? + prev_c = tf.zeros([1, hidden_size], dtype=tf.float32) + prev_h = tf.zeros([1, hidden_size], dtype=tf.float32) + + inputs = self.g_emb + for layer_id in range(1, num_layers + 1): + next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) + prev_c, prev_h = next_c, next_h + all_h.append(next_h) + all_h_w.append(tf.matmul(next_h, self.attn_w_1)) + + query = tf.matmul(next_h, self.attn_w_2) + query = query + tf.concat(all_h_w[:-1], axis=0) + query = tf.tanh(query) + logits = tf.matmul(query, self.attn_v) + logits = tf.reshape(logits, [1, layer_id]) + + if self.params.controller_temperature: + logits /= self.params.controller_temperature + if self.params.controller_tanh_constant: + logits = self.params.controller_tanh_constant * tf.tanh(logits) + diff = tf.cast(layer_id - tf.range(0, layer_id), tf.float32) ** 2 + logits -= tf.reshape(diff, [1, layer_id]) / 6.0 + skip_index = tf.random.categorical(logits, 1) + skip_index = tf.cast(skip_index, tf.int32) + skip_index = tf.reshape(skip_index, [1]) + arc_seq.append(skip_index) + + log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=skip_index) + sample_log_probs.append(log_prob) + + entropy = log_prob * tf.exp(-log_prob) + sample_entropy.append(tf.stop_gradient(entropy)) + + inputs = tf.nn.embedding_lookup( + tf.concat(all_h[:-1], axis=0), skip_index) + inputs /= (0.1 + tf.to_float(layer_id - skip_index)) + + next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) + prev_c, prev_h = next_c, next_h + logits = tf.matmul(next_h, self.w_emb, transpose_b=True) + if self.params.controller_temperature: + logits /= self.params.controller_temperature + if self.params.controller_tanh_constant: + logits = self.params.controller_tanh_constant * tf.tanh(logits) + func = tf.multinomial(logits, 1) + func = tf.to_int32(func) + func = tf.reshape(func, [1]) + arc_seq.append(func) + log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=func) + sample_log_probs.append(log_prob) + entropy = log_prob * tf.exp(-log_prob) + sample_entropy.append(tf.stop_gradient(entropy)) + inputs = tf.nn.embedding_lookup(self.w_emb, func) + + arc_seq = tf.concat(arc_seq, axis=0) + self.sample_arc = arc_seq + + self.sample_log_probs = tf.concat(sample_log_probs, axis=0) + self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) + + sample_entropy = tf.concat(sample_entropy, axis=0) + self.sample_entropy = tf.reduce_sum(sample_entropy) + + self.all_h = all_h + + def build_trainer(self, child_model): + """Build the train ops by connecting Controller with a Child.""" + # actor + self.valid_loss = tf.to_float(child_model.rl_loss) + self.valid_loss = tf.stop_gradient(self.valid_loss) + self.valid_ppl = tf.exp(self.valid_loss) + self.reward = REWARD_CONSTANT / self.valid_ppl + + if self.params.controller_entropy_weight: + self.reward += self.params.controller_entropy_weight * self.sample_entropy + + # or baseline + self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) + self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) + baseline_update = tf.assign_sub(self.baseline, + ((1 - self.params.controller_baseline_dec) * + (self.baseline - self.reward))) + + with tf.control_dependencies([baseline_update]): + self.reward = tf.identity(self.reward) + self.loss = self.sample_log_probs * (self.reward - self.baseline) + + self.train_step = tf.Variable( + 0, dtype=tf.int32, trainable=False, name='train_step') + tf_vars = [var for var in tf.trainable_variables() + if var.name.startswith(self.name)] + + self.train_op, self.optimizer, self.grad_norm = _build_train_op( + loss=self.loss, + tf_vars=tf_vars, + learning_rate=self.params.controller_learning_rate, + train_step=self.train_step, + num_aggregate=self.params.controller_num_aggregate) + + def train(self, sess, reset_op, log_every=10): + """Train the controller for `num_steps`.""" + print('-' * 80) + print('Training controller') + num_steps = (self.params.controller_num_aggregate * + self.params.controller_num_train_steps) + run_ops = [self.sample_arc, + self.sample_entropy, + self.reward, + self.baseline, + self.train_op] + + for step in range(num_steps): + arc, ent, reward, baseline, _ = sess.run(run_ops) + sess.run(reset_op) + if step % log_every == 0: + log_string = 'step={0:<5d}'.format(step) + log_string += ' ent={0:<7.3f}'.format(ent) + log_string += ' ppl={0:<7.2f}'.format(REWARD_CONSTANT / reward) + log_string += ' rw={0:<7.4f}'.format(reward) + log_string += ' bl={0:<7.4f}'.format(baseline) + log_string += ' arc=[{0}]'.format(' '.join([str(v) for v in arc])) + print(log_string) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py new file mode 100644 index 000000000..6d767073c --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Load picked Penn Treebank data.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +# from npu_bridge.npu_init import * + +import numpy as np +import tensorflow.compat.v1 as tf + + +def input_producer(raw_data, batch_size, num_steps, shuffle=False, + randomize=False, random_len=False): + """Produces graph-based input for Penn Treebank. + + Args: + raw_data: np tensor of size [num_words]. + batch_size: self-explained. + num_steps: number of BPTT steps. + shuffle: whether to shuffle sentences. + randomize: use random segments instead of the continuous corpus. + random_len: random sequence len. + + Returns: + If `random_len` is set, return op that represents whether we have reached + the end of a sequence. + Otherwise, return number of batches in an epoch. + """ + print("raw_data_size:{}".format(np.size(raw_data))) + print("num_steps:{}".format(num_steps)) + batch_len = np.size(raw_data) // batch_size + num_batches_per_epoch = ((np.size(raw_data) // batch_size) - 1) // num_steps + print("num_batches_per_epoch:{}".format(num_batches_per_epoch)) + raw_data = tf.convert_to_tensor(raw_data, name='raw_data', dtype=tf.int32) + + # data_len = tf.size(raw_data) + + + print("batch_len:{}".format(batch_len)) + data = tf.reshape(raw_data[0: batch_size * batch_len], + [batch_size, batch_len]) + + epoch_size = (batch_len - 1) // num_steps + with tf.device('/cpu:0'): + epoch_size = tf.identity(epoch_size, name='epoch_size') + + if random_len: + start_idx = tf.Variable(0, name='start_idx', dtype=tf.int32,trainable=False) + # start_idx = 0 + base_bptt = tf.cond( + tf.random_uniform(shape=(), minval=0., maxval=1.) < 0.95, + lambda: tf.cast(num_steps, dtype=tf.float32), + lambda: tf.cast(num_steps, dtype=tf.float32) / 2.) + # base_bptt = int(tf.cond( + # tf.greater_equal(0.95, np.random.uniform(100)/100), + # lambda:num_steps / 1., + # lambda:num_steps / 2.).item()) + # base_bptt = 35 + seq_len = tf.random.truncated_normal(shape=(), mean=base_bptt, stddev=5., + dtype=tf.float32) + # seq_len = int(np.random.normal(num_steps, 5)) + # seq_len = 35 + seq_len = tf.cast(seq_len, dtype=tf.int32) + seq_len = tf.minimum(seq_len, num_steps + 20) # seq_len <= bptt + 40 + seq_len = tf.minimum(seq_len, batch_len - start_idx - 1) + + # seq_len = tf.cond(tf.greater_equal(seq_len, num_steps + 20), lambda: num_steps + 20, lambda: seq_len).item() + # seq_len = tf.cond(tf.greater_equal(seq_len, int(batch_len - start_idx - 1)), lambda: int(batch_len - start_idx - 1), lambda: seq_len).item() + # seq_len = min(seq_len, num_steps + 20, batch_len - start_idx - 1) + print("seq_len:{}, type:{}".format(seq_len, type(seq_len))) + + end_idx = start_idx + seq_len + + x = data[:, start_idx: end_idx] + # x = tf.reshape(x, [batch_size, seq_len]) + # print("xshape:{}".format(x.get_shape().as_list())) + y = data[:, start_idx + 1: end_idx + 1] + # y = tf.reshape(y, [batch_size, seq_len]) + # print("yshape:{}".format(y.get_shape().as_list())) + + with tf.control_dependencies([x, y]): + with tf.control_dependencies([tf.assign(start_idx, end_idx)]): + should_reset = tf.greater_equal(end_idx, batch_len - 3) + reset_start_idx = tf.assign(start_idx, 0) + # reset_start_idx = tf.assign(tf.Variable(start_idx, name='reset_start_idx', dtype=tf.int32, trainable=False), 0) + return (x, y, num_batches_per_epoch, reset_start_idx, should_reset, + base_bptt, seq_len / batch_len) + + if randomize: + i = tf.random_uniform([1], minval=0, maxval=batch_len - num_steps,dtype=tf.int32)[0] + x = tf.strided_slice(data, [0, i], [batch_size, i + num_steps]) + y = tf.strided_slice(data, [0, i + 1], [batch_size, i + num_steps + 1]) + else: + # """ + # 修改点 + start_idx_eval = tf.Variable(0, name='start_idx', dtype=tf.int32, + trainable=False) + seq_len = num_steps + seq_len = tf.cast(seq_len, dtype=tf.int32) + end_idx = start_idx_eval + seq_len + x = data[:, start_idx_eval: end_idx] + y = data[:, start_idx_eval + 1: end_idx + 1] + with tf.control_dependencies([x, y]): + with tf.control_dependencies([tf.assign(start_idx_eval, end_idx)]): + should_reset_eval = tf.greater_equal(end_idx, batch_len - num_steps - 3) + reset_start_idx_eval = tf.assign(start_idx_eval, 0) + x.set_shape([batch_size, num_steps]) + y.set_shape([batch_size, num_steps]) + + return x, y, num_batches_per_epoch, reset_start_idx_eval, should_reset_eval diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py new file mode 100644 index 000000000..51ed715f5 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD ENAS with a fixed architecture.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.python.tools import freeze_graph + +import os +import pickle +import sys + +# TODO:change path +# sys.path.append("/home/test_user06/AscendZhongzhi_NJU/") +import time + +import numpy as np +import tensorflow.compat.v1 as tf + +import fixed_lib +import utils +from tensorflow.contrib import training as contrib_training + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +## Required parameters +subfolder = str(time.strftime('%Y%m%d_%H%M%S')) +flags.DEFINE_string('output_dir', "./output/infer0/" + subfolder, '') +flags.DEFINE_string('data_path', './ptb/ptb.pkl', '') +flags.DEFINE_string("ckp_path", '', "checkpoint path") + +## Other parametersresult +flags.DEFINE_boolean('reload_model', True, '') +flags.DEFINE_boolean('reset_output_dir', True, '') +flags.DEFINE_boolean('is_training', False, '') +flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") + +flags.DEFINE_integer('log_every', 100, '') + + +def get_ops(params, x_train, x_valid, x_test): + """Build [train, valid, test] graphs.""" + lm = fixed_lib.LM(params, x_train, x_valid, x_test) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + 'eval_test': lm.do_infer, + 'bptt_rate': lm.bptt_rate, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'moving_avg_started': lm.moving_avg_started, + 'update_moving_avg': lm.update_moving_avg_ops, + 'start_moving_avg': lm.start_moving_avg_op, + 'end_moving_avg': lm.end_moving_avg_op, + 'reset_avg': lm.restart_avg, + 'set_lr_decay': lm.set_lr_decay, + 'reset_start_idx_eval': lm.reset_start_idx_eval, + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + + +def load_ckpt_model(sess, save_path): + print("reload model from:{}".format(save_path)) + checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 + input_checkpoint = checkpoint.model_checkpoint_path + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 + saver.restore(sess, input_checkpoint) # 使用最新模型 + sess.run(tf.global_variables_initializer())# 初始化所有变量 + + +def train(params, is_training=True): + """Entry point for training.""" + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, x_test, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + print(' test_size: {0}'.format(np.size(x_test))) + + g = tf.Graph() + with g.as_default(): + tf.random.set_random_seed(2126) + ops = get_ops(params, x_train, x_valid, x_test) + run_ops = [ + ops['train_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['moving_avg_started'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=2) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + + # >>> add code >> + # 创建session + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 + custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") + # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 + # # custom_op.parameter_map[ + # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("./dump/") + # custom_op.parameter_map["enable_dump_debug"].b = True + # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 + # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) + # >>> add code >> + + + # config = tf.ConfigProto() + # config.gpu_options.allow_growth = True + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + # reload model + if params.ckp_path is not "" and FLAGS.reload_model: + last_checkpoint = tf.train.latest_checkpoint(params.ckp_path) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + + accum_loss = 0. + accum_step = 0 + epoch = sess.run(ops['global_step']) // params.num_train_batches + best_valid_ppl = [] + accum_rate = 0. + start_time = time.time() + last_min = (time.time() - start_time) / 60 + cleaned = True + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + avg_flag = "no_null" + while True and is_training: + try: + loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) + # bptt_rate = sess.run(ops['bptt_rate']) + # accum_rate += bptt_rate + + accum_loss += loss + accum_step += 1 + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + # epoch = step // params.num_train_batches + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + min_pices = mins_so_far-last_min + last_min = mins_so_far + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0}/{1:<6d}'.format(step, params.num_train_steps) + log_string += ' ppl={0:<10.2f}'.format(train_ppl) + log_string += ' lr={0:<6.3f}'.format(lr) + log_string += ' |g|={0:<6.3f}'.format(gn) + log_string += ' avg={0:<2d}'.format(moving_avg_started) + log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, min_pices/params.log_every) + # log_string += ' accum_rate(rate of a epoch)={0:<4.6f}'.format(accum_rate) + # log_string += ' should_reset:{}'.format(should_reset) + print(log_string) + + if moving_avg_started: + if avg_flag is "": + sess.run(ops['end_moving_avg']) + sess.run(ops['reset_avg']) + avg_flag = "restart_avg" + else: + sess.run(ops['update_moving_avg']) + # ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + + + if step <= (300 * params.num_train_batches): + if step % (10 * params.num_train_batches) == 0: + print('Start learning decay ...') + sess.run(ops['set_lr_decay']) + if moving_avg_started and step + 5 % (10 * params.num_train_batches) == 0 and len(best_valid_ppl) > params.best_valid_ppl_threshold and valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]): + print('Start learning decay ...') + sess.run(ops['set_lr_decay']) + if should_reset: + accum_rate=0. + print("should_reset:{}".format(should_reset)) + sess.run(ops['reset_batch_states']) + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + # 初始化验证集idx + sess.run(ops['reset_start_idx_eval']) + # 初始化训练集 batch_state, idx + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + # note:当目前的ppl不是最好的10个时,利用移动平均权重法进行调整。 + if (not moving_avg_started and + len(best_valid_ppl) > params.best_valid_ppl_threshold and + valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]) + ): + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + # print('Start learning decay ...') + # sess.run(ops['set_lr_decay']) + + if valid_ppl > 15.: + best_valid_ppl.append(valid_ppl) + if not cleaned: + best_valid_ppl = [p for p in best_valid_ppl if p < 40.] + cleaned = True + # ops['eval_test'](sess, use_moving_avg=moving_avg_started) + if step % (1 * params.num_train_batches) == 0: + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + print("test_ppl:{}".format(test_ppl)) + sess.run(ops['reset_start_idx_eval']) + if step >= params.num_train_steps: + #inference + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + print("final_test_ppl:{}".format(test_ppl)) + break + except tf.errors.InvalidArgumentError: + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + accum_loss, accum_step = 0., 0 + if not is_training: + moving_avg_started = sess.run(ops['moving_avg_started']) + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + sess.close() + # infer_loss = ops['inference']() + with tf.Session() as sess: + print("test_ppl:{}".format(test_ppl)) + #保存图,在./pb_model文件夹中生成model.pb文件 + # model.pb文件将作为input_graph给到接下来的freeze_graph函数 + tf.train.write_graph(sess.graph_def, './models_pb', 'model3.pb') # 通过write_graph生成模型文件 + freeze_graph.freeze_graph( + input_graph='./models_pb/model3.pb', # 传入write_graph生成的模型文件 + input_saver='', + input_binary=False, + input_checkpoint=params.ckp_path+'model.ckpt-906', # 传入训练生成的checkpoint文件 + output_node_names='output', # 与定义的推理网络输出节点保持一致 + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + output_graph='./models_pb/enas_lm3.pb', # 改为需要生成的推理网络的名称 + clear_devices=False, + initializer_nodes='') + print("done pb!") + else: + sess.close() + """ + if not is_training: + return infer_loss + else: + return -1 + """ + +def main(unused_args): + tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.info("**********") + print("===>>>data_path:{}".format(FLAGS.data_path)) + print("===>>>output_dir:{}".format(FLAGS.output_dir)) + print("===>>>ckp_path:{}".format(FLAGS.ckp_path)) + + print('-' * 80) + output_dir = FLAGS.output_dir + + print('-' * 80) + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(output_dir)) + gfile.DeleteRecursively(output_dir) + gfile.MakeDirs(output_dir) + + print('-' * 80) + log_file = os.path.join(output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ckp_path=FLAGS.ckp_path, + ) + + train(params, is_training=FLAGS.is_training) + + +if __name__ == '__main__': + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py new file mode 100644 index 000000000..49659f706 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py @@ -0,0 +1,652 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD ENAS fixed model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + +import numpy as np +import tensorflow.compat.v1 as tf + +import data_utils +import utils + +flags = tf.app.flags +FLAGS = flags.FLAGS + +flags.DEFINE_string('fixed_arc', '0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0', '') +flags.DEFINE_float('child_alpha', 0.7, 'activation L2 reg') +flags.DEFINE_float('child_drop_e', 0.125, 'drop rate words') +flags.DEFINE_float('child_drop_i', 0.175, 'drop rate embeddings') +flags.DEFINE_float('child_drop_l', 0.225, 'drop rate between layers') +flags.DEFINE_float('child_drop_o', 0.75, 'drop rate output') +flags.DEFINE_float('child_drop_w', 0.00, 'drop rate weight') +flags.DEFINE_float('child_drop_x', 0.725, 'drop rate at input of RNN cells') +flags.DEFINE_float('child_init_range', 0.05, '') +flags.DEFINE_float('child_grad_bound', 0.25, '') +flags.DEFINE_float('child_weight_decay', 2e-6, '') +flags.DEFINE_integer('child_num_train_epochs', 2, '') +flags.DEFINE_integer('child_hidden_size', 800, '') + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, minval=0., maxval=1., dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _rnn_fn(x, prev_s, w_prev, w_skip, input_mask, layer_mask, params): + """Multi-layer LSTM. + + Args: + x: [batch_size, num_steps, hidden_size]. + prev_s: [batch_size, hidden_size]. + w_prev: [2 * hidden_size, 2 * hidden_size]. + w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. + input_mask: [batch_size, hidden_size]. + layer_mask: [batch_size, hidden_size]. + params: hyper-params object. + + Returns: + next_s: [batch_size, hidden_size]. + all_s: [[batch_size, num_steps, hidden_size] * num_layers]. + """ + batch_size = x.get_shape()[0].value + print("batch_size:{}".format(batch_size)) + # batch_size = params.batch_size + num_steps = tf.shape(x)[1] + fixed_arc = params.fixed_arc + num_layers = len(fixed_arc) // 2 + set_shape = x.get_shape().as_list() + print("x.set_shape:{}".format(set_shape)) + + # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) + # all_s_my = [] + all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) + + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, prev_s, all_s_my): + """Body fn for `tf.while_loop`.""" + inp = x[:, step, :] + # print("inp:{}".format(inp)) + if layer_mask is not None: + assert input_mask is not None + ht = tf.matmul( + tf.concat([inp * input_mask, prev_s * layer_mask], axis=1), w_prev) + else: + ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) + # print("w_prev:{}".format(w_prev)) + h, t = tf.split(ht, 2, axis=1) + h = tf.tanh(h) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + layers = [s] + + def _select_function(h, function_id): + if function_id == 0: + return tf.tanh(h) + elif function_id == 1: + return tf.nn.relu(h) + elif function_id == 2: + return tf.sigmoid(h) + elif function_id == 3: + return h + raise ValueError('Unknown func_idx {0}'.format(function_id)) + + start_idx = 0 + for layer_id in range(num_layers): + prev_idx = fixed_arc[start_idx] + func_idx = fixed_arc[start_idx + 1] + prev_s = layers[prev_idx] + if layer_mask is not None: + ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) + else: + ht = tf.matmul(prev_s, w_skip[layer_id]) + h, t = tf.split(ht, 2, axis=1) + + h = _select_function(h, func_idx) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + # print("layers_id:{}\ns before set_shape:{}".format(layer_id, s)) + s.set_shape([batch_size, params.hidden_size]) + # print("s after set_shape:{}".format(s)) + layers.append(s) + start_idx += 2 + # print("layers:{}\ns:{}".format(layers, s)) + next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) + # print("next_s:{}".format(next_s)) + t = tf.stack([next_s]) + # print("t:{}".format(t)) + all_s_my = tf.concat([all_s_my, t], 0) + # print("all_s_my:{}".format(all_s_my)) + # all_s.append(next_s) + return step + 1, next_s, all_s_my + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] + _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) + # >>> add code >>> + # all_s_my = tf.reshape(all_s_my, [set_shape[1]+1, set_shape[0], params.hidden_size]) + # print("all_s_my(list):{}".format(all_s_my)) + # tmp = all_s_my[1:, :, :] + # # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) + # print("stack_all_s:{}".format(tmp)) + # all_s = tf.transpose(tmp, perm=[1, 0, 2]) + # # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) + # all_s = tf.reshape(all_s, [set_shape[0], set_shape[1], params.hidden_size]) + # print("all_s:{}".format(all_s)) + all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) + # print("stack_all_s:{}".format(all_s_my)) + + all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) + # print("all_s:{}".format(all_s)) + + return next_s, all_s + + +def _set_default_params(params): + """Set default values for the hparams.""" + params.add_hparam('alpha', FLAGS.child_alpha) # activation L2 reg + params.add_hparam('best_valid_ppl_threshold', 10) + + params.add_hparam('batch_size', 64) + params.add_hparam('bptt_steps', 32) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', FLAGS.child_drop_e) # word + params.add_hparam('drop_i', FLAGS.child_drop_i) # embeddings + params.add_hparam('drop_l', FLAGS.child_drop_l) # between RNN nodes + params.add_hparam('drop_o', FLAGS.child_drop_o) # output + params.add_hparam('drop_w', FLAGS.child_drop_w) # weight + params.add_hparam('drop_x', FLAGS.child_drop_x) # input to RNN layers + + assert FLAGS.fixed_arc is not None + print(FLAGS.fixed_arc) + L_arc = FLAGS.fixed_arc.split(' ') + print("L_arc:{}".format(L_arc)) + params.add_hparam('fixed_arc', [int(d) for d in L_arc]) + + params.add_hparam('grad_bound', FLAGS.child_grad_bound) + params.add_hparam('hidden_size', FLAGS.child_hidden_size) + params.add_hparam('init_range', FLAGS.child_init_range) + params.add_hparam('learning_rate', 40.) + params.add_hparam('num_train_epochs', FLAGS.child_num_train_epochs) + params.add_hparam('num_warmup_epochs', 0.0) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', FLAGS.child_weight_decay) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, x_train, x_valid, x_test, name='language_model'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, + self.base_bptt, self.bptt_rate) = data_utils.input_producer( + x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam( + 'num_train_steps', self.num_train_batches * params.num_train_epochs) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer( + x_valid, params.batch_size, params.bptt_steps) + + # test data + (self.x_test, self.y_test, + self.num_test_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer(x_test, 1, 1) + + params.add_hparam('num_warmup_steps', + params.num_warmup_epochs * self.num_train_batches) + self._build_params() + self._build_train() + self._build_valid() + self._build_test() + self._build_infer() + self._build_avg_infer() + + def _build_params(self): + """Create model parameters.""" + + print('-' * 80) + print('Building model params') + initializer = tf.initializers.random_uniform(minval=-self.params.init_range, + maxval=self.params.init_range) + with tf.variable_scope(self.name, initializer=initializer): + with tf.variable_scope('embedding'): + w_emb = tf.get_variable( + 'w', [self.params.vocab_size, self.params.hidden_size], + initializer=initializer) + # >>> add code >>> + dropped_w_emb = npu_ops.dropout(w_emb, 1 - self.params.drop_e) + # >>> add code >>> + # dropped_w_emb = tf.layers.dropout( + # w_emb, self.params.drop_e, [self.params.vocab_size, 1], + # training=True) + + hidden_size = self.params.hidden_size + fixed_arc = self.params.fixed_arc + num_layers = len(fixed_arc) // 2 + with tf.variable_scope('rnn_cell'): + w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) + i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) + h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w_prev = w_prev * mask + + w_skip, dropped_w_skip = [], [] + for layer_id in range(num_layers): + mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + with tf.variable_scope('layer_{}'.format(layer_id)): + w = tf.get_variable('w', [hidden_size, 2 * hidden_size]) + dropped_w = w * mask + w_skip.append(w) + dropped_w_skip.append(dropped_w) + + with tf.variable_scope('init_states'): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hidden_size] + batch_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset = tf.assign(batch_prev_s, zeros) + with tf.variable_scope('test'): + init_shape = [1, hidden_size] + test_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = tf.zeros(init_shape, dtype=tf.float32) + test_reset = tf.assign(test_prev_s, zeros) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + print('Model has {0} params'.format(num_params)) + + self.batch_init_states = { + 's': batch_prev_s, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_prev': dropped_w_prev, + 'w_skip': dropped_w_skip, + 'w_soft': w_emb, + } + self.test_init_states = { + 's': test_prev_s, + 'reset': test_reset, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_prev': w_prev, + 'w_skip': w_skip, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_prev = model_params['w_prev'] + w_skip = model_params['w_skip'] + w_soft = model_params['w_soft'] + prev_s = init_states['s'] + + emb = tf.nn.embedding_lookup(w_emb, x) + batch_size = self.params.batch_size + hidden_size = self.params.hidden_size + if is_training: + # >>> add code >>> + emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) + + # >>> add code >>> + # emb = tf.layers.dropout( + # emb, self.params.drop_i, + # [self.params.batch_size, 1, hidden_size], training=True) + + input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) + layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) + else: + input_mask = None + layer_mask = None + + out_s, all_s = _rnn_fn(emb, prev_s, w_prev, w_skip, input_mask, layer_mask, + self.params) + top_s = all_s + if is_training: + # >>> add code >>> + top_s = npu_ops.dropout(top_s, + 1 - self.params.drop_o)# ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) + # >>> add code >>> + + # top_s = tf.layers.dropout(top_s, self.params.drop_o, + # [batch_size, 1, hidden_size], training=True) + + carry_on = [tf.assign(prev_s, out_s)] + # print("top_s:{}\nw_soft:{}".format(top_s, w_soft)) + logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) + # print("logits:{}".format(logits)) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + loss = tf.reduce_mean(loss) + + reg_loss = loss # loss + regularization_terms, for training only + # print("_forward/loss:{}".format(loss)) + if is_training: + # L2 weight reg + reg_loss += self.params.weight_decay * tf.add_n( + [tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + + # activation L2 reg + reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) + + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + # print("reg_loss:{}\nloss:{}".format(reg_loss, loss)) + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = tf.trainable_variables() + # print("reg_loss:{}".format(reg_loss)) + print("tf_vars:{}".format(tf_vars)) + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + with tf.variable_scope('HParam'): + lr_decay = tf.get_variable('learning_rate_decay', [], initializer=tf.constant_initializer(1.), dtype=tf.float32, trainable=False) + self.set_lr_decay = tf.assign_sub(lr_decay, 0.02*lr_decay) + learning_rate = utils.get_lr(global_step, self.params, lr_decay) * lr_scale + grads = tf.gradients(reg_loss, tf_vars) + # print("grads:{}".format(grads)) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + (self.update_moving_avg_ops, self.use_moving_avg_vars, + self.restore_normal_vars) = self._create_average_ops() + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + # def _EMA(self): + # """Build moving average ops.""" + # print('Creating moving average ops') + # + # with tf.variable_scope('moving_avg_flag'): + # self.moving_avg_started = tf.get_variable( + # 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + # trainable=False) + # self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + # self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) + # all_vars = tf.trainable_variables() + # + # ema = tf.train.ExponentialMovingAverage(0.99) + # + # average_op = ema.apply(all_vars) + # back_up_v = tf.identity(all_vars) + # use_average_op = tf.assign(all_vars, ema.average(all_vars)) + # ema.average_name() + # reverse_average_op = tf.assign(all_vars, back_up_v) + + + + + def _create_average_ops(self): + """Build moving average ops.""" + print('Creating moving average ops') + + with tf.variable_scope('moving_avg_flag'): + self.moving_avg_started = tf.get_variable( + 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + trainable=False) + self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) + + all_vars = tf.trainable_variables() + print('all_vars:{}'.format(all_vars)) + average_pairs = [] + var_cnt = 0 + with tf.variable_scope('average'): + for v in all_vars: + avg_v = tf.get_variable( + str(var_cnt), shape=v.shape, dtype=v.dtype, + initializer=tf.zeros_initializer, trainable=False) + var_cnt += 1 + average_pairs.append([v, avg_v]) + backup_pairs = [] + var_cnt = 0 + with tf.variable_scope('backup'): + for v in all_vars: + backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, + trainable=False) + var_cnt += 1 + backup_pairs.append([v, backup_v]) + # 原作者手动实现的Moving Average ::当eval_valid_ppl退化到一定阈值(退步10名)后启动 + with tf.variable_scope('avg_step'): + avg_step = tf.get_variable('step', [], initializer=tf.constant_initializer(0.), dtype=tf.float32, trainable=False) + tmp1 = [] + tmp2 = [] + tmp3 = [] + self.restart_avg = tf.assign(avg_step, 0.) + with tf.control_dependencies([tf.assign_add(avg_step, 1.)]): + average_op = [] + for v, avg_v in average_pairs: + # v_curr = tf.Variable(tf.cast(tf.identity(v), tf.float32), dtype=tf.float32, trainable=False) + # avg_v_curr = tf.Variable(tf.cast(tf.identity(avg_v), tf.float32), dtype=tf.float32, trainable=False) + # mu = 1. / avg_step + mu = tf.cond(tf.cast(0.999 < (1. + avg_step) / (10. + avg_step), tf.bool), + lambda: tf.cast(tf.constant(0.99), dtype=tf.float32), + lambda: tf.cast((1. + avg_step) / (10. + avg_step), dtype=tf.float32)) + + new_avg = mu * tf.cast(avg_v, tf.float32) + (1. - mu) * tf.cast(v, tf.float32) + with tf.control_dependencies([new_avg]): + average_op.append(tf.assign(avg_v, tf.cast(new_avg, avg_v.dtype))) + # 追踪变量 + tmp1.append(v) + tmp2.append(new_avg) + tmp3.append([avg_step, mu, tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(new_avg ** 2)]) + + self.p1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp1]) + self.p2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp2]) + self.p3 = tmp3 + # # 使用官方API + # with tf.variable_scope('avg_step'): + # avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) + # + # ema = tf.train.ExponentialMovingAverage(0.99, avg_step) + # with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): + # average_op = [] + # for v, avg_v in average_pairs: + # v = tf.Variable(tf.cast(v, tf.float32), dtype=tf.float32, trainable=False) + # avg_v = tf.Variable(tf.cast(avg_v, tf.float32), dtype=tf.float32, trainable=False) + # print('v:{}'.format(v)) + # ema.apply([v]) + # new_avg = ema.average(v) + # print('new_avg:{}'.format(new_avg)) + # with tf.control_dependencies([new_avg]): + # print('avg_v:'.format(avg_v)) + # average_op.append(tf.assign(avg_v, new_avg)) + # # average_op = tf.group(*average_op) + + assert len(average_pairs) == len(all_vars) + assert len(average_pairs) == len(backup_pairs) + use_average_op = [] + + new_tmp1 = [] + for i in range(len(average_pairs)): + v, avg_v = average_pairs[i] + _, backup_v = backup_pairs[i] + with tf.control_dependencies([tf.assign(backup_v, v)]): + new_tmp1.append([tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(backup_v ** 2)]) + use_average_op.append(tf.assign(v, avg_v)) + self.p4 = new_tmp1 + + use_average_op = tf.group(*use_average_op) + # with tf.control_dependencies([use_average_op]): + self.p3_1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + reverse_average_op = [] + new_tmp2 = [] + for v, backup_v in backup_pairs: + # with tf.control_dependencies([use_average_op]): + new_tmp2.append([tf.reduce_sum(v ** 2), tf.reduce_sum(backup_v ** 2)]) + reverse_average_op.append(tf.assign(v, backup_v)) + self.p5 = new_tmp2 + reverse_average_op = tf.group(*reverse_average_op) + # with tf.control_dependencies([reverse_average_op]): + self.p3_2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + + return average_op, use_average_op, reverse_average_op + + def _eval_test(self, sess, use_moving_avg=False): + """Eval 1 round on test set.""" + total_loss = 0 + if use_moving_avg: + print('v:{}'.format(tf.trainable_variables())) + sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) + print('v_avg:{}'.format(tf.trainable_variables())) + for step in range(int(self.num_test_batches)): + total_loss += sess.run(self.test_loss) + if (step + 1) % 1000 == 0: + test_ppl = np.exp(total_loss / (step + 1)) + log_string = 'step={0:<6d}'.format(step + 1) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + if sess.run(self.should_reset_eval): + break + # test_ppl = np.exp(total_loss / self.num_test_batches) + + # log_string = 'step={0:<6d}'.format(self.num_test_batches) + # log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + # print(log_string) + if use_moving_avg: + sess.run(self.restore_normal_vars) + # test_ppl = tf.math.exp(total_loss/ self.num_test_batches, name='output') + # print("test_ppl:{}".format(test_ppl)) + # loss_assign_op = tf.assign(self.tt_loss, tf.Variable(total_loss, name='total_loss', dtype=tf.float32,trainable=False)) + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + + def _build_test(self): + print('Building test graph') + _, loss = self._forward(self.x_test, self.y_test, + self.eval_params, self.test_init_states) + self.test_loss = loss + + def _build_infer(self): + print("Building infer graph") + tt_loss = tf.Variable(0, name="total_loss", dtype=tf.float32, trainable=False) + def _condition(step, *unused_args): + return tf.less(step, self.num_test_batches-3) + def _body(step, tt_loss): + with tf.control_dependencies([self.test_loss]): + tt_loss += self.test_loss + return step+1, tt_loss + loop_inps = [tf.constant(0, dtype=tf.int32), tt_loss] + _, tt_loss = tf.while_loop(_condition, _body, loop_inps) + test_ppl = tf.math.exp(tt_loss/ self.num_test_batches, name='test_ppl') + print("test_ppl:{}".format(test_ppl)) + self.infer_ppl = test_ppl + + def _build_avg_infer(self): + print("Build avg_infer graph") + def _fp(): + with tf.control_dependencies([self.use_moving_avg_vars, self.test_init_states['reset']]): + avg_infer_ppl = self.infer_ppl + with tf.control_dependencies([avg_infer_ppl, self.restore_normal_vars]): + return avg_infer_ppl + def _fn(): + return self.infer_ppl + + with tf.control_dependencies([self.moving_avg_started]): + avg_infer_ppl = tf.cond(tf.greater_equal(self.moving_avg_started, 1), _fp, _fn) + self.avg_infer_ppl = tf.identity(avg_infer_ppl, name="output") + print("self.avg_infer_ppl:{}".format(self.avg_infer_ppl)) + + + def eval_valid(self, sess, use_moving_avg=False): + """Eval 1 round on valid set.""" + total_loss = 0 + + if use_moving_avg: + # print('sum_v:{}'.format(sess.run(self.p1))) + # print('new_sum_v:{}'.format(sess.run(self.p2))) + # print('[[step, mu, v, v_avg, new_v_avg]]={}'.format(sess.run(self.p3))) + # self.use_moving_avg_vars ===>影子权重暂时替代当前权重 + sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) + # print('v_avg:{}\n[[v, avg_v, backup_v]]={}'.format(sess.run(self.p3_1), sess.run(self.p4))) + + valid_loss = [] + for _ in range(self.num_valid_batches): + loss = sess.run(self.valid_loss) + total_loss += loss + valid_loss.append(loss) + if sess.run(self.should_reset_eval): + break + print("valid_loss={}, self.num_valid_batches={}".format(valid_loss, self.num_valid_batches)) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + # print('v:{}\n[[v, backup_v]]={} \n============================================================'.format( + # sess.run(self.p3_2), sess.run(self.p5))) + + return valid_ppl + + def do_infer(self, sess, use_moving_avg=False): + # self._eval_test(sess, use_moving_avg) + return sess.run(self.avg_infer_ppl) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py new file mode 100644 index 000000000..5985dd014 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py @@ -0,0 +1,93 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import datetime +# import moxing as mox +import tensorflow.compat.v1 as tf +gfile = tf.gfile + +def obs_data2modelarts(config): + """ + Copy train data from obs to modelarts by using moxing api. + """ + start = datetime.datetime.now() + print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.data_url, config.modelarts_data_dir)) + mox.file.copy_parallel(src_url=config.data_url, dst_url=config.modelarts_data_dir) + print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.ckp_path, config.modelarts_result_dir)) + output_dir = config.modelarts_result_dir + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + mox.file.copy_parallel(src_url=config.ckp_path, dst_url=config.modelarts_result_dir) + end = datetime.datetime.now() + files = os.listdir(config.modelarts_data_dir) + print("===>>>Files:", files) + files2 = os.listdir(config.modelarts_result_dir) + print("===>>>Files2:", files2) + + +def modelarts_result2obs(FLAGS): + """ + Copy debug data from modelarts to obs. + According to the swich flags, the debug data may contains auto tune repository, + dump data for precision comparision, even the computation graph and profiling data. + """ + work_dir = os.getcwd() + print("start op: modelarts_result2obs..........") + + ## copy result from modelarts to obs + obs_result_dir = os.path.join(FLAGS.obs_dir, 'result') + if not mox.file.exists(obs_result_dir): + mox.file.make_dirs(obs_result_dir) + else: + mox.file.remove(obs_result_dir, recursive=True) + mox.file.make_dirs(obs_result_dir) + mox.file.copy_parallel(src_url=FLAGS.output_dir, dst_url=obs_result_dir) + print("===>>>Copy Event or Checkpoint from modelarts dir:{} to obs:{}".format(FLAGS.output_dir, obs_result_dir)) + + ## Copy auto tune repository. Comment this snippets if npu_auto_tune is off. + # if FLAGS.npu_auto_tune: + # modelarts_auto_tune_dir = os.path.join(work_dir, "npu_auto_tune") + # obs_auto_tune_dir = os.path.join(FLAGS.obs_dir, 'npu_auto_tune') + # if not mox.file.exists(obs_auto_tune_dir): + # mox.file.make_dirs(obs_auto_tune_dir) + # mox.file.copy_parallel(modelarts_auto_tune_dir, obs_auto_tune_dir) + # print("===>>>Auto tune:{} on OBS dir:{}".format(mox.file.list_directory(obs_auto_tune_dir), obs_auto_tune_dir)) + # + # ## Copy dump data. Comment this snippets if npu_dump_data is off. + # if FLAGS.npu_dump_data: + # modelarts_dump_data_dir = os.path.join(work_dir, "npu_dump_data") + # obs_dump_data_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_data') + # if not mox.file.exists(obs_dump_data_dir): + # mox.file.make_dirs(obs_dump_data_dir) + # mox.file.copy_parallel(modelarts_dump_data_dir, obs_dump_data_dir) + # print("===>>>Dumped graph:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_data_dir), obs_dump_data_dir)) + # + # ## Copy compute graph. Comment this snippets if npu_dump_graph is off. + # if FLAGS.npu_dump_graph: + # modelarts_dump_graph_dir = os.path.join(work_dir, "npu_dump_graph") + # obs_dump_graph_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_graph') + # if not mox.file.exists(obs_dump_graph_dir): + # mox.file.make_dirs(obs_dump_graph_dir) + # mox.file.copy_parallel(modelarts_dump_graph_dir, obs_dump_graph_dir) + # print("===>>>Dumped data:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_graph_dir), obs_dump_graph_dir)) + # + # ## Copy profiling data. Comment this snippets if npu_profiling is off. + # if FLAGS.npu_profiling: + # modelarts_profiling_dir = os.path.join(work_dir, "npu_profiling") + # obs_profiling_dir = os.path.join(FLAGS.obs_dir, 'npu_profiling') + # if not mox.file.exists(obs_profiling_dir): + # mox.file.make_dirs(obs_profiling_dir) + # mox.file.copy_parallel(modelarts_profiling_dir, obs_profiling_dir) + # print("===>>>Profiling data:{} on OBS dir:{}".format(mox.file.list_directory(obs_profiling_dir), obs_profiling_dir)) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py new file mode 100644 index 000000000..2a1816ac3 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD LSTM.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import os +import pickle +import sys +import time + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import lstm_lib +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils +from tensorflow.contrib import training as contrib_training + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +flags.DEFINE_boolean('reset_output_dir', False, '') +flags.DEFINE_string('output_dir', None, '') +flags.DEFINE_string('data_path', None, '') + +flags.DEFINE_integer('log_every', 200, '') + + +def get_ops(params, x_train, x_valid, x_test): + """Build [train, valid, test] graphs.""" + + lm = lstm_lib.LM(params, x_train, x_valid, x_test) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + 'eval_test': lm.eval_test, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'moving_avg_started': lm.moving_avg_started, + 'update_moving_avg': lm.update_moving_avg_ops, + 'start_moving_avg': lm.start_moving_avg_op, + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + + +def train(params): + """Entry point for training.""" + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, x_test, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + print(' test_size: {0}'.format(np.size(x_test))) + + g = tf.Graph() + with g.as_default(): + ops = get_ops(params, x_train, x_valid, x_test) + run_ops = [ + ops['train_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['moving_avg_started'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=5) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + accum_loss = 0 + accum_step = 0 + epoch = 0 + best_valid_ppl = [] + start_time = time.time() + while True: + sess.run(ops['reset_batch_states']) + loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) + accum_loss += loss + accum_step += 1 + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0:<7d}'.format(step) + log_string += ' ppl={0:<9.2f}'.format(train_ppl) + log_string += ' lr={0:<10.7f}'.format(lr) + log_string += ' |g|={0:<5.2f}'.format(gn) + log_string += ' avg={0:<2d}'.format(moving_avg_started) + log_string += ' mins={0:<.2f}'.format(mins_so_far) + print(log_string) + + if moving_avg_started: + sess.run(ops['update_moving_avg']) + + # if step % params.num_train_batches == 0: + if should_reset: + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + if (not moving_avg_started and + len(best_valid_ppl) > params.best_valid_ppl_threshold and + valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold])): + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + best_valid_ppl.append(valid_ppl) + + if step >= params.num_train_steps: + ops['eval_test'](sess, use_moving_avg=moving_avg_started) + break + sess.close() + + +def main(unused_args): + output_dir = FLAGS.output_dir + print('-' * 80) + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(output_dir)) + gfile.DeleteRecursively(output_dir) + gfile.MakeDirs(output_dir) + + print('-' * 80) + log_file = os.path.join(output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ) + + train(params) + + +if __name__ == '__main__': + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py new file mode 100644 index 000000000..576b6f2e2 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py @@ -0,0 +1,458 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD LSTM model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils + +MOVING_AVERAGE_DECAY = 0.9995 + +MOVING_AVERAGE_DECAY = 0.9995 + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _lstm(x, prev_c, prev_h, w_lstm, layer_masks): + """Multi-layer LSTM. + + Args: + x: [batch_size, num_steps, hidden_size]. + prev_c: [[batch_size, hidden_size] * num_layers]. + prev_h: [[batch_size, hidden_size] * num_layers]. + w_lstm: [[2 * hidden_size, 4 * hidden_size] * num_layers]. + layer_masks: [([hidden_size, hidden_size] or None)* num_layers]. + + Returns: + next_c: [[batch_size, hidden_size] * num_layers]. + next_h: [[batch_size, hidden_size] * num_layers]. + all_h: [batch_size, num_steps, hidden_size]. + """ + _, num_steps, _ = tf.unstack(tf.shape(x)) + num_layers = len(w_lstm) + + all_h = [tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) + for _ in range(num_layers)] + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, pprev_c, pprev_h, all_h): + """Apply LSTM at each step.""" + next_c, next_h = [], [] + for layer_id, (p_c, p_h, w, m) in enumerate(zip( + pprev_c, pprev_h, w_lstm, layer_masks)): + inp = x[:, step, :] if layer_id == 0 else next_h[-1] + if m is not None: + inp *= m + ifog = tf.matmul(tf.concat([inp, p_h], axis=1), w) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + c = i * g + f * p_c + h = o * tf.tanh(c) + all_h[layer_id] = all_h[layer_id].write(step, h) + next_c.append(c) + next_h.append(h) + return step + 1, next_c, next_h, all_h + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_c, prev_h, all_h] + _, next_c, next_h, all_h = tf.while_loop(_condition, _body, loop_inps, + parallel_iterations=1) + all_h = [tf.transpose(h.stack(), [1, 0, 2]) + for h in all_h] + + return next_c, next_h, all_h + + +def _set_default_params(params): + """Set default parameters.""" + params.add_hparam('alpha', 2.) # activation L2 reg + params.add_hparam('best_valid_ppl_threshold', 7) + params.add_hparam('beta', 1.) # activation slowness reg + + params.add_hparam('batch_size', 12) + params.add_hparam('bptt_steps', 70) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', 0.10) # word + params.add_hparam('drop_i', 0.65) # embeddings + params.add_hparam('drop_l', 0.30) # between layers + params.add_hparam('drop_o', 0.40) # output + params.add_hparam('drop_w', 0.50) # weight + + params.add_hparam('emb_size', 400) + params.add_hparam('start_decay_epoch', 14) + params.add_hparam('decay_every_epoch', 1) + params.add_hparam('decay_rate', 0.98) + params.add_hparam('grad_bound', 0.25) + params.add_hparam('hidden_size', 1100) + params.add_hparam('init_range', 0.1) + params.add_hparam('learning_rate', 20.) + params.add_hparam('num_layers', 3) + params.add_hparam('num_train_epochs', 500) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', 1.2e-6) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, x_train, x_valid, x_test, name='language_model'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, self.base_bptt) = data_utils.input_producer( + x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam( + 'num_train_steps', self.num_train_batches * params.num_train_epochs) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches) = data_utils.input_producer( + x_valid, params.batch_size, params.bptt_steps) + + # test data + (self.x_test, self.y_test, + self.num_test_batches) = data_utils.input_producer(x_test, 1, 1) + + params.add_hparam('start_decay_step', + params.start_decay_epoch * self.num_train_batches) + params.add_hparam('decay_every_step', + params.decay_every_epoch * self.num_train_batches) + + self._build_params() + self._build_train() + self._build_valid() + self._build_test() + + def _build_params(self): + """Create and count model parameters.""" + print('-' * 80) + print('Building model params') + with tf.variable_scope(self.name): + with tf.variable_scope('embedding'): + initializer = tf.initializers.random_uniform( + -self.params.init_range, self.params.init_range) + w_emb = tf.get_variable( + 'w', [self.params.vocab_size, self.params.emb_size], + initializer=initializer) + dropped_w_emb = tf.layers.dropout( + w_emb, self.params.drop_e, [self.params.vocab_size, 1], + training=True) + + w_lstm = [] + dropped_w_lstm = [] + with tf.variable_scope('lstm'): + for i in range(self.params.num_layers): + inp_size = self.params.emb_size if i == 0 else self.params.hidden_size + hid_size = (self.params.emb_size if i == self.params.num_layers - 1 + else self.params.hidden_size) + init_range = 1.0 / np.sqrt(hid_size) + initializer = tf.initializers.random_uniform(-init_range, init_range) + with tf.variable_scope('layer_{0}'.format(i)): + w = tf.get_variable('w', [inp_size + hid_size, 4 * hid_size], + initializer=initializer) + i_mask = tf.ones([inp_size, 4 * hid_size], dtype=tf.float32) + h_mask = _gen_mask([hid_size, 4 * hid_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w = w * mask + w_lstm.append(w) + dropped_w_lstm.append(dropped_w) + + with tf.variable_scope('init_states'): + batch_prev_c, batch_prev_h, batch_reset = [], [], [] + test_prev_c, test_prev_h, test_reset = [], [], [] + for i in range(self.params.num_layers): + inp_size = self.params.emb_size if i == 0 else self.params.hidden_size + hid_size = (self.params.emb_size if i == self.params.num_layers - 1 + else self.params.hidden_size) + + with tf.variable_scope('layer_{0}'.format(i)): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hid_size] + batch_prev_c.append(tf.get_variable( + 'c', init_shape, dtype=tf.float32, trainable=False)) + batch_prev_h.append(tf.get_variable( + 'h', init_shape, dtype=tf.float32, trainable=False)) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset.append(tf.assign(batch_prev_c[-1], zeros)) + batch_reset.append(tf.assign(batch_prev_h[-1], zeros)) + with tf.variable_scope('test'): + init_shape = [1, hid_size] + test_prev_c.append(tf.get_variable( + 'c', init_shape, dtype=tf.float32, trainable=False)) + test_prev_h.append(tf.get_variable( + 'h', init_shape, dtype=tf.float32, trainable=False)) + zeros = np.zeros(init_shape, dtype=np.float32) + test_reset.append(tf.assign(test_prev_c[-1], zeros)) + test_reset.append(tf.assign(test_prev_h[-1], zeros)) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + print('Model has {0} params'.format(num_params)) + + self.batch_init_states = { + 'c': batch_prev_c, + 'h': batch_prev_h, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_lstm': dropped_w_lstm, + 'w_soft': w_emb, + } + self.test_init_states = { + 'c': test_prev_c, + 'h': test_prev_h, + 'reset': test_reset, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_lstm': w_lstm, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_lstm = model_params['w_lstm'] + w_soft = model_params['w_soft'] + prev_c = init_states['c'] + prev_h = init_states['h'] + + emb = tf.nn.embedding_lookup(w_emb, x) + if is_training: + emb = tf.layers.dropout( + emb, self.params.drop_i, + [self.params.batch_size, 1, self.params.emb_size], training=True) + + layer_masks = [None] + for _ in range(1, self.params.num_layers - 1): + mask = _gen_mask([self.params.batch_size, self.params.hidden_size], + self.params.drop_l) + layer_masks.append(mask) + layer_masks.append(None) + else: + layer_masks = [None] * self.params.num_layers + + out_c, out_h, all_h = _lstm(emb, prev_c, prev_h, w_lstm, layer_masks) + top_h = all_h[-1] + if is_training: + top_h = tf.layers.dropout( + top_h, self.params.drop_o, + [self.params.batch_size, 1, self.params.emb_size], training=True) + + carry_on = [] + for var, val in zip(prev_c + prev_h, out_c + out_h): + carry_on.append(tf.assign(var, val)) + + logits = tf.einsum('bnh,vh->bnv', top_h, w_soft) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + loss = tf.reduce_mean(loss) # TODO(hyhieu): watch for num_steps + + reg_loss = loss # loss + regularization_terms, for training only + if is_training: + # L2 weight reg + reg_loss += self.params.weight_decay * tf.add_n( + [tf.reduce_sum(w ** 2) for w in tf.trainable_variables()]) + + # activation L2 reg + reg_loss += self.params.alpha * tf.add_n( + [tf.reduce_mean(h ** 2) for h in all_h[:-1]]) + + # activation slowness L2 reg + reg_loss += self.params.beta * tf.add_n( + [tf.reduce_mean((h[:, 1:, :] - h[:, :-1, :]) ** 2) + for h in all_h[:-1]]) + + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = tf.trainable_variables() + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + learning_rate = utils.get_lr(global_step, self.params) * lr_scale + # learning_rate = tf.Print( + # learning_rate, + # [learning_rate, lr_scale, self.base_bptt, tf.shape(self.y_train)], + # message='lr: ', summarize=3) + grads = tf.gradients(reg_loss, tf_vars) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + + (self.update_moving_avg_ops, self.use_moving_avg_vars, + self.restore_normal_vars) = self._create_average_ops() + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + def _create_average_ops(self): + """Build moving average ops.""" + print('Creating moving average ops') + + with tf.variable_scope('moving_avg_flag'): + self.moving_avg_started = tf.get_variable( + 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + trainable=False) + self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + + all_vars = tf.trainable_variables() + average_pairs = [] + var_cnt = 0 + with tf.variable_scope('average'): + for v in all_vars: + avg_v = tf.get_variable( + str(var_cnt), shape=v.shape, dtype=v.dtype, + initializer=tf.zeros_initializer, trainable=False) + var_cnt += 1 + average_pairs.append([v, avg_v]) + backup_pairs = [] + var_cnt = 0 + with tf.variable_scope('backup'): + for v in all_vars: + backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, + trainable=False) + var_cnt += 1 + backup_pairs.append([v, backup_v]) + + with tf.variable_scope('avg_step'): + avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) + + with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): + average_op = [] + for v, avg_v in average_pairs: + mu = 1 / avg_step + new_avg = mu * v + (1 - mu) * avg_v + with tf.control_dependencies([new_avg]): + average_op.append(tf.assign(avg_v, new_avg)) + + assert len(average_pairs) == len(all_vars) + assert len(average_pairs) == len(backup_pairs) + use_average_op = [] + for i in range(len(average_pairs)): + v, avg_v = average_pairs[i] + _, backup_v = backup_pairs[i] + with tf.control_dependencies([tf.assign(backup_v, v)]): + use_average_op.append(tf.assign(v, avg_v)) + use_average_op = tf.group(*use_average_op) + + reverse_average_op = [] + for v, backup_v in backup_pairs: + reverse_average_op.append(tf.assign(v, backup_v)) + reverse_average_op = tf.group(*reverse_average_op) + + return average_op, use_average_op, reverse_average_op + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + + def _build_test(self): + print('Building test graph') + _, loss = self._forward(self.x_test, self.y_test, + self.eval_params, self.test_init_states) + self.test_loss = loss + + def eval_valid(self, sess, use_moving_avg=False): + """Eval 1 round on valid set.""" + total_loss = 0 + if use_moving_avg: + sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) + for _ in range(self.num_valid_batches): + total_loss += sess.run(self.valid_loss) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + return valid_ppl + + def eval_test(self, sess, use_moving_avg=False): + """Eval 1 round on test set.""" + total_loss = 0 + if use_moving_avg: + sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) + for step in range(self.num_test_batches): + total_loss += sess.run(self.test_loss) + if (step + 1) % 1000 == 0: + test_ppl = np.exp(total_loss / (step + 1)) + log_string = 'step={0}'.format(step + 1) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + test_ppl = np.exp(total_loss / self.num_valid_batches) + log_string = 'step={0}'.format(self.num_test_batches) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + return test_ppl diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py new file mode 100644 index 000000000..9a8804313 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Preprocess Penn-Treebank dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import pickle +import numpy as np +import os + + +def main(): + dataFolder = "/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/dataset/Penn_Treebank_dataset" + dataList = os.listdir(dataFolder) + dataPath = {} + for dataName in dataList: + dataPath[dataName] = os.path.join(dataFolder, dataName) + + with open(dataPath.get("ptb.train.txt")) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + vocab, index = {}, {} + for word in sorted(words): + if word not in vocab: + index[len(vocab)] = word + vocab[word] = len(vocab) + print('vocab size: {}'.format(len(vocab))) + + x_train = [vocab[word] for word in words] + [vocab['']] + x_train = np.array(x_train, dtype=np.int32) + + with open(dataPath.get('ptb.valid.txt')) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + x_valid = [vocab[word] for word in words] + [vocab['']] + x_valid = np.array(x_valid, dtype=np.int32) + + with open(dataPath.get("ptb.test.txt")) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + x_test = [vocab[word] for word in words] + [vocab['']] + x_test = np.array(x_test, dtype=np.int32) + + print('train size: {}'.format(np.size(x_train))) + print('valid size: {}'.format(np.size(x_valid))) + print('test size: {}'.format(np.size(x_test))) + + with open('ptb/ptb.pkl', 'wb') as fout: + pickle.dump((x_train, x_valid, x_test, vocab, index), fout, protocol=2) + + +if __name__ == '__main__': + main() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py new file mode 100644 index 000000000..4d73e2b37 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py @@ -0,0 +1,288 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD ENAS search process.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig + +import os +import pickle +import sys +import time + +sys.path.append("/home/ma-user/modelarts/user-job-dir/") + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import child +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import controller +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils +from tensorflow.contrib import training as contrib_training + + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string('output_dir', None, '') +flags.DEFINE_string('data_path', None, '') +flags.DEFINE_string("obs_dir", "obs://rstg/log", "obs result path, not need on gpu and apulis platform") + + +## Other parametersresult +flags.DEFINE_boolean('reset_output_dir', False, '') +flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") + +flags.DEFINE_integer('log_every', 20, '') + + + +def get_ops(params, x_train, x_valid): + """Build [train, valid, test] graphs.""" + + ct = controller.Controller(params=params) + lm = child.LM(params, ct, x_train, x_valid) + ct.build_trainer(lm) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'l2_reg_loss': lm.l2_reg_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'bptt_rate': lm.bptt_rate, + + 'controller_train_op': ct.train_op, + 'controller_grad_norm': ct.train_op, + 'controller_sample_arc': ct.sample_arc, + 'controller_entropy': ct.sample_entropy, + 'controller_reward': ct.reward, + 'controller_baseline': ct.baseline, + 'controller_optimizer': ct.optimizer, + 'controller_train_fn': ct.train, + + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + +def load_ckpt_model(sess, save_path): + print("reload model from:{}".format(save_path)) + checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 + input_checkpoint = checkpoint.model_checkpoint_path + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 + saver.restore(sess, input_checkpoint) # 使用最新模型 + sess.run(tf.global_variables_initializer())# 初始化所有变量 + +def train(params): + """Entry train function.""" + print("data_path:{}".format(params.data_path)) + print("output_dir:{}".format(params.output_dir)) + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, _, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + + + g = tf.Graph() + with g.as_default(): + tf.random.set_random_seed(2126) + ops = get_ops(params, x_train, x_valid) + run_ops = [ + ops['train_loss'], + ops['l2_reg_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=5) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + hooks.append(ops['controller_optimizer'].make_session_run_hook(True)) + + # >>> add code >> + # 创建session + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 + custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 + # custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fusion_switch.cfg") + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("/home/ma-user/modelarts/inputs/data_url_0") + # + # custom_op.parameter_map["enable_dump_debug"].b = True + # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 + # # custom_op.parameter_map[ + # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 + # + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 + # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) + # >>> add code >> + + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + # reload model + if len(gfile.ListDirectory(params.output_dir)): + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + + accum_loss = 0 + accum_step = 0 + epoch = sess.run(ops['global_step']) // params.num_train_batches + best_valid_ppl = [] + start_time = time.time() + last_mins = (time.time() - start_time) / 60 + accum_rate = 0. + # sess.run(tf.global_variables_initializer()) + while True: + try: + # run_ops = [ + # ops['train_loss'], + # ops['l2_reg_loss'], + # ops['grad_norm'], + # ops['learning_rate'], + # ops['should_reset'], + # ops['train_op'], + # ] + # 修改点 + # loss, l2_reg, gn, lr, should_reset, _ = sess.run(run_ops) + loss = sess.run(ops['train_loss']) + # print("loss_OK:loss:{}".format(loss)) + l2_reg = sess.run(ops['l2_reg_loss']) + # print("l2_reg_OK:l2_reg:{}".format(l2_reg)) + gn = sess.run(ops['grad_norm']) + # gn = -111111 + # print("gn_OK:gn:{}".format(gn)) + lr = sess.run(ops['learning_rate']) + # print("lr_OK:le:{}".format(lr)) + should_reset = sess.run(ops['should_reset']) + _ = sess.run(ops["train_op"]) + + bptt_rate = sess.run(ops['bptt_rate']) + # print("should_reset_OK:should_reset:{}".format(should_reset)) + # if not should_not_train : + # _ = sess.run(ops["train_op"]) + + accum_loss += loss + accum_step += 1 + accum_rate += bptt_rate + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + mins_pices = mins_so_far - last_mins + last_mins = mins_so_far + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0:<7d}/{1:<6d}'.format(step, params.num_train_steps) + log_string += ' ppl={0:<9.2f}'.format(train_ppl) + log_string += ' lr={0:<7.2f}'.format(lr) + log_string += ' |w|={0:<6.2f}'.format(l2_reg) + log_string += ' |g|={0:<6.2f}'.format(gn) + log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, mins_pices/params.log_every) + # log_string += ' accum_rate(rate of a epoch)={0:<4.4f}'.format(accum_rate) + # log_string += ' should_reset:{}'.format(should_reset) + print(log_string) + + if should_reset: + accum_rate=0. + print("should_reset:{}".format(should_reset)) + ops['controller_train_fn'](sess, ops['reset_batch_states']) + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess) + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + best_valid_ppl.append(valid_ppl) + + if step % (params.num_train_batches * 10) == 0: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + if step >= params.num_train_steps: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + break + except tf.errors.InvalidArgumentError: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + sess.close() + + +def main(unused_args): + + tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.info("**********") + print("===>>>data_path:{}".format(FLAGS.data_path)) + print("===>>>output_dir:{}".format(FLAGS.output_dir)) + print("===>>>obs_dir:{}".format(FLAGS.obs_dir)) + print("===>>>train_step:{}".format(FLAGS.num_train_epochs)) + + np.set_printoptions(precision=3, suppress=True, threshold=int(1e9), + linewidth=80) + + print('-' * 80) + if not gfile.IsDirectory(FLAGS.output_dir): + print('Path {} does not exist. Creating'.format(FLAGS.output_dir)) + gfile.MakeDirs(FLAGS.output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(FLAGS.output_dir)) + gfile.DeleteRecursively(FLAGS.output_dir) + gfile.MakeDirs(FLAGS.output_dir) + + print('-' * 80) + log_file = os.path.join(FLAGS.output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ) + train(params) + + + +if __name__ == '__main__': + flags.mark_flag_as_required("data_path") + flags.mark_flag_as_required("output_dir") + flags.mark_flag_as_required("obs_dir") + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh new file mode 100644 index 000000000..a70f6b23e --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh @@ -0,0 +1,36 @@ +#!/bin/bash +### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. +### Modelarts Platform command for train + +#export ASCEND_GLOBAL_LOG_LEVEL=1 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为4 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 # plog日志是否打屏 +#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 + +export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config +#export ENABLE_FORCE_V2_CONTROL=1 + +code_dir=${1} +data_path=${2} +output_dir=${3} +obs_url=${4} + +current_time=`date "+%Y-%m-%d-%H-%M-%S"` + +python ${code_dir}/search.py \ + --data_path=${data_path}/ptb.pkl \ + --output_dir=${output_dir} \ + --obs_dir=${obs_url} \ + --platform='modelarts' \ + 2>&1 | tee ${output_dir}/${current_time}_train_npu.log + + +#BASE_PATH='/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' +# +#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/search' +# +#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' +# +#args="--output_dir=$OUTPUT_DIR --data_path=$DATA_PATH" +# +##run search +#python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py $args diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh new file mode 100644 index 000000000..2697bfd90 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh @@ -0,0 +1,45 @@ +#!/bin/bash +### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. +### Modelarts Platform command for train + +#export ASCEND_GLOBAL_LOG_LEVEL=4 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为3;null级别为4 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 # plog日志是否打屏 +#export ASCEND_HOST_LOG_FILE_NUM=1000 +#export ASCEND_LOG_DEVICE_FLUSH_TIMEOUT=0 +#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 +#export ASCEND_GLOBAL_TRACE_ENABLE=0 +#export PROFILING_MODE=false +#export PROFILING_OPTIONS='{"output":"/tmp/profiling","training_trace":"off","task_trace":"off","aicpu":"on","fp_point":"resnet_model/conv2d/Conv2Dresnet_model/batch_normalization/FusedBatchNormV3_Reduce","bp_point":"gradients/AddN_70","aic_metrics":"PipeUtilization"}' + +export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config +#export ENABLE_FORCE_V2_CONTROL=1 + +code_dir=${1} +data_path=${2} +output_dir=${3} +ckp_path=${4} + +current_time=`date "+%Y-%m-%d-%H-%M-%S"` +FIXED_ARC='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' + +nohup python3 ${code_dir}/fixed.py \ + --data_path=${data_path}/ptb.pkl \ + --output_dir=${output_dir} \ + --fixed_arc='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' \ + --ckp_path=${ckp_path} \ + --platform='modelarts' \ + > nohup1.out 2>&1 & + + +#FIXED_ARC='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' +# +#BASE_PATH = '/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' +# +#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/test' +# +#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' +# +#args ='--fixed_arc=FIXED_ARC --output_dir=$OUTPUT_DIR --data_path=$DATA_PATH' +# +##run test +#python3 /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py $args diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py new file mode 100644 index 000000000..7b59aec44 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common utils.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import re +import sys +import tensorflow.compat.v1 as tf + +gfile = tf.gfile + + +class Logger(object): + """Prints to both STDOUT and a file.""" + + def __init__(self, filepath): + self.terminal = sys.stdout + self.log = gfile.GFile(filepath, 'a+') + + def write(self, message): + self.terminal.write(message) + self.terminal.flush() + self.log.write(message) + self.log.flush() + + def flush(self): + self.terminal.flush() + self.log.flush() + + +def get_lr(curr_step, params, lr_decay_rate): + """Compute learning rate at step depends on `params`.""" + lr = tf.constant(params.learning_rate, dtype=tf.float32) + if 'num_warmup_steps' in params and params.num_warmup_steps > 0: + num_warmup_steps = tf.cast(params.num_warmup_steps, dtype=tf.float32) + step = tf.cast(curr_step, dtype=tf.float32) + warmup_lr = params.learning_rate * step / num_warmup_steps + lr = tf.cond(tf.less(step, num_warmup_steps), lambda: warmup_lr, lambda: lr) + return lr * lr_decay_rate + + +def strip_var_name(var_name): + """Strips variable name of sub-strings blocking variable name matching.""" + # Strip trailing number, e.g. convert + # 'lstm/W_0:0' to 'lstm/W_0'. + var_name = re.sub(r':\d+$', '', var_name) + # Strip partitioning info, e.g. convert + # 'W_0/part_3/Adagrad' to 'W_0/Adagrad'. + var_name = re.sub(r'/part_\d+', '', var_name) + return var_name -- Gitee From 02ae315d60542d46649b5e77f9012fcb8a5e1ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=9F=B9=E4=BA=AE?= Date: Thu, 1 Sep 2022 10:19:40 +0000 Subject: [PATCH 27/27] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20co?= =?UTF-8?q?ntrib=5Fold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- contrib_old/.keep | 0 contrib_old/TensorFlow/.keep | 0 contrib_old/TensorFlow/Research/.keep | 0 contrib_old/TensorFlow/Research/nlp/.keep | 0 .../TensorFlow/Research/nlp/enas/.keep | 0 .../nlp/enas/ENAS_ID2053_for_TensorFlow/.keep | 0 .../enas/ENAS_ID2053_for_TensorFlow/README.md | 211 ------ .../ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py | 36 - .../enas/ENAS_ID2053_for_TensorFlow/bash.py | 4 - .../boot_modelarts.py | 73 -- .../enas/ENAS_ID2053_for_TensorFlow/child.py | 440 ------------ .../ENAS_ID2053_for_TensorFlow/ckpt2pb.py | 81 --- .../ENAS_ID2053_for_TensorFlow/controller.py | 250 ------- .../ENAS_ID2053_for_TensorFlow/data_utils.py | 125 ---- .../enas/ENAS_ID2053_for_TensorFlow/fixed.py | 318 --------- .../ENAS_ID2053_for_TensorFlow/fixed_lib.py | 652 ------------------ .../help_modelarts.py | 93 --- .../enas/ENAS_ID2053_for_TensorFlow/lstm.py | 174 ----- .../ENAS_ID2053_for_TensorFlow/lstm_lib.py | 458 ------------ .../ENAS_ID2053_for_TensorFlow/process.py | 72 -- .../enas/ENAS_ID2053_for_TensorFlow/search.py | 288 -------- .../enas/ENAS_ID2053_for_TensorFlow/search.sh | 36 - .../ENAS_ID2053_for_TensorFlow/test-npu.sh | 45 -- .../enas/ENAS_ID2053_for_TensorFlow/utils.py | 67 -- 24 files changed, 3423 deletions(-) delete mode 100644 contrib_old/.keep delete mode 100644 contrib_old/TensorFlow/.keep delete mode 100644 contrib_old/TensorFlow/Research/.keep delete mode 100644 contrib_old/TensorFlow/Research/nlp/.keep delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/.keep delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh delete mode 100644 contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py diff --git a/contrib_old/.keep b/contrib_old/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/contrib_old/TensorFlow/.keep b/contrib_old/TensorFlow/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/contrib_old/TensorFlow/Research/.keep b/contrib_old/TensorFlow/Research/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/contrib_old/TensorFlow/Research/nlp/.keep b/contrib_old/TensorFlow/Research/nlp/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/contrib_old/TensorFlow/Research/nlp/enas/.keep b/contrib_old/TensorFlow/Research/nlp/enas/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md deleted file mode 100644 index 051fc7c8f..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/README.md +++ /dev/null @@ -1,211 +0,0 @@ -### 基本信息 -#### 发布者(Publisher):Huawei -#### 应用领域(Application Domain):NLP -#### 修改时间(Modified) :2018. -#### 框架(Framework):TensorFlow 1.15.0 -#### 模型格式(Model Format):ckpt -#### 精度(Precision):Mixed -#### 处理器(Processor):昇腾910 -#### 应用级别(Categories):Research -#### 描述(Description): enas模型用于ptb数据集的神经网络结构搜索 - -### 概述 -enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. - -#### 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) -#### 参考代码:[enas](https://github.com/melodyguan/enas) - -### 默认配置 -#### 数据预处理 - - - 输入数据为文本 - - 文本输入格式: id [int] -#### 训练超参数 - - ##### search - - controller baseline decay : 0.999 - - controller entropy weight : 1e-5 - - controller temperature : 5 - - controller learning rate : 5e-5 - - controller num layers : 9 - - controller hidden size : 64 - - controller num functions : 4 - - child batch size : 128 - - child bptt steps : 35 - - num train epochs : 600 - - ##### test - - - child grad bound : 0.25 - - child weight decay : 2e-6 - - child num train epochs :3000 - - child hidden size : 800 - - learning_rate : 20. - -### 支持特性 - -| 特性列表 | 是否支持 | -|------|------| -| 混合精度 | 是 | - -### 混合精度训练 - -昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 - -### 快速上手 -模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 - -### 代码结构文件 - -|— search.py 搜索模型代码\ -|— child.py 子图模型代码\ -|— fixed.py 架构验证模型代码\ -|— fixed_lib.py\ -|— data_utils.py 数据处理代码\ -|— controller.py 性能评估模型代码\ -|— boot_modelarts.py 模型运行代码\ -|— ... - -### 脚本参数 - -- search:\ ---data_path\ ---output_dir\ ---obs_dir -- test:\ ---data_path\ ---output_dir\ ---fixed_arc\ ---ckp_path - - - -### 训练过程 -在论文的参数设置下,GPU训练精度和速度可以达到要求; -NPU的训练精度和速度还未达标。 - -- #### GPU - - ##### search -epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ -valid_ppl=1800.73\ -epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ -valid_ppl=892.87\ -epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ -valid_ppl=843.70\ -epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ -valid_ppl=898.45\ -epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ -valid_ppl=774.25\ -epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ -valid_ppl=622.82\ -epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ -valid_ppl=606.77\ -epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ -valid_ppl=579.69\ -epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ -valid_ppl=520.63\ -epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ -...\ -valid_ppl=162.39\ -epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 -- ##### test -epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ -epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ -valid_ppl=463.03\ -epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ -epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ -valid_ppl=339.76\ -epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ -epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ -valid_ppl=271.71\ -epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ -epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ -valid_ppl=245.00\ -epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ -epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ -valid_ppl=213.10\ -epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ -epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ -epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ -valid_ppl=209.90\ -epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ -epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ -valid_ppl=181.99\ -epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ -epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ -valid_ppl=176.79\ -epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ -epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ -valid_ppl=166.62\ -...\ -epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ -epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ -valid_ppl=61.17\ -epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ -epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ -epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ -valid_ppl=61.17\ -epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ -...\ -step=70000 test_ppl=59.15\ -step=71000 test_ppl=59.03\ -step=72000 test_ppl=59.06\ -step=73000 test_ppl=58.41\ -step=74000 test_ppl=58.24\ -step=75000 test_ppl=58.12\ -step=76000 test_ppl=58.15\ -step=77000 test_ppl=58.29\ -step=78000 test_ppl=58.36\ -step=79000 test_ppl=58.50\ -step=80000 test_ppl=58.43\ -step=81000 test_ppl=58.72\ -step=82000 test_ppl=58.52\ -step=82429 test_ppl=58.64 - -- #### NPU - - ##### test -epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ -epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ -valid_ppl=389.49\ -epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ -epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ -valid_ppl=298.25\ -epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ -epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ -epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ -valid_ppl=236.61\ -epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ -epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ -valid_ppl=252.75\ -epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ -epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ -valid_ppl=197.03\ -epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ -epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ -epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ -valid_ppl=191.64\ -epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ -epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ -valid_ppl=200.02\ -epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ -epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ -valid_ppl=201.46\ -epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ -epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ -epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ -valid_ppl=175.82\ -epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ -epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ -valid_ppl=209.94\ -epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ -epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ -epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ -valid_ppl=167.50\ -...\ -valid_ppl=112.40\ -epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ -epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ -valid_ppl=113.40\ -epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ -epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ -epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ -valid_ppl=113.22\ -epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py deleted file mode 100644 index 4291e14e4..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py +++ /dev/null @@ -1,36 +0,0 @@ -from npu_bridge.npu_init import * -import os -import sys, getopt - - -def main(argv): - # print(argv) - # argv_ = ['-t', 'search'] - runType = "" - try: - opts, args = getopt.getopt(argv, "ht:", ["trun="]) - except getopt.GetoptError: - print("getopt.GetoptError!!") - print("useage: (sudo) python(3) pythonFileName.py -t ") - sys.exit(2) - for opt, arg in opts: - if opt == '-h': - print("useage: pythonFileName.py -t ") - sys.exit() - elif opt in ("-t", "--trun"): - runType = arg - if runType == "search": - print(f'runType={runType}!\n') - os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/search --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") - elif runType == "test-npu": - print(f'runType={runType}!\n') - os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/test --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") - # os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc = '0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=$(pwd)/output/test --data_path=$(pwd)/ptb/ptb.pkl") - # print("this part is writing...") - # pass - else: - print("This runType is invaild!!!") - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py deleted file mode 100644 index eaf741434..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/bash.py +++ /dev/null @@ -1,4 +0,0 @@ -from npu_bridge.npu_init import * -import os - -os.system("bash /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.sh") \ No newline at end of file diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py deleted file mode 100644 index c4532fa3f..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/boot_modelarts.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This is the boot file for ModelArts platform. -Firstly, the train datasets are copyed from obs to ModelArts. -Then, the string of train shell command is concated and using 'os.system()' to execute -""" -import os -import time -import numpy as np -import argparse -from help_modelarts import obs_data2modelarts -# import moxing as mox -print(os.system('env')) -print(os.system("python3 --version")) -#print(os.system("pip install dlib")) -print("===>>>hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh") -os.environ['ASCEND_GLOBAL_LOG_LEVEL'] = '4' - -#data_dir = "/root/.keras/models/" -if __name__ == '__main__': - ## Note: the code dir is not the same as work dir on ModelArts Platform!!! - code_dir = '.' - work_dir = os.getcwd() - print("===>>>code_dir:{}, work_dir:{}".format(code_dir, work_dir)) - output_path = "./output/test/" + str(time.strftime('%Y%m%d_%H%M%S')) - parser = argparse.ArgumentParser() - parser.add_argument("--train_url", type=str, default=output_path) - parser.add_argument("--data_url", type=str, default="./ptb") - parser.add_argument("--ckp_path", type=str, default="./output/test/20220715_182127/") - # parser.add_argument("--ckp_path", type=str, default="obs://rstg/workplace_ENAS/lm-train/MA-new-enas-05-23-19-34/output/result/") - # parser.add_argument("--modelarts_data_dir", type=str, default="/cache/ptb-dataset") - # parser.add_argument("--modelarts_result_dir", type=str, default="/cache/result") - config = parser.parse_args() - #if not os.path.exists(data_dir): - # os.makedirs(data_dir) - # print("=nvvvvvvvvvvvvvfdsfdsfdvnn") - - #os.system("pip install -i http://repo.myhuaweicloud.com/repository/pypi/simple pexpect==4.2.1") - #os.system("pip install torch") - #os.system("pip install absl-py") - print("--------config---------hhhhhhhhhhhggggggggggggggggkkkkkkkkkkkkkkkkkkkkkkkkkgg-") - for k in list(vars(config).keys()): - print("key:{}: value:{}".format(k, vars(config)[k])) - print("--------config----------") - - ## copy dataset from obs to modelarts - # obs_data2modelarts(config) - # ret = mox.file.exists('obs://rstg/MA-new-p/') - # retm = mox.file.make_dirs('obs://rstg/MA-new-p/') - # print("bbbbbbbbbbbbbbbbbbbbbbbbb ",retm) - # print("config.modelarts_result_dir ", config.modelarts_result_dir) - ## start to train on Modelarts platform - # if not os.path.exists(config.modelarts_result_dir): - # os.makedirs(config.modelarts_result_dir) - # print("6666666666666666666666666666666666666666 ", config.modelarts_result_dir) - bash_header = os.path.join(code_dir, 'test-npu.sh') - # bash_header = os.path.join(code_dir, 'search.sh') - arg_url = '%s %s %s %s' % (code_dir, config.data_url, config.train_url, config.ckp_path) - bash_command = 'bash %s %s' % (bash_header, arg_url) - print("bash command:", bash_command) - os.system(bash_command) diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py deleted file mode 100644 index 09b6d878d..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/child.py +++ /dev/null @@ -1,440 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""AWD ENAS fixed model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * -from npu_bridge.estimator.npu import npu_convert_dropout - - -import numpy as np -import tensorflow.compat.v1 as tf -import tensorflow.keras as keras - -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils - - -flags = tf.app.flags -FLAGS = flags.FLAGS - - -flags.DEFINE_integer('child_batch_size', 128, '') -flags.DEFINE_integer('child_bptt_steps', 35, '') -flags.DEFINE_integer('num_train_epochs', 600, '') - - -def _gen_mask(shape, drop_prob): - """Generate a droppout mask.""" - keep_prob = 1. - drop_prob - mask = tf.random_uniform(shape, dtype=tf.float32) - mask = tf.floor(mask + keep_prob) / keep_prob - return mask - - -def _rnn_fn(sample_arc, x, prev_s, w_prev, w_skip, input_mask, layer_mask, - params): - """Multi-layer LSTM. - - Args: - sample_arc: [num_layers * 2], sequence of tokens representing architecture. - x: [batch_size, num_steps, hidden_size]. - prev_s: [batch_size, hidden_size]. - w_prev: [2 * hidden_size, 2 * hidden_size]. - w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. - input_mask: `[batch_size, hidden_size]`. - layer_mask: `[batch_size, hidden_size]`. - params: hyper-params object. - - Returns: - next_s: [batch_size, hidden_size]. - all_s: [[batch_size, num_steps, hidden_size] * num_layers]. - """ - batch_size = params.batch_size - # num_steps = 35 - num_steps = tf.shape(x)[1] - print("num_steps:{}/{}".format(num_steps, num_steps)) - - num_layers = len(sample_arc) // 2 - set_shape = x.get_shape().as_list() - print("set_shape:{}".format(set_shape)) - # 修改点 - # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=True) - all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) - # extract the relevant variables, so that you only do L2-reg on them. - u_skip = [] - start_idx = 0 - - for layer_id in range(num_layers): - prev_idx = sample_arc[start_idx] - func_idx = sample_arc[start_idx + 1] - u_skip.append(w_skip[layer_id][func_idx, prev_idx]) - start_idx += 2 - w_skip = u_skip - var_s = [w_prev] + w_skip[1:] - - def _select_function(h, function_id): - h = tf.stack([tf.tanh(h), tf.nn.relu(h), tf.sigmoid(h), h], axis=0) - h = h[function_id] - return h - - def _condition(step, *unused_args): - return tf.less(step, num_steps) - - def _body(step, prev_s, all_s): - """Body function.""" - inp = x[:, step, :] - # print("inp:{}".format(inp)) - - # important change: first input uses a tanh() - if layer_mask is not None: - assert input_mask is not None - ht = tf.matmul(tf.concat([inp * input_mask, prev_s * layer_mask], - axis=1), w_prev) - else: - ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) - # print("ht:{}".format(ht)) - h, t = tf.split(ht, 2, axis=1) - h = tf.tanh(h) - t = tf.sigmoid(t) - s = prev_s + t * (h - prev_s) - layers = [s] - # print("layer:{}".format(layers)) - - start_idx = 0 - used = [] - for layer_id in range(num_layers): - prev_idx = sample_arc[start_idx] - func_idx = sample_arc[start_idx + 1] - # print("layer_id/[prev_idx, func_idx]:{}/[{}, {}]".format(layer_id, prev_idx, func_idx)) - used.append(tf.one_hot(prev_idx, depth=num_layers, dtype=tf.int32)) - prev_s = tf.stack(layers, axis=0)[prev_idx] - if layer_mask is not None: - ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) - else: - ht = tf.matmul(prev_s, w_skip[layer_id]) - h, t = tf.split(ht, 2, axis=1) - - h = _select_function(h, func_idx) - t = tf.sigmoid(t) - s = prev_s + t * (h - prev_s) - # print("s before set_shape:{}".format(s)) - s.set_shape([batch_size, params.hidden_size]) - # print("s after set_shape:{}".format(s)) - layers.append(s) - start_idx += 2 - # print("layers:{}\ns:{}".format(layers, s)) - - next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) - # print("step:{}\nnext_s:{}".format(step, next_s)) - # all_s = all_s.write(step, next_s) - t = tf.stack([next_s]) - # print("t:{}".format(t)) - all_s = tf.concat([all_s, t], 0) - # print("step:{}-all_s:{}".format(step, all_s)) - # all_s_my[step] = next_s - - return step + 1, next_s, all_s - - loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] - _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) - - all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) - # all_s_my.set_shape([_, batch_size, params.hidden_size]) - # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) - # print("stack_all_s:{}".format(all_s_my)) - - all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) - # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) - # print("all_s:{}".format(all_s)) - - return next_s, all_s, var_s - - -def _set_default_params(params): - """Set default hyper-parameters.""" - params.add_hparam('alpha', 0.0) # activation L2 reg - params.add_hparam('beta', 1.) # activation slowness reg - params.add_hparam('best_valid_ppl_threshold', 5) - - params.add_hparam('batch_size', FLAGS.child_batch_size) - params.add_hparam('bptt_steps', FLAGS.child_bptt_steps) - - # for dropouts: dropping rate, NOT keeping rate - params.add_hparam('drop_e', 0.10) # word - params.add_hparam('drop_i', 0.20) # embeddings - params.add_hparam('drop_x', 0.75) # input to RNN cells - params.add_hparam('drop_l', 0.25) # between layers - params.add_hparam('drop_o', 0.75) # output - params.add_hparam('drop_w', 0.00) # weight - - params.add_hparam('grad_bound', 0.1) - params.add_hparam('hidden_size', 200) - params.add_hparam('init_range', 0.04) - params.add_hparam('learning_rate', 20.) - params.add_hparam('num_train_epochs', FLAGS.num_train_epochs) - params.add_hparam('vocab_size', 10000) - - params.add_hparam('weight_decay', 8e-7) - return params - - -class LM(object): - """Language model.""" - - def __init__(self, params, controller, x_train, x_valid, name='child'): - print('-' * 80) - print('Building LM') - - self.params = _set_default_params(params) - self.controller = controller - self.sample_arc = tf.unstack(controller.sample_arc) - self.name = name - - # train data - (self.x_train, self.y_train, - self.num_train_batches, self.reset_start_idx, - self.should_reset, - self.base_bptt, self.bptt_rate) = data_utils.input_producer(x_train, params.batch_size, params.bptt_steps, random_len=True) - params.add_hparam('num_train_steps', self.num_train_batches * params.num_train_epochs) - # self.x_train.set_shape([params.batch_size, self.base_bptt]) - # print("self.x_train:{}".format(self.x_train.get_shape().as_list())) - - # valid data - (self.x_valid, self.y_valid, - self.num_valid_batches) = data_utils.input_producer(x_valid, params.batch_size, params.bptt_steps) - # with tf.control_dependencies([self.base_bptt]): - self._build_params() - self._build_train() - self._build_valid() - - def _build_params(self): - """Create model parameters.""" - - print('-' * 80) - print('Building model params') - initializer = tf.initializers.random_uniform(minval=-self.params.init_range, - maxval=self.params.init_range) - num_functions = self.params.controller_num_functions - num_layers = self.params.controller_num_layers - hidden_size = self.params.hidden_size - # >>> add code >>> - with tf.variable_scope(self.name, initializer=initializer): - # >>> add code >>> - with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): - w_emb = tf.get_variable('w', [self.params.vocab_size, hidden_size]) - # >>> add code >>> - # 修改点 - dropped_w_emb = npu_ops.dropout(w_emb, 1-self.params.drop_e) - # dropped_w_emb = tf.layers.dropout( - # w_emb, self.params.drop_e, [self.params.vocab_size, 1], - # training=True) - with tf.variable_scope('rnn_cell', reuse=tf.AUTO_REUSE): - w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) - i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) - h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) - mask = tf.concat([i_mask, h_mask], axis=0) - dropped_w_prev = w_prev * mask - w_skip, dropped_w_skip = [], [] - for layer_id in range(1, num_layers + 1): - with tf.variable_scope('layer_{}'.format(layer_id)): - w = tf.get_variable( - 'w', [num_functions, layer_id, hidden_size, 2 * hidden_size]) - mask = _gen_mask([1, 1, hidden_size, 2 * hidden_size], - self.params.drop_w) - dropped_w = w * mask - w_skip.append(w) - dropped_w_skip.append(dropped_w) - with tf.variable_scope('init_states', reuse=tf.AUTO_REUSE): - with tf.variable_scope('batch'): - init_shape = [self.params.batch_size, hidden_size] - batch_prev_s = tf.get_variable( - 's', init_shape, dtype=tf.float32, trainable=False) - zeros = np.zeros(init_shape, dtype=np.float32) - batch_reset = tf.assign(batch_prev_s, zeros) - - self.num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() - if v.name.startswith(self.name)]) # .value - print('All children have {} params'.format(self.num_params)) - - num_params_per_child = 0 - for v in tf.trainable_variables(): - if v.name.startswith(self.name): - if 'rnn_cell' in v.name: - num_params_per_child += v.shape[-2] * v.shape[-1] - else: - num_params_per_child += np.prod([d for d in v.shape]) - print('Each child has {0} params'.format(num_params_per_child)) - - self.batch_init_states = { - 's': batch_prev_s, - 'reset': batch_reset, - } - self.train_params = { - 'w_emb': dropped_w_emb, - 'w_prev': dropped_w_prev, - 'w_skip': dropped_w_skip, - 'w_soft': w_emb, - } - self.eval_params = { - 'w_emb': w_emb, - 'w_prev': w_prev, - 'w_skip': w_skip, - 'w_soft': w_emb, - } - - def _forward(self, x, y, model_params, init_states, is_training=False): - """Computes the logits. - - Args: - x: [batch_size, num_steps], input batch. - y: [batch_size, num_steps], output batch. - model_params: a `dict` of params to use. - init_states: a `dict` of params to use. - is_training: if `True`, will apply regularizations. - - Returns: - loss: scalar, cross-entropy loss - """ - w_emb = model_params['w_emb'] - w_prev = model_params['w_prev'] - w_skip = model_params['w_skip'] - w_soft = model_params['w_soft'] - prev_s = init_states['s'] - - # bug点 - # - print("before [embedding_lookup], x={}".format(x)) - emb = tf.nn.embedding_lookup(w_emb, x) - batch_size = self.params.batch_size - hidden_size = self.params.hidden_size - sample_arc = self.sample_arc - if is_training: - # >>> add code >>> - emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) - # >>> add code >>> - # 修改点 - # emb = tf.layers.dropout( - # emb, self.params.drop_i, [batch_size, 1, hidden_size], training=True) - - input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) - layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) - else: - input_mask = None - layer_mask = None - - out_s, all_s, var_s = _rnn_fn(sample_arc, emb, prev_s, w_prev, w_skip, - input_mask, layer_mask, params=self.params) - - top_s = all_s - if is_training: - # >>> add code >>> - # 修改点 - - top_s = npu_ops.dropout(top_s, 1-self.params.drop_o) # ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) - # >>> add code >>> - # top_s = tf.layers.dropout( - # top_s, self.params.drop_o, - # [self.params.batch_size, 1, self.params.hidden_size], training=True) - - carry_on = [tf.assign(prev_s, out_s)] - top_s_shape = top_s.get_shape().as_list() - # print("top_s_shape:{}".format(top_s_shape)) - # print("w_soft:{}".format(w_soft)) - logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) - # logits = tf.matmul(top_s, tf.transpose(w_soft)) - # print("logits:{}".format(logits)) - loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, - logits=logits) - # print("loss:{}".format(loss)) - loss = tf.reduce_mean(loss) - # print("_forward/loss:{}".format(loss)) - reg_loss = loss # `loss + regularization_terms` is for training only - if is_training: - # L2 weight reg - self.l2_reg_loss = tf.add_n([tf.nn.l2_loss(w ** 2) for w in var_s]) - reg_loss += self.params.weight_decay * self.l2_reg_loss - - # activation L2 reg - reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) - - # activation slowness reg - reg_loss += self.params.beta * tf.reduce_mean( - (all_s[:, 1:, :] - all_s[:, :-1, :]) ** 2) - # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) - with tf.control_dependencies(carry_on): - loss = tf.identity(loss) - if is_training: - reg_loss = tf.identity(reg_loss) - # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) - return reg_loss, loss - - def _build_train(self): - """Build training ops.""" - print('-' * 80) - print('Building train graph') - reg_loss, loss = self._forward(self.x_train, self.y_train, - self.train_params, self.batch_init_states, - is_training=True) - - tf_vars = [v for v in tf.trainable_variables() - if v.name.startswith(self.name)] - # print("reg_loss:{}".format(reg_loss)) - # print("tf_vars:{}".format(tf_vars)) - global_step = tf.train.get_or_create_global_step() - lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / - tf.cast(self.params.bptt_steps, dtype=tf.float32)) - learning_rate = utils.get_lr(global_step, self.params) * lr_scale - if self.params.grad_bound: - # grads = tf.gradients(reg_loss, tf_vars) - # clipped_grads, _ = tf.clip_by_global_norm(grads, self.params.grad_bound) - # clipped_grads, grad_norm = tf.clip_by_global_norm(grads, self.params.grad_bound) - # print("clipped_grads:{}".format(clipped_grads)) - - grads = tf.gradients(reg_loss, tf_vars) - # print("grads:{}".format(grads)) - clipped_grads, grad_norm = tf.clip_by_global_norm(grads, - self.params.grad_bound) - optimizer = tf.train.GradientDescentOptimizer(learning_rate) - # print("optimizer:{}".format(optimizer)) - train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), - global_step=global_step) - # print("train_op:{}".format(train_op)) - self.train_loss = loss - self.train_op = train_op - self.grad_norm = grad_norm - self.learning_rate = learning_rate - - def _build_valid(self): - print('Building valid graph') - _, loss = self._forward(self.x_valid, self.y_valid, - self.eval_params, self.batch_init_states) - self.valid_loss = loss - self.rl_loss = loss - - def eval_valid(self, sess): - - """Eval 1 round on valid set.""" - total_loss = 0 - for _ in range(self.num_valid_batches): - sess.run(self.batch_init_states['reset']) - total_loss += sess.run(self.valid_loss) - valid_ppl = np.exp(total_loss / self.num_valid_batches) - print('valid_ppl={0:<.2f}'.format(valid_ppl)) - - return valid_ppl diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py deleted file mode 100644 index 2367ea1f0..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/ckpt2pb.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: UTF-8 -*- -import tensorflow.compat.v1 as tf - -# from create_tf_record import * -from tensorflow.python.framework import graph_util -from tensorflow.python.tools import freeze_graph - -from npu_bridge.npu_init import * - -def freeze_graph(input_checkpoint, output_graph): - ''' - :param input_checkpoint: - :param output_graph: PB模型保存路径 - :return: - ''' - # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 - # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 - - # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 - output_node_names = "output" - saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) - - with tf.Session() as sess: - saver.restore(sess, input_checkpoint) # 恢复图并得到数据 - output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 - sess=sess, - input_graph_def=sess.graph_def, # 等于:sess.graph_def - output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 - - with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 - f.write(output_graph_def.SerializeToString()) # 序列化输出 - print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 - - # for op in sess.graph.get_operations(): - # print(op.name, op.values()) - - -def freeze_graph2(input_checkpoint, output_graph): - ''' - :param input_checkpoint: - :param output_graph: PB模型保存路径 - :return: - ''' - # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 - # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 - - # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 - output_node_names = "InceptionV3/Logits/SpatialSqueeze" - saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) - graph = tf.get_default_graph() # 获得默认的图 - input_graph_def = graph.as_graph_def() # 返回一个序列化的图代表当前的图 - - with tf.Session() as sess: - saver.restore(sess, input_checkpoint) # 恢复图并得到数据 - output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 - sess=sess, - input_graph_def=input_graph_def, # 等于:sess.graph_def - output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 - - with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 - f.write(output_graph_def.SerializeToString()) # 序列化输出 - print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 - - # for op in graph.get_operations(): - # print(op.name, op.values()) - - -if __name__ == '__main__': - # 输入ckpt模型路径 - input_checkpoint = './output/test/20220709_185707/model.ckpt-181200' - # 输出pb模型的路径 - out_pb_path = "models_pb/enas-lm-infer2.pb" - # 调用freeze_graph将ckpt转为pb - freeze_graph(input_checkpoint, out_pb_path) - print("Done pb!") - - # 测试pb模型 - image_path = 'test_image/animal.jpg' - # freeze_graph_test(pb_path=out_pb_path, image_path=image_path) - - diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py deleted file mode 100644 index cb13d49ab..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/controller.py +++ /dev/null @@ -1,250 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""ENAS controller.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import numpy as np -import tensorflow.compat.v1 as tf - -flags = tf.app.flags -FLAGS = flags.FLAGS - -flags.DEFINE_float('controller_baseline_dec', 0.999, '') -flags.DEFINE_float('controller_entropy_weight', 1e-5, '') -flags.DEFINE_float('controller_temperature', 5., '') -flags.DEFINE_float('controller_tanh_constant', 2.25, '') -flags.DEFINE_float('controller_learning_rate', 5e-5, '') -flags.DEFINE_integer('controller_num_layers', 9, '') - -REWARD_CONSTANT = 80.0 - - -def _build_train_op(loss, tf_vars, learning_rate, train_step, num_aggregate): - """Build training ops from `loss` tensor.""" - optim = tf.train.AdamOptimizer(learning_rate) - optim = tf.train.SyncReplicasOptimizer( - optim, replicas_to_aggregate=num_aggregate, total_num_replicas=1, use_locking=True) - grads = tf.gradients(loss, tf_vars) - train_op = optim.apply_gradients(zip(grads, tf_vars), global_step=train_step) - grad_norm = tf.global_norm(grads) - return train_op, optim, grad_norm - - -def _lstm(x, prev_c, prev_h, w_lstm): - """LSTM subgraph.""" - ifog = tf.matmul(tf.concat([x, prev_h], axis=1), w_lstm) - i, f, o, g = tf.split(ifog, 4, axis=1) - i = tf.sigmoid(i) - f = tf.sigmoid(f) - o = tf.sigmoid(o) - g = tf.tanh(g) - next_c = i * g + f * prev_c - next_h = o * tf.tanh(next_c) - return next_c, next_h - - -def _set_default_params(params): - """Add controller's default params.""" - params.add_hparam('controller_hidden_size', 64) - params.add_hparam('controller_num_layers', FLAGS.controller_num_layers) - params.add_hparam('controller_num_functions', 4) # tanh, relu, sigmoid, iden - - params.add_hparam('controller_baseline_dec', FLAGS.controller_baseline_dec) - params.add_hparam('controller_entropy_weight', - FLAGS.controller_entropy_weight) - params.add_hparam('controller_temperature', FLAGS.controller_temperature) - params.add_hparam('controller_tanh_constant', FLAGS.controller_tanh_constant) - params.add_hparam('controller_learning_rate', FLAGS.controller_learning_rate) - params.add_hparam('controller_num_aggregate', 10) - params.add_hparam('controller_num_train_steps', 25) - - return params - - -class Controller(object): - """ENAS controller. Samples architectures and creates training ops.""" - - def __init__(self, params, name='controller'): - print('-' * 80) - print('Create a controller') - self.params = _set_default_params(params) - self.name = name - self._build_params() - self._build_sampler() - - def _build_params(self): - """Create TF parameters.""" - initializer = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) - num_funcs = self.params.controller_num_functions # 4 - hidden_size = self.params.controller_hidden_size # 64 - with tf.variable_scope(self.name, initializer=initializer): - with tf.variable_scope('lstm'): - self.w_lstm = tf.get_variable('w', [2 * hidden_size, 4 * hidden_size]) - - with tf.variable_scope('embedding'): - self.g_emb = tf.get_variable('g', [1, hidden_size]) - self.w_emb = tf.get_variable('w', [num_funcs, hidden_size]) - - with tf.variable_scope('attention'): - self.attn_w_1 = tf.get_variable('w_1', [hidden_size, hidden_size]) - self.attn_w_2 = tf.get_variable('w_2', [hidden_size, hidden_size]) - self.attn_v = tf.get_variable('v', [hidden_size, 1]) - - num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() - if v.name.startswith(self.name)]) - print('Controller has {0} params'.format(num_params)) - - def _build_sampler(self): - """Build the sampler ops and the log_prob ops.""" - hidden_size = self.params.controller_hidden_size - num_layers = self.params.controller_num_layers - - arc_seq = [] - sample_log_probs = [] - sample_entropy = [] - all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)] - all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)] - - # sampler ops - inputs = self.g_emb # ??? - prev_c = tf.zeros([1, hidden_size], dtype=tf.float32) - prev_h = tf.zeros([1, hidden_size], dtype=tf.float32) - - inputs = self.g_emb - for layer_id in range(1, num_layers + 1): - next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) - prev_c, prev_h = next_c, next_h - all_h.append(next_h) - all_h_w.append(tf.matmul(next_h, self.attn_w_1)) - - query = tf.matmul(next_h, self.attn_w_2) - query = query + tf.concat(all_h_w[:-1], axis=0) - query = tf.tanh(query) - logits = tf.matmul(query, self.attn_v) - logits = tf.reshape(logits, [1, layer_id]) - - if self.params.controller_temperature: - logits /= self.params.controller_temperature - if self.params.controller_tanh_constant: - logits = self.params.controller_tanh_constant * tf.tanh(logits) - diff = tf.cast(layer_id - tf.range(0, layer_id), tf.float32) ** 2 - logits -= tf.reshape(diff, [1, layer_id]) / 6.0 - skip_index = tf.random.categorical(logits, 1) - skip_index = tf.cast(skip_index, tf.int32) - skip_index = tf.reshape(skip_index, [1]) - arc_seq.append(skip_index) - - log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=logits, labels=skip_index) - sample_log_probs.append(log_prob) - - entropy = log_prob * tf.exp(-log_prob) - sample_entropy.append(tf.stop_gradient(entropy)) - - inputs = tf.nn.embedding_lookup( - tf.concat(all_h[:-1], axis=0), skip_index) - inputs /= (0.1 + tf.to_float(layer_id - skip_index)) - - next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) - prev_c, prev_h = next_c, next_h - logits = tf.matmul(next_h, self.w_emb, transpose_b=True) - if self.params.controller_temperature: - logits /= self.params.controller_temperature - if self.params.controller_tanh_constant: - logits = self.params.controller_tanh_constant * tf.tanh(logits) - func = tf.multinomial(logits, 1) - func = tf.to_int32(func) - func = tf.reshape(func, [1]) - arc_seq.append(func) - log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=logits, labels=func) - sample_log_probs.append(log_prob) - entropy = log_prob * tf.exp(-log_prob) - sample_entropy.append(tf.stop_gradient(entropy)) - inputs = tf.nn.embedding_lookup(self.w_emb, func) - - arc_seq = tf.concat(arc_seq, axis=0) - self.sample_arc = arc_seq - - self.sample_log_probs = tf.concat(sample_log_probs, axis=0) - self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) - - sample_entropy = tf.concat(sample_entropy, axis=0) - self.sample_entropy = tf.reduce_sum(sample_entropy) - - self.all_h = all_h - - def build_trainer(self, child_model): - """Build the train ops by connecting Controller with a Child.""" - # actor - self.valid_loss = tf.to_float(child_model.rl_loss) - self.valid_loss = tf.stop_gradient(self.valid_loss) - self.valid_ppl = tf.exp(self.valid_loss) - self.reward = REWARD_CONSTANT / self.valid_ppl - - if self.params.controller_entropy_weight: - self.reward += self.params.controller_entropy_weight * self.sample_entropy - - # or baseline - self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) - self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) - baseline_update = tf.assign_sub(self.baseline, - ((1 - self.params.controller_baseline_dec) * - (self.baseline - self.reward))) - - with tf.control_dependencies([baseline_update]): - self.reward = tf.identity(self.reward) - self.loss = self.sample_log_probs * (self.reward - self.baseline) - - self.train_step = tf.Variable( - 0, dtype=tf.int32, trainable=False, name='train_step') - tf_vars = [var for var in tf.trainable_variables() - if var.name.startswith(self.name)] - - self.train_op, self.optimizer, self.grad_norm = _build_train_op( - loss=self.loss, - tf_vars=tf_vars, - learning_rate=self.params.controller_learning_rate, - train_step=self.train_step, - num_aggregate=self.params.controller_num_aggregate) - - def train(self, sess, reset_op, log_every=10): - """Train the controller for `num_steps`.""" - print('-' * 80) - print('Training controller') - num_steps = (self.params.controller_num_aggregate * - self.params.controller_num_train_steps) - run_ops = [self.sample_arc, - self.sample_entropy, - self.reward, - self.baseline, - self.train_op] - - for step in range(num_steps): - arc, ent, reward, baseline, _ = sess.run(run_ops) - sess.run(reset_op) - if step % log_every == 0: - log_string = 'step={0:<5d}'.format(step) - log_string += ' ent={0:<7.3f}'.format(ent) - log_string += ' ppl={0:<7.2f}'.format(REWARD_CONSTANT / reward) - log_string += ' rw={0:<7.4f}'.format(reward) - log_string += ' bl={0:<7.4f}'.format(baseline) - log_string += ' arc=[{0}]'.format(' '.join([str(v) for v in arc])) - print(log_string) diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py deleted file mode 100644 index 6d767073c..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/data_utils.py +++ /dev/null @@ -1,125 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Load picked Penn Treebank data.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -# from npu_bridge.npu_init import * - -import numpy as np -import tensorflow.compat.v1 as tf - - -def input_producer(raw_data, batch_size, num_steps, shuffle=False, - randomize=False, random_len=False): - """Produces graph-based input for Penn Treebank. - - Args: - raw_data: np tensor of size [num_words]. - batch_size: self-explained. - num_steps: number of BPTT steps. - shuffle: whether to shuffle sentences. - randomize: use random segments instead of the continuous corpus. - random_len: random sequence len. - - Returns: - If `random_len` is set, return op that represents whether we have reached - the end of a sequence. - Otherwise, return number of batches in an epoch. - """ - print("raw_data_size:{}".format(np.size(raw_data))) - print("num_steps:{}".format(num_steps)) - batch_len = np.size(raw_data) // batch_size - num_batches_per_epoch = ((np.size(raw_data) // batch_size) - 1) // num_steps - print("num_batches_per_epoch:{}".format(num_batches_per_epoch)) - raw_data = tf.convert_to_tensor(raw_data, name='raw_data', dtype=tf.int32) - - # data_len = tf.size(raw_data) - - - print("batch_len:{}".format(batch_len)) - data = tf.reshape(raw_data[0: batch_size * batch_len], - [batch_size, batch_len]) - - epoch_size = (batch_len - 1) // num_steps - with tf.device('/cpu:0'): - epoch_size = tf.identity(epoch_size, name='epoch_size') - - if random_len: - start_idx = tf.Variable(0, name='start_idx', dtype=tf.int32,trainable=False) - # start_idx = 0 - base_bptt = tf.cond( - tf.random_uniform(shape=(), minval=0., maxval=1.) < 0.95, - lambda: tf.cast(num_steps, dtype=tf.float32), - lambda: tf.cast(num_steps, dtype=tf.float32) / 2.) - # base_bptt = int(tf.cond( - # tf.greater_equal(0.95, np.random.uniform(100)/100), - # lambda:num_steps / 1., - # lambda:num_steps / 2.).item()) - # base_bptt = 35 - seq_len = tf.random.truncated_normal(shape=(), mean=base_bptt, stddev=5., - dtype=tf.float32) - # seq_len = int(np.random.normal(num_steps, 5)) - # seq_len = 35 - seq_len = tf.cast(seq_len, dtype=tf.int32) - seq_len = tf.minimum(seq_len, num_steps + 20) # seq_len <= bptt + 40 - seq_len = tf.minimum(seq_len, batch_len - start_idx - 1) - - # seq_len = tf.cond(tf.greater_equal(seq_len, num_steps + 20), lambda: num_steps + 20, lambda: seq_len).item() - # seq_len = tf.cond(tf.greater_equal(seq_len, int(batch_len - start_idx - 1)), lambda: int(batch_len - start_idx - 1), lambda: seq_len).item() - # seq_len = min(seq_len, num_steps + 20, batch_len - start_idx - 1) - print("seq_len:{}, type:{}".format(seq_len, type(seq_len))) - - end_idx = start_idx + seq_len - - x = data[:, start_idx: end_idx] - # x = tf.reshape(x, [batch_size, seq_len]) - # print("xshape:{}".format(x.get_shape().as_list())) - y = data[:, start_idx + 1: end_idx + 1] - # y = tf.reshape(y, [batch_size, seq_len]) - # print("yshape:{}".format(y.get_shape().as_list())) - - with tf.control_dependencies([x, y]): - with tf.control_dependencies([tf.assign(start_idx, end_idx)]): - should_reset = tf.greater_equal(end_idx, batch_len - 3) - reset_start_idx = tf.assign(start_idx, 0) - # reset_start_idx = tf.assign(tf.Variable(start_idx, name='reset_start_idx', dtype=tf.int32, trainable=False), 0) - return (x, y, num_batches_per_epoch, reset_start_idx, should_reset, - base_bptt, seq_len / batch_len) - - if randomize: - i = tf.random_uniform([1], minval=0, maxval=batch_len - num_steps,dtype=tf.int32)[0] - x = tf.strided_slice(data, [0, i], [batch_size, i + num_steps]) - y = tf.strided_slice(data, [0, i + 1], [batch_size, i + num_steps + 1]) - else: - # """ - # 修改点 - start_idx_eval = tf.Variable(0, name='start_idx', dtype=tf.int32, - trainable=False) - seq_len = num_steps - seq_len = tf.cast(seq_len, dtype=tf.int32) - end_idx = start_idx_eval + seq_len - x = data[:, start_idx_eval: end_idx] - y = data[:, start_idx_eval + 1: end_idx + 1] - with tf.control_dependencies([x, y]): - with tf.control_dependencies([tf.assign(start_idx_eval, end_idx)]): - should_reset_eval = tf.greater_equal(end_idx, batch_len - num_steps - 3) - reset_start_idx_eval = tf.assign(start_idx_eval, 0) - x.set_shape([batch_size, num_steps]) - y.set_shape([batch_size, num_steps]) - - return x, y, num_batches_per_epoch, reset_start_idx_eval, should_reset_eval diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py deleted file mode 100644 index 51ed715f5..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed.py +++ /dev/null @@ -1,318 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Entry point for AWD ENAS with a fixed architecture.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * -from tensorflow.python.tools import freeze_graph - -import os -import pickle -import sys - -# TODO:change path -# sys.path.append("/home/test_user06/AscendZhongzhi_NJU/") -import time - -import numpy as np -import tensorflow.compat.v1 as tf - -import fixed_lib -import utils -from tensorflow.contrib import training as contrib_training - -flags = tf.app.flags -gfile = tf.gfile -FLAGS = flags.FLAGS - -## Required parameters -subfolder = str(time.strftime('%Y%m%d_%H%M%S')) -flags.DEFINE_string('output_dir', "./output/infer0/" + subfolder, '') -flags.DEFINE_string('data_path', './ptb/ptb.pkl', '') -flags.DEFINE_string("ckp_path", '', "checkpoint path") - -## Other parametersresult -flags.DEFINE_boolean('reload_model', True, '') -flags.DEFINE_boolean('reset_output_dir', True, '') -flags.DEFINE_boolean('is_training', False, '') -flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") - -flags.DEFINE_integer('log_every', 100, '') - - -def get_ops(params, x_train, x_valid, x_test): - """Build [train, valid, test] graphs.""" - lm = fixed_lib.LM(params, x_train, x_valid, x_test) - params.add_hparam('num_train_batches', lm.num_train_batches) - ops = { - 'train_op': lm.train_op, - 'learning_rate': lm.learning_rate, - 'grad_norm': lm.grad_norm, - 'train_loss': lm.train_loss, - 'global_step': tf.train.get_or_create_global_step(), - 'reset_batch_states': lm.batch_init_states['reset'], - 'eval_valid': lm.eval_valid, - 'eval_test': lm.do_infer, - 'bptt_rate': lm.bptt_rate, - - 'reset_start_idx': lm.reset_start_idx, - 'should_reset': lm.should_reset, - 'moving_avg_started': lm.moving_avg_started, - 'update_moving_avg': lm.update_moving_avg_ops, - 'start_moving_avg': lm.start_moving_avg_op, - 'end_moving_avg': lm.end_moving_avg_op, - 'reset_avg': lm.restart_avg, - 'set_lr_decay': lm.set_lr_decay, - 'reset_start_idx_eval': lm.reset_start_idx_eval, - } - print('-' * 80) - print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) - - return ops - - -def load_ckpt_model(sess, save_path): - print("reload model from:{}".format(save_path)) - checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 - input_checkpoint = checkpoint.model_checkpoint_path - saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 - saver.restore(sess, input_checkpoint) # 使用最新模型 - sess.run(tf.global_variables_initializer())# 初始化所有变量 - - -def train(params, is_training=True): - """Entry point for training.""" - with gfile.GFile(params.data_path, 'rb') as finp: - x_train, x_valid, x_test, _, _ = pickle.load(finp) - print('-' * 80) - print('train_size: {0}'.format(np.size(x_train))) - print('valid_size: {0}'.format(np.size(x_valid))) - print(' test_size: {0}'.format(np.size(x_test))) - - g = tf.Graph() - with g.as_default(): - tf.random.set_random_seed(2126) - ops = get_ops(params, x_train, x_valid, x_test) - run_ops = [ - ops['train_loss'], - ops['grad_norm'], - ops['learning_rate'], - ops['should_reset'], - ops['moving_avg_started'], - ops['train_op'], - ] - - saver = tf.train.Saver(max_to_keep=2) - checkpoint_saver_hook = tf.train.CheckpointSaverHook( - params.output_dir, save_steps=params.num_train_batches, saver=saver) - hooks = [checkpoint_saver_hook] - - # >>> add code >> - # 创建session - config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) - custom_op = config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 - custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 - custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 - custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") - # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 - # # custom_op.parameter_map[ - # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 - # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("./dump/") - # custom_op.parameter_map["enable_dump_debug"].b = True - # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") - config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 - config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 - # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) - # >>> add code >> - - - # config = tf.ConfigProto() - # config.gpu_options.allow_growth = True - sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, - checkpoint_dir=params.output_dir) - # reload model - if params.ckp_path is not "" and FLAGS.reload_model: - last_checkpoint = tf.train.latest_checkpoint(params.ckp_path) - print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) - saver.restore(sess, last_checkpoint) - - accum_loss = 0. - accum_step = 0 - epoch = sess.run(ops['global_step']) // params.num_train_batches - best_valid_ppl = [] - accum_rate = 0. - start_time = time.time() - last_min = (time.time() - start_time) / 60 - cleaned = True - print('Starting moving_avg') - sess.run(ops['start_moving_avg']) - avg_flag = "no_null" - while True and is_training: - try: - loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) - # bptt_rate = sess.run(ops['bptt_rate']) - # accum_rate += bptt_rate - - accum_loss += loss - accum_step += 1 - step = sess.run(ops['global_step']) - if step % params.log_every == 0: - # epoch = step // params.num_train_batches - train_ppl = np.exp(accum_loss / accum_step) - mins_so_far = (time.time() - start_time) / 60. - min_pices = mins_so_far-last_min - last_min = mins_so_far - log_string = 'epoch={0:<5d}'.format(epoch) - log_string += ' step={0}/{1:<6d}'.format(step, params.num_train_steps) - log_string += ' ppl={0:<10.2f}'.format(train_ppl) - log_string += ' lr={0:<6.3f}'.format(lr) - log_string += ' |g|={0:<6.3f}'.format(gn) - log_string += ' avg={0:<2d}'.format(moving_avg_started) - log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, min_pices/params.log_every) - # log_string += ' accum_rate(rate of a epoch)={0:<4.6f}'.format(accum_rate) - # log_string += ' should_reset:{}'.format(should_reset) - print(log_string) - - if moving_avg_started: - if avg_flag is "": - sess.run(ops['end_moving_avg']) - sess.run(ops['reset_avg']) - avg_flag = "restart_avg" - else: - sess.run(ops['update_moving_avg']) - # ops['eval_valid'](sess, use_moving_avg=moving_avg_started) - - - if step <= (300 * params.num_train_batches): - if step % (10 * params.num_train_batches) == 0: - print('Start learning decay ...') - sess.run(ops['set_lr_decay']) - if moving_avg_started and step + 5 % (10 * params.num_train_batches) == 0 and len(best_valid_ppl) > params.best_valid_ppl_threshold and valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]): - print('Start learning decay ...') - sess.run(ops['set_lr_decay']) - if should_reset: - accum_rate=0. - print("should_reset:{}".format(should_reset)) - sess.run(ops['reset_batch_states']) - epoch += 1 - accum_loss = 0 - accum_step = 0 - valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) - # 初始化验证集idx - sess.run(ops['reset_start_idx_eval']) - # 初始化训练集 batch_state, idx - sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) - # note:当目前的ppl不是最好的10个时,利用移动平均权重法进行调整。 - if (not moving_avg_started and - len(best_valid_ppl) > params.best_valid_ppl_threshold and - valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]) - ): - print('Starting moving_avg') - sess.run(ops['start_moving_avg']) - # print('Start learning decay ...') - # sess.run(ops['set_lr_decay']) - - if valid_ppl > 15.: - best_valid_ppl.append(valid_ppl) - if not cleaned: - best_valid_ppl = [p for p in best_valid_ppl if p < 40.] - cleaned = True - # ops['eval_test'](sess, use_moving_avg=moving_avg_started) - if step % (1 * params.num_train_batches) == 0: - test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) - print("test_ppl:{}".format(test_ppl)) - sess.run(ops['reset_start_idx_eval']) - if step >= params.num_train_steps: - #inference - test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) - print("final_test_ppl:{}".format(test_ppl)) - break - except tf.errors.InvalidArgumentError: - last_checkpoint = tf.train.latest_checkpoint(params.output_dir) - print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) - saver.restore(sess, last_checkpoint) - accum_loss, accum_step = 0., 0 - if not is_training: - moving_avg_started = sess.run(ops['moving_avg_started']) - test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) - sess.close() - # infer_loss = ops['inference']() - with tf.Session() as sess: - print("test_ppl:{}".format(test_ppl)) - #保存图,在./pb_model文件夹中生成model.pb文件 - # model.pb文件将作为input_graph给到接下来的freeze_graph函数 - tf.train.write_graph(sess.graph_def, './models_pb', 'model3.pb') # 通过write_graph生成模型文件 - freeze_graph.freeze_graph( - input_graph='./models_pb/model3.pb', # 传入write_graph生成的模型文件 - input_saver='', - input_binary=False, - input_checkpoint=params.ckp_path+'model.ckpt-906', # 传入训练生成的checkpoint文件 - output_node_names='output', # 与定义的推理网络输出节点保持一致 - restore_op_name='save/restore_all', - filename_tensor_name='save/Const:0', - output_graph='./models_pb/enas_lm3.pb', # 改为需要生成的推理网络的名称 - clear_devices=False, - initializer_nodes='') - print("done pb!") - else: - sess.close() - """ - if not is_training: - return infer_loss - else: - return -1 - """ - -def main(unused_args): - tf.logging.set_verbosity(tf.logging.INFO) - tf.logging.info("**********") - print("===>>>data_path:{}".format(FLAGS.data_path)) - print("===>>>output_dir:{}".format(FLAGS.output_dir)) - print("===>>>ckp_path:{}".format(FLAGS.ckp_path)) - - print('-' * 80) - output_dir = FLAGS.output_dir - - print('-' * 80) - if not gfile.IsDirectory(output_dir): - print('Path {} does not exist. Creating'.format(output_dir)) - gfile.MakeDirs(output_dir) - elif FLAGS.reset_output_dir: - print('Path {} exists. Reseting'.format(output_dir)) - gfile.DeleteRecursively(output_dir) - gfile.MakeDirs(output_dir) - - print('-' * 80) - log_file = os.path.join(output_dir, 'stdout') - print('Logging to {}'.format(log_file)) - sys.stdout = utils.Logger(log_file) - - params = contrib_training.HParams( - data_path=FLAGS.data_path, - log_every=FLAGS.log_every, - output_dir=FLAGS.output_dir, - ckp_path=FLAGS.ckp_path, - ) - - train(params, is_training=FLAGS.is_training) - - -if __name__ == '__main__': - tf.app.run() diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py deleted file mode 100644 index 49659f706..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/fixed_lib.py +++ /dev/null @@ -1,652 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""AWD ENAS fixed model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * -from npu_bridge.estimator.npu import npu_convert_dropout - -import numpy as np -import tensorflow.compat.v1 as tf - -import data_utils -import utils - -flags = tf.app.flags -FLAGS = flags.FLAGS - -flags.DEFINE_string('fixed_arc', '0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0', '') -flags.DEFINE_float('child_alpha', 0.7, 'activation L2 reg') -flags.DEFINE_float('child_drop_e', 0.125, 'drop rate words') -flags.DEFINE_float('child_drop_i', 0.175, 'drop rate embeddings') -flags.DEFINE_float('child_drop_l', 0.225, 'drop rate between layers') -flags.DEFINE_float('child_drop_o', 0.75, 'drop rate output') -flags.DEFINE_float('child_drop_w', 0.00, 'drop rate weight') -flags.DEFINE_float('child_drop_x', 0.725, 'drop rate at input of RNN cells') -flags.DEFINE_float('child_init_range', 0.05, '') -flags.DEFINE_float('child_grad_bound', 0.25, '') -flags.DEFINE_float('child_weight_decay', 2e-6, '') -flags.DEFINE_integer('child_num_train_epochs', 2, '') -flags.DEFINE_integer('child_hidden_size', 800, '') - - -def _gen_mask(shape, drop_prob): - """Generate a droppout mask.""" - keep_prob = 1. - drop_prob - mask = tf.random_uniform(shape, minval=0., maxval=1., dtype=tf.float32) - mask = tf.floor(mask + keep_prob) / keep_prob - return mask - - -def _rnn_fn(x, prev_s, w_prev, w_skip, input_mask, layer_mask, params): - """Multi-layer LSTM. - - Args: - x: [batch_size, num_steps, hidden_size]. - prev_s: [batch_size, hidden_size]. - w_prev: [2 * hidden_size, 2 * hidden_size]. - w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. - input_mask: [batch_size, hidden_size]. - layer_mask: [batch_size, hidden_size]. - params: hyper-params object. - - Returns: - next_s: [batch_size, hidden_size]. - all_s: [[batch_size, num_steps, hidden_size] * num_layers]. - """ - batch_size = x.get_shape()[0].value - print("batch_size:{}".format(batch_size)) - # batch_size = params.batch_size - num_steps = tf.shape(x)[1] - fixed_arc = params.fixed_arc - num_layers = len(fixed_arc) // 2 - set_shape = x.get_shape().as_list() - print("x.set_shape:{}".format(set_shape)) - - # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) - # all_s_my = [] - all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) - - - def _condition(step, *unused_args): - return tf.less(step, num_steps) - - def _body(step, prev_s, all_s_my): - """Body fn for `tf.while_loop`.""" - inp = x[:, step, :] - # print("inp:{}".format(inp)) - if layer_mask is not None: - assert input_mask is not None - ht = tf.matmul( - tf.concat([inp * input_mask, prev_s * layer_mask], axis=1), w_prev) - else: - ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) - # print("w_prev:{}".format(w_prev)) - h, t = tf.split(ht, 2, axis=1) - h = tf.tanh(h) - t = tf.sigmoid(t) - s = prev_s + t * (h - prev_s) - layers = [s] - - def _select_function(h, function_id): - if function_id == 0: - return tf.tanh(h) - elif function_id == 1: - return tf.nn.relu(h) - elif function_id == 2: - return tf.sigmoid(h) - elif function_id == 3: - return h - raise ValueError('Unknown func_idx {0}'.format(function_id)) - - start_idx = 0 - for layer_id in range(num_layers): - prev_idx = fixed_arc[start_idx] - func_idx = fixed_arc[start_idx + 1] - prev_s = layers[prev_idx] - if layer_mask is not None: - ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) - else: - ht = tf.matmul(prev_s, w_skip[layer_id]) - h, t = tf.split(ht, 2, axis=1) - - h = _select_function(h, func_idx) - t = tf.sigmoid(t) - s = prev_s + t * (h - prev_s) - # print("layers_id:{}\ns before set_shape:{}".format(layer_id, s)) - s.set_shape([batch_size, params.hidden_size]) - # print("s after set_shape:{}".format(s)) - layers.append(s) - start_idx += 2 - # print("layers:{}\ns:{}".format(layers, s)) - next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) - # print("next_s:{}".format(next_s)) - t = tf.stack([next_s]) - # print("t:{}".format(t)) - all_s_my = tf.concat([all_s_my, t], 0) - # print("all_s_my:{}".format(all_s_my)) - # all_s.append(next_s) - return step + 1, next_s, all_s_my - - loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] - _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) - # >>> add code >>> - # all_s_my = tf.reshape(all_s_my, [set_shape[1]+1, set_shape[0], params.hidden_size]) - # print("all_s_my(list):{}".format(all_s_my)) - # tmp = all_s_my[1:, :, :] - # # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) - # print("stack_all_s:{}".format(tmp)) - # all_s = tf.transpose(tmp, perm=[1, 0, 2]) - # # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) - # all_s = tf.reshape(all_s, [set_shape[0], set_shape[1], params.hidden_size]) - # print("all_s:{}".format(all_s)) - all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) - # print("stack_all_s:{}".format(all_s_my)) - - all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) - # print("all_s:{}".format(all_s)) - - return next_s, all_s - - -def _set_default_params(params): - """Set default values for the hparams.""" - params.add_hparam('alpha', FLAGS.child_alpha) # activation L2 reg - params.add_hparam('best_valid_ppl_threshold', 10) - - params.add_hparam('batch_size', 64) - params.add_hparam('bptt_steps', 32) - - # for dropouts: dropping rate, NOT keeping rate - params.add_hparam('drop_e', FLAGS.child_drop_e) # word - params.add_hparam('drop_i', FLAGS.child_drop_i) # embeddings - params.add_hparam('drop_l', FLAGS.child_drop_l) # between RNN nodes - params.add_hparam('drop_o', FLAGS.child_drop_o) # output - params.add_hparam('drop_w', FLAGS.child_drop_w) # weight - params.add_hparam('drop_x', FLAGS.child_drop_x) # input to RNN layers - - assert FLAGS.fixed_arc is not None - print(FLAGS.fixed_arc) - L_arc = FLAGS.fixed_arc.split(' ') - print("L_arc:{}".format(L_arc)) - params.add_hparam('fixed_arc', [int(d) for d in L_arc]) - - params.add_hparam('grad_bound', FLAGS.child_grad_bound) - params.add_hparam('hidden_size', FLAGS.child_hidden_size) - params.add_hparam('init_range', FLAGS.child_init_range) - params.add_hparam('learning_rate', 40.) - params.add_hparam('num_train_epochs', FLAGS.child_num_train_epochs) - params.add_hparam('num_warmup_epochs', 0.0) - params.add_hparam('vocab_size', 10000) - - params.add_hparam('weight_decay', FLAGS.child_weight_decay) - return params - - -class LM(object): - """Language model.""" - - def __init__(self, params, x_train, x_valid, x_test, name='language_model'): - print('-' * 80) - print('Building LM') - - self.params = _set_default_params(params) - self.name = name - - # train data - (self.x_train, self.y_train, - self.num_train_batches, self.reset_start_idx, - self.should_reset, - self.base_bptt, self.bptt_rate) = data_utils.input_producer( - x_train, params.batch_size, params.bptt_steps, random_len=True) - params.add_hparam( - 'num_train_steps', self.num_train_batches * params.num_train_epochs) - - # valid data - (self.x_valid, self.y_valid, - self.num_valid_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer( - x_valid, params.batch_size, params.bptt_steps) - - # test data - (self.x_test, self.y_test, - self.num_test_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer(x_test, 1, 1) - - params.add_hparam('num_warmup_steps', - params.num_warmup_epochs * self.num_train_batches) - self._build_params() - self._build_train() - self._build_valid() - self._build_test() - self._build_infer() - self._build_avg_infer() - - def _build_params(self): - """Create model parameters.""" - - print('-' * 80) - print('Building model params') - initializer = tf.initializers.random_uniform(minval=-self.params.init_range, - maxval=self.params.init_range) - with tf.variable_scope(self.name, initializer=initializer): - with tf.variable_scope('embedding'): - w_emb = tf.get_variable( - 'w', [self.params.vocab_size, self.params.hidden_size], - initializer=initializer) - # >>> add code >>> - dropped_w_emb = npu_ops.dropout(w_emb, 1 - self.params.drop_e) - # >>> add code >>> - # dropped_w_emb = tf.layers.dropout( - # w_emb, self.params.drop_e, [self.params.vocab_size, 1], - # training=True) - - hidden_size = self.params.hidden_size - fixed_arc = self.params.fixed_arc - num_layers = len(fixed_arc) // 2 - with tf.variable_scope('rnn_cell'): - w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) - i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) - h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) - mask = tf.concat([i_mask, h_mask], axis=0) - dropped_w_prev = w_prev * mask - - w_skip, dropped_w_skip = [], [] - for layer_id in range(num_layers): - mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) - with tf.variable_scope('layer_{}'.format(layer_id)): - w = tf.get_variable('w', [hidden_size, 2 * hidden_size]) - dropped_w = w * mask - w_skip.append(w) - dropped_w_skip.append(dropped_w) - - with tf.variable_scope('init_states'): - with tf.variable_scope('batch'): - init_shape = [self.params.batch_size, hidden_size] - batch_prev_s = tf.get_variable( - 's', init_shape, dtype=tf.float32, trainable=False) - zeros = np.zeros(init_shape, dtype=np.float32) - batch_reset = tf.assign(batch_prev_s, zeros) - with tf.variable_scope('test'): - init_shape = [1, hidden_size] - test_prev_s = tf.get_variable( - 's', init_shape, dtype=tf.float32, trainable=False) - zeros = tf.zeros(init_shape, dtype=tf.float32) - test_reset = tf.assign(test_prev_s, zeros) - - num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) - print('Model has {0} params'.format(num_params)) - - self.batch_init_states = { - 's': batch_prev_s, - 'reset': batch_reset, - } - self.train_params = { - 'w_emb': dropped_w_emb, - 'w_prev': dropped_w_prev, - 'w_skip': dropped_w_skip, - 'w_soft': w_emb, - } - self.test_init_states = { - 's': test_prev_s, - 'reset': test_reset, - } - self.eval_params = { - 'w_emb': w_emb, - 'w_prev': w_prev, - 'w_skip': w_skip, - 'w_soft': w_emb, - } - - def _forward(self, x, y, model_params, init_states, is_training=False): - """Computes the logits. - - Args: - x: [batch_size, num_steps], input batch. - y: [batch_size, num_steps], output batch. - model_params: a `dict` of params to use. - init_states: a `dict` of params to use. - is_training: if `True`, will apply regularizations. - - Returns: - loss: scalar, cross-entropy loss - """ - w_emb = model_params['w_emb'] - w_prev = model_params['w_prev'] - w_skip = model_params['w_skip'] - w_soft = model_params['w_soft'] - prev_s = init_states['s'] - - emb = tf.nn.embedding_lookup(w_emb, x) - batch_size = self.params.batch_size - hidden_size = self.params.hidden_size - if is_training: - # >>> add code >>> - emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) - - # >>> add code >>> - # emb = tf.layers.dropout( - # emb, self.params.drop_i, - # [self.params.batch_size, 1, hidden_size], training=True) - - input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) - layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) - else: - input_mask = None - layer_mask = None - - out_s, all_s = _rnn_fn(emb, prev_s, w_prev, w_skip, input_mask, layer_mask, - self.params) - top_s = all_s - if is_training: - # >>> add code >>> - top_s = npu_ops.dropout(top_s, - 1 - self.params.drop_o)# ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) - # >>> add code >>> - - # top_s = tf.layers.dropout(top_s, self.params.drop_o, - # [batch_size, 1, hidden_size], training=True) - - carry_on = [tf.assign(prev_s, out_s)] - # print("top_s:{}\nw_soft:{}".format(top_s, w_soft)) - logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) - # print("logits:{}".format(logits)) - loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, - logits=logits) - loss = tf.reduce_mean(loss) - - reg_loss = loss # loss + regularization_terms, for training only - # print("_forward/loss:{}".format(loss)) - if is_training: - # L2 weight reg - reg_loss += self.params.weight_decay * tf.add_n( - [tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) - - # activation L2 reg - reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) - - with tf.control_dependencies(carry_on): - loss = tf.identity(loss) - if is_training: - reg_loss = tf.identity(reg_loss) - # print("reg_loss:{}\nloss:{}".format(reg_loss, loss)) - return reg_loss, loss - - def _build_train(self): - """Build training ops.""" - print('-' * 80) - print('Building train graph') - reg_loss, loss = self._forward(self.x_train, self.y_train, - self.train_params, self.batch_init_states, - is_training=True) - - tf_vars = tf.trainable_variables() - # print("reg_loss:{}".format(reg_loss)) - print("tf_vars:{}".format(tf_vars)) - global_step = tf.train.get_or_create_global_step() - lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / - tf.cast(self.params.bptt_steps, dtype=tf.float32)) - with tf.variable_scope('HParam'): - lr_decay = tf.get_variable('learning_rate_decay', [], initializer=tf.constant_initializer(1.), dtype=tf.float32, trainable=False) - self.set_lr_decay = tf.assign_sub(lr_decay, 0.02*lr_decay) - learning_rate = utils.get_lr(global_step, self.params, lr_decay) * lr_scale - grads = tf.gradients(reg_loss, tf_vars) - # print("grads:{}".format(grads)) - clipped_grads, grad_norm = tf.clip_by_global_norm(grads, - self.params.grad_bound) - (self.update_moving_avg_ops, self.use_moving_avg_vars, - self.restore_normal_vars) = self._create_average_ops() - optimizer = tf.train.GradientDescentOptimizer(learning_rate) - train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), - global_step=global_step) - - self.train_loss = loss - self.train_op = train_op - self.grad_norm = grad_norm - self.learning_rate = learning_rate - - # def _EMA(self): - # """Build moving average ops.""" - # print('Creating moving average ops') - # - # with tf.variable_scope('moving_avg_flag'): - # self.moving_avg_started = tf.get_variable( - # 'flag', [], tf.int32, initializer=tf.initializers.zeros(), - # trainable=False) - # self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) - # self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) - # all_vars = tf.trainable_variables() - # - # ema = tf.train.ExponentialMovingAverage(0.99) - # - # average_op = ema.apply(all_vars) - # back_up_v = tf.identity(all_vars) - # use_average_op = tf.assign(all_vars, ema.average(all_vars)) - # ema.average_name() - # reverse_average_op = tf.assign(all_vars, back_up_v) - - - - - def _create_average_ops(self): - """Build moving average ops.""" - print('Creating moving average ops') - - with tf.variable_scope('moving_avg_flag'): - self.moving_avg_started = tf.get_variable( - 'flag', [], tf.int32, initializer=tf.initializers.zeros(), - trainable=False) - self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) - self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) - - all_vars = tf.trainable_variables() - print('all_vars:{}'.format(all_vars)) - average_pairs = [] - var_cnt = 0 - with tf.variable_scope('average'): - for v in all_vars: - avg_v = tf.get_variable( - str(var_cnt), shape=v.shape, dtype=v.dtype, - initializer=tf.zeros_initializer, trainable=False) - var_cnt += 1 - average_pairs.append([v, avg_v]) - backup_pairs = [] - var_cnt = 0 - with tf.variable_scope('backup'): - for v in all_vars: - backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, - trainable=False) - var_cnt += 1 - backup_pairs.append([v, backup_v]) - # 原作者手动实现的Moving Average ::当eval_valid_ppl退化到一定阈值(退步10名)后启动 - with tf.variable_scope('avg_step'): - avg_step = tf.get_variable('step', [], initializer=tf.constant_initializer(0.), dtype=tf.float32, trainable=False) - tmp1 = [] - tmp2 = [] - tmp3 = [] - self.restart_avg = tf.assign(avg_step, 0.) - with tf.control_dependencies([tf.assign_add(avg_step, 1.)]): - average_op = [] - for v, avg_v in average_pairs: - # v_curr = tf.Variable(tf.cast(tf.identity(v), tf.float32), dtype=tf.float32, trainable=False) - # avg_v_curr = tf.Variable(tf.cast(tf.identity(avg_v), tf.float32), dtype=tf.float32, trainable=False) - # mu = 1. / avg_step - mu = tf.cond(tf.cast(0.999 < (1. + avg_step) / (10. + avg_step), tf.bool), - lambda: tf.cast(tf.constant(0.99), dtype=tf.float32), - lambda: tf.cast((1. + avg_step) / (10. + avg_step), dtype=tf.float32)) - - new_avg = mu * tf.cast(avg_v, tf.float32) + (1. - mu) * tf.cast(v, tf.float32) - with tf.control_dependencies([new_avg]): - average_op.append(tf.assign(avg_v, tf.cast(new_avg, avg_v.dtype))) - # 追踪变量 - tmp1.append(v) - tmp2.append(new_avg) - tmp3.append([avg_step, mu, tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(new_avg ** 2)]) - - self.p1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp1]) - self.p2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp2]) - self.p3 = tmp3 - # # 使用官方API - # with tf.variable_scope('avg_step'): - # avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) - # - # ema = tf.train.ExponentialMovingAverage(0.99, avg_step) - # with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): - # average_op = [] - # for v, avg_v in average_pairs: - # v = tf.Variable(tf.cast(v, tf.float32), dtype=tf.float32, trainable=False) - # avg_v = tf.Variable(tf.cast(avg_v, tf.float32), dtype=tf.float32, trainable=False) - # print('v:{}'.format(v)) - # ema.apply([v]) - # new_avg = ema.average(v) - # print('new_avg:{}'.format(new_avg)) - # with tf.control_dependencies([new_avg]): - # print('avg_v:'.format(avg_v)) - # average_op.append(tf.assign(avg_v, new_avg)) - # # average_op = tf.group(*average_op) - - assert len(average_pairs) == len(all_vars) - assert len(average_pairs) == len(backup_pairs) - use_average_op = [] - - new_tmp1 = [] - for i in range(len(average_pairs)): - v, avg_v = average_pairs[i] - _, backup_v = backup_pairs[i] - with tf.control_dependencies([tf.assign(backup_v, v)]): - new_tmp1.append([tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(backup_v ** 2)]) - use_average_op.append(tf.assign(v, avg_v)) - self.p4 = new_tmp1 - - use_average_op = tf.group(*use_average_op) - # with tf.control_dependencies([use_average_op]): - self.p3_1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) - reverse_average_op = [] - new_tmp2 = [] - for v, backup_v in backup_pairs: - # with tf.control_dependencies([use_average_op]): - new_tmp2.append([tf.reduce_sum(v ** 2), tf.reduce_sum(backup_v ** 2)]) - reverse_average_op.append(tf.assign(v, backup_v)) - self.p5 = new_tmp2 - reverse_average_op = tf.group(*reverse_average_op) - # with tf.control_dependencies([reverse_average_op]): - self.p3_2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) - - return average_op, use_average_op, reverse_average_op - - def _eval_test(self, sess, use_moving_avg=False): - """Eval 1 round on test set.""" - total_loss = 0 - if use_moving_avg: - print('v:{}'.format(tf.trainable_variables())) - sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) - print('v_avg:{}'.format(tf.trainable_variables())) - for step in range(int(self.num_test_batches)): - total_loss += sess.run(self.test_loss) - if (step + 1) % 1000 == 0: - test_ppl = np.exp(total_loss / (step + 1)) - log_string = 'step={0:<6d}'.format(step + 1) - log_string += ' test_ppl={0:<.2f}'.format(test_ppl) - print(log_string) - if sess.run(self.should_reset_eval): - break - # test_ppl = np.exp(total_loss / self.num_test_batches) - - # log_string = 'step={0:<6d}'.format(self.num_test_batches) - # log_string += ' test_ppl={0:<.2f}'.format(test_ppl) - # print(log_string) - if use_moving_avg: - sess.run(self.restore_normal_vars) - # test_ppl = tf.math.exp(total_loss/ self.num_test_batches, name='output') - # print("test_ppl:{}".format(test_ppl)) - # loss_assign_op = tf.assign(self.tt_loss, tf.Variable(total_loss, name='total_loss', dtype=tf.float32,trainable=False)) - - def _build_valid(self): - print('Building valid graph') - _, loss = self._forward(self.x_valid, self.y_valid, - self.eval_params, self.batch_init_states) - self.valid_loss = loss - - def _build_test(self): - print('Building test graph') - _, loss = self._forward(self.x_test, self.y_test, - self.eval_params, self.test_init_states) - self.test_loss = loss - - def _build_infer(self): - print("Building infer graph") - tt_loss = tf.Variable(0, name="total_loss", dtype=tf.float32, trainable=False) - def _condition(step, *unused_args): - return tf.less(step, self.num_test_batches-3) - def _body(step, tt_loss): - with tf.control_dependencies([self.test_loss]): - tt_loss += self.test_loss - return step+1, tt_loss - loop_inps = [tf.constant(0, dtype=tf.int32), tt_loss] - _, tt_loss = tf.while_loop(_condition, _body, loop_inps) - test_ppl = tf.math.exp(tt_loss/ self.num_test_batches, name='test_ppl') - print("test_ppl:{}".format(test_ppl)) - self.infer_ppl = test_ppl - - def _build_avg_infer(self): - print("Build avg_infer graph") - def _fp(): - with tf.control_dependencies([self.use_moving_avg_vars, self.test_init_states['reset']]): - avg_infer_ppl = self.infer_ppl - with tf.control_dependencies([avg_infer_ppl, self.restore_normal_vars]): - return avg_infer_ppl - def _fn(): - return self.infer_ppl - - with tf.control_dependencies([self.moving_avg_started]): - avg_infer_ppl = tf.cond(tf.greater_equal(self.moving_avg_started, 1), _fp, _fn) - self.avg_infer_ppl = tf.identity(avg_infer_ppl, name="output") - print("self.avg_infer_ppl:{}".format(self.avg_infer_ppl)) - - - def eval_valid(self, sess, use_moving_avg=False): - """Eval 1 round on valid set.""" - total_loss = 0 - - if use_moving_avg: - # print('sum_v:{}'.format(sess.run(self.p1))) - # print('new_sum_v:{}'.format(sess.run(self.p2))) - # print('[[step, mu, v, v_avg, new_v_avg]]={}'.format(sess.run(self.p3))) - # self.use_moving_avg_vars ===>影子权重暂时替代当前权重 - sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) - # print('v_avg:{}\n[[v, avg_v, backup_v]]={}'.format(sess.run(self.p3_1), sess.run(self.p4))) - - valid_loss = [] - for _ in range(self.num_valid_batches): - loss = sess.run(self.valid_loss) - total_loss += loss - valid_loss.append(loss) - if sess.run(self.should_reset_eval): - break - print("valid_loss={}, self.num_valid_batches={}".format(valid_loss, self.num_valid_batches)) - valid_ppl = np.exp(total_loss / self.num_valid_batches) - print('valid_ppl={0:<.2f}'.format(valid_ppl)) - if use_moving_avg: - sess.run(self.restore_normal_vars) - - # print('v:{}\n[[v, backup_v]]={} \n============================================================'.format( - # sess.run(self.p3_2), sess.run(self.p5))) - - return valid_ppl - - def do_infer(self, sess, use_moving_avg=False): - # self._eval_test(sess, use_moving_avg) - return sess.run(self.avg_infer_ppl) diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py deleted file mode 100644 index d8bdec500..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/help_modelarts.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import datetime -# import moxing as mox -import tensorflow.compat.v1 as tf -gfile = tf.gfile - -def obs_data2modelarts(config): - """ - Copy train data from obs to modelarts by using moxing api. - """ - start = datetime.datetime.now() - print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.data_url, config.modelarts_data_dir)) - mox.file.copy_parallel(src_url=config.data_url, dst_url=config.modelarts_data_dir) - print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.ckp_path, config.modelarts_result_dir)) - output_dir = config.modelarts_result_dir - if not gfile.IsDirectory(output_dir): - print('Path {} does not exist. Creating'.format(output_dir)) - gfile.MakeDirs(output_dir) - mox.file.copy_parallel(src_url=config.ckp_path, dst_url=config.modelarts_result_dir) - end = datetime.datetime.now() - files = os.listdir(config.modelarts_data_dir) - print("===>>>Files:", files) - files2 = os.listdir(config.modelarts_result_dir) - print("===>>>Files2:", files2) - - -def modelarts_result2obs(FLAGS): - """ - Copy debug data from modelarts to obs. - According to the swich flags, the debug data may contains auto tune repository, - dump data for precision comparision, even the computation graph and profiling data. - """ - work_dir = os.getcwd() - print("start op: modelarts_result2obs..........") - - ## copy result from modelarts to obs - obs_result_dir = os.path.join(FLAGS.obs_dir, 'result') - if not mox.file.exists(obs_result_dir): - mox.file.make_dirs(obs_result_dir) - else: - mox.file.remove(obs_result_dir, recursive=True) - mox.file.make_dirs(obs_result_dir) - mox.file.copy_parallel(src_url=FLAGS.output_dir, dst_url=obs_result_dir) - print("===>>>Copy Event or Checkpoint from modelarts dir:{} to obs:{}".format(FLAGS.output_dir, obs_result_dir)) - - ## Copy auto tune repository. Comment this snippets if npu_auto_tune is off. - # if FLAGS.npu_auto_tune: - # modelarts_auto_tune_dir = os.path.join(work_dir, "npu_auto_tune") - # obs_auto_tune_dir = os.path.join(FLAGS.obs_dir, 'npu_auto_tune') - # if not mox.file.exists(obs_auto_tune_dir): - # mox.file.make_dirs(obs_auto_tune_dir) - # mox.file.copy_parallel(modelarts_auto_tune_dir, obs_auto_tune_dir) - # print("===>>>Auto tune:{} on OBS dir:{}".format(mox.file.list_directory(obs_auto_tune_dir), obs_auto_tune_dir)) - # - # ## Copy dump data. Comment this snippets if npu_dump_data is off. - # if FLAGS.npu_dump_data: - # modelarts_dump_data_dir = os.path.join(work_dir, "npu_dump_data") - # obs_dump_data_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_data') - # if not mox.file.exists(obs_dump_data_dir): - # mox.file.make_dirs(obs_dump_data_dir) - # mox.file.copy_parallel(modelarts_dump_data_dir, obs_dump_data_dir) - # print("===>>>Dumped graph:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_data_dir), obs_dump_data_dir)) - # - # ## Copy compute graph. Comment this snippets if npu_dump_graph is off. - # if FLAGS.npu_dump_graph: - # modelarts_dump_graph_dir = os.path.join(work_dir, "npu_dump_graph") - # obs_dump_graph_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_graph') - # if not mox.file.exists(obs_dump_graph_dir): - # mox.file.make_dirs(obs_dump_graph_dir) - # mox.file.copy_parallel(modelarts_dump_graph_dir, obs_dump_graph_dir) - # print("===>>>Dumped data:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_graph_dir), obs_dump_graph_dir)) - # - # ## Copy profiling data. Comment this snippets if npu_profiling is off. - # if FLAGS.npu_profiling: - # modelarts_profiling_dir = os.path.join(work_dir, "npu_profiling") - # obs_profiling_dir = os.path.join(FLAGS.obs_dir, 'npu_profiling') - # if not mox.file.exists(obs_profiling_dir): - # mox.file.make_dirs(obs_profiling_dir) - # mox.file.copy_parallel(modelarts_profiling_dir, obs_profiling_dir) - # print("===>>>Profiling data:{} on OBS dir:{}".format(mox.file.list_directory(obs_profiling_dir), obs_profiling_dir)) diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py deleted file mode 100644 index 2a1816ac3..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm.py +++ /dev/null @@ -1,174 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Entry point for AWD LSTM.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import os -import pickle -import sys -import time - -import numpy as np -import tensorflow.compat.v1 as tf - -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import lstm_lib -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils -from tensorflow.contrib import training as contrib_training - -flags = tf.app.flags -gfile = tf.gfile -FLAGS = flags.FLAGS - -flags.DEFINE_boolean('reset_output_dir', False, '') -flags.DEFINE_string('output_dir', None, '') -flags.DEFINE_string('data_path', None, '') - -flags.DEFINE_integer('log_every', 200, '') - - -def get_ops(params, x_train, x_valid, x_test): - """Build [train, valid, test] graphs.""" - - lm = lstm_lib.LM(params, x_train, x_valid, x_test) - params.add_hparam('num_train_batches', lm.num_train_batches) - ops = { - 'train_op': lm.train_op, - 'learning_rate': lm.learning_rate, - 'grad_norm': lm.grad_norm, - 'train_loss': lm.train_loss, - 'global_step': tf.train.get_or_create_global_step(), - 'reset_batch_states': lm.batch_init_states['reset'], - 'eval_valid': lm.eval_valid, - 'eval_test': lm.eval_test, - - 'reset_start_idx': lm.reset_start_idx, - 'should_reset': lm.should_reset, - 'moving_avg_started': lm.moving_avg_started, - 'update_moving_avg': lm.update_moving_avg_ops, - 'start_moving_avg': lm.start_moving_avg_op, - } - print('-' * 80) - print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) - - return ops - - -def train(params): - """Entry point for training.""" - with gfile.GFile(params.data_path, 'rb') as finp: - x_train, x_valid, x_test, _, _ = pickle.load(finp) - print('-' * 80) - print('train_size: {0}'.format(np.size(x_train))) - print('valid_size: {0}'.format(np.size(x_valid))) - print(' test_size: {0}'.format(np.size(x_test))) - - g = tf.Graph() - with g.as_default(): - ops = get_ops(params, x_train, x_valid, x_test) - run_ops = [ - ops['train_loss'], - ops['grad_norm'], - ops['learning_rate'], - ops['should_reset'], - ops['moving_avg_started'], - ops['train_op'], - ] - - saver = tf.train.Saver(max_to_keep=5) - checkpoint_saver_hook = tf.train.CheckpointSaverHook( - params.output_dir, save_steps=params.num_train_batches, saver=saver) - hooks = [checkpoint_saver_hook] - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, - checkpoint_dir=params.output_dir) - accum_loss = 0 - accum_step = 0 - epoch = 0 - best_valid_ppl = [] - start_time = time.time() - while True: - sess.run(ops['reset_batch_states']) - loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) - accum_loss += loss - accum_step += 1 - step = sess.run(ops['global_step']) - if step % params.log_every == 0: - train_ppl = np.exp(accum_loss / accum_step) - mins_so_far = (time.time() - start_time) / 60. - log_string = 'epoch={0:<5d}'.format(epoch) - log_string += ' step={0:<7d}'.format(step) - log_string += ' ppl={0:<9.2f}'.format(train_ppl) - log_string += ' lr={0:<10.7f}'.format(lr) - log_string += ' |g|={0:<5.2f}'.format(gn) - log_string += ' avg={0:<2d}'.format(moving_avg_started) - log_string += ' mins={0:<.2f}'.format(mins_so_far) - print(log_string) - - if moving_avg_started: - sess.run(ops['update_moving_avg']) - - # if step % params.num_train_batches == 0: - if should_reset: - epoch += 1 - accum_loss = 0 - accum_step = 0 - valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) - sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) - if (not moving_avg_started and - len(best_valid_ppl) > params.best_valid_ppl_threshold and - valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold])): - print('Starting moving_avg') - sess.run(ops['start_moving_avg']) - best_valid_ppl.append(valid_ppl) - - if step >= params.num_train_steps: - ops['eval_test'](sess, use_moving_avg=moving_avg_started) - break - sess.close() - - -def main(unused_args): - output_dir = FLAGS.output_dir - print('-' * 80) - if not gfile.IsDirectory(output_dir): - print('Path {} does not exist. Creating'.format(output_dir)) - gfile.MakeDirs(output_dir) - elif FLAGS.reset_output_dir: - print('Path {} exists. Reseting'.format(output_dir)) - gfile.DeleteRecursively(output_dir) - gfile.MakeDirs(output_dir) - - print('-' * 80) - log_file = os.path.join(output_dir, 'stdout') - print('Logging to {}'.format(log_file)) - sys.stdout = utils.Logger(log_file) - - params = contrib_training.HParams( - data_path=FLAGS.data_path, - log_every=FLAGS.log_every, - output_dir=FLAGS.output_dir, - ) - - train(params) - - -if __name__ == '__main__': - tf.app.run() diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py deleted file mode 100644 index 576b6f2e2..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/lstm_lib.py +++ /dev/null @@ -1,458 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""AWD LSTM model.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * -from npu_bridge.estimator.npu import npu_convert_dropout - -import numpy as np -import tensorflow.compat.v1 as tf - -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils - -MOVING_AVERAGE_DECAY = 0.9995 - -MOVING_AVERAGE_DECAY = 0.9995 - - -def _gen_mask(shape, drop_prob): - """Generate a droppout mask.""" - keep_prob = 1. - drop_prob - mask = tf.random_uniform(shape, dtype=tf.float32) - mask = tf.floor(mask + keep_prob) / keep_prob - return mask - - -def _lstm(x, prev_c, prev_h, w_lstm, layer_masks): - """Multi-layer LSTM. - - Args: - x: [batch_size, num_steps, hidden_size]. - prev_c: [[batch_size, hidden_size] * num_layers]. - prev_h: [[batch_size, hidden_size] * num_layers]. - w_lstm: [[2 * hidden_size, 4 * hidden_size] * num_layers]. - layer_masks: [([hidden_size, hidden_size] or None)* num_layers]. - - Returns: - next_c: [[batch_size, hidden_size] * num_layers]. - next_h: [[batch_size, hidden_size] * num_layers]. - all_h: [batch_size, num_steps, hidden_size]. - """ - _, num_steps, _ = tf.unstack(tf.shape(x)) - num_layers = len(w_lstm) - - all_h = [tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) - for _ in range(num_layers)] - - def _condition(step, *unused_args): - return tf.less(step, num_steps) - - def _body(step, pprev_c, pprev_h, all_h): - """Apply LSTM at each step.""" - next_c, next_h = [], [] - for layer_id, (p_c, p_h, w, m) in enumerate(zip( - pprev_c, pprev_h, w_lstm, layer_masks)): - inp = x[:, step, :] if layer_id == 0 else next_h[-1] - if m is not None: - inp *= m - ifog = tf.matmul(tf.concat([inp, p_h], axis=1), w) - i, f, o, g = tf.split(ifog, 4, axis=1) - i = tf.sigmoid(i) - f = tf.sigmoid(f) - o = tf.sigmoid(o) - g = tf.tanh(g) - c = i * g + f * p_c - h = o * tf.tanh(c) - all_h[layer_id] = all_h[layer_id].write(step, h) - next_c.append(c) - next_h.append(h) - return step + 1, next_c, next_h, all_h - - loop_inps = [tf.constant(0, dtype=tf.int32), prev_c, prev_h, all_h] - _, next_c, next_h, all_h = tf.while_loop(_condition, _body, loop_inps, - parallel_iterations=1) - all_h = [tf.transpose(h.stack(), [1, 0, 2]) - for h in all_h] - - return next_c, next_h, all_h - - -def _set_default_params(params): - """Set default parameters.""" - params.add_hparam('alpha', 2.) # activation L2 reg - params.add_hparam('best_valid_ppl_threshold', 7) - params.add_hparam('beta', 1.) # activation slowness reg - - params.add_hparam('batch_size', 12) - params.add_hparam('bptt_steps', 70) - - # for dropouts: dropping rate, NOT keeping rate - params.add_hparam('drop_e', 0.10) # word - params.add_hparam('drop_i', 0.65) # embeddings - params.add_hparam('drop_l', 0.30) # between layers - params.add_hparam('drop_o', 0.40) # output - params.add_hparam('drop_w', 0.50) # weight - - params.add_hparam('emb_size', 400) - params.add_hparam('start_decay_epoch', 14) - params.add_hparam('decay_every_epoch', 1) - params.add_hparam('decay_rate', 0.98) - params.add_hparam('grad_bound', 0.25) - params.add_hparam('hidden_size', 1100) - params.add_hparam('init_range', 0.1) - params.add_hparam('learning_rate', 20.) - params.add_hparam('num_layers', 3) - params.add_hparam('num_train_epochs', 500) - params.add_hparam('vocab_size', 10000) - - params.add_hparam('weight_decay', 1.2e-6) - return params - - -class LM(object): - """Language model.""" - - def __init__(self, params, x_train, x_valid, x_test, name='language_model'): - print('-' * 80) - print('Building LM') - - self.params = _set_default_params(params) - self.name = name - - # train data - (self.x_train, self.y_train, - self.num_train_batches, self.reset_start_idx, - self.should_reset, self.base_bptt) = data_utils.input_producer( - x_train, params.batch_size, params.bptt_steps, random_len=True) - params.add_hparam( - 'num_train_steps', self.num_train_batches * params.num_train_epochs) - - # valid data - (self.x_valid, self.y_valid, - self.num_valid_batches) = data_utils.input_producer( - x_valid, params.batch_size, params.bptt_steps) - - # test data - (self.x_test, self.y_test, - self.num_test_batches) = data_utils.input_producer(x_test, 1, 1) - - params.add_hparam('start_decay_step', - params.start_decay_epoch * self.num_train_batches) - params.add_hparam('decay_every_step', - params.decay_every_epoch * self.num_train_batches) - - self._build_params() - self._build_train() - self._build_valid() - self._build_test() - - def _build_params(self): - """Create and count model parameters.""" - print('-' * 80) - print('Building model params') - with tf.variable_scope(self.name): - with tf.variable_scope('embedding'): - initializer = tf.initializers.random_uniform( - -self.params.init_range, self.params.init_range) - w_emb = tf.get_variable( - 'w', [self.params.vocab_size, self.params.emb_size], - initializer=initializer) - dropped_w_emb = tf.layers.dropout( - w_emb, self.params.drop_e, [self.params.vocab_size, 1], - training=True) - - w_lstm = [] - dropped_w_lstm = [] - with tf.variable_scope('lstm'): - for i in range(self.params.num_layers): - inp_size = self.params.emb_size if i == 0 else self.params.hidden_size - hid_size = (self.params.emb_size if i == self.params.num_layers - 1 - else self.params.hidden_size) - init_range = 1.0 / np.sqrt(hid_size) - initializer = tf.initializers.random_uniform(-init_range, init_range) - with tf.variable_scope('layer_{0}'.format(i)): - w = tf.get_variable('w', [inp_size + hid_size, 4 * hid_size], - initializer=initializer) - i_mask = tf.ones([inp_size, 4 * hid_size], dtype=tf.float32) - h_mask = _gen_mask([hid_size, 4 * hid_size], self.params.drop_w) - mask = tf.concat([i_mask, h_mask], axis=0) - dropped_w = w * mask - w_lstm.append(w) - dropped_w_lstm.append(dropped_w) - - with tf.variable_scope('init_states'): - batch_prev_c, batch_prev_h, batch_reset = [], [], [] - test_prev_c, test_prev_h, test_reset = [], [], [] - for i in range(self.params.num_layers): - inp_size = self.params.emb_size if i == 0 else self.params.hidden_size - hid_size = (self.params.emb_size if i == self.params.num_layers - 1 - else self.params.hidden_size) - - with tf.variable_scope('layer_{0}'.format(i)): - with tf.variable_scope('batch'): - init_shape = [self.params.batch_size, hid_size] - batch_prev_c.append(tf.get_variable( - 'c', init_shape, dtype=tf.float32, trainable=False)) - batch_prev_h.append(tf.get_variable( - 'h', init_shape, dtype=tf.float32, trainable=False)) - zeros = np.zeros(init_shape, dtype=np.float32) - batch_reset.append(tf.assign(batch_prev_c[-1], zeros)) - batch_reset.append(tf.assign(batch_prev_h[-1], zeros)) - with tf.variable_scope('test'): - init_shape = [1, hid_size] - test_prev_c.append(tf.get_variable( - 'c', init_shape, dtype=tf.float32, trainable=False)) - test_prev_h.append(tf.get_variable( - 'h', init_shape, dtype=tf.float32, trainable=False)) - zeros = np.zeros(init_shape, dtype=np.float32) - test_reset.append(tf.assign(test_prev_c[-1], zeros)) - test_reset.append(tf.assign(test_prev_h[-1], zeros)) - - num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) - print('Model has {0} params'.format(num_params)) - - self.batch_init_states = { - 'c': batch_prev_c, - 'h': batch_prev_h, - 'reset': batch_reset, - } - self.train_params = { - 'w_emb': dropped_w_emb, - 'w_lstm': dropped_w_lstm, - 'w_soft': w_emb, - } - self.test_init_states = { - 'c': test_prev_c, - 'h': test_prev_h, - 'reset': test_reset, - } - self.eval_params = { - 'w_emb': w_emb, - 'w_lstm': w_lstm, - 'w_soft': w_emb, - } - - def _forward(self, x, y, model_params, init_states, is_training=False): - """Computes the logits. - - Args: - x: [batch_size, num_steps], input batch. - y: [batch_size, num_steps], output batch. - model_params: a `dict` of params to use. - init_states: a `dict` of params to use. - is_training: if `True`, will apply regularizations. - - Returns: - loss: scalar, cross-entropy loss - """ - w_emb = model_params['w_emb'] - w_lstm = model_params['w_lstm'] - w_soft = model_params['w_soft'] - prev_c = init_states['c'] - prev_h = init_states['h'] - - emb = tf.nn.embedding_lookup(w_emb, x) - if is_training: - emb = tf.layers.dropout( - emb, self.params.drop_i, - [self.params.batch_size, 1, self.params.emb_size], training=True) - - layer_masks = [None] - for _ in range(1, self.params.num_layers - 1): - mask = _gen_mask([self.params.batch_size, self.params.hidden_size], - self.params.drop_l) - layer_masks.append(mask) - layer_masks.append(None) - else: - layer_masks = [None] * self.params.num_layers - - out_c, out_h, all_h = _lstm(emb, prev_c, prev_h, w_lstm, layer_masks) - top_h = all_h[-1] - if is_training: - top_h = tf.layers.dropout( - top_h, self.params.drop_o, - [self.params.batch_size, 1, self.params.emb_size], training=True) - - carry_on = [] - for var, val in zip(prev_c + prev_h, out_c + out_h): - carry_on.append(tf.assign(var, val)) - - logits = tf.einsum('bnh,vh->bnv', top_h, w_soft) - loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, - logits=logits) - loss = tf.reduce_mean(loss) # TODO(hyhieu): watch for num_steps - - reg_loss = loss # loss + regularization_terms, for training only - if is_training: - # L2 weight reg - reg_loss += self.params.weight_decay * tf.add_n( - [tf.reduce_sum(w ** 2) for w in tf.trainable_variables()]) - - # activation L2 reg - reg_loss += self.params.alpha * tf.add_n( - [tf.reduce_mean(h ** 2) for h in all_h[:-1]]) - - # activation slowness L2 reg - reg_loss += self.params.beta * tf.add_n( - [tf.reduce_mean((h[:, 1:, :] - h[:, :-1, :]) ** 2) - for h in all_h[:-1]]) - - with tf.control_dependencies(carry_on): - loss = tf.identity(loss) - if is_training: - reg_loss = tf.identity(reg_loss) - - return reg_loss, loss - - def _build_train(self): - """Build training ops.""" - print('-' * 80) - print('Building train graph') - reg_loss, loss = self._forward(self.x_train, self.y_train, - self.train_params, self.batch_init_states, - is_training=True) - - tf_vars = tf.trainable_variables() - global_step = tf.train.get_or_create_global_step() - lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / - tf.cast(self.params.bptt_steps, dtype=tf.float32)) - learning_rate = utils.get_lr(global_step, self.params) * lr_scale - # learning_rate = tf.Print( - # learning_rate, - # [learning_rate, lr_scale, self.base_bptt, tf.shape(self.y_train)], - # message='lr: ', summarize=3) - grads = tf.gradients(reg_loss, tf_vars) - clipped_grads, grad_norm = tf.clip_by_global_norm(grads, - self.params.grad_bound) - - (self.update_moving_avg_ops, self.use_moving_avg_vars, - self.restore_normal_vars) = self._create_average_ops() - optimizer = tf.train.GradientDescentOptimizer(learning_rate) - train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), - global_step=global_step) - - self.train_loss = loss - self.train_op = train_op - self.grad_norm = grad_norm - self.learning_rate = learning_rate - - def _create_average_ops(self): - """Build moving average ops.""" - print('Creating moving average ops') - - with tf.variable_scope('moving_avg_flag'): - self.moving_avg_started = tf.get_variable( - 'flag', [], tf.int32, initializer=tf.initializers.zeros(), - trainable=False) - self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) - - all_vars = tf.trainable_variables() - average_pairs = [] - var_cnt = 0 - with tf.variable_scope('average'): - for v in all_vars: - avg_v = tf.get_variable( - str(var_cnt), shape=v.shape, dtype=v.dtype, - initializer=tf.zeros_initializer, trainable=False) - var_cnt += 1 - average_pairs.append([v, avg_v]) - backup_pairs = [] - var_cnt = 0 - with tf.variable_scope('backup'): - for v in all_vars: - backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, - trainable=False) - var_cnt += 1 - backup_pairs.append([v, backup_v]) - - with tf.variable_scope('avg_step'): - avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) - - with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): - average_op = [] - for v, avg_v in average_pairs: - mu = 1 / avg_step - new_avg = mu * v + (1 - mu) * avg_v - with tf.control_dependencies([new_avg]): - average_op.append(tf.assign(avg_v, new_avg)) - - assert len(average_pairs) == len(all_vars) - assert len(average_pairs) == len(backup_pairs) - use_average_op = [] - for i in range(len(average_pairs)): - v, avg_v = average_pairs[i] - _, backup_v = backup_pairs[i] - with tf.control_dependencies([tf.assign(backup_v, v)]): - use_average_op.append(tf.assign(v, avg_v)) - use_average_op = tf.group(*use_average_op) - - reverse_average_op = [] - for v, backup_v in backup_pairs: - reverse_average_op.append(tf.assign(v, backup_v)) - reverse_average_op = tf.group(*reverse_average_op) - - return average_op, use_average_op, reverse_average_op - - def _build_valid(self): - print('Building valid graph') - _, loss = self._forward(self.x_valid, self.y_valid, - self.eval_params, self.batch_init_states) - self.valid_loss = loss - - def _build_test(self): - print('Building test graph') - _, loss = self._forward(self.x_test, self.y_test, - self.eval_params, self.test_init_states) - self.test_loss = loss - - def eval_valid(self, sess, use_moving_avg=False): - """Eval 1 round on valid set.""" - total_loss = 0 - if use_moving_avg: - sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) - for _ in range(self.num_valid_batches): - total_loss += sess.run(self.valid_loss) - valid_ppl = np.exp(total_loss / self.num_valid_batches) - print('valid_ppl={0:<.2f}'.format(valid_ppl)) - if use_moving_avg: - sess.run(self.restore_normal_vars) - - return valid_ppl - - def eval_test(self, sess, use_moving_avg=False): - """Eval 1 round on test set.""" - total_loss = 0 - if use_moving_avg: - sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) - for step in range(self.num_test_batches): - total_loss += sess.run(self.test_loss) - if (step + 1) % 1000 == 0: - test_ppl = np.exp(total_loss / (step + 1)) - log_string = 'step={0}'.format(step + 1) - log_string += ' test_ppl={0:<.2f}'.format(test_ppl) - print(log_string) - test_ppl = np.exp(total_loss / self.num_valid_batches) - log_string = 'step={0}'.format(self.num_test_batches) - log_string += ' test_ppl={0:<.2f}'.format(test_ppl) - print(log_string) - if use_moving_avg: - sess.run(self.restore_normal_vars) - - return test_ppl diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py deleted file mode 100644 index 9a8804313..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/process.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Preprocess Penn-Treebank dataset.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import pickle -import numpy as np -import os - - -def main(): - dataFolder = "/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/dataset/Penn_Treebank_dataset" - dataList = os.listdir(dataFolder) - dataPath = {} - for dataName in dataList: - dataPath[dataName] = os.path.join(dataFolder, dataName) - - with open(dataPath.get("ptb.train.txt")) as finp: - lines = finp.read().strip().replace('\n', '') - words = lines.split(' ') - - vocab, index = {}, {} - for word in sorted(words): - if word not in vocab: - index[len(vocab)] = word - vocab[word] = len(vocab) - print('vocab size: {}'.format(len(vocab))) - - x_train = [vocab[word] for word in words] + [vocab['']] - x_train = np.array(x_train, dtype=np.int32) - - with open(dataPath.get('ptb.valid.txt')) as finp: - lines = finp.read().strip().replace('\n', '') - words = lines.split(' ') - - x_valid = [vocab[word] for word in words] + [vocab['']] - x_valid = np.array(x_valid, dtype=np.int32) - - with open(dataPath.get("ptb.test.txt")) as finp: - lines = finp.read().strip().replace('\n', '') - words = lines.split(' ') - - x_test = [vocab[word] for word in words] + [vocab['']] - x_test = np.array(x_test, dtype=np.int32) - - print('train size: {}'.format(np.size(x_train))) - print('valid size: {}'.format(np.size(x_valid))) - print('test size: {}'.format(np.size(x_test))) - - with open('ptb/ptb.pkl', 'wb') as fout: - pickle.dump((x_train, x_valid, x_test, vocab, index), fout, protocol=2) - - -if __name__ == '__main__': - main() diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py deleted file mode 100644 index 4d73e2b37..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.py +++ /dev/null @@ -1,288 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Entry point for AWD ENAS search process.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * -from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig - -import os -import pickle -import sys -import time - -sys.path.append("/home/ma-user/modelarts/user-job-dir/") - -import numpy as np -import tensorflow.compat.v1 as tf - -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import child -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import controller -from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils -from tensorflow.contrib import training as contrib_training - - -flags = tf.app.flags -gfile = tf.gfile -FLAGS = flags.FLAGS - -## Required parameters -flags.DEFINE_string('output_dir', None, '') -flags.DEFINE_string('data_path', None, '') -flags.DEFINE_string("obs_dir", "obs://rstg/log", "obs result path, not need on gpu and apulis platform") - - -## Other parametersresult -flags.DEFINE_boolean('reset_output_dir', False, '') -flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") - -flags.DEFINE_integer('log_every', 20, '') - - - -def get_ops(params, x_train, x_valid): - """Build [train, valid, test] graphs.""" - - ct = controller.Controller(params=params) - lm = child.LM(params, ct, x_train, x_valid) - ct.build_trainer(lm) - params.add_hparam('num_train_batches', lm.num_train_batches) - ops = { - 'train_op': lm.train_op, - 'learning_rate': lm.learning_rate, - 'grad_norm': lm.grad_norm, - 'train_loss': lm.train_loss, - 'l2_reg_loss': lm.l2_reg_loss, - 'global_step': tf.train.get_or_create_global_step(), - 'reset_batch_states': lm.batch_init_states['reset'], - 'eval_valid': lm.eval_valid, - - 'reset_start_idx': lm.reset_start_idx, - 'should_reset': lm.should_reset, - 'bptt_rate': lm.bptt_rate, - - 'controller_train_op': ct.train_op, - 'controller_grad_norm': ct.train_op, - 'controller_sample_arc': ct.sample_arc, - 'controller_entropy': ct.sample_entropy, - 'controller_reward': ct.reward, - 'controller_baseline': ct.baseline, - 'controller_optimizer': ct.optimizer, - 'controller_train_fn': ct.train, - - } - print('-' * 80) - print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) - - return ops - -def load_ckpt_model(sess, save_path): - print("reload model from:{}".format(save_path)) - checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 - input_checkpoint = checkpoint.model_checkpoint_path - saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 - saver.restore(sess, input_checkpoint) # 使用最新模型 - sess.run(tf.global_variables_initializer())# 初始化所有变量 - -def train(params): - """Entry train function.""" - print("data_path:{}".format(params.data_path)) - print("output_dir:{}".format(params.output_dir)) - with gfile.GFile(params.data_path, 'rb') as finp: - x_train, x_valid, _, _, _ = pickle.load(finp) - print('-' * 80) - print('train_size: {0}'.format(np.size(x_train))) - print('valid_size: {0}'.format(np.size(x_valid))) - - - g = tf.Graph() - with g.as_default(): - tf.random.set_random_seed(2126) - ops = get_ops(params, x_train, x_valid) - run_ops = [ - ops['train_loss'], - ops['l2_reg_loss'], - ops['grad_norm'], - ops['learning_rate'], - ops['should_reset'], - ops['train_op'], - ] - - saver = tf.train.Saver(max_to_keep=5) - checkpoint_saver_hook = tf.train.CheckpointSaverHook( - params.output_dir, save_steps=params.num_train_batches, saver=saver) - hooks = [checkpoint_saver_hook] - hooks.append(ops['controller_optimizer'].make_session_run_hook(True)) - - # >>> add code >> - # 创建session - config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) - custom_op = config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 - custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 - # custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 - custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fusion_switch.cfg") - # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("/home/ma-user/modelarts/inputs/data_url_0") - # - # custom_op.parameter_map["enable_dump_debug"].b = True - # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") - # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 - # # custom_op.parameter_map[ - # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 - # - config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 - config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 - # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) - # >>> add code >> - - sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, - checkpoint_dir=params.output_dir) - # reload model - if len(gfile.ListDirectory(params.output_dir)): - last_checkpoint = tf.train.latest_checkpoint(params.output_dir) - print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) - saver.restore(sess, last_checkpoint) - - accum_loss = 0 - accum_step = 0 - epoch = sess.run(ops['global_step']) // params.num_train_batches - best_valid_ppl = [] - start_time = time.time() - last_mins = (time.time() - start_time) / 60 - accum_rate = 0. - # sess.run(tf.global_variables_initializer()) - while True: - try: - # run_ops = [ - # ops['train_loss'], - # ops['l2_reg_loss'], - # ops['grad_norm'], - # ops['learning_rate'], - # ops['should_reset'], - # ops['train_op'], - # ] - # 修改点 - # loss, l2_reg, gn, lr, should_reset, _ = sess.run(run_ops) - loss = sess.run(ops['train_loss']) - # print("loss_OK:loss:{}".format(loss)) - l2_reg = sess.run(ops['l2_reg_loss']) - # print("l2_reg_OK:l2_reg:{}".format(l2_reg)) - gn = sess.run(ops['grad_norm']) - # gn = -111111 - # print("gn_OK:gn:{}".format(gn)) - lr = sess.run(ops['learning_rate']) - # print("lr_OK:le:{}".format(lr)) - should_reset = sess.run(ops['should_reset']) - _ = sess.run(ops["train_op"]) - - bptt_rate = sess.run(ops['bptt_rate']) - # print("should_reset_OK:should_reset:{}".format(should_reset)) - # if not should_not_train : - # _ = sess.run(ops["train_op"]) - - accum_loss += loss - accum_step += 1 - accum_rate += bptt_rate - step = sess.run(ops['global_step']) - if step % params.log_every == 0: - train_ppl = np.exp(accum_loss / accum_step) - mins_so_far = (time.time() - start_time) / 60. - mins_pices = mins_so_far - last_mins - last_mins = mins_so_far - log_string = 'epoch={0:<5d}'.format(epoch) - log_string += ' step={0:<7d}/{1:<6d}'.format(step, params.num_train_steps) - log_string += ' ppl={0:<9.2f}'.format(train_ppl) - log_string += ' lr={0:<7.2f}'.format(lr) - log_string += ' |w|={0:<6.2f}'.format(l2_reg) - log_string += ' |g|={0:<6.2f}'.format(gn) - log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, mins_pices/params.log_every) - # log_string += ' accum_rate(rate of a epoch)={0:<4.4f}'.format(accum_rate) - # log_string += ' should_reset:{}'.format(should_reset) - print(log_string) - - if should_reset: - accum_rate=0. - print("should_reset:{}".format(should_reset)) - ops['controller_train_fn'](sess, ops['reset_batch_states']) - epoch += 1 - accum_loss = 0 - accum_step = 0 - valid_ppl = ops['eval_valid'](sess) - sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) - best_valid_ppl.append(valid_ppl) - - if step % (params.num_train_batches * 10) == 0: - if FLAGS.platform.lower() == 'modelarts': - from help_modelarts import modelarts_result2obs - modelarts_result2obs(FLAGS) - if step >= params.num_train_steps: - if FLAGS.platform.lower() == 'modelarts': - from help_modelarts import modelarts_result2obs - modelarts_result2obs(FLAGS) - break - except tf.errors.InvalidArgumentError: - if FLAGS.platform.lower() == 'modelarts': - from help_modelarts import modelarts_result2obs - modelarts_result2obs(FLAGS) - last_checkpoint = tf.train.latest_checkpoint(params.output_dir) - print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) - saver.restore(sess, last_checkpoint) - sess.close() - - -def main(unused_args): - - tf.logging.set_verbosity(tf.logging.INFO) - tf.logging.info("**********") - print("===>>>data_path:{}".format(FLAGS.data_path)) - print("===>>>output_dir:{}".format(FLAGS.output_dir)) - print("===>>>obs_dir:{}".format(FLAGS.obs_dir)) - print("===>>>train_step:{}".format(FLAGS.num_train_epochs)) - - np.set_printoptions(precision=3, suppress=True, threshold=int(1e9), - linewidth=80) - - print('-' * 80) - if not gfile.IsDirectory(FLAGS.output_dir): - print('Path {} does not exist. Creating'.format(FLAGS.output_dir)) - gfile.MakeDirs(FLAGS.output_dir) - elif FLAGS.reset_output_dir: - print('Path {} exists. Reseting'.format(FLAGS.output_dir)) - gfile.DeleteRecursively(FLAGS.output_dir) - gfile.MakeDirs(FLAGS.output_dir) - - print('-' * 80) - log_file = os.path.join(FLAGS.output_dir, 'stdout') - print('Logging to {}'.format(log_file)) - sys.stdout = utils.Logger(log_file) - - params = contrib_training.HParams( - data_path=FLAGS.data_path, - log_every=FLAGS.log_every, - output_dir=FLAGS.output_dir, - ) - train(params) - - - -if __name__ == '__main__': - flags.mark_flag_as_required("data_path") - flags.mark_flag_as_required("output_dir") - flags.mark_flag_as_required("obs_dir") - tf.app.run() diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh deleted file mode 100644 index b5df7f14d..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/search.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. -### Modelarts Platform command for train - -#export ASCEND_GLOBAL_LOG_LEVEL=1 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为4 -export ASCEND_SLOG_PRINT_TO_STDOUT=0 # plog日志是否打屏 -#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 - -export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config -#export ENABLE_FORCE_V2_CONTROL=1 - -code_dir=${1} -data_path=${2} -output_dir=${3} -obs_url=${4} - -current_time=`date "+%Y-%m-%d-%H-%M-%S"` - -python ${code_dir}/search.py \ - --data_path=${data_path}/ptb.pkl \ - --output_dir=${output_dir} \ - --obs_dir=${obs_url} \ - --platform='modelarts' \ - 2>&1 | tee ${output_dir}/${current_time}_train_npu.log - - -#BASE_PATH='/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' -# -#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/search' -# -#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' -# -#args="--output_dir=$OUTPUT_DIR --data_path=$DATA_PATH" -# -##run search -#python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py $args diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh deleted file mode 100644 index b3ed57170..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/test-npu.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. -### Modelarts Platform command for train - -#export ASCEND_GLOBAL_LOG_LEVEL=4 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为3;null级别为4 -#export ASCEND_SLOG_PRINT_TO_STDOUT=1 # plog日志是否打屏 -#export ASCEND_HOST_LOG_FILE_NUM=1000 -#export ASCEND_LOG_DEVICE_FLUSH_TIMEOUT=0 -#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 -#export ASCEND_GLOBAL_TRACE_ENABLE=0 -#export PROFILING_MODE=false -#export PROFILING_OPTIONS='{"output":"/tmp/profiling","training_trace":"off","task_trace":"off","aicpu":"on","fp_point":"resnet_model/conv2d/Conv2Dresnet_model/batch_normalization/FusedBatchNormV3_Reduce","bp_point":"gradients/AddN_70","aic_metrics":"PipeUtilization"}' - -export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config -#export ENABLE_FORCE_V2_CONTROL=1 - -code_dir=${1} -data_path=${2} -output_dir=${3} -ckp_path=${4} - -current_time=`date "+%Y-%m-%d-%H-%M-%S"` -FIXED_ARC='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' - -nohup python3 ${code_dir}/fixed.py \ - --data_path=${data_path}/ptb.pkl \ - --output_dir=${output_dir} \ - --fixed_arc='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' \ - --ckp_path=${ckp_path} \ - --platform='modelarts' \ - > nohup1.out 2>&1 & - - -#FIXED_ARC='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' -# -#BASE_PATH = '/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' -# -#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/test' -# -#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' -# -#args ='--fixed_arc=FIXED_ARC --output_dir=$OUTPUT_DIR --data_path=$DATA_PATH' -# -##run test -#python3 /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py $args diff --git a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py b/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py deleted file mode 100644 index 7b59aec44..000000000 --- a/contrib_old/TensorFlow/Research/nlp/enas/ENAS_ID2053_for_TensorFlow/utils.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Common utils.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import re -import sys -import tensorflow.compat.v1 as tf - -gfile = tf.gfile - - -class Logger(object): - """Prints to both STDOUT and a file.""" - - def __init__(self, filepath): - self.terminal = sys.stdout - self.log = gfile.GFile(filepath, 'a+') - - def write(self, message): - self.terminal.write(message) - self.terminal.flush() - self.log.write(message) - self.log.flush() - - def flush(self): - self.terminal.flush() - self.log.flush() - - -def get_lr(curr_step, params, lr_decay_rate): - """Compute learning rate at step depends on `params`.""" - lr = tf.constant(params.learning_rate, dtype=tf.float32) - if 'num_warmup_steps' in params and params.num_warmup_steps > 0: - num_warmup_steps = tf.cast(params.num_warmup_steps, dtype=tf.float32) - step = tf.cast(curr_step, dtype=tf.float32) - warmup_lr = params.learning_rate * step / num_warmup_steps - lr = tf.cond(tf.less(step, num_warmup_steps), lambda: warmup_lr, lambda: lr) - return lr * lr_decay_rate - - -def strip_var_name(var_name): - """Strips variable name of sub-strings blocking variable name matching.""" - # Strip trailing number, e.g. convert - # 'lstm/W_0:0' to 'lstm/W_0'. - var_name = re.sub(r':\d+$', '', var_name) - # Strip partitioning info, e.g. convert - # 'W_0/part_3/Adagrad' to 'W_0/Adagrad'. - var_name = re.sub(r'/part_\d+', '', var_name) - return var_name -- Gitee