diff --git a/contrib/.keep b/contrib/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/TensorFlow/.keep b/contrib/TensorFlow/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/TensorFlow/Research/.keep b/contrib/TensorFlow/Research/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/TensorFlow/Research/nlp/.keep b/contrib/TensorFlow/Research/nlp/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/.keep b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md new file mode 100644 index 0000000000000000000000000000000000000000..051fc7c8ffa92af932f37bb85aedb0b52905ed29 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/README.md @@ -0,0 +1,211 @@ +### 基本信息 +#### 发布者(Publisher):Huawei +#### 应用领域(Application Domain):NLP +#### 修改时间(Modified) :2018. +#### 框架(Framework):TensorFlow 1.15.0 +#### 模型格式(Model Format):ckpt +#### 精度(Precision):Mixed +#### 处理器(Processor):昇腾910 +#### 应用级别(Categories):Research +#### 描述(Description): enas模型用于ptb数据集的神经网络结构搜索 + +### 概述 +enas是一个快速高效的神经网络架构搜索的方法,使用了子图采样和权重共享的策略,极大的提高了神经网络结构搜索的效率。在数据ptb和cifar-10上都发现了新的网络架构而达到了新的sota. + +#### 参考论文:[Efficient Neural Architecture Search via Parameter Sharing](http://proceedings.mlr.press/v80/pham18a/pham18a.pdf) +#### 参考代码:[enas](https://github.com/melodyguan/enas) + +### 默认配置 +#### 数据预处理 + + - 输入数据为文本 + - 文本输入格式: id [int] +#### 训练超参数 + - ##### search + - controller baseline decay : 0.999 + - controller entropy weight : 1e-5 + - controller temperature : 5 + - controller learning rate : 5e-5 + - controller num layers : 9 + - controller hidden size : 64 + - controller num functions : 4 + - child batch size : 128 + - child bptt steps : 35 + - num train epochs : 600 + - ##### test + + - child grad bound : 0.25 + - child weight decay : 2e-6 + - child num train epochs :3000 + - child hidden size : 800 + - learning_rate : 20. + +### 支持特性 + +| 特性列表 | 是否支持 | +|------|------| +| 混合精度 | 是 | + +### 混合精度训练 + +昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 + +### 快速上手 +模型的search阶段和test阶段都使用数据集ptb,原始数据需要使用process.py脚本进行处理,也可以在obs://rstg/Dataset/ptb获取。 + +### 代码结构文件 + +|— search.py 搜索模型代码\ +|— child.py 子图模型代码\ +|— fixed.py 架构验证模型代码\ +|— fixed_lib.py\ +|— data_utils.py 数据处理代码\ +|— controller.py 性能评估模型代码\ +|— boot_modelarts.py 模型运行代码\ +|— ... + +### 脚本参数 + +- search:\ +--data_path\ +--output_dir\ +--obs_dir +- test:\ +--data_path\ +--output_dir\ +--fixed_arc\ +--ckp_path + + + +### 训练过程 +在论文的参数设置下,GPU训练精度和速度可以达到要求; +NPU的训练精度和速度还未达标。 + +- #### GPU + - ##### search +epoch=0 step=200 /124200 ppl=1950.47 lr=17.14 |w|=0.44 |g|=2.84 mins=0.53\ +valid_ppl=1800.73\ +epoch=1 step=400 /124200 ppl=1187.87 lr=22.86 |w|=0.64 |g|=0.81 mins=1.46\ +valid_ppl=892.87\ +epoch=2 step=600 /124200 ppl=1065.44 lr=18.29 |w|=0.82 |g|=0.35 mins=2.36\ +valid_ppl=843.70\ +epoch=3 step=800 /124200 ppl=953.38 lr=14.86 |w|=1.14 |g\|=0.31 mins=3.25\ +valid_ppl=898.45\ +epoch=4 step=1000 /124200 ppl=949.04 lr=20.57 |w|=1.72 |g|=0.31 mins=4.15\ +valid_ppl=774.25\ +epoch=5 step=1200 /124200 ppl=876.15 lr=20.00 |w|=3.69 |g|=0.30 mins=5.04\ +valid_ppl=622.82\ +epoch=6 step=1400 /124200 ppl=838.09 lr=24.00 |w|=6.94 |g|=0.67 mins=5.92\ +valid_ppl=606.77\ +epoch=7 step=1600 /124200 ppl=764.65 lr=21.14 |w|=11.46 |g|=0.36 mins=6.81\ +valid_ppl=579.69\ +epoch=8 step=1800 /124200 ppl=762.31 lr=20.00 |w|=17.41 |g|=0.29 mins=7.71\ +valid_ppl=520.63\ +epoch=9 step=2000 /124200 ppl=695.99 lr=24.00 |w|=27.42 |g|=0.20 mins=8.61\ +...\ +valid_ppl=162.39\ +epoch=574 step=124200 /124200 ppl=244.80 lr=21.71 |w|=6348.55 |g|=0.15 mins=536.70 +- ##### test +epoch=0 step=200 ppl=1779.60 lr=20.000 |g|=0.234 avg=0 mins=0.34\ +epoch=0 step=400 ppl=1223.42 lr=20.571 |g|=0.407 avg=0 mins=0.64\ +valid_ppl=463.03\ +epoch=1 step=600 ppl=595.22 lr=9.714 |g|=0.483 avg=0 mins=0.98\ +epoch=1 step=800 ppl=545.60 lr=24.000 |g|=0.223 avg=0 mins=1.28\ +valid_ppl=339.76\ +epoch=2 step=1000 ppl=436.82 lr=21.714 |g|=0.332 avg=0 mins=1.61\ +epoch=2 step=1200 ppl=411.70 lr=14.286 |g|=0.274 avg=0 mins=1.91\ +valid_ppl=271.71\ +epoch=3 step=1400 ppl=365.17 lr=18.857 |g|=0.291 avg=0 mins=2.24\ +epoch=3 step=1600 ppl=347.84 lr=14.857 |g|=0.247 avg=0 mins=2.54\ +valid_ppl=245.00\ +epoch=4 step=1800 ppl=321.47 lr=17.143 |g|=0.238 avg=0 mins=2.87\ +epoch=4 step=2000 ppl=307.67 lr=18.286 |g|=0.237 avg=0 mins=3.18\ +valid_ppl=213.10\ +epoch=5 step=2200 ppl=296.59 lr=17.714 |g|=0.259 avg=0 mins=3.51\ +epoch=5 step=2400 ppl=281.99 lr=15.429 |g|=0.263 avg=0 mins=3.81\ +epoch=6 step=2600 ppl=280.63 lr=22.857 |g|=0.234 avg=0 mins=4.12\ +valid_ppl=209.90\ +epoch=6 step=2800 ppl=261.67 lr=20.000 |g|=0.232 avg=0 mins=4.44\ +epoch=7 step=3000 ppl=262.83 lr=16.000 |g|=0.313 avg=0 mins=4.75\ +valid_ppl=181.99\ +epoch=7 step=3200 ppl=249.74 lr=8.571 |g|=0.367 avg=0 mins=5.07\ +epoch=8 step=3400 ppl=248.14 lr=17.714 |g|=0.248 avg=0 mins=5.37\ +valid_ppl=176.79\ +epoch=8 step=3600 ppl=243.44 lr=17.714 |g|=0.260 avg=0 mins=5.69\ +epoch=9 step=3800 ppl=236.51 lr=17.143 |g|=0.299 avg=0 mins=6.00\ +valid_ppl=166.62\ +...\ +epoch=2997 step=1241000 ppl=51.39 lr=21.714 |g|=0.333 avg=1 mins=2160.67\ +epoch=2998 step=1241200 ppl=48.44 lr=21.714 |g|=0.336 avg=1 mins=2161.02\ +valid_ppl=61.17\ +epoch=2998 step=1241400 ppl=54.42 lr=22.857 |g|=0.322 avg=1 mins=2161.37\ +epoch=2999 step=1241600 ppl=48.16 lr=21.714 |g|=0.339 avg=1 mins=2161.70\ +epoch=2999 step=1241800 ppl=49.21 lr=21.714 |g|=0.340 avg=1 mins=2162.04\ +valid_ppl=61.17\ +epoch=3000 step=1242000 ppl=48.24 lr=22.286 |g|=0.332 avg=1 mins=2162.40\ +...\ +step=70000 test_ppl=59.15\ +step=71000 test_ppl=59.03\ +step=72000 test_ppl=59.06\ +step=73000 test_ppl=58.41\ +step=74000 test_ppl=58.24\ +step=75000 test_ppl=58.12\ +step=76000 test_ppl=58.15\ +step=77000 test_ppl=58.29\ +step=78000 test_ppl=58.36\ +step=79000 test_ppl=58.50\ +step=80000 test_ppl=58.43\ +step=81000 test_ppl=58.72\ +step=82000 test_ppl=58.52\ +step=82429 test_ppl=58.64 + +- #### NPU + - ##### test +epoch=0 step=200/453000 ppl=6106.61 lr=46.250 |g|=0.171 avg=0 mins=5.33-min/step=0.0211\ +epoch=0 step=400/453000 ppl=1966.93 lr=40.000 |g|=0.193 avg=0 mins=9.44-min/step=0.0204\ +valid_ppl=389.49\ +epoch=1 step=600/453000 ppl=405.67 lr=42.500 |g|=0.195 avg=0 mins=14.69-min/step=0.0208\ +epoch=1 step=800/453000 ppl=369.30 lr=38.750 |g|=0.207 avg=0 mins=18.93-min/step=0.0212\ +valid_ppl=298.25\ +epoch=2 step=1000/453000 ppl=299.71 lr=38.750 |g|=0.222 avg=0 mins=23.45-min/step=0.0243\ +epoch=2 step=1200/453000 ppl=281.29 lr=45.000 |g|=0.177 avg=0 mins=27.68-min/step=0.0210\ +epoch=2 step=1400/453000 ppl=274.65 lr=43.750 |g|=0.270 avg=0 mins=31.83-min/step=0.0211\ +valid_ppl=236.61\ +epoch=3 step=1600/453000 ppl=243.76 lr=33.750 |g|=0.209 avg=0 mins=36.26-min/step=0.0208\ +epoch=3 step=1800/453000 ppl=240.20 lr=33.750 |g|=0.222 avg=0 mins=40.45-min/step=0.0211\ +valid_ppl=252.75\ +epoch=4 step=2000/453000 ppl=228.79 lr=40.000 |g|=0.214 avg=0 mins=44.94-min/step=0.0205\ +epoch=4 step=2200/453000 ppl=222.90 lr=40.000 |g|=0.211 avg=0 mins=49.15-min/step=0.0210\ +valid_ppl=197.03\ +epoch=5 step=2400/453000 ppl=219.08 lr=40.000 |g|=0.199 avg=0 mins=53.66-min/step=0.0245\ +epoch=5 step=2600/453000 ppl=204.19 lr=32.500 |g|=0.219 avg=0 mins=57.78-min/step=0.0209\ +epoch=5 step=2800/453000 ppl=206.65 lr=33.750 |g|=0.225 avg=0 mins=61.98-min/step=0.0210\ +valid_ppl=191.64\ +epoch=6 step=3000/453000 ppl=197.33 lr=45.000 |g|=0.201 avg=0 mins=66.49-min/step=0.0207\ +epoch=6 step=3200/453000 ppl=194.74 lr=38.750 |g|=0.212 avg=0 mins=70.64-min/step=0.0211\ +valid_ppl=200.02\ +epoch=7 step=3400/453000 ppl=191.74 lr=35.000 |g|=0.208 avg=0 mins=75.13-min/step=0.0240\ +epoch=7 step=3600/453000 ppl=186.42 lr=41.250 |g|=0.185 avg=0 mins=79.25-min/step=0.0205\ +valid_ppl=201.46\ +epoch=8 step=3800/453000 ppl=204.60 lr=46.250 |g|=0.225 avg=0 mins=83.78-min/step=0.0243\ +epoch=8 step=4000/453000 ppl=177.41 lr=32.500 |g|=0.236 avg=0 mins=87.95-min/step=0.0208\ +epoch=8 step=4200/453000 ppl=180.42 lr=36.250 |g|=0.207 avg=0 mins=92.05-min/step=0.0207\ +valid_ppl=175.82\ +epoch=9 step=4400/453000 ppl=180.36 lr=35.000 |g|=0.350 avg=0 mins=96.54-min/step=0.0208\ +epoch=9 step=4600/453000 ppl=173.57 lr=42.500 |g|=0.188 avg=0 mins=100.67-min/step=0.0206\ +valid_ppl=209.94\ +epoch=10 step=4800/453000 ppl=170.76 lr=38.750 |g|=0.207 avg=0 mins=105.17-min/step=0.0243\ +epoch=10 step=5000/453000 ppl=167.46 lr=32.500 |g|=0.244 avg=0 mins=109.31-min/step=0.0207\ +epoch=10 step=5200/453000 ppl=169.23 lr=43.750 |g|=0.235 avg=0 mins=113.42-min/step=0.0203\ +valid_ppl=167.50\ +...\ +valid_ppl=112.40\ +epoch=270 step=128000/453000 ppl=98.60 lr=31.389 |g|=0.316 avg=1 mins=2925.00-min/step=0.0222\ +epoch=270 step=128200/453000 ppl=95.14 lr=26.773 |g|=0.556 avg=1 mins=2929.33-min/step=0.0211\ +valid_ppl=113.40\ +epoch=271 step=128400/453000 ppl=98.80 lr=32.312 |g|=0.319 avg=1 mins=2934.05-min/step=0.0257\ +epoch=271 step=128600/453000 ppl=92.38 lr=28.620 |g|=0.328 avg=1 mins=2938.40-min/step=0.0214\ +epoch=271 step=128800/453000 ppl=96.70 lr=28.620 |g|=0.350 avg=1 mins=2942.81-min/step=0.0218\ +valid_ppl=113.22\ +epoch=272 step=129000/453000 ppl=96.46 lr=29.543 |g|=0.316 avg=1 mins=2947.51-min/step=0.0218 \ No newline at end of file diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py new file mode 100644 index 0000000000000000000000000000000000000000..4291e14e43acd7ac936f698adf2b834e3c003470 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/RUN_SEARCH.py @@ -0,0 +1,36 @@ +from npu_bridge.npu_init import * +import os +import sys, getopt + + +def main(argv): + # print(argv) + # argv_ = ['-t', 'search'] + runType = "" + try: + opts, args = getopt.getopt(argv, "ht:", ["trun="]) + except getopt.GetoptError: + print("getopt.GetoptError!!") + print("useage: (sudo) python(3) pythonFileName.py -t ") + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print("useage: pythonFileName.py -t ") + sys.exit() + elif opt in ("-t", "--trun"): + runType = arg + if runType == "search": + print(f'runType={runType}!\n') + os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/search --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") + elif runType == "test-npu": + print(f'runType={runType}!\n') + os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/output/test --data_path=/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl") + # os.system("python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py --fixed_arc = '0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' --output_dir=$(pwd)/output/test --data_path=$(pwd)/ptb/ptb.pkl") + # print("this part is writing...") + # pass + else: + print("This runType is invaild!!!") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py new file mode 100644 index 0000000000000000000000000000000000000000..eaf741434e81a5783cd7f14b269bbc8e916459cd --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/bash.py @@ -0,0 +1,4 @@ +from npu_bridge.npu_init import * +import os + +os.system("bash /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.sh") \ No newline at end of file diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py new file mode 100644 index 0000000000000000000000000000000000000000..d65dfdded4cb00d21c8be6e2aef84cc12b0248ce --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/boot_modelarts.py @@ -0,0 +1,73 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This is the boot file for ModelArts platform. +Firstly, the train datasets are copyed from obs to ModelArts. +Then, the string of train shell command is concated and using 'os.system()' to execute +""" +import os +import time +import numpy as np +import argparse +from help_modelarts import obs_data2modelarts +# import moxing as mox +print(os.system('env')) +print(os.system("python3 --version")) +#print(os.system("pip install dlib")) +print("===>>>hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh") +os.environ['ASCEND_GLOBAL_LOG_LEVEL'] = '4' + +#data_dir = "/root/.keras/models/" +if __name__ == '__main__': + ## Note: the code dir is not the same as work dir on ModelArts Platform!!! + code_dir = '.' + work_dir = os.getcwd() + print("===>>>code_dir:{}, work_dir:{}".format(code_dir, work_dir)) + output_path = "./output/test/" + str(time.strftime('%Y%m%d_%H%M%S')) + parser = argparse.ArgumentParser() + parser.add_argument("--train_url", type=str, default=output_path) + parser.add_argument("--data_url", type=str, default="./ptb") + parser.add_argument("--ckp_path", type=str, default="./output/test/20220715_182127/") + # parser.add_argument("--ckp_path", type=str, default="obs://rstg/workplace_ENAS/lm-train/MA-new-enas-05-23-19-34/output/result/") + # parser.add_argument("--modelarts_data_dir", type=str, default="/cache/ptb-dataset") + # parser.add_argument("--modelarts_result_dir", type=str, default="/cache/result") + config = parser.parse_args() + #if not os.path.exists(data_dir): + # os.makedirs(data_dir) + # print("=nvvvvvvvvvvvvvfdsfdsfdvnn") + + #os.system("pip install -i http://repo.myhuaweicloud.com/repository/pypi/simple pexpect==4.2.1") + #os.system("pip install torch") + #os.system("pip install absl-py") + print("--------config---------hhhhhhhhhhhggggggggggggggggkkkkkkkkkkkkkkkkkkkkkkkkkgg-") + for k in list(vars(config).keys()): + print("key:{}: value:{}".format(k, vars(config)[k])) + print("--------config----------") + + ## copy dataset from obs to modelarts + # obs_data2modelarts(config) + # ret = mox.file.exists('obs://rstg/MA-new-p/') + # retm = mox.file.make_dirs('obs://rstg/MA-new-p/') + # print("bbbbbbbbbbbbbbbbbbbbbbbbb ",retm) + # print("config.modelarts_result_dir ", config.modelarts_result_dir) + ## start to train on Modelarts platform + # if not os.path.exists(config.modelarts_result_dir): + # os.makedirs(config.modelarts_result_dir) + # print("6666666666666666666666666666666666666666 ", config.modelarts_result_dir) + bash_header = os.path.join(code_dir, 'test-npu.sh') + # bash_header = os.path.join(code_dir, 'search.sh') + arg_url = '%s %s %s %s' % (code_dir, config.data_url, config.train_url, config.ckp_path) + bash_command = 'bash %s %s' % (bash_header, arg_url) + print("bash command:", bash_command) + os.system(bash_command) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py new file mode 100644 index 0000000000000000000000000000000000000000..09b6d878d9cf151d46999ce9fd57b46c9353feac --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/child.py @@ -0,0 +1,440 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD ENAS fixed model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + + +import numpy as np +import tensorflow.compat.v1 as tf +import tensorflow.keras as keras + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils + + +flags = tf.app.flags +FLAGS = flags.FLAGS + + +flags.DEFINE_integer('child_batch_size', 128, '') +flags.DEFINE_integer('child_bptt_steps', 35, '') +flags.DEFINE_integer('num_train_epochs', 600, '') + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _rnn_fn(sample_arc, x, prev_s, w_prev, w_skip, input_mask, layer_mask, + params): + """Multi-layer LSTM. + + Args: + sample_arc: [num_layers * 2], sequence of tokens representing architecture. + x: [batch_size, num_steps, hidden_size]. + prev_s: [batch_size, hidden_size]. + w_prev: [2 * hidden_size, 2 * hidden_size]. + w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. + input_mask: `[batch_size, hidden_size]`. + layer_mask: `[batch_size, hidden_size]`. + params: hyper-params object. + + Returns: + next_s: [batch_size, hidden_size]. + all_s: [[batch_size, num_steps, hidden_size] * num_layers]. + """ + batch_size = params.batch_size + # num_steps = 35 + num_steps = tf.shape(x)[1] + print("num_steps:{}/{}".format(num_steps, num_steps)) + + num_layers = len(sample_arc) // 2 + set_shape = x.get_shape().as_list() + print("set_shape:{}".format(set_shape)) + # 修改点 + # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=True) + all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) + # extract the relevant variables, so that you only do L2-reg on them. + u_skip = [] + start_idx = 0 + + for layer_id in range(num_layers): + prev_idx = sample_arc[start_idx] + func_idx = sample_arc[start_idx + 1] + u_skip.append(w_skip[layer_id][func_idx, prev_idx]) + start_idx += 2 + w_skip = u_skip + var_s = [w_prev] + w_skip[1:] + + def _select_function(h, function_id): + h = tf.stack([tf.tanh(h), tf.nn.relu(h), tf.sigmoid(h), h], axis=0) + h = h[function_id] + return h + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, prev_s, all_s): + """Body function.""" + inp = x[:, step, :] + # print("inp:{}".format(inp)) + + # important change: first input uses a tanh() + if layer_mask is not None: + assert input_mask is not None + ht = tf.matmul(tf.concat([inp * input_mask, prev_s * layer_mask], + axis=1), w_prev) + else: + ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) + # print("ht:{}".format(ht)) + h, t = tf.split(ht, 2, axis=1) + h = tf.tanh(h) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + layers = [s] + # print("layer:{}".format(layers)) + + start_idx = 0 + used = [] + for layer_id in range(num_layers): + prev_idx = sample_arc[start_idx] + func_idx = sample_arc[start_idx + 1] + # print("layer_id/[prev_idx, func_idx]:{}/[{}, {}]".format(layer_id, prev_idx, func_idx)) + used.append(tf.one_hot(prev_idx, depth=num_layers, dtype=tf.int32)) + prev_s = tf.stack(layers, axis=0)[prev_idx] + if layer_mask is not None: + ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) + else: + ht = tf.matmul(prev_s, w_skip[layer_id]) + h, t = tf.split(ht, 2, axis=1) + + h = _select_function(h, func_idx) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + # print("s before set_shape:{}".format(s)) + s.set_shape([batch_size, params.hidden_size]) + # print("s after set_shape:{}".format(s)) + layers.append(s) + start_idx += 2 + # print("layers:{}\ns:{}".format(layers, s)) + + next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) + # print("step:{}\nnext_s:{}".format(step, next_s)) + # all_s = all_s.write(step, next_s) + t = tf.stack([next_s]) + # print("t:{}".format(t)) + all_s = tf.concat([all_s, t], 0) + # print("step:{}-all_s:{}".format(step, all_s)) + # all_s_my[step] = next_s + + return step + 1, next_s, all_s + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] + _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) + + all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) + # all_s_my.set_shape([_, batch_size, params.hidden_size]) + # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) + # print("stack_all_s:{}".format(all_s_my)) + + all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) + # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) + # print("all_s:{}".format(all_s)) + + return next_s, all_s, var_s + + +def _set_default_params(params): + """Set default hyper-parameters.""" + params.add_hparam('alpha', 0.0) # activation L2 reg + params.add_hparam('beta', 1.) # activation slowness reg + params.add_hparam('best_valid_ppl_threshold', 5) + + params.add_hparam('batch_size', FLAGS.child_batch_size) + params.add_hparam('bptt_steps', FLAGS.child_bptt_steps) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', 0.10) # word + params.add_hparam('drop_i', 0.20) # embeddings + params.add_hparam('drop_x', 0.75) # input to RNN cells + params.add_hparam('drop_l', 0.25) # between layers + params.add_hparam('drop_o', 0.75) # output + params.add_hparam('drop_w', 0.00) # weight + + params.add_hparam('grad_bound', 0.1) + params.add_hparam('hidden_size', 200) + params.add_hparam('init_range', 0.04) + params.add_hparam('learning_rate', 20.) + params.add_hparam('num_train_epochs', FLAGS.num_train_epochs) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', 8e-7) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, controller, x_train, x_valid, name='child'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.controller = controller + self.sample_arc = tf.unstack(controller.sample_arc) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, + self.base_bptt, self.bptt_rate) = data_utils.input_producer(x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam('num_train_steps', self.num_train_batches * params.num_train_epochs) + # self.x_train.set_shape([params.batch_size, self.base_bptt]) + # print("self.x_train:{}".format(self.x_train.get_shape().as_list())) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches) = data_utils.input_producer(x_valid, params.batch_size, params.bptt_steps) + # with tf.control_dependencies([self.base_bptt]): + self._build_params() + self._build_train() + self._build_valid() + + def _build_params(self): + """Create model parameters.""" + + print('-' * 80) + print('Building model params') + initializer = tf.initializers.random_uniform(minval=-self.params.init_range, + maxval=self.params.init_range) + num_functions = self.params.controller_num_functions + num_layers = self.params.controller_num_layers + hidden_size = self.params.hidden_size + # >>> add code >>> + with tf.variable_scope(self.name, initializer=initializer): + # >>> add code >>> + with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): + w_emb = tf.get_variable('w', [self.params.vocab_size, hidden_size]) + # >>> add code >>> + # 修改点 + dropped_w_emb = npu_ops.dropout(w_emb, 1-self.params.drop_e) + # dropped_w_emb = tf.layers.dropout( + # w_emb, self.params.drop_e, [self.params.vocab_size, 1], + # training=True) + with tf.variable_scope('rnn_cell', reuse=tf.AUTO_REUSE): + w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) + i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) + h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w_prev = w_prev * mask + w_skip, dropped_w_skip = [], [] + for layer_id in range(1, num_layers + 1): + with tf.variable_scope('layer_{}'.format(layer_id)): + w = tf.get_variable( + 'w', [num_functions, layer_id, hidden_size, 2 * hidden_size]) + mask = _gen_mask([1, 1, hidden_size, 2 * hidden_size], + self.params.drop_w) + dropped_w = w * mask + w_skip.append(w) + dropped_w_skip.append(dropped_w) + with tf.variable_scope('init_states', reuse=tf.AUTO_REUSE): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hidden_size] + batch_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset = tf.assign(batch_prev_s, zeros) + + self.num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() + if v.name.startswith(self.name)]) # .value + print('All children have {} params'.format(self.num_params)) + + num_params_per_child = 0 + for v in tf.trainable_variables(): + if v.name.startswith(self.name): + if 'rnn_cell' in v.name: + num_params_per_child += v.shape[-2] * v.shape[-1] + else: + num_params_per_child += np.prod([d for d in v.shape]) + print('Each child has {0} params'.format(num_params_per_child)) + + self.batch_init_states = { + 's': batch_prev_s, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_prev': dropped_w_prev, + 'w_skip': dropped_w_skip, + 'w_soft': w_emb, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_prev': w_prev, + 'w_skip': w_skip, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_prev = model_params['w_prev'] + w_skip = model_params['w_skip'] + w_soft = model_params['w_soft'] + prev_s = init_states['s'] + + # bug点 + # + print("before [embedding_lookup], x={}".format(x)) + emb = tf.nn.embedding_lookup(w_emb, x) + batch_size = self.params.batch_size + hidden_size = self.params.hidden_size + sample_arc = self.sample_arc + if is_training: + # >>> add code >>> + emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) + # >>> add code >>> + # 修改点 + # emb = tf.layers.dropout( + # emb, self.params.drop_i, [batch_size, 1, hidden_size], training=True) + + input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) + layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) + else: + input_mask = None + layer_mask = None + + out_s, all_s, var_s = _rnn_fn(sample_arc, emb, prev_s, w_prev, w_skip, + input_mask, layer_mask, params=self.params) + + top_s = all_s + if is_training: + # >>> add code >>> + # 修改点 + + top_s = npu_ops.dropout(top_s, 1-self.params.drop_o) # ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) + # >>> add code >>> + # top_s = tf.layers.dropout( + # top_s, self.params.drop_o, + # [self.params.batch_size, 1, self.params.hidden_size], training=True) + + carry_on = [tf.assign(prev_s, out_s)] + top_s_shape = top_s.get_shape().as_list() + # print("top_s_shape:{}".format(top_s_shape)) + # print("w_soft:{}".format(w_soft)) + logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) + # logits = tf.matmul(top_s, tf.transpose(w_soft)) + # print("logits:{}".format(logits)) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + # print("loss:{}".format(loss)) + loss = tf.reduce_mean(loss) + # print("_forward/loss:{}".format(loss)) + reg_loss = loss # `loss + regularization_terms` is for training only + if is_training: + # L2 weight reg + self.l2_reg_loss = tf.add_n([tf.nn.l2_loss(w ** 2) for w in var_s]) + reg_loss += self.params.weight_decay * self.l2_reg_loss + + # activation L2 reg + reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) + + # activation slowness reg + reg_loss += self.params.beta * tf.reduce_mean( + (all_s[:, 1:, :] - all_s[:, :-1, :]) ** 2) + # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + # print("reg_loss/loss:{}/{}".format(reg_loss, loss)) + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = [v for v in tf.trainable_variables() + if v.name.startswith(self.name)] + # print("reg_loss:{}".format(reg_loss)) + # print("tf_vars:{}".format(tf_vars)) + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + learning_rate = utils.get_lr(global_step, self.params) * lr_scale + if self.params.grad_bound: + # grads = tf.gradients(reg_loss, tf_vars) + # clipped_grads, _ = tf.clip_by_global_norm(grads, self.params.grad_bound) + # clipped_grads, grad_norm = tf.clip_by_global_norm(grads, self.params.grad_bound) + # print("clipped_grads:{}".format(clipped_grads)) + + grads = tf.gradients(reg_loss, tf_vars) + # print("grads:{}".format(grads)) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + # print("optimizer:{}".format(optimizer)) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + # print("train_op:{}".format(train_op)) + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + self.rl_loss = loss + + def eval_valid(self, sess): + + """Eval 1 round on valid set.""" + total_loss = 0 + for _ in range(self.num_valid_batches): + sess.run(self.batch_init_states['reset']) + total_loss += sess.run(self.valid_loss) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + + return valid_ppl diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py new file mode 100644 index 0000000000000000000000000000000000000000..2367ea1f0a34122ed396c069a1a5447e2b641f4a --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/ckpt2pb.py @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- +import tensorflow.compat.v1 as tf + +# from create_tf_record import * +from tensorflow.python.framework import graph_util +from tensorflow.python.tools import freeze_graph + +from npu_bridge.npu_init import * + +def freeze_graph(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "output" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=sess.graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in sess.graph.get_operations(): + # print(op.name, op.values()) + + +def freeze_graph2(input_checkpoint, output_graph): + ''' + :param input_checkpoint: + :param output_graph: PB模型保存路径 + :return: + ''' + # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 + # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 + + # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 + output_node_names = "InceptionV3/Logits/SpatialSqueeze" + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) + graph = tf.get_default_graph() # 获得默认的图 + input_graph_def = graph.as_graph_def() # 返回一个序列化的图代表当前的图 + + with tf.Session() as sess: + saver.restore(sess, input_checkpoint) # 恢复图并得到数据 + output_graph_def = graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 + sess=sess, + input_graph_def=input_graph_def, # 等于:sess.graph_def + output_node_names=output_node_names.split(",")) # 如果有多个输出节点,以逗号隔开 + + with tf.gfile.GFile(output_graph, "wb") as f: # 保存模型 + f.write(output_graph_def.SerializeToString()) # 序列化输出 + print("%d ops in the final graph." % len(output_graph_def.node)) # 得到当前图有几个操作节点 + + # for op in graph.get_operations(): + # print(op.name, op.values()) + + +if __name__ == '__main__': + # 输入ckpt模型路径 + input_checkpoint = './output/test/20220709_185707/model.ckpt-181200' + # 输出pb模型的路径 + out_pb_path = "models_pb/enas-lm-infer2.pb" + # 调用freeze_graph将ckpt转为pb + freeze_graph(input_checkpoint, out_pb_path) + print("Done pb!") + + # 测试pb模型 + image_path = 'test_image/animal.jpg' + # freeze_graph_test(pb_path=out_pb_path, image_path=image_path) + + diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py new file mode 100644 index 0000000000000000000000000000000000000000..cb13d49ab1520a757ef5add25b1ec16cf35b83b1 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/controller.py @@ -0,0 +1,250 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ENAS controller.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import numpy as np +import tensorflow.compat.v1 as tf + +flags = tf.app.flags +FLAGS = flags.FLAGS + +flags.DEFINE_float('controller_baseline_dec', 0.999, '') +flags.DEFINE_float('controller_entropy_weight', 1e-5, '') +flags.DEFINE_float('controller_temperature', 5., '') +flags.DEFINE_float('controller_tanh_constant', 2.25, '') +flags.DEFINE_float('controller_learning_rate', 5e-5, '') +flags.DEFINE_integer('controller_num_layers', 9, '') + +REWARD_CONSTANT = 80.0 + + +def _build_train_op(loss, tf_vars, learning_rate, train_step, num_aggregate): + """Build training ops from `loss` tensor.""" + optim = tf.train.AdamOptimizer(learning_rate) + optim = tf.train.SyncReplicasOptimizer( + optim, replicas_to_aggregate=num_aggregate, total_num_replicas=1, use_locking=True) + grads = tf.gradients(loss, tf_vars) + train_op = optim.apply_gradients(zip(grads, tf_vars), global_step=train_step) + grad_norm = tf.global_norm(grads) + return train_op, optim, grad_norm + + +def _lstm(x, prev_c, prev_h, w_lstm): + """LSTM subgraph.""" + ifog = tf.matmul(tf.concat([x, prev_h], axis=1), w_lstm) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + next_c = i * g + f * prev_c + next_h = o * tf.tanh(next_c) + return next_c, next_h + + +def _set_default_params(params): + """Add controller's default params.""" + params.add_hparam('controller_hidden_size', 64) + params.add_hparam('controller_num_layers', FLAGS.controller_num_layers) + params.add_hparam('controller_num_functions', 4) # tanh, relu, sigmoid, iden + + params.add_hparam('controller_baseline_dec', FLAGS.controller_baseline_dec) + params.add_hparam('controller_entropy_weight', + FLAGS.controller_entropy_weight) + params.add_hparam('controller_temperature', FLAGS.controller_temperature) + params.add_hparam('controller_tanh_constant', FLAGS.controller_tanh_constant) + params.add_hparam('controller_learning_rate', FLAGS.controller_learning_rate) + params.add_hparam('controller_num_aggregate', 10) + params.add_hparam('controller_num_train_steps', 25) + + return params + + +class Controller(object): + """ENAS controller. Samples architectures and creates training ops.""" + + def __init__(self, params, name='controller'): + print('-' * 80) + print('Create a controller') + self.params = _set_default_params(params) + self.name = name + self._build_params() + self._build_sampler() + + def _build_params(self): + """Create TF parameters.""" + initializer = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) + num_funcs = self.params.controller_num_functions # 4 + hidden_size = self.params.controller_hidden_size # 64 + with tf.variable_scope(self.name, initializer=initializer): + with tf.variable_scope('lstm'): + self.w_lstm = tf.get_variable('w', [2 * hidden_size, 4 * hidden_size]) + + with tf.variable_scope('embedding'): + self.g_emb = tf.get_variable('g', [1, hidden_size]) + self.w_emb = tf.get_variable('w', [num_funcs, hidden_size]) + + with tf.variable_scope('attention'): + self.attn_w_1 = tf.get_variable('w_1', [hidden_size, hidden_size]) + self.attn_w_2 = tf.get_variable('w_2', [hidden_size, hidden_size]) + self.attn_v = tf.get_variable('v', [hidden_size, 1]) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables() + if v.name.startswith(self.name)]) + print('Controller has {0} params'.format(num_params)) + + def _build_sampler(self): + """Build the sampler ops and the log_prob ops.""" + hidden_size = self.params.controller_hidden_size + num_layers = self.params.controller_num_layers + + arc_seq = [] + sample_log_probs = [] + sample_entropy = [] + all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)] + all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)] + + # sampler ops + inputs = self.g_emb # ??? + prev_c = tf.zeros([1, hidden_size], dtype=tf.float32) + prev_h = tf.zeros([1, hidden_size], dtype=tf.float32) + + inputs = self.g_emb + for layer_id in range(1, num_layers + 1): + next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) + prev_c, prev_h = next_c, next_h + all_h.append(next_h) + all_h_w.append(tf.matmul(next_h, self.attn_w_1)) + + query = tf.matmul(next_h, self.attn_w_2) + query = query + tf.concat(all_h_w[:-1], axis=0) + query = tf.tanh(query) + logits = tf.matmul(query, self.attn_v) + logits = tf.reshape(logits, [1, layer_id]) + + if self.params.controller_temperature: + logits /= self.params.controller_temperature + if self.params.controller_tanh_constant: + logits = self.params.controller_tanh_constant * tf.tanh(logits) + diff = tf.cast(layer_id - tf.range(0, layer_id), tf.float32) ** 2 + logits -= tf.reshape(diff, [1, layer_id]) / 6.0 + skip_index = tf.random.categorical(logits, 1) + skip_index = tf.cast(skip_index, tf.int32) + skip_index = tf.reshape(skip_index, [1]) + arc_seq.append(skip_index) + + log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=skip_index) + sample_log_probs.append(log_prob) + + entropy = log_prob * tf.exp(-log_prob) + sample_entropy.append(tf.stop_gradient(entropy)) + + inputs = tf.nn.embedding_lookup( + tf.concat(all_h[:-1], axis=0), skip_index) + inputs /= (0.1 + tf.to_float(layer_id - skip_index)) + + next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) + prev_c, prev_h = next_c, next_h + logits = tf.matmul(next_h, self.w_emb, transpose_b=True) + if self.params.controller_temperature: + logits /= self.params.controller_temperature + if self.params.controller_tanh_constant: + logits = self.params.controller_tanh_constant * tf.tanh(logits) + func = tf.multinomial(logits, 1) + func = tf.to_int32(func) + func = tf.reshape(func, [1]) + arc_seq.append(func) + log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=func) + sample_log_probs.append(log_prob) + entropy = log_prob * tf.exp(-log_prob) + sample_entropy.append(tf.stop_gradient(entropy)) + inputs = tf.nn.embedding_lookup(self.w_emb, func) + + arc_seq = tf.concat(arc_seq, axis=0) + self.sample_arc = arc_seq + + self.sample_log_probs = tf.concat(sample_log_probs, axis=0) + self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) + + sample_entropy = tf.concat(sample_entropy, axis=0) + self.sample_entropy = tf.reduce_sum(sample_entropy) + + self.all_h = all_h + + def build_trainer(self, child_model): + """Build the train ops by connecting Controller with a Child.""" + # actor + self.valid_loss = tf.to_float(child_model.rl_loss) + self.valid_loss = tf.stop_gradient(self.valid_loss) + self.valid_ppl = tf.exp(self.valid_loss) + self.reward = REWARD_CONSTANT / self.valid_ppl + + if self.params.controller_entropy_weight: + self.reward += self.params.controller_entropy_weight * self.sample_entropy + + # or baseline + self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) + self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) + baseline_update = tf.assign_sub(self.baseline, + ((1 - self.params.controller_baseline_dec) * + (self.baseline - self.reward))) + + with tf.control_dependencies([baseline_update]): + self.reward = tf.identity(self.reward) + self.loss = self.sample_log_probs * (self.reward - self.baseline) + + self.train_step = tf.Variable( + 0, dtype=tf.int32, trainable=False, name='train_step') + tf_vars = [var for var in tf.trainable_variables() + if var.name.startswith(self.name)] + + self.train_op, self.optimizer, self.grad_norm = _build_train_op( + loss=self.loss, + tf_vars=tf_vars, + learning_rate=self.params.controller_learning_rate, + train_step=self.train_step, + num_aggregate=self.params.controller_num_aggregate) + + def train(self, sess, reset_op, log_every=10): + """Train the controller for `num_steps`.""" + print('-' * 80) + print('Training controller') + num_steps = (self.params.controller_num_aggregate * + self.params.controller_num_train_steps) + run_ops = [self.sample_arc, + self.sample_entropy, + self.reward, + self.baseline, + self.train_op] + + for step in range(num_steps): + arc, ent, reward, baseline, _ = sess.run(run_ops) + sess.run(reset_op) + if step % log_every == 0: + log_string = 'step={0:<5d}'.format(step) + log_string += ' ent={0:<7.3f}'.format(ent) + log_string += ' ppl={0:<7.2f}'.format(REWARD_CONSTANT / reward) + log_string += ' rw={0:<7.4f}'.format(reward) + log_string += ' bl={0:<7.4f}'.format(baseline) + log_string += ' arc=[{0}]'.format(' '.join([str(v) for v in arc])) + print(log_string) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6d767073c3a3afd3755efb5165522bbb33e70ca9 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/data_utils.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Load picked Penn Treebank data.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +# from npu_bridge.npu_init import * + +import numpy as np +import tensorflow.compat.v1 as tf + + +def input_producer(raw_data, batch_size, num_steps, shuffle=False, + randomize=False, random_len=False): + """Produces graph-based input for Penn Treebank. + + Args: + raw_data: np tensor of size [num_words]. + batch_size: self-explained. + num_steps: number of BPTT steps. + shuffle: whether to shuffle sentences. + randomize: use random segments instead of the continuous corpus. + random_len: random sequence len. + + Returns: + If `random_len` is set, return op that represents whether we have reached + the end of a sequence. + Otherwise, return number of batches in an epoch. + """ + print("raw_data_size:{}".format(np.size(raw_data))) + print("num_steps:{}".format(num_steps)) + batch_len = np.size(raw_data) // batch_size + num_batches_per_epoch = ((np.size(raw_data) // batch_size) - 1) // num_steps + print("num_batches_per_epoch:{}".format(num_batches_per_epoch)) + raw_data = tf.convert_to_tensor(raw_data, name='raw_data', dtype=tf.int32) + + # data_len = tf.size(raw_data) + + + print("batch_len:{}".format(batch_len)) + data = tf.reshape(raw_data[0: batch_size * batch_len], + [batch_size, batch_len]) + + epoch_size = (batch_len - 1) // num_steps + with tf.device('/cpu:0'): + epoch_size = tf.identity(epoch_size, name='epoch_size') + + if random_len: + start_idx = tf.Variable(0, name='start_idx', dtype=tf.int32,trainable=False) + # start_idx = 0 + base_bptt = tf.cond( + tf.random_uniform(shape=(), minval=0., maxval=1.) < 0.95, + lambda: tf.cast(num_steps, dtype=tf.float32), + lambda: tf.cast(num_steps, dtype=tf.float32) / 2.) + # base_bptt = int(tf.cond( + # tf.greater_equal(0.95, np.random.uniform(100)/100), + # lambda:num_steps / 1., + # lambda:num_steps / 2.).item()) + # base_bptt = 35 + seq_len = tf.random.truncated_normal(shape=(), mean=base_bptt, stddev=5., + dtype=tf.float32) + # seq_len = int(np.random.normal(num_steps, 5)) + # seq_len = 35 + seq_len = tf.cast(seq_len, dtype=tf.int32) + seq_len = tf.minimum(seq_len, num_steps + 20) # seq_len <= bptt + 40 + seq_len = tf.minimum(seq_len, batch_len - start_idx - 1) + + # seq_len = tf.cond(tf.greater_equal(seq_len, num_steps + 20), lambda: num_steps + 20, lambda: seq_len).item() + # seq_len = tf.cond(tf.greater_equal(seq_len, int(batch_len - start_idx - 1)), lambda: int(batch_len - start_idx - 1), lambda: seq_len).item() + # seq_len = min(seq_len, num_steps + 20, batch_len - start_idx - 1) + print("seq_len:{}, type:{}".format(seq_len, type(seq_len))) + + end_idx = start_idx + seq_len + + x = data[:, start_idx: end_idx] + # x = tf.reshape(x, [batch_size, seq_len]) + # print("xshape:{}".format(x.get_shape().as_list())) + y = data[:, start_idx + 1: end_idx + 1] + # y = tf.reshape(y, [batch_size, seq_len]) + # print("yshape:{}".format(y.get_shape().as_list())) + + with tf.control_dependencies([x, y]): + with tf.control_dependencies([tf.assign(start_idx, end_idx)]): + should_reset = tf.greater_equal(end_idx, batch_len - 3) + reset_start_idx = tf.assign(start_idx, 0) + # reset_start_idx = tf.assign(tf.Variable(start_idx, name='reset_start_idx', dtype=tf.int32, trainable=False), 0) + return (x, y, num_batches_per_epoch, reset_start_idx, should_reset, + base_bptt, seq_len / batch_len) + + if randomize: + i = tf.random_uniform([1], minval=0, maxval=batch_len - num_steps,dtype=tf.int32)[0] + x = tf.strided_slice(data, [0, i], [batch_size, i + num_steps]) + y = tf.strided_slice(data, [0, i + 1], [batch_size, i + num_steps + 1]) + else: + # """ + # 修改点 + start_idx_eval = tf.Variable(0, name='start_idx', dtype=tf.int32, + trainable=False) + seq_len = num_steps + seq_len = tf.cast(seq_len, dtype=tf.int32) + end_idx = start_idx_eval + seq_len + x = data[:, start_idx_eval: end_idx] + y = data[:, start_idx_eval + 1: end_idx + 1] + with tf.control_dependencies([x, y]): + with tf.control_dependencies([tf.assign(start_idx_eval, end_idx)]): + should_reset_eval = tf.greater_equal(end_idx, batch_len - num_steps - 3) + reset_start_idx_eval = tf.assign(start_idx_eval, 0) + x.set_shape([batch_size, num_steps]) + y.set_shape([batch_size, num_steps]) + + return x, y, num_batches_per_epoch, reset_start_idx_eval, should_reset_eval diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py new file mode 100644 index 0000000000000000000000000000000000000000..51ed715f5b75c6701b78e4ec77d7e6a3bc393707 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD ENAS with a fixed architecture.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.python.tools import freeze_graph + +import os +import pickle +import sys + +# TODO:change path +# sys.path.append("/home/test_user06/AscendZhongzhi_NJU/") +import time + +import numpy as np +import tensorflow.compat.v1 as tf + +import fixed_lib +import utils +from tensorflow.contrib import training as contrib_training + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +## Required parameters +subfolder = str(time.strftime('%Y%m%d_%H%M%S')) +flags.DEFINE_string('output_dir', "./output/infer0/" + subfolder, '') +flags.DEFINE_string('data_path', './ptb/ptb.pkl', '') +flags.DEFINE_string("ckp_path", '', "checkpoint path") + +## Other parametersresult +flags.DEFINE_boolean('reload_model', True, '') +flags.DEFINE_boolean('reset_output_dir', True, '') +flags.DEFINE_boolean('is_training', False, '') +flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") + +flags.DEFINE_integer('log_every', 100, '') + + +def get_ops(params, x_train, x_valid, x_test): + """Build [train, valid, test] graphs.""" + lm = fixed_lib.LM(params, x_train, x_valid, x_test) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + 'eval_test': lm.do_infer, + 'bptt_rate': lm.bptt_rate, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'moving_avg_started': lm.moving_avg_started, + 'update_moving_avg': lm.update_moving_avg_ops, + 'start_moving_avg': lm.start_moving_avg_op, + 'end_moving_avg': lm.end_moving_avg_op, + 'reset_avg': lm.restart_avg, + 'set_lr_decay': lm.set_lr_decay, + 'reset_start_idx_eval': lm.reset_start_idx_eval, + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + + +def load_ckpt_model(sess, save_path): + print("reload model from:{}".format(save_path)) + checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 + input_checkpoint = checkpoint.model_checkpoint_path + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 + saver.restore(sess, input_checkpoint) # 使用最新模型 + sess.run(tf.global_variables_initializer())# 初始化所有变量 + + +def train(params, is_training=True): + """Entry point for training.""" + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, x_test, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + print(' test_size: {0}'.format(np.size(x_test))) + + g = tf.Graph() + with g.as_default(): + tf.random.set_random_seed(2126) + ops = get_ops(params, x_train, x_valid, x_test) + run_ops = [ + ops['train_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['moving_avg_started'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=2) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + + # >>> add code >> + # 创建session + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 + custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") + # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 + # # custom_op.parameter_map[ + # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("./dump/") + # custom_op.parameter_map["enable_dump_debug"].b = True + # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 + # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) + # >>> add code >> + + + # config = tf.ConfigProto() + # config.gpu_options.allow_growth = True + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + # reload model + if params.ckp_path is not "" and FLAGS.reload_model: + last_checkpoint = tf.train.latest_checkpoint(params.ckp_path) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + + accum_loss = 0. + accum_step = 0 + epoch = sess.run(ops['global_step']) // params.num_train_batches + best_valid_ppl = [] + accum_rate = 0. + start_time = time.time() + last_min = (time.time() - start_time) / 60 + cleaned = True + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + avg_flag = "no_null" + while True and is_training: + try: + loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) + # bptt_rate = sess.run(ops['bptt_rate']) + # accum_rate += bptt_rate + + accum_loss += loss + accum_step += 1 + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + # epoch = step // params.num_train_batches + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + min_pices = mins_so_far-last_min + last_min = mins_so_far + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0}/{1:<6d}'.format(step, params.num_train_steps) + log_string += ' ppl={0:<10.2f}'.format(train_ppl) + log_string += ' lr={0:<6.3f}'.format(lr) + log_string += ' |g|={0:<6.3f}'.format(gn) + log_string += ' avg={0:<2d}'.format(moving_avg_started) + log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, min_pices/params.log_every) + # log_string += ' accum_rate(rate of a epoch)={0:<4.6f}'.format(accum_rate) + # log_string += ' should_reset:{}'.format(should_reset) + print(log_string) + + if moving_avg_started: + if avg_flag is "": + sess.run(ops['end_moving_avg']) + sess.run(ops['reset_avg']) + avg_flag = "restart_avg" + else: + sess.run(ops['update_moving_avg']) + # ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + + + if step <= (300 * params.num_train_batches): + if step % (10 * params.num_train_batches) == 0: + print('Start learning decay ...') + sess.run(ops['set_lr_decay']) + if moving_avg_started and step + 5 % (10 * params.num_train_batches) == 0 and len(best_valid_ppl) > params.best_valid_ppl_threshold and valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]): + print('Start learning decay ...') + sess.run(ops['set_lr_decay']) + if should_reset: + accum_rate=0. + print("should_reset:{}".format(should_reset)) + sess.run(ops['reset_batch_states']) + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + # 初始化验证集idx + sess.run(ops['reset_start_idx_eval']) + # 初始化训练集 batch_state, idx + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + # note:当目前的ppl不是最好的10个时,利用移动平均权重法进行调整。 + if (not moving_avg_started and + len(best_valid_ppl) > params.best_valid_ppl_threshold and + valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold]) + ): + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + # print('Start learning decay ...') + # sess.run(ops['set_lr_decay']) + + if valid_ppl > 15.: + best_valid_ppl.append(valid_ppl) + if not cleaned: + best_valid_ppl = [p for p in best_valid_ppl if p < 40.] + cleaned = True + # ops['eval_test'](sess, use_moving_avg=moving_avg_started) + if step % (1 * params.num_train_batches) == 0: + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + print("test_ppl:{}".format(test_ppl)) + sess.run(ops['reset_start_idx_eval']) + if step >= params.num_train_steps: + #inference + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + print("final_test_ppl:{}".format(test_ppl)) + break + except tf.errors.InvalidArgumentError: + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + accum_loss, accum_step = 0., 0 + if not is_training: + moving_avg_started = sess.run(ops['moving_avg_started']) + test_ppl = ops['eval_test'](sess, use_moving_avg=moving_avg_started) + sess.close() + # infer_loss = ops['inference']() + with tf.Session() as sess: + print("test_ppl:{}".format(test_ppl)) + #保存图,在./pb_model文件夹中生成model.pb文件 + # model.pb文件将作为input_graph给到接下来的freeze_graph函数 + tf.train.write_graph(sess.graph_def, './models_pb', 'model3.pb') # 通过write_graph生成模型文件 + freeze_graph.freeze_graph( + input_graph='./models_pb/model3.pb', # 传入write_graph生成的模型文件 + input_saver='', + input_binary=False, + input_checkpoint=params.ckp_path+'model.ckpt-906', # 传入训练生成的checkpoint文件 + output_node_names='output', # 与定义的推理网络输出节点保持一致 + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + output_graph='./models_pb/enas_lm3.pb', # 改为需要生成的推理网络的名称 + clear_devices=False, + initializer_nodes='') + print("done pb!") + else: + sess.close() + """ + if not is_training: + return infer_loss + else: + return -1 + """ + +def main(unused_args): + tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.info("**********") + print("===>>>data_path:{}".format(FLAGS.data_path)) + print("===>>>output_dir:{}".format(FLAGS.output_dir)) + print("===>>>ckp_path:{}".format(FLAGS.ckp_path)) + + print('-' * 80) + output_dir = FLAGS.output_dir + + print('-' * 80) + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(output_dir)) + gfile.DeleteRecursively(output_dir) + gfile.MakeDirs(output_dir) + + print('-' * 80) + log_file = os.path.join(output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ckp_path=FLAGS.ckp_path, + ) + + train(params, is_training=FLAGS.is_training) + + +if __name__ == '__main__': + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..49659f7066f259780283e01cab0af42b853899f2 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/fixed_lib.py @@ -0,0 +1,652 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD ENAS fixed model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + +import numpy as np +import tensorflow.compat.v1 as tf + +import data_utils +import utils + +flags = tf.app.flags +FLAGS = flags.FLAGS + +flags.DEFINE_string('fixed_arc', '0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0', '') +flags.DEFINE_float('child_alpha', 0.7, 'activation L2 reg') +flags.DEFINE_float('child_drop_e', 0.125, 'drop rate words') +flags.DEFINE_float('child_drop_i', 0.175, 'drop rate embeddings') +flags.DEFINE_float('child_drop_l', 0.225, 'drop rate between layers') +flags.DEFINE_float('child_drop_o', 0.75, 'drop rate output') +flags.DEFINE_float('child_drop_w', 0.00, 'drop rate weight') +flags.DEFINE_float('child_drop_x', 0.725, 'drop rate at input of RNN cells') +flags.DEFINE_float('child_init_range', 0.05, '') +flags.DEFINE_float('child_grad_bound', 0.25, '') +flags.DEFINE_float('child_weight_decay', 2e-6, '') +flags.DEFINE_integer('child_num_train_epochs', 2, '') +flags.DEFINE_integer('child_hidden_size', 800, '') + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, minval=0., maxval=1., dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _rnn_fn(x, prev_s, w_prev, w_skip, input_mask, layer_mask, params): + """Multi-layer LSTM. + + Args: + x: [batch_size, num_steps, hidden_size]. + prev_s: [batch_size, hidden_size]. + w_prev: [2 * hidden_size, 2 * hidden_size]. + w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. + input_mask: [batch_size, hidden_size]. + layer_mask: [batch_size, hidden_size]. + params: hyper-params object. + + Returns: + next_s: [batch_size, hidden_size]. + all_s: [[batch_size, num_steps, hidden_size] * num_layers]. + """ + batch_size = x.get_shape()[0].value + print("batch_size:{}".format(batch_size)) + # batch_size = params.batch_size + num_steps = tf.shape(x)[1] + fixed_arc = params.fixed_arc + num_layers = len(fixed_arc) // 2 + set_shape = x.get_shape().as_list() + print("x.set_shape:{}".format(set_shape)) + + # all_s = tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) + # all_s_my = [] + all_s_my = tf.zeros([1, batch_size, params.hidden_size], dtype=tf.float32) + + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, prev_s, all_s_my): + """Body fn for `tf.while_loop`.""" + inp = x[:, step, :] + # print("inp:{}".format(inp)) + if layer_mask is not None: + assert input_mask is not None + ht = tf.matmul( + tf.concat([inp * input_mask, prev_s * layer_mask], axis=1), w_prev) + else: + ht = tf.matmul(tf.concat([inp, prev_s], axis=1), w_prev) + # print("w_prev:{}".format(w_prev)) + h, t = tf.split(ht, 2, axis=1) + h = tf.tanh(h) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + layers = [s] + + def _select_function(h, function_id): + if function_id == 0: + return tf.tanh(h) + elif function_id == 1: + return tf.nn.relu(h) + elif function_id == 2: + return tf.sigmoid(h) + elif function_id == 3: + return h + raise ValueError('Unknown func_idx {0}'.format(function_id)) + + start_idx = 0 + for layer_id in range(num_layers): + prev_idx = fixed_arc[start_idx] + func_idx = fixed_arc[start_idx + 1] + prev_s = layers[prev_idx] + if layer_mask is not None: + ht = tf.matmul(prev_s * layer_mask, w_skip[layer_id]) + else: + ht = tf.matmul(prev_s, w_skip[layer_id]) + h, t = tf.split(ht, 2, axis=1) + + h = _select_function(h, func_idx) + t = tf.sigmoid(t) + s = prev_s + t * (h - prev_s) + # print("layers_id:{}\ns before set_shape:{}".format(layer_id, s)) + s.set_shape([batch_size, params.hidden_size]) + # print("s after set_shape:{}".format(s)) + layers.append(s) + start_idx += 2 + # print("layers:{}\ns:{}".format(layers, s)) + next_s = tf.add_n(layers[1:]) / tf.cast(num_layers, dtype=tf.float32) + # print("next_s:{}".format(next_s)) + t = tf.stack([next_s]) + # print("t:{}".format(t)) + all_s_my = tf.concat([all_s_my, t], 0) + # print("all_s_my:{}".format(all_s_my)) + # all_s.append(next_s) + return step + 1, next_s, all_s_my + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_s, all_s_my] + _, next_s, all_s_my = tf.while_loop(_condition, _body, loop_inps, shape_invariants=[loop_inps[0].get_shape(), loop_inps[1].get_shape(), tf.TensorShape([None, batch_size, params.hidden_size])]) + # >>> add code >>> + # all_s_my = tf.reshape(all_s_my, [set_shape[1]+1, set_shape[0], params.hidden_size]) + # print("all_s_my(list):{}".format(all_s_my)) + # tmp = all_s_my[1:, :, :] + # # tmp = tf.reshape(tmp, [set_shape[1], set_shape[0], params.hidden_size]) + # print("stack_all_s:{}".format(tmp)) + # all_s = tf.transpose(tmp, perm=[1, 0, 2]) + # # all_s.set_shape([set_shape[0], set_shape[1], params.hidden_size]) + # all_s = tf.reshape(all_s, [set_shape[0], set_shape[1], params.hidden_size]) + # print("all_s:{}".format(all_s)) + all_s_my = tf.strided_slice(all_s_my, [1, 0, 0], [num_steps + 1, batch_size, params.hidden_size]) + # print("stack_all_s:{}".format(all_s_my)) + + all_s = tf.transpose(all_s_my, perm=[1, 0, 2]) + # print("all_s:{}".format(all_s)) + + return next_s, all_s + + +def _set_default_params(params): + """Set default values for the hparams.""" + params.add_hparam('alpha', FLAGS.child_alpha) # activation L2 reg + params.add_hparam('best_valid_ppl_threshold', 10) + + params.add_hparam('batch_size', 64) + params.add_hparam('bptt_steps', 32) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', FLAGS.child_drop_e) # word + params.add_hparam('drop_i', FLAGS.child_drop_i) # embeddings + params.add_hparam('drop_l', FLAGS.child_drop_l) # between RNN nodes + params.add_hparam('drop_o', FLAGS.child_drop_o) # output + params.add_hparam('drop_w', FLAGS.child_drop_w) # weight + params.add_hparam('drop_x', FLAGS.child_drop_x) # input to RNN layers + + assert FLAGS.fixed_arc is not None + print(FLAGS.fixed_arc) + L_arc = FLAGS.fixed_arc.split(' ') + print("L_arc:{}".format(L_arc)) + params.add_hparam('fixed_arc', [int(d) for d in L_arc]) + + params.add_hparam('grad_bound', FLAGS.child_grad_bound) + params.add_hparam('hidden_size', FLAGS.child_hidden_size) + params.add_hparam('init_range', FLAGS.child_init_range) + params.add_hparam('learning_rate', 40.) + params.add_hparam('num_train_epochs', FLAGS.child_num_train_epochs) + params.add_hparam('num_warmup_epochs', 0.0) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', FLAGS.child_weight_decay) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, x_train, x_valid, x_test, name='language_model'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, + self.base_bptt, self.bptt_rate) = data_utils.input_producer( + x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam( + 'num_train_steps', self.num_train_batches * params.num_train_epochs) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer( + x_valid, params.batch_size, params.bptt_steps) + + # test data + (self.x_test, self.y_test, + self.num_test_batches, self.reset_start_idx_eval, self.should_reset_eval) = data_utils.input_producer(x_test, 1, 1) + + params.add_hparam('num_warmup_steps', + params.num_warmup_epochs * self.num_train_batches) + self._build_params() + self._build_train() + self._build_valid() + self._build_test() + self._build_infer() + self._build_avg_infer() + + def _build_params(self): + """Create model parameters.""" + + print('-' * 80) + print('Building model params') + initializer = tf.initializers.random_uniform(minval=-self.params.init_range, + maxval=self.params.init_range) + with tf.variable_scope(self.name, initializer=initializer): + with tf.variable_scope('embedding'): + w_emb = tf.get_variable( + 'w', [self.params.vocab_size, self.params.hidden_size], + initializer=initializer) + # >>> add code >>> + dropped_w_emb = npu_ops.dropout(w_emb, 1 - self.params.drop_e) + # >>> add code >>> + # dropped_w_emb = tf.layers.dropout( + # w_emb, self.params.drop_e, [self.params.vocab_size, 1], + # training=True) + + hidden_size = self.params.hidden_size + fixed_arc = self.params.fixed_arc + num_layers = len(fixed_arc) // 2 + with tf.variable_scope('rnn_cell'): + w_prev = tf.get_variable('w_prev', [2 * hidden_size, 2 * hidden_size]) + i_mask = tf.ones([hidden_size, 2 * hidden_size], dtype=tf.float32) + h_mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w_prev = w_prev * mask + + w_skip, dropped_w_skip = [], [] + for layer_id in range(num_layers): + mask = _gen_mask([hidden_size, 2 * hidden_size], self.params.drop_w) + with tf.variable_scope('layer_{}'.format(layer_id)): + w = tf.get_variable('w', [hidden_size, 2 * hidden_size]) + dropped_w = w * mask + w_skip.append(w) + dropped_w_skip.append(dropped_w) + + with tf.variable_scope('init_states'): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hidden_size] + batch_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset = tf.assign(batch_prev_s, zeros) + with tf.variable_scope('test'): + init_shape = [1, hidden_size] + test_prev_s = tf.get_variable( + 's', init_shape, dtype=tf.float32, trainable=False) + zeros = tf.zeros(init_shape, dtype=tf.float32) + test_reset = tf.assign(test_prev_s, zeros) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + print('Model has {0} params'.format(num_params)) + + self.batch_init_states = { + 's': batch_prev_s, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_prev': dropped_w_prev, + 'w_skip': dropped_w_skip, + 'w_soft': w_emb, + } + self.test_init_states = { + 's': test_prev_s, + 'reset': test_reset, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_prev': w_prev, + 'w_skip': w_skip, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_prev = model_params['w_prev'] + w_skip = model_params['w_skip'] + w_soft = model_params['w_soft'] + prev_s = init_states['s'] + + emb = tf.nn.embedding_lookup(w_emb, x) + batch_size = self.params.batch_size + hidden_size = self.params.hidden_size + if is_training: + # >>> add code >>> + emb = npu_ops.dropout(emb, 1-self.params.drop_i) # , [batch_size, 1, hidden_size]) # , training=True) + + # >>> add code >>> + # emb = tf.layers.dropout( + # emb, self.params.drop_i, + # [self.params.batch_size, 1, hidden_size], training=True) + + input_mask = _gen_mask([batch_size, hidden_size], self.params.drop_x) + layer_mask = _gen_mask([batch_size, hidden_size], self.params.drop_l) + else: + input_mask = None + layer_mask = None + + out_s, all_s = _rnn_fn(emb, prev_s, w_prev, w_skip, input_mask, layer_mask, + self.params) + top_s = all_s + if is_training: + # >>> add code >>> + top_s = npu_ops.dropout(top_s, + 1 - self.params.drop_o)# ,[self.params.batch_size, 1, self.params.hidden_size]) # , training=True) + # >>> add code >>> + + # top_s = tf.layers.dropout(top_s, self.params.drop_o, + # [batch_size, 1, hidden_size], training=True) + + carry_on = [tf.assign(prev_s, out_s)] + # print("top_s:{}\nw_soft:{}".format(top_s, w_soft)) + logits = tf.einsum('bnh,vh->bnv', top_s, w_soft) + # print("logits:{}".format(logits)) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + loss = tf.reduce_mean(loss) + + reg_loss = loss # loss + regularization_terms, for training only + # print("_forward/loss:{}".format(loss)) + if is_training: + # L2 weight reg + reg_loss += self.params.weight_decay * tf.add_n( + [tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + + # activation L2 reg + reg_loss += self.params.alpha * tf.reduce_mean(all_s ** 2) + + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + # print("reg_loss:{}\nloss:{}".format(reg_loss, loss)) + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = tf.trainable_variables() + # print("reg_loss:{}".format(reg_loss)) + print("tf_vars:{}".format(tf_vars)) + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + with tf.variable_scope('HParam'): + lr_decay = tf.get_variable('learning_rate_decay', [], initializer=tf.constant_initializer(1.), dtype=tf.float32, trainable=False) + self.set_lr_decay = tf.assign_sub(lr_decay, 0.02*lr_decay) + learning_rate = utils.get_lr(global_step, self.params, lr_decay) * lr_scale + grads = tf.gradients(reg_loss, tf_vars) + # print("grads:{}".format(grads)) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + (self.update_moving_avg_ops, self.use_moving_avg_vars, + self.restore_normal_vars) = self._create_average_ops() + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + # def _EMA(self): + # """Build moving average ops.""" + # print('Creating moving average ops') + # + # with tf.variable_scope('moving_avg_flag'): + # self.moving_avg_started = tf.get_variable( + # 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + # trainable=False) + # self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + # self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) + # all_vars = tf.trainable_variables() + # + # ema = tf.train.ExponentialMovingAverage(0.99) + # + # average_op = ema.apply(all_vars) + # back_up_v = tf.identity(all_vars) + # use_average_op = tf.assign(all_vars, ema.average(all_vars)) + # ema.average_name() + # reverse_average_op = tf.assign(all_vars, back_up_v) + + + + + def _create_average_ops(self): + """Build moving average ops.""" + print('Creating moving average ops') + + with tf.variable_scope('moving_avg_flag'): + self.moving_avg_started = tf.get_variable( + 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + trainable=False) + self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + self.end_moving_avg_op = tf.assign(self.moving_avg_started, 0) + + all_vars = tf.trainable_variables() + print('all_vars:{}'.format(all_vars)) + average_pairs = [] + var_cnt = 0 + with tf.variable_scope('average'): + for v in all_vars: + avg_v = tf.get_variable( + str(var_cnt), shape=v.shape, dtype=v.dtype, + initializer=tf.zeros_initializer, trainable=False) + var_cnt += 1 + average_pairs.append([v, avg_v]) + backup_pairs = [] + var_cnt = 0 + with tf.variable_scope('backup'): + for v in all_vars: + backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, + trainable=False) + var_cnt += 1 + backup_pairs.append([v, backup_v]) + # 原作者手动实现的Moving Average ::当eval_valid_ppl退化到一定阈值(退步10名)后启动 + with tf.variable_scope('avg_step'): + avg_step = tf.get_variable('step', [], initializer=tf.constant_initializer(0.), dtype=tf.float32, trainable=False) + tmp1 = [] + tmp2 = [] + tmp3 = [] + self.restart_avg = tf.assign(avg_step, 0.) + with tf.control_dependencies([tf.assign_add(avg_step, 1.)]): + average_op = [] + for v, avg_v in average_pairs: + # v_curr = tf.Variable(tf.cast(tf.identity(v), tf.float32), dtype=tf.float32, trainable=False) + # avg_v_curr = tf.Variable(tf.cast(tf.identity(avg_v), tf.float32), dtype=tf.float32, trainable=False) + # mu = 1. / avg_step + mu = tf.cond(tf.cast(0.999 < (1. + avg_step) / (10. + avg_step), tf.bool), + lambda: tf.cast(tf.constant(0.99), dtype=tf.float32), + lambda: tf.cast((1. + avg_step) / (10. + avg_step), dtype=tf.float32)) + + new_avg = mu * tf.cast(avg_v, tf.float32) + (1. - mu) * tf.cast(v, tf.float32) + with tf.control_dependencies([new_avg]): + average_op.append(tf.assign(avg_v, tf.cast(new_avg, avg_v.dtype))) + # 追踪变量 + tmp1.append(v) + tmp2.append(new_avg) + tmp3.append([avg_step, mu, tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(new_avg ** 2)]) + + self.p1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp1]) + self.p2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tmp2]) + self.p3 = tmp3 + # # 使用官方API + # with tf.variable_scope('avg_step'): + # avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) + # + # ema = tf.train.ExponentialMovingAverage(0.99, avg_step) + # with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): + # average_op = [] + # for v, avg_v in average_pairs: + # v = tf.Variable(tf.cast(v, tf.float32), dtype=tf.float32, trainable=False) + # avg_v = tf.Variable(tf.cast(avg_v, tf.float32), dtype=tf.float32, trainable=False) + # print('v:{}'.format(v)) + # ema.apply([v]) + # new_avg = ema.average(v) + # print('new_avg:{}'.format(new_avg)) + # with tf.control_dependencies([new_avg]): + # print('avg_v:'.format(avg_v)) + # average_op.append(tf.assign(avg_v, new_avg)) + # # average_op = tf.group(*average_op) + + assert len(average_pairs) == len(all_vars) + assert len(average_pairs) == len(backup_pairs) + use_average_op = [] + + new_tmp1 = [] + for i in range(len(average_pairs)): + v, avg_v = average_pairs[i] + _, backup_v = backup_pairs[i] + with tf.control_dependencies([tf.assign(backup_v, v)]): + new_tmp1.append([tf.reduce_sum(v ** 2), tf.reduce_sum(avg_v ** 2), tf.reduce_sum(backup_v ** 2)]) + use_average_op.append(tf.assign(v, avg_v)) + self.p4 = new_tmp1 + + use_average_op = tf.group(*use_average_op) + # with tf.control_dependencies([use_average_op]): + self.p3_1 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + reverse_average_op = [] + new_tmp2 = [] + for v, backup_v in backup_pairs: + # with tf.control_dependencies([use_average_op]): + new_tmp2.append([tf.reduce_sum(v ** 2), tf.reduce_sum(backup_v ** 2)]) + reverse_average_op.append(tf.assign(v, backup_v)) + self.p5 = new_tmp2 + reverse_average_op = tf.group(*reverse_average_op) + # with tf.control_dependencies([reverse_average_op]): + self.p3_2 = tf.add_n([tf.reduce_sum(tf.cast(w, tf.float32) ** 2) for w in tf.trainable_variables()]) + + return average_op, use_average_op, reverse_average_op + + def _eval_test(self, sess, use_moving_avg=False): + """Eval 1 round on test set.""" + total_loss = 0 + if use_moving_avg: + print('v:{}'.format(tf.trainable_variables())) + sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) + print('v_avg:{}'.format(tf.trainable_variables())) + for step in range(int(self.num_test_batches)): + total_loss += sess.run(self.test_loss) + if (step + 1) % 1000 == 0: + test_ppl = np.exp(total_loss / (step + 1)) + log_string = 'step={0:<6d}'.format(step + 1) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + if sess.run(self.should_reset_eval): + break + # test_ppl = np.exp(total_loss / self.num_test_batches) + + # log_string = 'step={0:<6d}'.format(self.num_test_batches) + # log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + # print(log_string) + if use_moving_avg: + sess.run(self.restore_normal_vars) + # test_ppl = tf.math.exp(total_loss/ self.num_test_batches, name='output') + # print("test_ppl:{}".format(test_ppl)) + # loss_assign_op = tf.assign(self.tt_loss, tf.Variable(total_loss, name='total_loss', dtype=tf.float32,trainable=False)) + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + + def _build_test(self): + print('Building test graph') + _, loss = self._forward(self.x_test, self.y_test, + self.eval_params, self.test_init_states) + self.test_loss = loss + + def _build_infer(self): + print("Building infer graph") + tt_loss = tf.Variable(0, name="total_loss", dtype=tf.float32, trainable=False) + def _condition(step, *unused_args): + return tf.less(step, self.num_test_batches-3) + def _body(step, tt_loss): + with tf.control_dependencies([self.test_loss]): + tt_loss += self.test_loss + return step+1, tt_loss + loop_inps = [tf.constant(0, dtype=tf.int32), tt_loss] + _, tt_loss = tf.while_loop(_condition, _body, loop_inps) + test_ppl = tf.math.exp(tt_loss/ self.num_test_batches, name='test_ppl') + print("test_ppl:{}".format(test_ppl)) + self.infer_ppl = test_ppl + + def _build_avg_infer(self): + print("Build avg_infer graph") + def _fp(): + with tf.control_dependencies([self.use_moving_avg_vars, self.test_init_states['reset']]): + avg_infer_ppl = self.infer_ppl + with tf.control_dependencies([avg_infer_ppl, self.restore_normal_vars]): + return avg_infer_ppl + def _fn(): + return self.infer_ppl + + with tf.control_dependencies([self.moving_avg_started]): + avg_infer_ppl = tf.cond(tf.greater_equal(self.moving_avg_started, 1), _fp, _fn) + self.avg_infer_ppl = tf.identity(avg_infer_ppl, name="output") + print("self.avg_infer_ppl:{}".format(self.avg_infer_ppl)) + + + def eval_valid(self, sess, use_moving_avg=False): + """Eval 1 round on valid set.""" + total_loss = 0 + + if use_moving_avg: + # print('sum_v:{}'.format(sess.run(self.p1))) + # print('new_sum_v:{}'.format(sess.run(self.p2))) + # print('[[step, mu, v, v_avg, new_v_avg]]={}'.format(sess.run(self.p3))) + # self.use_moving_avg_vars ===>影子权重暂时替代当前权重 + sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) + # print('v_avg:{}\n[[v, avg_v, backup_v]]={}'.format(sess.run(self.p3_1), sess.run(self.p4))) + + valid_loss = [] + for _ in range(self.num_valid_batches): + loss = sess.run(self.valid_loss) + total_loss += loss + valid_loss.append(loss) + if sess.run(self.should_reset_eval): + break + print("valid_loss={}, self.num_valid_batches={}".format(valid_loss, self.num_valid_batches)) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + # print('v:{}\n[[v, backup_v]]={} \n============================================================'.format( + # sess.run(self.p3_2), sess.run(self.p5))) + + return valid_ppl + + def do_infer(self, sess, use_moving_avg=False): + # self._eval_test(sess, use_moving_avg) + return sess.run(self.avg_infer_ppl) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py new file mode 100644 index 0000000000000000000000000000000000000000..5985dd014aeb4f2bef3aec64ed82326cf36e180b --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/help_modelarts.py @@ -0,0 +1,93 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import datetime +# import moxing as mox +import tensorflow.compat.v1 as tf +gfile = tf.gfile + +def obs_data2modelarts(config): + """ + Copy train data from obs to modelarts by using moxing api. + """ + start = datetime.datetime.now() + print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.data_url, config.modelarts_data_dir)) + mox.file.copy_parallel(src_url=config.data_url, dst_url=config.modelarts_data_dir) + print("===>>>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbCopy files from obs:{} to modelarts dir:{}".format(config.ckp_path, config.modelarts_result_dir)) + output_dir = config.modelarts_result_dir + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + mox.file.copy_parallel(src_url=config.ckp_path, dst_url=config.modelarts_result_dir) + end = datetime.datetime.now() + files = os.listdir(config.modelarts_data_dir) + print("===>>>Files:", files) + files2 = os.listdir(config.modelarts_result_dir) + print("===>>>Files2:", files2) + + +def modelarts_result2obs(FLAGS): + """ + Copy debug data from modelarts to obs. + According to the swich flags, the debug data may contains auto tune repository, + dump data for precision comparision, even the computation graph and profiling data. + """ + work_dir = os.getcwd() + print("start op: modelarts_result2obs..........") + + ## copy result from modelarts to obs + obs_result_dir = os.path.join(FLAGS.obs_dir, 'result') + if not mox.file.exists(obs_result_dir): + mox.file.make_dirs(obs_result_dir) + else: + mox.file.remove(obs_result_dir, recursive=True) + mox.file.make_dirs(obs_result_dir) + mox.file.copy_parallel(src_url=FLAGS.output_dir, dst_url=obs_result_dir) + print("===>>>Copy Event or Checkpoint from modelarts dir:{} to obs:{}".format(FLAGS.output_dir, obs_result_dir)) + + ## Copy auto tune repository. Comment this snippets if npu_auto_tune is off. + # if FLAGS.npu_auto_tune: + # modelarts_auto_tune_dir = os.path.join(work_dir, "npu_auto_tune") + # obs_auto_tune_dir = os.path.join(FLAGS.obs_dir, 'npu_auto_tune') + # if not mox.file.exists(obs_auto_tune_dir): + # mox.file.make_dirs(obs_auto_tune_dir) + # mox.file.copy_parallel(modelarts_auto_tune_dir, obs_auto_tune_dir) + # print("===>>>Auto tune:{} on OBS dir:{}".format(mox.file.list_directory(obs_auto_tune_dir), obs_auto_tune_dir)) + # + # ## Copy dump data. Comment this snippets if npu_dump_data is off. + # if FLAGS.npu_dump_data: + # modelarts_dump_data_dir = os.path.join(work_dir, "npu_dump_data") + # obs_dump_data_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_data') + # if not mox.file.exists(obs_dump_data_dir): + # mox.file.make_dirs(obs_dump_data_dir) + # mox.file.copy_parallel(modelarts_dump_data_dir, obs_dump_data_dir) + # print("===>>>Dumped graph:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_data_dir), obs_dump_data_dir)) + # + # ## Copy compute graph. Comment this snippets if npu_dump_graph is off. + # if FLAGS.npu_dump_graph: + # modelarts_dump_graph_dir = os.path.join(work_dir, "npu_dump_graph") + # obs_dump_graph_dir = os.path.join(FLAGS.obs_dir, 'npu_dump_graph') + # if not mox.file.exists(obs_dump_graph_dir): + # mox.file.make_dirs(obs_dump_graph_dir) + # mox.file.copy_parallel(modelarts_dump_graph_dir, obs_dump_graph_dir) + # print("===>>>Dumped data:{} on OBS dir:{}".format(mox.file.list_directory(obs_dump_graph_dir), obs_dump_graph_dir)) + # + # ## Copy profiling data. Comment this snippets if npu_profiling is off. + # if FLAGS.npu_profiling: + # modelarts_profiling_dir = os.path.join(work_dir, "npu_profiling") + # obs_profiling_dir = os.path.join(FLAGS.obs_dir, 'npu_profiling') + # if not mox.file.exists(obs_profiling_dir): + # mox.file.make_dirs(obs_profiling_dir) + # mox.file.copy_parallel(modelarts_profiling_dir, obs_profiling_dir) + # print("===>>>Profiling data:{} on OBS dir:{}".format(mox.file.list_directory(obs_profiling_dir), obs_profiling_dir)) diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..2a1816ac38916bba11b1ddc92fa984c70d2d847f --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD LSTM.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import os +import pickle +import sys +import time + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import lstm_lib +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils +from tensorflow.contrib import training as contrib_training + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +flags.DEFINE_boolean('reset_output_dir', False, '') +flags.DEFINE_string('output_dir', None, '') +flags.DEFINE_string('data_path', None, '') + +flags.DEFINE_integer('log_every', 200, '') + + +def get_ops(params, x_train, x_valid, x_test): + """Build [train, valid, test] graphs.""" + + lm = lstm_lib.LM(params, x_train, x_valid, x_test) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + 'eval_test': lm.eval_test, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'moving_avg_started': lm.moving_avg_started, + 'update_moving_avg': lm.update_moving_avg_ops, + 'start_moving_avg': lm.start_moving_avg_op, + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + + +def train(params): + """Entry point for training.""" + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, x_test, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + print(' test_size: {0}'.format(np.size(x_test))) + + g = tf.Graph() + with g.as_default(): + ops = get_ops(params, x_train, x_valid, x_test) + run_ops = [ + ops['train_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['moving_avg_started'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=5) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + accum_loss = 0 + accum_step = 0 + epoch = 0 + best_valid_ppl = [] + start_time = time.time() + while True: + sess.run(ops['reset_batch_states']) + loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(run_ops) + accum_loss += loss + accum_step += 1 + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0:<7d}'.format(step) + log_string += ' ppl={0:<9.2f}'.format(train_ppl) + log_string += ' lr={0:<10.7f}'.format(lr) + log_string += ' |g|={0:<5.2f}'.format(gn) + log_string += ' avg={0:<2d}'.format(moving_avg_started) + log_string += ' mins={0:<.2f}'.format(mins_so_far) + print(log_string) + + if moving_avg_started: + sess.run(ops['update_moving_avg']) + + # if step % params.num_train_batches == 0: + if should_reset: + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess, use_moving_avg=moving_avg_started) + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + if (not moving_avg_started and + len(best_valid_ppl) > params.best_valid_ppl_threshold and + valid_ppl > min(best_valid_ppl[:-params.best_valid_ppl_threshold])): + print('Starting moving_avg') + sess.run(ops['start_moving_avg']) + best_valid_ppl.append(valid_ppl) + + if step >= params.num_train_steps: + ops['eval_test'](sess, use_moving_avg=moving_avg_started) + break + sess.close() + + +def main(unused_args): + output_dir = FLAGS.output_dir + print('-' * 80) + if not gfile.IsDirectory(output_dir): + print('Path {} does not exist. Creating'.format(output_dir)) + gfile.MakeDirs(output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(output_dir)) + gfile.DeleteRecursively(output_dir) + gfile.MakeDirs(output_dir) + + print('-' * 80) + log_file = os.path.join(output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ) + + train(params) + + +if __name__ == '__main__': + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..576b6f2e2f8f65242e04bc9289110cbc17357229 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/lstm_lib.py @@ -0,0 +1,458 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AWD LSTM model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu import npu_convert_dropout + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import data_utils +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils + +MOVING_AVERAGE_DECAY = 0.9995 + +MOVING_AVERAGE_DECAY = 0.9995 + + +def _gen_mask(shape, drop_prob): + """Generate a droppout mask.""" + keep_prob = 1. - drop_prob + mask = tf.random_uniform(shape, dtype=tf.float32) + mask = tf.floor(mask + keep_prob) / keep_prob + return mask + + +def _lstm(x, prev_c, prev_h, w_lstm, layer_masks): + """Multi-layer LSTM. + + Args: + x: [batch_size, num_steps, hidden_size]. + prev_c: [[batch_size, hidden_size] * num_layers]. + prev_h: [[batch_size, hidden_size] * num_layers]. + w_lstm: [[2 * hidden_size, 4 * hidden_size] * num_layers]. + layer_masks: [([hidden_size, hidden_size] or None)* num_layers]. + + Returns: + next_c: [[batch_size, hidden_size] * num_layers]. + next_h: [[batch_size, hidden_size] * num_layers]. + all_h: [batch_size, num_steps, hidden_size]. + """ + _, num_steps, _ = tf.unstack(tf.shape(x)) + num_layers = len(w_lstm) + + all_h = [tf.TensorArray(dtype=tf.float32, size=num_steps, infer_shape=False) + for _ in range(num_layers)] + + def _condition(step, *unused_args): + return tf.less(step, num_steps) + + def _body(step, pprev_c, pprev_h, all_h): + """Apply LSTM at each step.""" + next_c, next_h = [], [] + for layer_id, (p_c, p_h, w, m) in enumerate(zip( + pprev_c, pprev_h, w_lstm, layer_masks)): + inp = x[:, step, :] if layer_id == 0 else next_h[-1] + if m is not None: + inp *= m + ifog = tf.matmul(tf.concat([inp, p_h], axis=1), w) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + c = i * g + f * p_c + h = o * tf.tanh(c) + all_h[layer_id] = all_h[layer_id].write(step, h) + next_c.append(c) + next_h.append(h) + return step + 1, next_c, next_h, all_h + + loop_inps = [tf.constant(0, dtype=tf.int32), prev_c, prev_h, all_h] + _, next_c, next_h, all_h = tf.while_loop(_condition, _body, loop_inps, + parallel_iterations=1) + all_h = [tf.transpose(h.stack(), [1, 0, 2]) + for h in all_h] + + return next_c, next_h, all_h + + +def _set_default_params(params): + """Set default parameters.""" + params.add_hparam('alpha', 2.) # activation L2 reg + params.add_hparam('best_valid_ppl_threshold', 7) + params.add_hparam('beta', 1.) # activation slowness reg + + params.add_hparam('batch_size', 12) + params.add_hparam('bptt_steps', 70) + + # for dropouts: dropping rate, NOT keeping rate + params.add_hparam('drop_e', 0.10) # word + params.add_hparam('drop_i', 0.65) # embeddings + params.add_hparam('drop_l', 0.30) # between layers + params.add_hparam('drop_o', 0.40) # output + params.add_hparam('drop_w', 0.50) # weight + + params.add_hparam('emb_size', 400) + params.add_hparam('start_decay_epoch', 14) + params.add_hparam('decay_every_epoch', 1) + params.add_hparam('decay_rate', 0.98) + params.add_hparam('grad_bound', 0.25) + params.add_hparam('hidden_size', 1100) + params.add_hparam('init_range', 0.1) + params.add_hparam('learning_rate', 20.) + params.add_hparam('num_layers', 3) + params.add_hparam('num_train_epochs', 500) + params.add_hparam('vocab_size', 10000) + + params.add_hparam('weight_decay', 1.2e-6) + return params + + +class LM(object): + """Language model.""" + + def __init__(self, params, x_train, x_valid, x_test, name='language_model'): + print('-' * 80) + print('Building LM') + + self.params = _set_default_params(params) + self.name = name + + # train data + (self.x_train, self.y_train, + self.num_train_batches, self.reset_start_idx, + self.should_reset, self.base_bptt) = data_utils.input_producer( + x_train, params.batch_size, params.bptt_steps, random_len=True) + params.add_hparam( + 'num_train_steps', self.num_train_batches * params.num_train_epochs) + + # valid data + (self.x_valid, self.y_valid, + self.num_valid_batches) = data_utils.input_producer( + x_valid, params.batch_size, params.bptt_steps) + + # test data + (self.x_test, self.y_test, + self.num_test_batches) = data_utils.input_producer(x_test, 1, 1) + + params.add_hparam('start_decay_step', + params.start_decay_epoch * self.num_train_batches) + params.add_hparam('decay_every_step', + params.decay_every_epoch * self.num_train_batches) + + self._build_params() + self._build_train() + self._build_valid() + self._build_test() + + def _build_params(self): + """Create and count model parameters.""" + print('-' * 80) + print('Building model params') + with tf.variable_scope(self.name): + with tf.variable_scope('embedding'): + initializer = tf.initializers.random_uniform( + -self.params.init_range, self.params.init_range) + w_emb = tf.get_variable( + 'w', [self.params.vocab_size, self.params.emb_size], + initializer=initializer) + dropped_w_emb = tf.layers.dropout( + w_emb, self.params.drop_e, [self.params.vocab_size, 1], + training=True) + + w_lstm = [] + dropped_w_lstm = [] + with tf.variable_scope('lstm'): + for i in range(self.params.num_layers): + inp_size = self.params.emb_size if i == 0 else self.params.hidden_size + hid_size = (self.params.emb_size if i == self.params.num_layers - 1 + else self.params.hidden_size) + init_range = 1.0 / np.sqrt(hid_size) + initializer = tf.initializers.random_uniform(-init_range, init_range) + with tf.variable_scope('layer_{0}'.format(i)): + w = tf.get_variable('w', [inp_size + hid_size, 4 * hid_size], + initializer=initializer) + i_mask = tf.ones([inp_size, 4 * hid_size], dtype=tf.float32) + h_mask = _gen_mask([hid_size, 4 * hid_size], self.params.drop_w) + mask = tf.concat([i_mask, h_mask], axis=0) + dropped_w = w * mask + w_lstm.append(w) + dropped_w_lstm.append(dropped_w) + + with tf.variable_scope('init_states'): + batch_prev_c, batch_prev_h, batch_reset = [], [], [] + test_prev_c, test_prev_h, test_reset = [], [], [] + for i in range(self.params.num_layers): + inp_size = self.params.emb_size if i == 0 else self.params.hidden_size + hid_size = (self.params.emb_size if i == self.params.num_layers - 1 + else self.params.hidden_size) + + with tf.variable_scope('layer_{0}'.format(i)): + with tf.variable_scope('batch'): + init_shape = [self.params.batch_size, hid_size] + batch_prev_c.append(tf.get_variable( + 'c', init_shape, dtype=tf.float32, trainable=False)) + batch_prev_h.append(tf.get_variable( + 'h', init_shape, dtype=tf.float32, trainable=False)) + zeros = np.zeros(init_shape, dtype=np.float32) + batch_reset.append(tf.assign(batch_prev_c[-1], zeros)) + batch_reset.append(tf.assign(batch_prev_h[-1], zeros)) + with tf.variable_scope('test'): + init_shape = [1, hid_size] + test_prev_c.append(tf.get_variable( + 'c', init_shape, dtype=tf.float32, trainable=False)) + test_prev_h.append(tf.get_variable( + 'h', init_shape, dtype=tf.float32, trainable=False)) + zeros = np.zeros(init_shape, dtype=np.float32) + test_reset.append(tf.assign(test_prev_c[-1], zeros)) + test_reset.append(tf.assign(test_prev_h[-1], zeros)) + + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + print('Model has {0} params'.format(num_params)) + + self.batch_init_states = { + 'c': batch_prev_c, + 'h': batch_prev_h, + 'reset': batch_reset, + } + self.train_params = { + 'w_emb': dropped_w_emb, + 'w_lstm': dropped_w_lstm, + 'w_soft': w_emb, + } + self.test_init_states = { + 'c': test_prev_c, + 'h': test_prev_h, + 'reset': test_reset, + } + self.eval_params = { + 'w_emb': w_emb, + 'w_lstm': w_lstm, + 'w_soft': w_emb, + } + + def _forward(self, x, y, model_params, init_states, is_training=False): + """Computes the logits. + + Args: + x: [batch_size, num_steps], input batch. + y: [batch_size, num_steps], output batch. + model_params: a `dict` of params to use. + init_states: a `dict` of params to use. + is_training: if `True`, will apply regularizations. + + Returns: + loss: scalar, cross-entropy loss + """ + w_emb = model_params['w_emb'] + w_lstm = model_params['w_lstm'] + w_soft = model_params['w_soft'] + prev_c = init_states['c'] + prev_h = init_states['h'] + + emb = tf.nn.embedding_lookup(w_emb, x) + if is_training: + emb = tf.layers.dropout( + emb, self.params.drop_i, + [self.params.batch_size, 1, self.params.emb_size], training=True) + + layer_masks = [None] + for _ in range(1, self.params.num_layers - 1): + mask = _gen_mask([self.params.batch_size, self.params.hidden_size], + self.params.drop_l) + layer_masks.append(mask) + layer_masks.append(None) + else: + layer_masks = [None] * self.params.num_layers + + out_c, out_h, all_h = _lstm(emb, prev_c, prev_h, w_lstm, layer_masks) + top_h = all_h[-1] + if is_training: + top_h = tf.layers.dropout( + top_h, self.params.drop_o, + [self.params.batch_size, 1, self.params.emb_size], training=True) + + carry_on = [] + for var, val in zip(prev_c + prev_h, out_c + out_h): + carry_on.append(tf.assign(var, val)) + + logits = tf.einsum('bnh,vh->bnv', top_h, w_soft) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, + logits=logits) + loss = tf.reduce_mean(loss) # TODO(hyhieu): watch for num_steps + + reg_loss = loss # loss + regularization_terms, for training only + if is_training: + # L2 weight reg + reg_loss += self.params.weight_decay * tf.add_n( + [tf.reduce_sum(w ** 2) for w in tf.trainable_variables()]) + + # activation L2 reg + reg_loss += self.params.alpha * tf.add_n( + [tf.reduce_mean(h ** 2) for h in all_h[:-1]]) + + # activation slowness L2 reg + reg_loss += self.params.beta * tf.add_n( + [tf.reduce_mean((h[:, 1:, :] - h[:, :-1, :]) ** 2) + for h in all_h[:-1]]) + + with tf.control_dependencies(carry_on): + loss = tf.identity(loss) + if is_training: + reg_loss = tf.identity(reg_loss) + + return reg_loss, loss + + def _build_train(self): + """Build training ops.""" + print('-' * 80) + print('Building train graph') + reg_loss, loss = self._forward(self.x_train, self.y_train, + self.train_params, self.batch_init_states, + is_training=True) + + tf_vars = tf.trainable_variables() + global_step = tf.train.get_or_create_global_step() + lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / + tf.cast(self.params.bptt_steps, dtype=tf.float32)) + learning_rate = utils.get_lr(global_step, self.params) * lr_scale + # learning_rate = tf.Print( + # learning_rate, + # [learning_rate, lr_scale, self.base_bptt, tf.shape(self.y_train)], + # message='lr: ', summarize=3) + grads = tf.gradients(reg_loss, tf_vars) + clipped_grads, grad_norm = tf.clip_by_global_norm(grads, + self.params.grad_bound) + + (self.update_moving_avg_ops, self.use_moving_avg_vars, + self.restore_normal_vars) = self._create_average_ops() + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), + global_step=global_step) + + self.train_loss = loss + self.train_op = train_op + self.grad_norm = grad_norm + self.learning_rate = learning_rate + + def _create_average_ops(self): + """Build moving average ops.""" + print('Creating moving average ops') + + with tf.variable_scope('moving_avg_flag'): + self.moving_avg_started = tf.get_variable( + 'flag', [], tf.int32, initializer=tf.initializers.zeros(), + trainable=False) + self.start_moving_avg_op = tf.assign(self.moving_avg_started, 1) + + all_vars = tf.trainable_variables() + average_pairs = [] + var_cnt = 0 + with tf.variable_scope('average'): + for v in all_vars: + avg_v = tf.get_variable( + str(var_cnt), shape=v.shape, dtype=v.dtype, + initializer=tf.zeros_initializer, trainable=False) + var_cnt += 1 + average_pairs.append([v, avg_v]) + backup_pairs = [] + var_cnt = 0 + with tf.variable_scope('backup'): + for v in all_vars: + backup_v = tf.get_variable(str(var_cnt), shape=v.shape, dtype=v.dtype, + trainable=False) + var_cnt += 1 + backup_pairs.append([v, backup_v]) + + with tf.variable_scope('avg_step'): + avg_step = tf.get_variable('step', [], dtype=tf.float32, trainable=False) + + with tf.control_dependencies([tf.assign_add(avg_step, 1.0)]): + average_op = [] + for v, avg_v in average_pairs: + mu = 1 / avg_step + new_avg = mu * v + (1 - mu) * avg_v + with tf.control_dependencies([new_avg]): + average_op.append(tf.assign(avg_v, new_avg)) + + assert len(average_pairs) == len(all_vars) + assert len(average_pairs) == len(backup_pairs) + use_average_op = [] + for i in range(len(average_pairs)): + v, avg_v = average_pairs[i] + _, backup_v = backup_pairs[i] + with tf.control_dependencies([tf.assign(backup_v, v)]): + use_average_op.append(tf.assign(v, avg_v)) + use_average_op = tf.group(*use_average_op) + + reverse_average_op = [] + for v, backup_v in backup_pairs: + reverse_average_op.append(tf.assign(v, backup_v)) + reverse_average_op = tf.group(*reverse_average_op) + + return average_op, use_average_op, reverse_average_op + + def _build_valid(self): + print('Building valid graph') + _, loss = self._forward(self.x_valid, self.y_valid, + self.eval_params, self.batch_init_states) + self.valid_loss = loss + + def _build_test(self): + print('Building test graph') + _, loss = self._forward(self.x_test, self.y_test, + self.eval_params, self.test_init_states) + self.test_loss = loss + + def eval_valid(self, sess, use_moving_avg=False): + """Eval 1 round on valid set.""" + total_loss = 0 + if use_moving_avg: + sess.run([self.use_moving_avg_vars, self.batch_init_states['reset']]) + for _ in range(self.num_valid_batches): + total_loss += sess.run(self.valid_loss) + valid_ppl = np.exp(total_loss / self.num_valid_batches) + print('valid_ppl={0:<.2f}'.format(valid_ppl)) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + return valid_ppl + + def eval_test(self, sess, use_moving_avg=False): + """Eval 1 round on test set.""" + total_loss = 0 + if use_moving_avg: + sess.run([self.use_moving_avg_vars, self.test_init_states['reset']]) + for step in range(self.num_test_batches): + total_loss += sess.run(self.test_loss) + if (step + 1) % 1000 == 0: + test_ppl = np.exp(total_loss / (step + 1)) + log_string = 'step={0}'.format(step + 1) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + test_ppl = np.exp(total_loss / self.num_valid_batches) + log_string = 'step={0}'.format(self.num_test_batches) + log_string += ' test_ppl={0:<.2f}'.format(test_ppl) + print(log_string) + if use_moving_avg: + sess.run(self.restore_normal_vars) + + return test_ppl diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py new file mode 100644 index 0000000000000000000000000000000000000000..9a880431333b12118e42952c9908ce2af5ac3229 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/process.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Preprocess Penn-Treebank dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import pickle +import numpy as np +import os + + +def main(): + dataFolder = "/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/dataset/Penn_Treebank_dataset" + dataList = os.listdir(dataFolder) + dataPath = {} + for dataName in dataList: + dataPath[dataName] = os.path.join(dataFolder, dataName) + + with open(dataPath.get("ptb.train.txt")) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + vocab, index = {}, {} + for word in sorted(words): + if word not in vocab: + index[len(vocab)] = word + vocab[word] = len(vocab) + print('vocab size: {}'.format(len(vocab))) + + x_train = [vocab[word] for word in words] + [vocab['']] + x_train = np.array(x_train, dtype=np.int32) + + with open(dataPath.get('ptb.valid.txt')) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + x_valid = [vocab[word] for word in words] + [vocab['']] + x_valid = np.array(x_valid, dtype=np.int32) + + with open(dataPath.get("ptb.test.txt")) as finp: + lines = finp.read().strip().replace('\n', '') + words = lines.split(' ') + + x_test = [vocab[word] for word in words] + [vocab['']] + x_test = np.array(x_test, dtype=np.int32) + + print('train size: {}'.format(np.size(x_train))) + print('valid size: {}'.format(np.size(x_valid))) + print('test size: {}'.format(np.size(x_test))) + + with open('ptb/ptb.pkl', 'wb') as fout: + pickle.dump((x_train, x_valid, x_test, vocab, index), fout, protocol=2) + + +if __name__ == '__main__': + main() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py new file mode 100644 index 0000000000000000000000000000000000000000..4d73e2b37b89396e6ec485d6863ea8acfe074a48 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.py @@ -0,0 +1,288 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Entry point for AWD ENAS search process.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig + +import os +import pickle +import sys +import time + +sys.path.append("/home/ma-user/modelarts/user-job-dir/") + +import numpy as np +import tensorflow.compat.v1 as tf + +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import child +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import controller +from enas_lm_npu_for_TensorFlow.enas_lm_npu_20211114162907.src import utils +from tensorflow.contrib import training as contrib_training + + +flags = tf.app.flags +gfile = tf.gfile +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string('output_dir', None, '') +flags.DEFINE_string('data_path', None, '') +flags.DEFINE_string("obs_dir", "obs://rstg/log", "obs result path, not need on gpu and apulis platform") + + +## Other parametersresult +flags.DEFINE_boolean('reset_output_dir', False, '') +flags.DEFINE_string("platform", "apulis", "Run on apulis/modelarts platform. Modelarts Platform has some extra data copy operations") + +flags.DEFINE_integer('log_every', 20, '') + + + +def get_ops(params, x_train, x_valid): + """Build [train, valid, test] graphs.""" + + ct = controller.Controller(params=params) + lm = child.LM(params, ct, x_train, x_valid) + ct.build_trainer(lm) + params.add_hparam('num_train_batches', lm.num_train_batches) + ops = { + 'train_op': lm.train_op, + 'learning_rate': lm.learning_rate, + 'grad_norm': lm.grad_norm, + 'train_loss': lm.train_loss, + 'l2_reg_loss': lm.l2_reg_loss, + 'global_step': tf.train.get_or_create_global_step(), + 'reset_batch_states': lm.batch_init_states['reset'], + 'eval_valid': lm.eval_valid, + + 'reset_start_idx': lm.reset_start_idx, + 'should_reset': lm.should_reset, + 'bptt_rate': lm.bptt_rate, + + 'controller_train_op': ct.train_op, + 'controller_grad_norm': ct.train_op, + 'controller_sample_arc': ct.sample_arc, + 'controller_entropy': ct.sample_entropy, + 'controller_reward': ct.reward, + 'controller_baseline': ct.baseline, + 'controller_optimizer': ct.optimizer, + 'controller_train_fn': ct.train, + + } + print('-' * 80) + print('HParams:\n{0}'.format(params.to_json(indent=2, sort_keys=True))) + + return ops + +def load_ckpt_model(sess, save_path): + print("reload model from:{}".format(save_path)) + checkpoint = tf.train.get_checkpoint_state(save_path) # 从checkpoint文件中读取checkpoint对象 + input_checkpoint = checkpoint.model_checkpoint_path + saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) # 加载模型结构 + saver.restore(sess, input_checkpoint) # 使用最新模型 + sess.run(tf.global_variables_initializer())# 初始化所有变量 + +def train(params): + """Entry train function.""" + print("data_path:{}".format(params.data_path)) + print("output_dir:{}".format(params.output_dir)) + with gfile.GFile(params.data_path, 'rb') as finp: + x_train, x_valid, _, _, _ = pickle.load(finp) + print('-' * 80) + print('train_size: {0}'.format(np.size(x_train))) + print('valid_size: {0}'.format(np.size(x_valid))) + + + g = tf.Graph() + with g.as_default(): + tf.random.set_random_seed(2126) + ops = get_ops(params, x_train, x_valid) + run_ops = [ + ops['train_loss'], + ops['l2_reg_loss'], + ops['grad_norm'], + ops['learning_rate'], + ops['should_reset'], + ops['train_op'], + ] + + saver = tf.train.Saver(max_to_keep=5) + checkpoint_saver_hook = tf.train.CheckpointSaverHook( + params.output_dir, save_steps=params.num_train_batches, saver=saver) + hooks = [checkpoint_saver_hook] + hooks.append(ops['controller_optimizer'].make_session_run_hook(True)) + + # >>> add code >> + # 创建session + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["use_off_line"].b = True # 在昇腾AI处理器执行训练 + custom_op.parameter_map["mix_compile_mode"].b = False # 关闭混合计算,根据实际情况配置,默认关闭 + # custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") # 设置混合精度 + custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fusion_switch.cfg") + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("/home/ma-user/modelarts/inputs/data_url_0") + # + # custom_op.parameter_map["enable_dump_debug"].b = True + # custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") + # # custom_op.parameter_map["enable_data_pre_proc"].b = True # getnext算子下沉是迭代循环下沉的必要条件 + # # custom_op.parameter_map[ + # # "iterations_per_loop"].i = 10 # 此处设置的值和set_iteration_per_loop设置的iterations_per_loop值保持一致,用于判断是否进行训练迭代下沉 + # + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF # 必须显式关闭 + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF # 必须显式关闭 + # sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, checkpoint_dir=params.output_dir) + # >>> add code >> + + sess = tf.train.SingularMonitoredSession(config=config, hooks=hooks, + checkpoint_dir=params.output_dir) + # reload model + if len(gfile.ListDirectory(params.output_dir)): + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + + accum_loss = 0 + accum_step = 0 + epoch = sess.run(ops['global_step']) // params.num_train_batches + best_valid_ppl = [] + start_time = time.time() + last_mins = (time.time() - start_time) / 60 + accum_rate = 0. + # sess.run(tf.global_variables_initializer()) + while True: + try: + # run_ops = [ + # ops['train_loss'], + # ops['l2_reg_loss'], + # ops['grad_norm'], + # ops['learning_rate'], + # ops['should_reset'], + # ops['train_op'], + # ] + # 修改点 + # loss, l2_reg, gn, lr, should_reset, _ = sess.run(run_ops) + loss = sess.run(ops['train_loss']) + # print("loss_OK:loss:{}".format(loss)) + l2_reg = sess.run(ops['l2_reg_loss']) + # print("l2_reg_OK:l2_reg:{}".format(l2_reg)) + gn = sess.run(ops['grad_norm']) + # gn = -111111 + # print("gn_OK:gn:{}".format(gn)) + lr = sess.run(ops['learning_rate']) + # print("lr_OK:le:{}".format(lr)) + should_reset = sess.run(ops['should_reset']) + _ = sess.run(ops["train_op"]) + + bptt_rate = sess.run(ops['bptt_rate']) + # print("should_reset_OK:should_reset:{}".format(should_reset)) + # if not should_not_train : + # _ = sess.run(ops["train_op"]) + + accum_loss += loss + accum_step += 1 + accum_rate += bptt_rate + step = sess.run(ops['global_step']) + if step % params.log_every == 0: + train_ppl = np.exp(accum_loss / accum_step) + mins_so_far = (time.time() - start_time) / 60. + mins_pices = mins_so_far - last_mins + last_mins = mins_so_far + log_string = 'epoch={0:<5d}'.format(epoch) + log_string += ' step={0:<7d}/{1:<6d}'.format(step, params.num_train_steps) + log_string += ' ppl={0:<9.2f}'.format(train_ppl) + log_string += ' lr={0:<7.2f}'.format(lr) + log_string += ' |w|={0:<6.2f}'.format(l2_reg) + log_string += ' |g|={0:<6.2f}'.format(gn) + log_string += ' mins={0:<.2f}-min/step={1:<.4f}'.format(mins_so_far, mins_pices/params.log_every) + # log_string += ' accum_rate(rate of a epoch)={0:<4.4f}'.format(accum_rate) + # log_string += ' should_reset:{}'.format(should_reset) + print(log_string) + + if should_reset: + accum_rate=0. + print("should_reset:{}".format(should_reset)) + ops['controller_train_fn'](sess, ops['reset_batch_states']) + epoch += 1 + accum_loss = 0 + accum_step = 0 + valid_ppl = ops['eval_valid'](sess) + sess.run([ops['reset_batch_states'], ops['reset_start_idx']]) + best_valid_ppl.append(valid_ppl) + + if step % (params.num_train_batches * 10) == 0: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + if step >= params.num_train_steps: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + break + except tf.errors.InvalidArgumentError: + if FLAGS.platform.lower() == 'modelarts': + from help_modelarts import modelarts_result2obs + modelarts_result2obs(FLAGS) + last_checkpoint = tf.train.latest_checkpoint(params.output_dir) + print('rolling back to previous checkpoint {0}'.format(last_checkpoint)) + saver.restore(sess, last_checkpoint) + sess.close() + + +def main(unused_args): + + tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.info("**********") + print("===>>>data_path:{}".format(FLAGS.data_path)) + print("===>>>output_dir:{}".format(FLAGS.output_dir)) + print("===>>>obs_dir:{}".format(FLAGS.obs_dir)) + print("===>>>train_step:{}".format(FLAGS.num_train_epochs)) + + np.set_printoptions(precision=3, suppress=True, threshold=int(1e9), + linewidth=80) + + print('-' * 80) + if not gfile.IsDirectory(FLAGS.output_dir): + print('Path {} does not exist. Creating'.format(FLAGS.output_dir)) + gfile.MakeDirs(FLAGS.output_dir) + elif FLAGS.reset_output_dir: + print('Path {} exists. Reseting'.format(FLAGS.output_dir)) + gfile.DeleteRecursively(FLAGS.output_dir) + gfile.MakeDirs(FLAGS.output_dir) + + print('-' * 80) + log_file = os.path.join(FLAGS.output_dir, 'stdout') + print('Logging to {}'.format(log_file)) + sys.stdout = utils.Logger(log_file) + + params = contrib_training.HParams( + data_path=FLAGS.data_path, + log_every=FLAGS.log_every, + output_dir=FLAGS.output_dir, + ) + train(params) + + + +if __name__ == '__main__': + flags.mark_flag_as_required("data_path") + flags.mark_flag_as_required("output_dir") + flags.mark_flag_as_required("obs_dir") + tf.app.run() diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh new file mode 100644 index 0000000000000000000000000000000000000000..a70f6b23e1d098caccb6d3edd028a5fabc8769cf --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/search.sh @@ -0,0 +1,36 @@ +#!/bin/bash +### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. +### Modelarts Platform command for train + +#export ASCEND_GLOBAL_LOG_LEVEL=1 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为4 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 # plog日志是否打屏 +#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 + +export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config +#export ENABLE_FORCE_V2_CONTROL=1 + +code_dir=${1} +data_path=${2} +output_dir=${3} +obs_url=${4} + +current_time=`date "+%Y-%m-%d-%H-%M-%S"` + +python ${code_dir}/search.py \ + --data_path=${data_path}/ptb.pkl \ + --output_dir=${output_dir} \ + --obs_dir=${obs_url} \ + --platform='modelarts' \ + 2>&1 | tee ${output_dir}/${current_time}_train_npu.log + + +#BASE_PATH='/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' +# +#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/search' +# +#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' +# +#args="--output_dir=$OUTPUT_DIR --data_path=$DATA_PATH" +# +##run search +#python /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/search.py $args diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..2697bfd90afbe121750ecf6c474e9b9e59c7beca --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/test-npu.sh @@ -0,0 +1,45 @@ +#!/bin/bash +### Do not need to Configure CANN Environment on Modelarts Platform, because it has been set already. +### Modelarts Platform command for train + +#export ASCEND_GLOBAL_LOG_LEVEL=4 # 日志级别设置 debug级别为0;info 级别为1;warning级别为 2;error级别为3;null级别为4 +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 # plog日志是否打屏 +#export ASCEND_HOST_LOG_FILE_NUM=1000 +#export ASCEND_LOG_DEVICE_FLUSH_TIMEOUT=0 +#export ASCEND_GLOBAL_EVENT_ENABLE=0 # 设置事件级别 不开启Event日志级别为0;开启Event日志级别为1 +#export ASCEND_GLOBAL_TRACE_ENABLE=0 +#export PROFILING_MODE=false +#export PROFILING_OPTIONS='{"output":"/tmp/profiling","training_trace":"off","task_trace":"off","aicpu":"on","fp_point":"resnet_model/conv2d/Conv2Dresnet_model/batch_normalization/FusedBatchNormV3_Reduce","bp_point":"gradients/AddN_70","aic_metrics":"PipeUtilization"}' + +export TF_CPP_MIN_LOG_LEVEL=2 ## Tensorflow api print Log Config +#export ENABLE_FORCE_V2_CONTROL=1 + +code_dir=${1} +data_path=${2} +output_dir=${3} +ckp_path=${4} + +current_time=`date "+%Y-%m-%d-%H-%M-%S"` +FIXED_ARC='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' + +nohup python3 ${code_dir}/fixed.py \ + --data_path=${data_path}/ptb.pkl \ + --output_dir=${output_dir} \ + --fixed_arc='0 2 1 0 2 0 3 0 4 2 5 3 5 0 6 0 7 0' \ + --ckp_path=${ckp_path} \ + --platform='modelarts' \ + > nohup1.out 2>&1 & + + +#FIXED_ARC='0 2 1 0 2 1 2 2 4 0 5 0 3 2 6 2' +# +#BASE_PATH = '/home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow' +# +#OUTPUT_DIR=$BASE_PATH'/enas_lm_npu_20211114162907/src/output/test' +# +#DATA_PATH='/home/ma-user/modelarts/inputs/data_url_0/ptb/ptb.pkl' +# +#args ='--fixed_arc=FIXED_ARC --output_dir=$OUTPUT_DIR --data_path=$DATA_PATH' +# +##run test +#python3 /home/ma-user/modelarts/user-job-dir/enas_lm_npu_for_TensorFlow/enas_lm_npu_20211114162907/src/fixed.py $args diff --git a/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7b59aec44ac46f74a34d1db8de30cc4340b5be44 --- /dev/null +++ b/contrib/TensorFlow/Research/nlp/ENAS_ID2053_for_TensorFlow/utils.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Common utils.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from npu_bridge.npu_init import * + +import re +import sys +import tensorflow.compat.v1 as tf + +gfile = tf.gfile + + +class Logger(object): + """Prints to both STDOUT and a file.""" + + def __init__(self, filepath): + self.terminal = sys.stdout + self.log = gfile.GFile(filepath, 'a+') + + def write(self, message): + self.terminal.write(message) + self.terminal.flush() + self.log.write(message) + self.log.flush() + + def flush(self): + self.terminal.flush() + self.log.flush() + + +def get_lr(curr_step, params, lr_decay_rate): + """Compute learning rate at step depends on `params`.""" + lr = tf.constant(params.learning_rate, dtype=tf.float32) + if 'num_warmup_steps' in params and params.num_warmup_steps > 0: + num_warmup_steps = tf.cast(params.num_warmup_steps, dtype=tf.float32) + step = tf.cast(curr_step, dtype=tf.float32) + warmup_lr = params.learning_rate * step / num_warmup_steps + lr = tf.cond(tf.less(step, num_warmup_steps), lambda: warmup_lr, lambda: lr) + return lr * lr_decay_rate + + +def strip_var_name(var_name): + """Strips variable name of sub-strings blocking variable name matching.""" + # Strip trailing number, e.g. convert + # 'lstm/W_0:0' to 'lstm/W_0'. + var_name = re.sub(r':\d+$', '', var_name) + # Strip partitioning info, e.g. convert + # 'W_0/part_3/Adagrad' to 'W_0/Adagrad'. + var_name = re.sub(r'/part_\d+', '', var_name) + return var_name