From 4b82fe576ce1f8a09efafcbc78b5b18977c1de2a Mon Sep 17 00:00:00 2001 From: yys Date: Mon, 28 Nov 2022 16:40:21 +0800 Subject: [PATCH] add code --- .../fasttext_ftodft_for_Tensorflow/README.md | 70 +++ .../__init__.py | 0 .../fasttext_ftodft_for_Tensorflow/cli.py | 214 +++++++++ .../customize_dtypes.cfg | 3 + .../fasttext_ftodft_for_Tensorflow/export.py | 49 ++ .../inference.py | 86 ++++ .../fasttext_ftodft_for_Tensorflow/input.py | 443 ++++++++++++++++++ .../fasttext_ftodft_for_Tensorflow/model.py | 265 +++++++++++ .../requirements.txt | 6 + .../settings.py | 302 ++++++++++++ .../test/train_full_1p.sh | 157 +++++++ .../test/train_performance_1p.sh | 156 ++++++ .../training.py | 168 +++++++ 13 files changed, 1919 insertions(+) create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/README.md create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/__init__.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/cli.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/customize_dtypes.cfg create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/export.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/inference.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/input.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/model.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/requirements.txt create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/settings.py create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_full_1p.sh create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_performance_1p.sh create mode 100644 TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/training.py diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/README.md b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/README.md new file mode 100644 index 000000000..026634421 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/README.md @@ -0,0 +1,70 @@ +# FToDTF - FastText on Distributed TensorFlow + +This software uses unsupervised machine-learning to calculate vector-representation of words. These vector representations can then be used for things like computing the similarity of words to each other or association-rules (e.g. paris is to france like X to germany). + +This software is an implementation of https://arxiv.org/abs/1607.04606 (facebook's fasttext) in tensorflow on Ascend 910 environment. + +In contrast to the original implementation of fasttext (https://github.com/facebookresearch/fastText) this implementation can use GPUs to accelerate the training and the training can be distributed across multiple nodes. + +## Datasets +Any language corpus according to the supported languages mentioned in the fasttext original paper +Dataset has to be preprocessed before training + +## Running +``` +python3 cli.py preprocess --corpus_path +python3 cli.py train (use `--` to add training parameters) +``` +in your console. +This will run the training and will periodically store checkpoints of the current model into the ./log folder. +After you have trained for some time you can try out the trained word-vectors: +``` +python3 cli.py infer similarity i you one two +``` +This will load the latest model stored in ./log and use it to calculate and print the similarity between the words i you one two. If everything works out, "I" should be similar to "you" and "one" should be similar to "two", while all other combinations should be relatively un-similar. + +## Docker +This application is also available as pre-built docker-image (https://hub.docker.com/r/dbaumgarten/ftodtf/) +``` +sudo docker run --rm -it -v `pwd`:/data dbaumgarten/ftodtf train +``` + +## Distributed Setup +### Docker +There is docker-compose file demonstrating the distributed setup op this programm. To run a cluster on your local machine +- go to the directory of the docker-compose file +- preprocess your data using `python3 cli.py preprocess --corpus_path ` +- run: +``` +sudo docker-compose up +``` +This will start a cluster consisting of two workers and two parameter servers on your machine. +Each time you restart the cluster it will continue to work from the last checkpoint. If you want to start from zero delete the contents of ./log/distributed on the server of worker0 +Please note that running a cluster on a single machine is slower then running a single instance directly on this machine. To see some speedup you will need to use multiple independent machines. +### Slurm +There is also an example how to use slurm for setting up distributed training (slurmjob.sh). You will probably have to modify the script to work on your specfic cluster. Please not that the slurm-script currently only handles training. You will have to create training-batches (fasttext preprocess) and copy the created batches-files to the cluster-nodes manually befor starting training. + +## Training-data +The input for the proprocess-step is a raw text-file containing lots of sentences of the language for that you want to compute word-embeddings. + +## Hyperparameters and Quality +The quality of the calculated word-vectors depends heavily on the used training-corpus and the hyperparameters (training-steps, embedding-dimension etc.). If you don't get usefull results try changing the default hyperparameters (especially the amount of training-steps can have a big influence) or use other training data. + +We got really good results for german with 81MB of training-data and the parameters --num_buckets=2000000 --vocabulary_size=200000 --steps=10000000, but the resulting model is quite large (2.5GB) and it took >10 hours to train. + +## Known Bugs and Limitations +- When supplying input-text that does not contain sentences (but instead just a bunch of words without punctuation) ```fasttext preprocess``` will hang indefinetly. + +## 训练结果 + +- 精度结果比对 + +| 精度指标项 | GPU/论文实测 | NPU实测 | +| ---- | ---- | ---- | +| LOSS | 2.5644302148967983 | 2.5319034051969647 | + +- 性能结果比对 + +| 性能指标项 | GPU/论文实测 | NPU实测 | +| ---- | ---- | ---- | +| Steptime | 0.004117727279663086 | 0.0055217742919921875 | diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/__init__.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/cli.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/cli.py new file mode 100644 index 000000000..655b3eae2 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/cli.py @@ -0,0 +1,214 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" This module handles parsing of cli-flags and then calls the needed function +from the library""" +import sys +import argparse +from multiprocessing import Process +from multiprocessing.queues import Empty + +from tqdm import tqdm + +import model +import training +import input +import inference +import export +from settings import FasttextSettings + +PREPROCESS_REQUIRED_PARAMETERS = ["corpus_path"] +TRAIN_REQUIRED_PARAMETERS = [] +SETTINGS = FasttextSettings() +PARSER = argparse.ArgumentParser( + description="Unsupervised training of word-vector-embeddings.") + +SUBPARSER = PARSER.add_subparsers(dest="command") +PREPROCESS_PARSER = SUBPARSER.add_parser( + "preprocess", help="Convert raw text to training-data for use in the train step.") +TRAIN_PARSER = SUBPARSER.add_parser( + "train", help="Train word-embeddings using previously created training-data.") + +INFER_PARSER = SUBPARSER.add_parser("infer", help="Use trained embeddings.") +INFER_SUBPARSER = INFER_PARSER.add_subparsers(dest="subcommand") +INFER_SIMILARITIES = INFER_SUBPARSER.add_parser( + "similarities", help="Compute the similarities between given words.") + +EXPORT_PARSER = SUBPARSER.add_parser( + "export", help="Export trained embeddings.") +EXPORT_SUBPARSER = EXPORT_PARSER.add_subparsers(dest="subcommand") +EXPORT_EMBEDDINGS = EXPORT_SUBPARSER.add_parser( + "embeddings", help="Export the embeddings in tensorflows checkpoint-format. Can be used for fasttext infer and is smaller than regular training-chekpoints") + + +def add_arguments_to_parser(arglist, parser, required, group=None): + """ Adds arguments (obtained from the settings-class) to an agrparse-parser + + :param list(str) arglist: A list of strings representing the names of the flags to add + :param argparse.ArgumentParser parser: The parser to add the arguments to + :param list(str) required: A list of argument-names that are required for the command + :param str group: If set place the arguments in an argument-group of the specified name + """ + if group: + parser = parser.add_argument_group(group) + for parameter, default in filter(lambda x: x[0] in arglist, vars(SETTINGS).items()): + parser.add_argument("--"+parameter, type=type(default), + help=SETTINGS.attribute_docstring(parameter), + required=parameter in required, default=default) + + +add_arguments_to_parser(SETTINGS.preprocessing_settings(), + PREPROCESS_PARSER, + PREPROCESS_REQUIRED_PARAMETERS) + +add_arguments_to_parser(SETTINGS.training_settings(), + TRAIN_PARSER, + TRAIN_REQUIRED_PARAMETERS) + +add_arguments_to_parser(SETTINGS.distribution_settings(), + TRAIN_PARSER, + [], + "Distribution settings") + +add_arguments_to_parser(SETTINGS.inference_settings(), + INFER_PARSER, + []) + +INFER_SIMILARITIES.add_argument("words", type=str, nargs="+", + help=" A list of words of which the similarities to each other should be computed.") + +# Export uses the same parameters as settigns, because both build an InferenceModel (and therefore need exactly the same settings). +add_arguments_to_parser(SETTINGS.inference_settings(), + EXPORT_PARSER, + []) + +EXPORT_PARSER.add_argument("-outdir", type=str, default="./export", + help="The directory to store the exports in. Default ./export") + + +def spawn_progress_bar(): + """ This function will spawn a new process using multiprocessing module. + + :return: A child process. + """ + p = Process(target=show_prog, args=(input.QUEUE, )) + p.daemon = True + return p + + +def show_prog(q): + """ Show progressbar, converges against the next max progress_bar.n and + finishes only when the function "write_batches_to_file" ends. + + :param q: Process which handles the progressbar. + """ + proggress_bar = tqdm(total=100, desc="Segmen./Cleaning", + bar_format='{desc}:{percentage:3.0f}%|{bar}[{elapsed}]') + n = 40 + j = 1 + while True: + try: + finished_function = q.get(timeout=1) + if finished_function == "_process_text": + proggress_bar.n = 66 + n, j = 10, 1 + proggress_bar.desc = "Writing Batches" + elif finished_function == "write_batches_to_file": + proggress_bar.n = 100 + proggress_bar.close() + return 0 + except (TimeoutError, Empty): + if n <= 0: + j *= 10 + n = j + proggress_bar.update(1/j) + n -= 1 + continue + + +def cli_main(): + """ Program entry point. """ + flags, unknown = PARSER.parse_known_args() + if unknown: + print( + "Unknown flag '{}'. Run --help for a list of all possible " + "flags".format(unknown[0])) + sys.exit(1) + # copy specified arguments over to the SETTINGS object + for k, v in vars(flags).items(): + SETTINGS.__dict__[k] = v + + if flags.command == "preprocess": + try: + SETTINGS.validate_preprocess() + except Exception as e: + print(": ".join(["ERROR", e.__str__()])) + sys.exit(1) + else: + p = spawn_progress_bar() + p.start() + ipp = input.InputProcessor(SETTINGS) + ipp.preprocess() + try: + input.write_batches_to_file(ipp.batches(), + SETTINGS.batches_file, + SETTINGS.num_batch_files) + except Warning as w: + # write_batches_to_file will raise a warning if there is not enough input-data + print(w) + sys.exit(1) + p.join() + elif flags.command == "train": + try: + SETTINGS.validate_train() + except Exception as e: + print(": ".join(["ERROR", e.__str__()])) + sys.exit(1) + else: + training.train(SETTINGS) + elif flags.command == "infer" and flags.subcommand == "similarities": + inference.compute_similarities(flags.words, SETTINGS) + elif flags.command == "export" and flags.subcommand == "embeddings": + export.export_embeddings(SETTINGS, flags.outdir) + else: + PARSER.print_help() + + +if __name__ == "__main__": + try: + cli_main() + except KeyboardInterrupt as e: + + # Kill all subprocess + import psutil + current_process = psutil.Process() + children = current_process.children(recursive=True) + for child in children: + child.kill() + + print("Program interrupted!") diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/customize_dtypes.cfg b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/customize_dtypes.cfg new file mode 100644 index 000000000..adfbda71c --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/customize_dtypes.cfg @@ -0,0 +1,3 @@ +OpType::MatMulV2:InputDtype:float16,float16,float32,OutputDtype:float32 +OpType::BatchMatMul:InputDtype:float16,float16,OutputDtype:float32 +OpType::BatchMatMulV2:InputDtype:float16,float16,OutputDtype:float32 diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/export.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/export.py new file mode 100644 index 000000000..0a5f30115 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/export.py @@ -0,0 +1,49 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" This module handles the expporting of trained models""" +import os.path +import tensorflow as tf +import model + + +def export_embeddings(settings, outputdir): + """ Builds an model using the given settings, loads the last checkpoint and saves only the embedding-variable to a new checkpoint inside outputdir, + leaving out all the other weights. The new checkpoint is much smaller then the original. + This new Checkpoint can be used for inference but not to continue training. + + :param ftodtf.settings.FasttextSettings settings: The settings for the model + :param str outputdir: The directory to store the new checkpoint to. + """ + m = model.InferenceModel(settings) + sess = tf.Session(graph=m.graph) + m.load(settings.log_dir, sess) + with m.graph.as_default(): + exporter = tf.train.Saver( + save_relative_paths=True, var_list=m.embeddings, filename="embeddings") + exporter.save(sess, os.path.join(outputdir, "embeddings")) diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/inference.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/inference.py new file mode 100644 index 000000000..31ed74a50 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/inference.py @@ -0,0 +1,86 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" This module contains functions to use trained word-embeddings to do usefull things + Currently the only implemented thing is to compute the similarities between words. +""" +import tensorflow as tf + +import model +import input as inp + + +def compute_similarities(words, settings): + """ Use trained embeddigs to compute the similarity between given input-words + + :param list(str) words: A list of words to compare to each other + :param ftodtf.settings.FastTextSettings settings: The settings for the fasttext-model + """ + m = model.InferenceModel(settings) + sess = tf.Session(graph=m.graph) + m.load(settings.log_dir, sess) + ngrammatrix = inp.words_to_ngramhashes(words, settings.num_buckets) + sims = sess.run([m.similarities], feed_dict={ + m.words_to_compare: ngrammatrix + })[0] + print_similarity(sims, words) + + +def print_similarity(similarity, words): + """ Print similarity between given words + :param similarity: A matrix of format len(words)xlen(words) containing the similarity between words + :param list(str) words: Words to print the similarity for + """ + for i, _ in enumerate(words): + for j, _ in enumerate(words): + print("Similarity between {} and {}: {:.2f}".format( + words[i], words[j], similarity[i][j])) + + +class PrintSimilarityHook(tf.train.StepCounterHook): + """ Implements a Hook that computes and printes the similarity between given words every x-steps. + To be used with tf.train.MonitoredTrainingSession + """ + + def __init__(self, every_n_steps, similarityop, words): + self.similarityop = similarityop + self.every_n_steps = every_n_steps + self.stepcounter = 0 + self.words = words + super().__init__(self) + + def before_run(self, run_context): + self.stepcounter += 1 + if self.stepcounter % self.every_n_steps == 0: + self.stepcounter = 0 + return tf.train.SessionRunArgs([self.similarityop]) + + def after_run(self, run_context, run_values): + results = run_values.results + if results: + print_similarity(results[0], self.words) diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/input.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/input.py new file mode 100644 index 000000000..f48595585 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/input.py @@ -0,0 +1,443 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module handles all the input-relatet tasks like loading, pre-processing +and batching""" +import os +import re +import random +import collections +import multiprocessing as mp + +import fnvhash +import numpy as np +import tensorflow as tf +import nltk +from nltk import ngrams +from nltk.tokenize import sent_tokenize, word_tokenize + +longest = 46 + +try: + nltk.data.find('tokenizers/punkt') +except LookupError: + nltk.download('punkt') + +# function names will be put there to show the progress of the preprocessing +QUEUE = mp.Manager().Queue() + + +def generate_ngram_per_word(word, ngram_window=2): + """ + Generates ngram strings of the specified size for a given word. + Before processing beginning and end of the word will be marked with "*". + The ngrams will also include the full word (including the added *s). + This is the same process as described in the fasttext paper. + + :param str word: The token string which represents a word. + :param int ngram_window: The size of the ngrams + :returns: A generator which yields ngrams. + """ + word = "*"+word+"*" + ngs = ngrams(word, ngram_window) + ngstrings = ["".join(x) for x in ngs] + ngstrings.append(word) + return ngstrings + + +def pad_to_length(li, length, pad=""): + """ Pads a given list to a given length with a given padding-element + + :param list() li: The list to be padded + :param int length: The length to pad the list to + :param object pad: The element to add to the list until the desired length is reached + """ + li += [pad]*(length-len(li)) + return li + + +def hash_string_list(strings, buckets, offset=0): + """ Hashes each element in a list of strings using the FNVa1 algorithm. + + :param list(str) strings: A list of strings to hash. + :param int buckets: How many different hash-values to produce maximally. (all Hashes are mod buckets) + :param int offset: The smallest possible hash value. Can be used to make hashvalues start at an other number then 0 + """ + return [(fnvhash.fnv1a_64(x.encode('UTF-8')) % (buckets))+offset for x in strings] + + +def inform_progressbar(func): + """Decorator used to put the function names into the QUEUE for showing the + progress in the progressbar + :param func: The function which should be decorated.""" + def wrapper_function(*args, **kwargs): + func(*args, **kwargs) + QUEUE.put(func.__name__) + return wrapper_function + + +@inform_progressbar +def write_batches_to_file(batchgenerator, filename, num_batch_files): + """ Writes the batches obtained from batchgenerator to files. + + :param batchgenerator: A generator yielding training-batches + :param str filename: The full path of the file into which the batches should be written + :param int num_batch_files: The number of files. + :raises Warning: If no batch could be generated because of a lack of inpput-data + """ + + writers = [] + if num_batch_files == 1: + writers.append(tf.python_io.TFRecordWriter(filename)) + else: + for k in range(0, num_batch_files): + writers.append(tf.python_io.TFRecordWriter( + 'batches_'+str(k)+'.tfrecord')) + + writer_index = 0 + batch_counter = 0 + for batch in batchgenerator: + flattened = [] + batch_counter += 1 + for x in batch[0]: + for y in x: + flattened.append(y) + + features = { + "inputs": tf.train.Feature(int64_list=tf.train.Int64List(value=flattened)), + "labels": tf.train.Feature(int64_list=tf.train.Int64List(value=batch[1])) + } + example = tf.train.Example( + features=tf.train.Features(feature=features)) + writers[writer_index].write(example.SerializeToString()) + + # Every 1000 batches change the file + if batch_counter % 1000 == 0: + writer_index = (writer_index+1) % num_batch_files + + for writer in writers: + writer.flush() + writer.close() + + if batch_counter == 0: + raise Warning( + "No batches could be generated. Please make sure you provided enough input-data to generate batches of the desired size.") + + +def words_to_ngramhashes(words, num_buckets): + """ Converts a list of words into a list of padded lists of ngrams-hashes. + The resulting matrix can then be used to compute the word-verctors for the original words + :param list(str) words: The words to convert + :param int num_buckets: The number of hash-buckets to use when hashing the ngrams + :returns: list(list(int)) + """ + + ngs = [generate_ngram_per_word(x) for x in words] + maxlen = 0 + for ng in ngs: + maxlen = max(maxlen, len(ng)) + for i, _ in enumerate(ngs): + ngs[i] = hash_string_list(ngs[i], num_buckets-1, 1) + ngs[i] = pad_to_length(ngs[i], maxlen, pad=0) + return ngs + + +def find_and_clean_sentences(corpus, language): + """ + Uses NLTK to parse the corpus and find the sentences. + :param str corpus: The corpus where the sentences should be found. + :return: A list with sentences. + """ + sentence_tokens = sent_tokenize(corpus, language=language) + for j, sentence in enumerate(sentence_tokens): + clean_sentence = "" + for word in word_tokenize(sentence): + clean_word = "".join(letter for letter in word + if letter.isalpha()) + if not clean_word.isspace(): + clean_sentence = " ".join(filter(None, [clean_sentence, + clean_word])) + + clean_sentence = re.sub("\s+", " ", clean_sentence) + sentence_tokens[j] = clean_sentence.lower() + return sentence_tokens + + +def parse_files_sequential(file_folder, language, sentences): + """ + Parse the raw data files from the training folder sequentially. + :param file_folder: The folder which contains the raw text files. + :param language: The language of the text files. + :param sentences: A reference to the sentence list. + """ + for file in os.listdir(file_folder): + if os.path.isfile(file_folder + '/' + file): + with open(file_folder + '/' + file) as f: + result_sents = find_and_clean_sentences(f.read(), language) + sentences.extend(result_sents) + + +def find_and_clean_sentences_helper(args): + """ + Auxiliary function to unwrap the arguments for multiprocessing. + :param args: Takes the corpus and specified language of the corpus. + :return: The result of the find_and_clean_sentence function. + """ + return find_and_clean_sentences(*args) + + +class InputProcessor: + """Handles the creation of training-examble-batches from the raw training-text""" + + def __init__(self, settings): + """ + Constructor of InputProcessor + + :param settings: An object encapsulating all the settings for the fasttext-model + :type settings: ftodtf.settings.FasttextSettings + """ + self.settings = settings + # Will be populated by preprocess + self.wordcount = None + self.dict = None + self.drop_p_word = None + self.sentences = [] + + def preprocess(self): + """ + Do the needed proprocessing of the dataset. Count word frequencies, + create a mapping word->int + """ + # TODO: Process a folder files which where separated by user + self._process_text() + self.wordcount = collections.Counter(self._words_in_corpus()) + + # number of all words in the corpus + total_sum = sum(self.wordcount.values()) + # drop probability for a word in the corpus + self.drop_p_word = {word: 1-np.sqrt(self.settings.rejection_threshold / + (self.wordcount[word] / total_sum)) + for word in self.wordcount} + idx = 0 + self.dict = {} + # Assign a number to every word we have. 0 = the most common word + for word, _ in self.wordcount.most_common(): + # We only want vocab_size words in or dictionary. Skip the remaining uncommon words + if idx == self.settings.vocabulary_size: + break + self.dict[word] = idx + idx += 1 + + @inform_progressbar + def _process_text(self): + """ + First check if the user provided a folder with the raw text files. + If No, than check if the corpus file should be processed with multiple cores. + This will happen if it is large enough (>= 100MB). Than cut the corpus + into pieces and use multiprocessing to process the pieces simultaneously. + It could cut some words into meaningless chunks but if the corpus is + large enough than these little changes should not have a big impact on + the word vectors. + """ + + # Check if the user provided a folder with the raw text files + if os.path.isdir(self.settings.corpus_path): + parse_files_sequential(self.settings.corpus_path, + self.settings.language, + self.sentences) + # Parse the single file + else: + with open(self.settings.corpus_path) as f: + corpus = f.read() + if os.path.getsize(self.settings.corpus_path) / (1024 * 1024) < 100: + self.sentences = find_and_clean_sentences( + corpus, self.settings.language) + else: + size_per_cpu = len(corpus) // mp.cpu_count() + pool = mp.Pool(processes=mp.cpu_count() - 2) + corpus_chunks = [] + + for i in range(0, mp.cpu_count()): + corpus_chunks.append( + corpus[i * size_per_cpu:(i + 1) * size_per_cpu]) + + job_args = [(e, self.settings.language) + for e in corpus_chunks] + result = pool.map( + find_and_clean_sentences_helper, job_args) + for sentence_bundle in result: + for sentence in sentence_bundle: + self.sentences.append(sentence) + + def _words_in_corpus(self): + """ + Returns a generator over all words in the corpus written lowercase and + removes punctuation. + """ + for sentence in self.sentences: + for word in sentence.split(" "): + yield word + + def _subsample(self, gen): + """This generators checks if the target word or context word + should be ignored, based on the word-frequency of the target-word. + :param gen: A generator yielding (string,string)-tuples + """ + for target_word, context_word in gen: + if random.random() < self.drop_p_word[target_word] or \ + random.random() < self.drop_p_word[context_word]: + # found frequent word, so ignore it + continue + else: + yield (target_word, context_word) + + def string_samples(self): + """ Returns a generator for samples (targetword->contextword) + :returns: A generator yielding 2-tuple consisting of a target-word and a context word. + """ + for words in self.sentences: + words = words.split() + idx = 0 + for word in words: + window = random.randint(1, self.settings.skip_window) + contextoffsets = [ + x for x in range(-window, window+1) if x != 0] + for contextoffset in contextoffsets: + contextindex = idx+contextoffset + # if selected index-offset reaches outside of the list, discard the contextword + if idx+contextoffset < 0 or idx+contextoffset >= len(words): + continue + yield (word, words[contextindex]) + idx += 1 + + def _lookup_label(self, gen): + """ Maps the second words in the input-tuple to numbers. + Conversion is done via lookup in self.dict + + :param gen: A generator yielding 2-tuples of strings + :returns: A generator yielding 2-tuples (string,int) + """ + for e in gen: + try: + yield (e[0], self.dict[e[1]]) + except KeyError: + pass + + def _hash_ngrams(self, gen): + """ Hashes the list of ngrams for each received ([str],?)-tuple and yields ([int],?) instead + + :param gen: The generator to receive the input-tuples from + """ + for targetngrams, contextword in gen: + # Hash the ngrams but reserve hashvalue 0 for the padding + hashed = hash_string_list( + targetngrams, self.settings.num_buckets-1, 1) + yield (hashed, contextword) + + @staticmethod + def _repeat(times, generator_func): + """ Repeat a given generator forever by recreating it whenever a StopIteration Exception occurs + + .param int times: How many times to repeat the input-generator after it threw it's first StopIteration. Special-cases: 0 = Never, -1=forever. + :param generator_func: A function without arguments returning a generator + :returns: A new inifinite generator + """ + iterationnr = 0 + g = generator_func() + while True: + try: + yield g.__next__() + except StopIteration: + if times < 0 or iterationnr < times: + iterationnr += 1 + g = generator_func() + else: + # raise StopIteration + return + + def _batch(self, samples): + """ Pack self.batch_size of training samples into a batch + The output is a tuple of two lists, rather then a list of tuples, because this way we can treat + the two lists as input-tensor and label-tensor. + The first List is a list of lists of ints. + The second list is al list of ints. + + :param samples: A generator yielding 2-tuples + :returns: A generator yielding 2-tuples of self.batch_size long lists. The second lists consists of 1-element-ling lists. + """ + while True: + inputs = [] + labels = [] + for _ in range(0, self.settings.batch_size): + sample = None + try: + sample = samples.__next__() + except StopIteration: + return + inputs.append(sample[0]) + labels.append(sample[1]) + yield inputs, labels + + def _ngrammize(self, gen): + """ Transforms the first entry (a string) of the tuples received from the generator gen into a list of ngrams + + :param gen: A generator yielding tuples (str,?) + :returns: A generator yielding tuples (list(str),?) + """ + for entry in gen: + yield (generate_ngram_per_word(entry[0], self.settings.ngram_size), entry[1]) + + @staticmethod + def _equalize_batch(padding, gen): + """ Makes sure all n-gram arrays of a batch have the same length. + + :param padding: The string/number/object that should be used to pad the entries of a batch + :param gen: The generator to retrieve the batches from + :returns: A generator yielding batches with equal-length ngram-lists + """ + for batch in gen: + longest = 46 # npu-cann net set longest the same + for ngs in batch[0]: + longest = max(longest, len(ngs)) + for i in range(len(batch[0])): + batch[0][i] = pad_to_length(batch[0][i], longest, padding) + yield batch + + def batches(self, passes=1): + """ Returns a generator the will yield an infinite amout of training-batches ready to feed into the model + + :param int repetitions: How many passes over the input data should be done. Default: 1. 0 will repeat the input forever. + """ + return self._equalize_batch(0, + self._batch( + self._hash_ngrams( + self._ngrammize( + self._lookup_label( + self._subsample( + self._repeat(passes-1, + self.string_samples))))))) diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/model.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/model.py new file mode 100644 index 000000000..c2dedab86 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/model.py @@ -0,0 +1,265 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module handles the building of the tf execution graph""" +import math +import tensorflow as tf +import input as inp + + +def parse_batch_func(batch_size): + """ Returns a function that can parse a batch from a tfrecord-entry + + :param int batch_size: How many samples are in a batch + """ + def parse(batch): + """ Parses a tfrecord-entry into a usable batch. To be used with tf.data.Dataset.map + + :params batch: The tfrecord-entry to parse + :returns: A batch ready to feed into the model + """ + features = { + "inputs": tf.VarLenFeature(tf.int64), + "labels": tf.FixedLenFeature([batch_size], tf.int64) + } + parsed = tf.parse_single_example(batch, features=features) + inputs = tf.sparse_tensor_to_dense( + parsed['inputs'], default_value=0) + # inputs = tf.reshape(inputs, [batch_size, -1]) # npu-cann need set -1 to fixed + inputs = tf.reshape(inputs, [batch_size, 46]) + labels = tf.reshape(parsed["labels"], [batch_size, 1]) + return inputs, labels + return parse + + +class TrainingModel(): + """Builds and represents the tensorflow computation graph for the training of the embeddings. Exports all important operations via fields""" + + def __init__(self, settings, cluster=None): + """ + Constuctor for Model + + :param settings: An object encapsulating all the settings for the fasttext-model + :param cluster: A tf.train.ClusterSpec object describint the tf-cluster. Needed for variable and ops-placement + :type settings: ftodtf.settings.FasttextSettings + """ + self.graph = tf.Graph() + + with self.graph.as_default(): + device = None + if cluster and settings.ps_list: # if running distributed use replica_device_setter + device = tf.train.replica_device_setter( + worker_device="/job:worker/task:%d" % settings.index, cluster=cluster) + # If running distributed pin all ops and assign variables to ps-servers. Else use auto-assignment + # with tf.device(device): # npu-cann need set device to cpu + with tf.device("/cpu:0"): + + inputpipe = tf.data.TFRecordDataset( + [settings.batches_file]).repeat() + batches = inputpipe.map(parse_batch_func( + settings.batch_size), num_parallel_calls=4) + batches = batches.shuffle(1000) + batches = batches.prefetch(1) + + iterator = batches.make_initializable_iterator() + self._dataset_init = iterator.initializer + batch = iterator.get_next() + + # Input data. + with tf.name_scope('inputs'): + train_inputs = batch[0] + train_labels = batch[1] + + # Create all Weights + self.embeddings = create_embedding_weights(settings) + + nce_weights = tf.create_partitioned_variables( + shape=[settings.vocabulary_size, settings.embedding_size], + slicing=[len(settings.ps_list) + if settings.ps_list else 1, + 1], + initializer=tf.truncated_normal( + [settings.vocabulary_size, settings.embedding_size], stddev=1.0 / math.sqrt(settings.embedding_size)), + dtype=tf.float32, + trainable=True, + name="weights" + ) + + nce_biases = tf.Variable( + name="biases", + initial_value=tf.zeros([settings.vocabulary_size])) + + target_vectors = ngrams_to_vectors( + train_inputs, self.embeddings) + + with tf.name_scope('loss'): + self.loss = tf.reduce_mean( + tf.nn.nce_loss( + weights=nce_weights, + biases=nce_biases, + labels=train_labels, + inputs=target_vectors, + num_sampled=settings.num_sampled, + num_classes=settings.vocabulary_size)) + + # Add the loss value as a scalar to summary. + tf.summary.scalar('loss', self.loss) + + # Keep track of how many iterations we have already done + self.step_nr = tf.train.create_global_step(self.graph) + + # Learnrate starts at settings.learnrates and will reach ~0 when the training is finished. + decaying_learn_rate = settings.learnrate * \ + (1 - (self.step_nr/settings.steps)) + + # Add the learnrate to the summary + tf.summary.scalar('learnrate', decaying_learn_rate) + + with tf.name_scope('optimizer'): + self.optimizer = tf.train.GradientDescentOptimizer( + decaying_learn_rate).minimize(self.loss, global_step=self.step_nr) + + # Merge all summaries. + self.merged = tf.summary.merge_all() + + # Create a saver to save the trained variables once training is over + self._saver = tf.train.Saver(save_relative_paths=True) + + if settings.validation_words_list: + ngrams = inp.words_to_ngramhashes( + settings.validation_words_list, settings.num_buckets) + ngramstensor = tf.constant(ngrams, dtype=tf.int64, shape=[ + len(ngrams), len(ngrams[0])]) + self.validation = compute_word_similarities( + ngramstensor, self.embeddings) + + def get_scaffold(self): + """ Returns a tf.train.Scaffold object describing this graph + + :returns: tf.train.Scaffold + """ + return tf.train.Scaffold( + init_op=tf.global_variables_initializer(), + local_init_op=tf.group(tf.local_variables_initializer( + ), self._dataset_init, tf.tables_initializer()), + saver=self._saver, + summary_op=self.merged + ) + + +def ngrams_to_vectors(ngrams, embeddings): + """ Create a tensorflow operation converting a batch consisting of lists of ngrams for a word to a list of vectors. One vector for each word + + :param ngrams: A batch of lists of ngrams + :param embeddings: The embeddings to use as tensorflow variable. Can also be a list of variables. + :returns: a batch of vectors + """ + + first_part_of_embeddings = embeddings + if isinstance(embeddings, list): + first_part_of_embeddings = embeddings[0] + + # Set the first enty in embeddings (or of partitioned, the first entry of the first partition) (belonging to the padding-ngram) to <0,0,...> + mask_padding_zero_op = tf.scatter_update( + first_part_of_embeddings, 0, tf.zeros([first_part_of_embeddings.shape[1]], dtype=tf.float32)) + + # Lookup the vector for each hashed value. The hash-value 0 (the value for the ngram "") will always et a 0-vector + with tf.control_dependencies([mask_padding_zero_op]): + looked_up = tf.nn.embedding_lookup(embeddings, ngrams) + # sum all ngram-vectors to get a word-vector + summed = tf.reduce_sum(looked_up, 1) + return summed + + +def compute_word_similarities(ngramhashmatrix, embeddings): + """Returns a tensorflow-operation that computes the similarities between all input-words using the given embeddings + + :param tf.Tensor ngramhashmatrix: A list of lists of ngram-hashes, each list represents the ngrams for one word. (In principle a trainings-batch without labels) + :param tf.Tensor embeddings: The embeddings to use for converting words to vectors. (Can be a list of tensors) + :param int num_buckets: The number of hash-buckets used when hashing ngrams + """ + + vectors = ngrams_to_vectors(ngramhashmatrix, embeddings) + + # normalize word-vectors before computing dot-product (so the results stay between -1 and 1) + norm = tf.sqrt(tf.reduce_sum( + tf.square(vectors), 1, keep_dims=True)) + normalized_embeddings = vectors / norm + + return tf.matmul(normalized_embeddings, normalized_embeddings, transpose_b=True) + + +def create_embedding_weights(settings): + """ Creates a (partitioned) tensorflow variable for the word-embeddings + Exists as seperate function to minimize code-duplication between training and inference-models + """ + + return tf.create_partitioned_variables( + shape=[settings.num_buckets, settings.embedding_size], + slicing=[len(settings.ps_list) + if settings.ps_list else 1, + 1], + # initializer= tf.random_uniform( + # [settings.num_buckets, settings.embedding_size], 0, 1.0), + initializer=tf.contrib.layers.xavier_initializer(), + dtype=tf.float32, + trainable=True, + name="embeddings" + ) + + +class InferenceModel(): + """Builds and represents the tensorflow computation graph for using the trained embeddings. Exports all important operations via fields. + An existing checkpoint must be loaded via load() before this model can be used to compute anything. + """ + + def __init__(self, settings): + """ + Constuctor for Model + + :param settings: An object encapsulating all the settings for the fasttext-model + :type settings: ftodtf.settings.FasttextSettings + """ + self.graph = tf.Graph() + + with self.graph.as_default(): + self.words_to_compare = tf.placeholder(tf.int64) + self.embeddings = create_embedding_weights(settings) + self.similarities = compute_word_similarities( + self.words_to_compare, self.embeddings) + self.saver = tf.train.Saver() + + def load(self, logdir, session): + """ Loades pre-trained embeddings from the filesystem + + :param str logdir: The path of the folder where the checkpoints created by the training were saved + :param tf.Session session: The session to restore the variables into + """ + latest = tf.train.latest_checkpoint(logdir) + print("Loading checkpoint:", latest) + self.saver.restore(session, latest) diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/requirements.txt b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/requirements.txt new file mode 100644 index 000000000..8c82c263e --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/requirements.txt @@ -0,0 +1,6 @@ +fnvhash==0.1.0 +nltk==3.7 +numpy==1.21.6 +psutil==5.9.4 +tensorflow==1.15.0 +tqdm==4.64.0 diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/settings.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/settings.py new file mode 100644 index 000000000..4eb0695ee --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/settings.py @@ -0,0 +1,302 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" This module contains the FasttextSettings class """ +import os +import re + +CURRENNT_PATH = os.getcwd() +DEFAULT_LOGPATH = os.path.join(CURRENNT_PATH, "log") +DEFAULT_BATCHES_FILE = os.path.join(CURRENNT_PATH, "batches.tfrecord") + +# pylint: disable=R0902,R0903 + + +class FasttextSettings: + """ This class contains all the settings for the fasttext-training and also handles things like validation. Use the attributes/variables of this class to set hyperparameters for the model. + + :ivar str corpus_path: Path to the file containing text for training the model. + :ivar str batches_file: The Filename for the file containing the training-batches. The file is written by the preprocess command and read by the train command. + :ivar str log_dir: Directory to write the generated files (e.g. the computed word-vectors) to and read/write checkoints from. + :ivar int steps: How many training steps to perform. + :ivar int vocabulary_size: How many words the vocabulary will have. Only the vocabulary_size most frequent words will be processed. + :ivar int batch_size: How many trainings-samples to process per batch. + :ivar int embedding_size: Dimension of the computed embedding vectors. + :ivar int skip_window: How many words to consider left and right of the target-word maximally. The actual window is randomly sampled for each word between 1 and this value + :ivar int num_sampled: Number of negative examples to sample when computing the nce_loss. + :ivar int ngram_size: How large the ngrams (in which the target words are split) should be. + :ivar int num_buckets: How many hash-buckets to use when hashing the ngrams to numbers. + :ivar str validation_words: A string of comma-seperated words. The similarity of these words to each other will be regularily computed and printed to indicade the progress of the training. + :ivar boolean profile: If set to True tensorflow will profile the graph-execution and writer results to ./profile.json. + :ivar float learnrate: The starting learnrate for the training. The actual learnrate will lineraily decrease to beyth 0 when the specified amount of training-steps is reached. + :ivar float rejection_threshold: In order to subsample the most frequent words. + :ivar string job: The role of this node in a distributed setup. Can be worker' or 'ps'. + :ivar str workers: A comma seperated list of host:port combinations representing the workers in the distributed setup. + :ivar str ps: A comma seperated list of host:port combinations representing the parameter servers in the distributed setup. If empty a non-distributed setup is assumed. + :ivar int num_batch_files: Number of batch files which should be created. + :ivar int index: The of the node itself in the list of --workers (or --ps, depending on --job). + :ivar str language: The language of the corpus. + """ + + def __init__(self): + self.corpus_path = "" + self.batches_file = DEFAULT_BATCHES_FILE + self.log_dir = DEFAULT_LOGPATH + self.steps = 500001 + self.vocabulary_size = 50000 + self.batch_size = 128 + self.embedding_size = 300 + self.skip_window = 5 + self.num_sampled = 5 + self.ngram_size = 3 + self.num_buckets = 200000 # In paper 210**6, but this would lead to OOM on small GPUs + self.validation_words = "" + self.profile = False + self.learnrate = 0.1 + self.rejection_threshold = 0.0001 + self.job = "worker" + self.index = 0 + self.workers = "localhost:7777" + self.ps = "" + self.num_batch_files = 1 + self.language = 'german' + + @staticmethod + def preprocessing_settings(): + """ + Returns the names of the settings that are used for the preprocessing + command + """ + return ["corpus_path", "batches_file", "vocabulary_size", + "batch_size", "skip_window", "ngram_size", "num_buckets", + "rejection_threshold", "profile", "num_batch_files", + "language"] + + @staticmethod + def training_settings(): + """ Returns the names of the settings that are used for the training + command """ + return ["batches_file", "log_dir", "steps", "vocabulary_size", + "batch_size", "embedding_size", "num_sampled", "num_buckets", + "validation_words", "profile", "learnrate"] + + @staticmethod + def distribution_settings(): + """ Returns the names of the settings that are used for configuren the tensoflow-cluster """ + return ["job", "index", "workers", "ps"] + + @staticmethod + def inference_settings(): + """ Returns the names of the settings that are used for the infer + command """ + return ["log_dir", "embedding_size", "num_buckets"] + + @property + def validation_words_list(self): + """ Returns the validation_words as list of strings instead of a comma + seperate string like the attribute would do + :returns: A list of strings if validation_words is set and else None + """ + if self.validation_words: + return self.validation_words.split(",") + return None + + @property + def workers_list(self): + """ Returns workers as list of strings instead of a comma + seperate string like the attribute would do + :returns: A list of strings if workers is set and else None + """ + if self.workers: + return self.workers.split(",") + return [] + + @property + def ps_list(self): + """ Returns ps as list of strings instead of a comma + seperate string like the attribute would do + :returns: A list of strings if ps is set and else None + """ + if self.ps: + return self.ps.split(",") + return [] + + def validate_preprocess(self): + """ Check if the current settings are valid for pre processing. + :raises: ValueError if the validation fails""" + try: + check_corpus_path(self.corpus_path) + check_vocabulary_size(self.vocabulary_size) + check_batch_size(self.batch_size) + check_skip_window(self.skip_window) + check_ngram_size(self.ngram_size) + check_num_buckets(self.num_buckets) + check_rejection_threshold(self.rejection_threshold) + except Exception as e: + raise e + + def validate_train(self): + """Check if the current settings are valid for training. + :raises: ValueError if the validation fails """ + try: + if self.job != "ps": + check_batches_file(self.batches_file) + if self.index == 0 and self.job == "worker": + check_log_dir(self.log_dir) + check_steps(self.steps) + check_vocabulary_size(self.vocabulary_size) + check_batch_size(self.batch_size) + check_embedding_size(self.embedding_size) + check_num_sampled(self.num_sampled) + check_num_buckets(self.num_buckets) + check_learn_rate(self.learnrate) + check_nodelist(self.workers) + check_nodelist(self.ps, allow_empty=True) + check_job(self.job) + check_index(self.job, self.workers, self.ps, self.index) + except Exception as e: + raise e + + def attribute_docstring(self, attribute, include_defaults=True): + """ Given the name of an attribute of this class, this function will return the docstring for the attribute. + + :param str attribute: The name of the attribute + :returns: The docstring for the attribute + """ + match = re.search("^.*:ivar \\w* "+attribute + + ": (.*)$", self.__doc__, re.MULTILINE) + if not match: + raise RuntimeError("No docstring found for: "+attribute) + docstring = match.group(1) + if include_defaults: + docstring += " Default: "+str(vars(self)[attribute]) + + return docstring + + +def check_index(job, workers, ps, index): + if job == "worker": + li = workers + else: + li = ps + if index < 0 or index >= len(li.split(",")): + raise ValueError( + "--index must be between 0 and {}".format(len(li.split(",")))) + + +def check_job(job): + if job != "worker" and job != "ps": + raise ValueError("--job can only be 'worker' or 'ps'") + + +def check_nodelist(noli, allow_empty=False): + """ Checks if the given argument is a comma seperated list of host:port strings. + + :raises: ValueError if it is not + """ + if allow_empty and noli == "": + return + hostportregex = re.compile("^[0-9a-zA-Z.\-]+:[0-9]+$") + noli = noli.split(",") + for e in noli: + if not hostportregex.match(e): + raise ValueError( + "{} is not a valid host:port combination".format(e)) + + +def check_corpus_path(corpus_path): + if not os.path.isfile(corpus_path) and not os.path.isdir(corpus_path): + raise FileNotFoundError("The specified corpus was not found!") + + +def check_vocabulary_size(vocabulary_size): + if vocabulary_size <= 0: + raise ValueError("Vocabulary size must be bigger than zero.") + elif vocabulary_size > 10251098: # Number of English words --> biggest vocab + raise ValueError("There exist no language with such a big vocabulary.") + + +def check_rejection_threshold(rejection_threshold): + if rejection_threshold <= 0 or rejection_threshold > 1: + raise ValueError("Rejection threshold must be between 0 and 1.") + + +def check_batch_size(batch_size): + if batch_size < 1: + # Practical recommendations for gradient-based training of deep architectures + # https://arxiv.org/abs/1206.5533 + raise ValueError("The batch-size must be >= 1") + + +def check_skip_window(skip_window): + if skip_window < 1: + raise ValueError("The window size must be >= 1") + + +def check_ngram_size(ngram_size): + if ngram_size < 3 or ngram_size > 6: + raise ValueError("The n-gram size must be between >= 3") + + +def check_num_buckets(number_buckets): + if number_buckets < 1: + raise ValueError("Number of Buckets must be bigger than zero.") + + +def check_batches_file(batches_file): + if not os.path.isfile(batches_file): + raise FileNotFoundError( + "The specified batches-file could not be found.") + + +def check_log_dir(log_dir): + if not os.path.exists(log_dir): + raise FileNotFoundError("Cannot find the log folder!") + + +def check_steps(steps): + if steps < 1: + raise ValueError("Number of steps must be bigger than 0.") + + +def check_embedding_size(embedding_size): + if embedding_size <= 0: + raise ValueError("The embedding size must be >= 1.") + + +def check_num_sampled(num_sampled): + if num_sampled <= 0: + raise ValueError("The number of negative samples should be" + ">= 1") + + +def check_learn_rate(learnrate): + if learnrate < 0.01 or learnrate > 1.0: + # https://fasttext.cc/docs/en/supervised-tutorial.html + raise ValueError("The learning rate should be between 0.01" + " and 1.0.") diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_full_1p.sh b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_full_1p.sh new file mode 100644 index 000000000..3d2e80fc1 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_full_1p.sh @@ -0,0 +1,157 @@ +#!/bin/bash + +########################################################## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` + +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` + +export RANK_SIZE=1 +export RANK_ID=0 +export JOB_ID=10087 + +# 路径参数初始化 +data_path="" +output_path="" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message + " + exit 1 +fi + +# 参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +train_steps=100001 +batch_size=128 + +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +python3.7 ./cli.py preprocess --corpus_path ${data_path} +python3.7 ./cli.py train --steps=${train_steps} --batch_size=${batch_size} --log_dir=${output_path} 1>${print_log} 2>&1 + +# 性能相关数据计算 +StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" + +# 输出训练精度 +echo "Final Train Accuracy : ${train_accuracy}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_performance_1p.sh b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_performance_1p.sh new file mode 100644 index 000000000..3af52a145 --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/test/train_performance_1p.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +########################################################## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` + +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` + +export RANK_SIZE=1 +export RANK_ID=0 +export JOB_ID=10087 + +# 路径参数初始化 +data_path="" +output_path="" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message + " + exit 1 +fi + +# 参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +#########第3行 至 90行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +train_steps=100001 +batch_size=128 + +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +python3.7 ./cli.py preprocess --corpus_path ${data_path} +python3.7 ./cli.py train --steps=${train_steps} --batch_size=${batch_size} --log_dir=${output_path} 1>${print_log} 2>&1 + +# 性能相关数据计算 +StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" + +# 输出训练精度 +echo "Final Train Accuracy : ${train_accuracy}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/training.py b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/training.py new file mode 100644 index 000000000..27060332b --- /dev/null +++ b/TensorFlow/contrib/nlp/fasttext/fasttext_ftodft_for_Tensorflow/training.py @@ -0,0 +1,168 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" This module handles the training of the word-vectors""" +import os +import signal +import _thread + +# pylint: disable=E0611 +from tensorflow.python.client import timeline +import tensorflow as tf + +import model as model +import inference + +import time + +from npu_bridge.npu_init import * # npu-cann add migrate + +def _tfserver_from_settings(settings): + """ Given a settings-object creates and returns a tf-server with proper settings + + :param settings: An object encapsulating all the settings for the fasttext-model + :type settings: ftodtf.settings.FasttextSettings + :returns: A tf.train.Server configured and ready to run and the cluster-spec that was used to create the server + """ + spec = tf.train.ClusterSpec({ + "worker": settings.workers_list, + "ps": settings.ps_list + }) + + server = tf.train.Server( + spec, job_name=settings.job, task_index=settings.index) + return server, spec + + +class PrintLossHook(tf.train.StepCounterHook): + """ Implements a Hook that prints the current step and current average loss every x steps""" + + def __init__(self, every_n_steps, lossop, steptensor): + self.lossop = lossop + self.steptensor = steptensor + self.cumloss = 0 + self.every_n_steps = every_n_steps + self.stepcounter = 0 + self.start_time = 0 # npu-cann add step_time + super().__init__(self) + + def before_run(self, run_context): + self.start_time = time.time() + return tf.train.SessionRunArgs([self.lossop, self.steptensor]) + + def after_run(self, run_context, run_values): + step_time = time.time() - self.start_time # npu-cann add step_time + loss, step = run_values.results + if self.stepcounter == self.every_n_steps: + print("Step {}: Loss: {} Steptime: {}".format( + step, self.cumloss/(self.stepcounter), step_time)) # npu-cann add step_time + self.cumloss = 0 + self.stepcounter = 1 + else: + self.cumloss += loss + self.stepcounter += 1 + + +def train(settings): + """ Run the fasttext training. + + :param settings: An object encapsulating all the settings for the fasttext-model + :type settings: ftodtf.settings.FasttextSettings + + """ + if not os.path.exists(settings.log_dir): + os.makedirs(settings.log_dir) + + server, cluster = _tfserver_from_settings(settings) + + if settings.job == "ps": + _thread.start_new_thread(server.join, tuple()) + signal.sigwait([signal.SIGINT, signal.SIGKILL]) + print("Terminating...") + return + + # Get the computation-graph and the associated operations + m = model.TrainingModel(settings, cluster) + + hooks = [tf.train.StopAtStepHook( + last_step=settings.steps)] + + chief_hooks = [PrintLossHook(2000, m.loss, m.step_nr)] + + if settings.validation_words: + chief_hooks.append(ftodtf.inference.PrintSimilarityHook(10000, m.validation, settings.validation_words_list)) + + # npu-cann add migrate + config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) + custom_op = config.graph_options.rewrite_options.custom_optimizers.add() # tf.train.GradientDescentOptimizer() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["customize_dtypes"].s = tf.compat.as_bytes("./customize_dtypes.cfg") + config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + with m.graph.as_default(): + with tf.train.MonitoredTrainingSession( + scaffold=m.get_scaffold(), + master="", # npu-cann remove server.target + # master=server.target, + is_chief=True, + # is_chief=(settings.index == 0), + checkpoint_dir=settings.log_dir, + hooks=hooks, + config=config, # npu-can add config + save_checkpoint_steps=10000, + chief_only_hooks=chief_hooks) as session: + + # Open a writer to write summaries. + while not session.should_stop(): + + # Define metadata variable. + run_metadata = tf.RunMetadata() + options = None + if settings.profile: + # pylint: disable=E1101 + options = tf.RunOptions( + trace_level=tf.RunOptions.FULL_TRACE) + + # We perform one update step by evaluating the optimizer op (including it + # in the list of returned values for session.run() + # Also, evaluate the merged op to get all summaries from the returned "summary" variable. + # Feed metadata variable to session for visualizing the graph in TensorBoard. + session.run( + [m.optimizer, m.merged, m.loss], + run_metadata=run_metadata, + options=options) + + # Create the Timeline object, and write it to a json file + if settings.profile: + # pylint: disable=E1101 + fetched_timeline = timeline.Timeline( + run_metadata.step_stats) + chrome_trace = fetched_timeline.generate_chrome_trace_format() + with open('profile.json', 'w') as f: + f.write(chrome_trace) -- Gitee