diff --git a/PyTorch/contrib/nlp/TransformerXL/Readme.md b/PyTorch/contrib/nlp/TransformerXL/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..42d50d759c327e34965a9f540a3362e815f96f98 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/Readme.md @@ -0,0 +1,62 @@ +# Transformer-xl + +This implements training of transformer-xl on the enwik8 dataset, mainly modified from [pytorch/examples](https://github.com/kimiyoung/transformer-xl/tree/master/pytorch). + +## Transformer-xl Detail + +As of the current date, Ascend-Pytorch is still inefficient for contiguous operations.Therefore, Transformer-xl is re-implemented using semantics such as custom OP. + + +## Requirements + +- Install PyTorch ([pytorch.org](http://pytorch.org)) +- `pip install -r requirements.txt` + +## Data Prepration +- `bash getdata.sh` + +## Training and Evaluation + +To train a model, run `bash test/train_full_8p.sh` with the desired model architecture and the path to the enwik8 dataset: + + +```bash +#env +cd transformer-xl +dos2unix ./test/*.sh + +# 1p train perf +bash test/train_performance_1p.sh + +# 8p train perf +bash test/train_performance_8p.sh + +# 8p train full +bash test/train_full_8p.sh + +# 1p eval +bash test/eval_1p.sh + +``` + +- 参数说明: +```bash +#--data //数据集路径,可自行修改为对应路径的数据集 +#--restart_dir //加载模型checkpoint路径,可自行修改为对应路径的模型文件 +#--addr //主机地址 +#--max_step //最大训练步数 +#--batch-size //训练批次大小 +#--lr //初始学习率,默认:0.00025 +#--device-list //多卡训练指定训练用卡 ,8卡:'0,1,2,3,4,5,6,7' +#--amp //是否使用混合精度 +#--loss-scale //lossscale大小 +#--opt-level //混合精度类型 +``` + + +## Transformer-xl training result + +| bpc | FPS | Npu_nums | Epochs | AMP_Type | +| :------: | :------: | :------: | :------: | :------: | +| - | 8300 | 1 | 1 | O2 | +| 1.09 | 44500 | 8 | 50 | O2 | \ No newline at end of file diff --git a/PyTorch/contrib/nlp/TransformerXL/adaptive_softmax.py b/PyTorch/contrib/nlp/TransformerXL/adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..1a65fb1c72bd60b8c83e1c60b3e284a9ef8f34c7 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/adaptive_softmax.py @@ -0,0 +1,97 @@ +from collections import defaultdict + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class AdaptiveLogSoftmax(nn.Module): + def __init__(self, in_features, n_classes, cutoffs, keep_order=False): + super(AdaptiveLogSoftmax, self).__init__() + + cutoffs = list(cutoffs) + + if (cutoffs != sorted(cutoffs)) \ + or (min(cutoffs) <= 0) \ + or (max(cutoffs) >= (n_classes - 1)) \ + or (len(set(cutoffs)) != len(cutoffs)) \ + or any([int(c) != c for c in cutoffs]): + + raise ValueError("cutoffs should be a sequence of unique, positive " + "integers sorted in an increasing order, where " + "each value is between 1 and n_classes-1") + + self.in_features = in_features + self.n_classes = n_classes + self.cutoffs = cutoffs + [n_classes] + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.keep_order = keep_order + + + def forward(self, hidden, target, weight, bias, keep_order=False): + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + head_weight = torch.cat( + [weight[:self.shortlist_size], self.cluster_weight], dim=0) + head_bias = torch.cat( + [bias[:self.shortlist_size], self.cluster_bias], dim=0) + + head_logit = F.linear(hidden, head_weight, bias=head_bias) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < h_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i = weight[l_idx:h_idx] + bias_i = bias[l_idx:h_idx] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + # aa = target_i[:, None] + # aa = aa.to('cpu') + # logprob_i = head_logprob_i[:, -i] \ + # + tail_logprob_i.gather(1, aa).squeeze(1) + + print(f'target_i[:,None]: {target_i[:, None]}') + print(f'target_i[:,None].shape: {target_i[:, None].shape}') + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/PyTorch/contrib/nlp/TransformerXL/data_utils.py b/PyTorch/contrib/nlp/TransformerXL/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3642f37ceff7853d0e1eecb85c1232eb5ead5bd0 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/data_utils.py @@ -0,0 +1,258 @@ +import os, sys +import glob +import numpy as np +import torch + +from utils.vocabulary import Vocab + + +class LMOrderedIterator(object): + def __init__(self, data, bsz, bptt, device='npu:0', ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i+1:i+1+seq_len] + + return data, target, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2. + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator(object): + def __init__(self, data, bsz, bptt, device='npu:0', ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \ + else np.array(range(len(self.data))) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \ + streams[i][:n_new] + target[n_filled:n_filled+n_new, i] = \ + streams[i][1:n_new+1] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data = data.to(self.device) + target = target.to(self.device) + + yield data, target, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__(self, paths, vocab, bsz, bptt, device='npu:0', ext_len=None, + shuffle=False): + + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class Corpus(object): + def __init__(self, path, dataset, *args, **kwargs): + self.dataset = dataset + self.vocab = Vocab(*args, **kwargs) + + if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']: + self.vocab.count_file(os.path.join(path, 'train.txt')) + self.vocab.count_file(os.path.join(path, 'valid.txt')) + self.vocab.count_file(os.path.join(path, 'test.txt')) + elif self.dataset == 'wt103': + self.vocab.count_file(os.path.join(path, 'train.txt')) + elif self.dataset == 'lm1b': + train_path_pattern = os.path.join( + path, '1-billion-word-language-modeling-benchmark-r13output', + 'training-monolingual.tokenized.shuffled', 'news.en-*') + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ['ptb', 'wt2', 'wt103']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True) + elif self.dataset in ['enwik8', 'text8']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True, add_eos=False) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True, add_eos=False) + elif self.dataset == 'lm1b': + self.train = train_paths + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True) + + def get_iterator(self, split, *args, **kwargs): + if split == 'train': + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == 'lm1b': + kwargs['shuffle'] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ['valid', 'test.py']: + data = self.valid if split == 'valid' else self.test + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == 'lm1b': + data_iter = LMShuffledIterator(data, *args, **kwargs) + + return data_iter + + +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, 'cache.pt') + if os.path.exists(fn): + print('Loading cached dataset...') + corpus = torch.load(fn) + else: + print('Producing dataset {}...'.format(dataset)) + kwargs = {} + if dataset in ['wt103', 'wt2']: + kwargs['special'] = [''] + kwargs['lower_case'] = False + elif dataset == 'ptb': + kwargs['special'] = [''] + kwargs['lower_case'] = True + elif dataset == 'lm1b': + kwargs['special'] = [] + kwargs['lower_case'] = False + kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt') + elif dataset in ['enwik8', 'text8']: + pass + corpus = Corpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + return corpus + + diff --git a/PyTorch/contrib/nlp/TransformerXL/eval_npu.py b/PyTorch/contrib/nlp/TransformerXL/eval_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..29e80f07e21e84eb26f0a409d9c82e252d186382 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/eval_npu.py @@ -0,0 +1,376 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +import math +import os +import torch +import torch.nn as nn +import torch.optim as optim + +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from apex import amp +import apex +from utils.exp_utils import get_logger + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/enwik8', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='enwik8', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=8, + help='number of heads') +parser.add_argument('--d_head', type=int, default=64, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=512, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=2048, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.1, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=100000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=10, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=512, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=128, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=512, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +parser.add_argument('--npu', default=True, help='use NPU') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=4000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--static-loss-scale', type=float, default=128.0, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +parser.add_argument('--no_log', action='store_true', + help='do not log the eval result') +parser.add_argument('--split', default='valid', + choices=['all','valid','test']) + +args = parser.parse_args() +args.tied = not args.not_tied + +if args.d_embed < 0: + args.d_embed = args.d_model + +assert args.ext_len >= 0, 'extended context length must be non-negative' +assert args.batch_size % args.batch_chunk == 0 + +args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) +args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) + +# Get logger +logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) +logging = get_logger('log.txt', log_=not args.no_log) + +loc = "npu:0" +torch.npu.set_device(loc) + +############################################################################### +# Load data +############################################################################### +corpus = get_lm_corpus(args.data, args.dataset) +ntokens = len(corpus.vocab) +args.n_token = ntokens + +va_iter = corpus.get_iterator('valid', args.batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) +te_iter = corpus.get_iterator('test.py', args.batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) + +# adaptive softmax / embedding +cutoffs, tie_projs = [], [False] +if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + +############################################################################### +# Build the model +############################################################################### +def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + +def init_bias(bias): + nn.init.constant_(bias, 0.0) + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + +def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + +def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + +model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) +model.apply(weights_init) +model.word_emb.apply(weights_init) +args.n_all_param = sum([p.nelement() for p in model.parameters()]) +args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + +model = model.to(loc) + +#### optimizer +if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) +elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr) +elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + + + +logging('=' * 100) +logging('#params = {}'.format(args.n_all_param)) +logging('#non emb params = {}'.format(args.n_nonemb_param)) + +# Load the best saved model. +with open('model_best_bpc.pt', 'rb') as f: + model.load_state_dict(torch.load(f, map_location=loc)) + +logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( + args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) + +model.reset_length(args.tgt_len, args.ext_len, args.mem_len) +if args.clamp_len > 0: + model.clamp_len = args.clamp_len +if args.same_length: + model.same_length = True + +############################################################################### +# Evaluation code +############################################################################### + +def evaluate(eval_iter): + model.eval() + total_len, total_loss = 0, 0. + start_time = time.time() + with torch.no_grad(): + mems = tuple() + for idx, (data, target, seq_len) in enumerate(eval_iter): + ts = time.time() + ret = model(data,target,*mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.item() + total_len += seq_len + #print('eval_batch id: {} use time: {:.2f} ms '.format(idx, (time.time()-ts)*1000)) + total_time = time.time() - start_time + logging('Time : {:.2f}s, FPS: {:.2f} characters/s'.format( + total_time, total_len*args.batch_size*args.eval_tgt_len/total_time)) + return total_loss / total_len + + +# Run on test.py data. +if args.split == 'all': + test_loss = evaluate(te_iter) + valid_loss = evaluate(va_iter) +elif args.split == 'valid': + valid_loss = evaluate(va_iter) + test_loss = None +elif args.split == 'test': + test_loss = evaluate(te_iter) + valid_loss = None + +def format_log(loss, split): + if args.dataset in ['enwik8', 'text8']: + log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format( + split, loss, loss / math.log(2)) + else: + log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( + split, loss, math.exp(loss)) + return log_str + +log_str = '' +if valid_loss is not None: + log_str += format_log(valid_loss, 'valid') +if test_loss is not None: + log_str += format_log(test_loss, 'test.py') + +logging('=' * 100) +logging(log_str) +logging('=' * 100) diff --git a/PyTorch/contrib/nlp/TransformerXL/getdata.sh b/PyTorch/contrib/nlp/TransformerXL/getdata.sh new file mode 100644 index 0000000000000000000000000000000000000000..780475780e4a737a7660aacbfcd5d65d7886f45a --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/getdata.sh @@ -0,0 +1,90 @@ +echo "=== Acquiring datasets ===" +echo "---" + +mkdir -p data +cd data + +if [[ ! -d 'wikitext-2' ]]; then + echo "- Downloading WikiText-2 (WT2)" + wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip + unzip -q wikitext-2-v1.zip + cd wikitext-2 + mv wiki.train.tokens train.txt + mv wiki.valid.tokens valid.txt + mv wiki.test.tokens test.txt + cd .. +fi + +echo "- Downloading WikiText-103 (WT2)" +if [[ ! -d 'wikitext-103' ]]; then + wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip + unzip -q wikitext-103-v1.zip + cd wikitext-103 + mv wiki.train.tokens train.txt + mv wiki.valid.tokens valid.txt + mv wiki.test.tokens test.txt + cd .. +fi + +echo "- Downloading enwik8 (Character)" +if [[ ! -d 'enwik8' ]]; then + mkdir -p enwik8 + cd enwik8 + wget --continue http://mattmahoney.net/dc/enwik8.zip + wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py + python3 prep_enwik8.py + cd .. +fi + +echo "- Downloading text8 (Character)" +if [[ ! -d 'text8' ]]; then + mkdir -p text8 + cd text8 + wget --continue http://mattmahoney.net/dc/text8.zip + python ../../prep_text8.py + cd .. +fi + +echo "- Downloading Penn Treebank (PTB)" +if [[ ! -d 'penn' ]]; then + wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz + tar -xzf simple-examples.tgz + + mkdir -p penn + cd penn + mv ../simple-examples/data/ptb.train.txt train.txt + mv ../simple-examples/data/ptb.test.txt test.txt + mv ../simple-examples/data/ptb.valid.txt valid.txt + cd .. + + echo "- Downloading Penn Treebank (Character)" + mkdir -p pennchar + cd pennchar + mv ../simple-examples/data/ptb.char.train.txt train.txt + mv ../simple-examples/data/ptb.char.test.txt test.txt + mv ../simple-examples/data/ptb.char.valid.txt valid.txt + cd .. + + rm -rf simple-examples/ +fi + +echo "- Downloading 1B words" + +if [[ ! -d 'one-billion-words' ]]; then + mkdir -p one-billion-words + cd one-billion-words + + wget --no-proxy http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz + tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz + + path="1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/" + cat ${path}/news.en.heldout-00000-of-00050 > valid.txt + cat ${path}/news.en.heldout-00000-of-00050 > test.txt + + wget https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt + + cd .. +fi + +echo "---" +echo "Happy language modeling :)" diff --git a/PyTorch/contrib/nlp/TransformerXL/mem_transformer.py b/PyTorch/contrib/nlp/TransformerXL/mem_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..92a55122f9fec99dc3a32cd6c38fcc9ca008625a --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/mem_transformer.py @@ -0,0 +1,851 @@ +import sys +import math +import functools + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +sys.path.append('utils') +from utils.proj_adaptive_softmax import ProjectedAdaptiveLogSoftmax +from utils.log_uniform_sampler import LogUniformSampler, sample_logits + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + + self.demb = demb + + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[:,None,:].expand(-1, bsz, -1) + else: + return pos_emb[:,None,:] + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False): + super(PositionwiseFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = nn.LayerNorm(d_model) + + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + ##### layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + ##### residual connection + output = core_out + inp + else: + ##### positionwise feed-forward + core_out = self.CoreNet(inp) + + ##### residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + # output = self.layer_norm((inp + core_out).squeeze()) + # output = output.unsqueeze(1) + + return output + +class MultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + pre_lnorm=False): + super(MultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.q_net = nn.Linear(d_model, n_head * d_head, bias=False) + self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + def forward(self, h, attn_mask=None, mems=None): + ##### multihead attention + # [hlen x bsz x n_head x d_head] + + if mems is not None: + c = torch.cat([mems, h], 0) + else: + c = h + + if self.pre_lnorm: + ##### layer normalization + c = self.layer_norm(c) + + head_q = self.q_net(h) + head_k, head_v = torch.chunk(self.kv_net(c), 2, -1) + + head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head) + head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head) + head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head) + + # [qlen x klen x bsz x n_head] + attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k)) + attn_score.mul_(self.scale) + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head] + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v)) + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = h + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(h + attn_out) + + return output + +class RelMultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False): + super(RelMultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + def _parallelogram_mask(self, h, w, left=False): + mask = torch.ones((h, w)).byte() + m = min(h, w) + mask[:m,:m] = torch.triu(mask[:m,:m]) + mask[-m:,-m:] = torch.tril(mask[-m:,-m:]) + + if left: + return mask + else: + return mask.flip(0) + + def _shift(self, x, qlen, klen, mask, left=False): + if qlen > 1: + zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)), + device=x.device, dtype=x.dtype) + else: + zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype) + + if left: + mask = mask.flip(1) + x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1) + else: + x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1) + + x = x_padded.masked_select(mask[:,:,None,None]) \ + .view(qlen, klen, x.size(2), x.size(3)) + + return x + + def _rel_shift(self, x, zero_triu=False): + zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]), + device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=1) + + x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:]) + + x = x_padded[1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(0), x.size(1))) + x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None] + + return x + + def forward(self, w, r, attn_mask=None, mems=None): + + raise NotImplementedError + +class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) + + def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): + qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + + #### compute attention score + rw_head_q = w_head_q + r_w_bias # qlen x bsz x n_head x d_head + + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + r_r_bias + BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + attn_mask_bool = attn_mask.bool() + if attn_mask is not None and attn_mask_bool.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None, :, :, None], -float('inf')).bool().type_as(attn_score) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:, :, :, None], -float('inf')).bool().type_as(attn_score) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class RelLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): + # r_emb: [klen, n_head, d_head], used for term B + # r_w_bias: [n_head, d_head], used for term C + # r_bias: [klen, n_head], used for term D + + qlen, bsz = w.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) + + if klen > r_emb.size(0): + r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1) + r_emb = torch.cat([r_emb_pad, r_emb], 0) + r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1) + r_bias = torch.cat([r_bias_pad, r_bias], 0) + else: + r_emb = r_emb[-klen:] + r_bias = r_bias[-klen:] + + #### compute attention score + rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head + + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb)) # qlen x klen x bsz x n_head + D_ = r_bias[None, :, None] # 1 x klen x 1 x n_head + BD = self._rel_shift(B_ + D_) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class DecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs): + super(DecoderLayer, self).__init__() + + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, dec_attn_mask=None, mems=None): + output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout, + **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None): + output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelPartialLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelPartialLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, + d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None): + output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + + +class AdaptiveEmbedding(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + sample_softmax=False): + super(AdaptiveEmbedding, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj ** 0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = nn.ModuleList() + self.emb_projs = nn.ParameterList() + if div_val == 1: + print("n_token:", n_token) + print("d_embed:", d_embed) + # self.emb_layers.append( + # nn.Embedding(n_token, 512, sparse=sample_softmax>0) + # ) + self.emb_layers.append( + nn.Embedding(n_token, d_embed, sparse=sample_softmax>0) + ) + if d_proj != d_embed: + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed))) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i)) + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i))) + + def forward(self, inp): + if self.div_val == 1: + embed = self.emb_layers[0](inp) + if self.d_proj != self.d_embed: + embed = F.linear(embed, self.emb_projs[0]) + else: + param = next(self.parameters()) + inp_flat = inp.view(-1) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], + dtype=param.dtype, device=param.device) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + inp_i = inp_flat.index_select(0, indices_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = F.linear(emb_i, self.emb_projs[i]) + + emb_flat.index_copy_(0, indices_i, emb_i) + + embed = emb_flat.view(*inp.size(), self.d_proj) + + embed.mul_(self.emb_scale) + + return embed + +class MemTransformerLM(nn.Module): + def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner, + dropout, dropatt, tie_weight=True, d_embed=None, + div_val=1, tie_projs=[False], pre_lnorm=False, + tgt_len=None, ext_len=None, mem_len=None, + cutoffs=[], adapt_inp=False, + same_length=False, attn_type=0, clamp_len=-1, + sample_softmax=-1): + super(MemTransformerLM, self).__init__() + self.n_token = n_token + + d_embed = d_model if d_embed is None else d_embed + self.d_embed = d_embed + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs, + div_val=div_val) + self.drop = nn.Dropout(dropout) + + self.n_layer = n_layer + + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + self.max_klen = tgt_len + ext_len + mem_len + + self.attn_type = attn_type + + self.layers = nn.ModuleList() + if attn_type == 0: # the default attention + for i in range(n_layer): + self.layers.append( + RelPartialLearnableDecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + elif attn_type == 1: # learnable embeddings + for i in range(n_layer): + self.layers.append( + RelLearnableDecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + elif attn_type in [2, 3]: # absolute embeddings + for i in range(n_layer): + self.layers.append( + DecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + + self.sample_softmax = sample_softmax + # use sampled softmax + if sample_softmax > 0: + self.out_layer = nn.Linear(d_model, n_token) + if tie_weight: + self.out_layer.weight = self.word_emb.weight + self.tie_weight = tie_weight + self.sampler = LogUniformSampler(n_token, sample_softmax) + + # use adaptive softmax (including standard softmax) + else: + # dump_tensor(n_token, 'n_token.pt') + # dump_tensor(d_embed, 'd_embed.pt') + # dump_tensor(d_model, 'd_model.pt') + # dump_tensor(cutoffs, 'cutoffs.pt') + # dump_tensor(div_val, 'div_val.pt') + + self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, + cutoffs, div_val=div_val) + + if tie_weight: + for i in range(len(self.crit.out_layers)): + self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight + + if tie_projs: + for i, tie_proj in enumerate(tie_projs): + if tie_proj and div_val == 1 and d_model != d_embed: + self.crit.out_projs[i] = self.word_emb.emb_projs[0] + elif tie_proj and div_val != 1: + self.crit.out_projs[i] = self.word_emb.emb_projs[i] + + self.same_length = same_length + self.clamp_len = clamp_len + + self._create_params() + + def backward_compatible(self): + self.sample_softmax = -1 + + def _create_params(self): + if self.attn_type == 0: # default attention + self.pos_emb = PositionalEmbedding(self.d_model) + self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + elif self.attn_type == 1: # learnable + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.n_head, self.d_head)) + self.r_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head)) + elif self.attn_type == 2: # absolute standard + self.pos_emb = PositionalEmbedding(self.d_model) + elif self.attn_type == 3: # absolute deeper SA + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + + def reset_length(self, tgt_len, ext_len, mem_len): + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + + def init_mems(self): + if self.mem_len > 0: + mems = [] + param = next(self.parameters()) + for i in range(self.n_layer+1): + empty = torch.empty(0, dtype=param.dtype, device=param.device) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, qlen, mlen): + # does not deal with None + if mems is None: return None + + # mems is not None + assert len(hids) == len(mems), 'len(hids) != len(mems)' + + # There are `mlen + qlen` steps that can be cached into mems + # For the next step, the last `ext_len` of the `qlen` tokens + # will be used as the extended context. Hence, we only cache + # the tokens from `mlen + qlen - self.ext_len - self.mem_len` + # to `mlen + qlen - self.ext_len`. + with torch.no_grad(): + new_mems = [] + end_idx = mlen + max(0, qlen - 0 - self.ext_len) + beg_idx = max(0, end_idx - self.mem_len) + for i in range(len(hids)): + + cat = torch.cat([mems[i], hids[i]], dim=0) + new_mems.append(cat[beg_idx:end_idx].detach()) + + return new_mems + + def _forward(self, dec_inp, mems=None): + qlen, bsz = dec_inp.size() + word_emb = self.word_emb(dec_inp.long()) + + mlen = mems[0].size(0) if mems is not None else 0 + klen = mlen + qlen + if self.same_length: + all_ones = word_emb.new_ones(qlen, klen) + mask_len = klen - self.mem_len + if mask_len > 0: + mask_shift_len = qlen - mask_len + else: + mask_shift_len = qlen + dec_attn_mask = (torch.triu(all_ones, 1+mlen) + + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1 + else: + dec_attn_mask = torch.triu( + word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None] + + hids = [] + if self.attn_type == 0: # default + pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb) + pos_emb = self.drop(pos_emb) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, pos_emb, self.r_w_bias, + self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + hids.append(core_out) + elif self.attn_type == 1: # learnable + core_out = self.drop(word_emb) + hids.append(core_out) + for i, layer in enumerate(self.layers): + if self.clamp_len > 0: + r_emb = self.r_emb[i][-self.clamp_len :] + r_bias = self.r_bias[i][-self.clamp_len :] + else: + r_emb, r_bias = self.r_emb[i], self.r_bias[i] + + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, r_emb, self.r_w_bias[i], + r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + hids.append(core_out) + elif self.attn_type == 2: # absolute + pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb + pos_emb[-qlen:]) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + if mems_i is not None and i == 0: + mems_i += pos_emb[:mlen] + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + hids.append(core_out) + elif self.attn_type == 3: + core_out = self.drop(word_emb) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + if mems_i is not None and mlen > 0: + cur_emb = self.r_emb[i][:-qlen] + cur_size = cur_emb.size(0) + if cur_size < mlen: + cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1) + cur_emb = torch.cat([cur_emb_pad, cur_emb], 0) + else: + cur_emb = cur_emb[-mlen:] + mems_i += cur_emb.view(mlen, 1, -1) + core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1) + + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + hids.append(core_out) + + core_out = self.drop(core_out) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + return core_out, new_mems + + def forward(self, data, target, *mems): + # nn.DataParallel does not allow size(0) tensors to be broadcasted. + # So, have to initialize size(0) mems inside the model forward. + # Moreover, have to return new_mems to allow nn.DataParallel to piece + # them together. + if not mems: mems = self.init_mems() + + tgt_len = target.size(0) + hidden, new_mems = self._forward(data, mems=mems) + + pred_hid = hidden[-tgt_len:] + if self.sample_softmax > 0 and self.training: + assert self.tie_weight + logit = sample_logits(self.word_emb, + self.out_layer.bias, target, pred_hid, self.sampler) + loss = -F.log_softmax(logit, -1)[:, :, 0] + else: + loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) + loss = loss.view(tgt_len, -1) + loss = loss.npu() + + if new_mems is None: + return [loss] + else: + return [loss] + new_mems + +def set_device(obj, device='cpu'): + if isinstance(obj, (tuple, list)): + dump = [] + for item in obj: + dump.append(set_device(item, device)) + return dump + elif isinstance(obj, dict): + dump = {} + for k, v in obj.items(): + dump[k] = set_device(v, device) + return dump + elif isinstance(obj, torch.Tensor): + return obj.to(device) + else: + return obj + + +def dump_tensor(output, name): + dump = set_device(output, 'cpu') + torch.save(dump, name) + print('%s dump success!' % (name)) + + +def load_tensor(name, device): + output = torch.load(name) + dump = set_device(output, device) + print('%s load success!' % (name)) + return dump + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='unit test') + + parser.add_argument('--n_layer', type=int, default=4, help='') + parser.add_argument('--n_rel_layer', type=int, default=4, help='') + parser.add_argument('--n_head', type=int, default=2, help='') + parser.add_argument('--d_head', type=int, default=2, help='') + parser.add_argument('--d_model', type=int, default=200, help='') + parser.add_argument('--d_embed', type=int, default=200, help='') + parser.add_argument('--d_inner', type=int, default=200, help='') + parser.add_argument('--dropout', type=float, default=0.0, help='') + parser.add_argument('--cuda', action='store_true', help='') + parser.add_argument('--seed', type=int, default=1111, help='') + parser.add_argument('--multi_gpu', action='store_true', help='') + + args = parser.parse_args() + + #device = torch.device("cuda" if args.cuda else "cpu") + device = torch.device("npu:0") + + B = 4 + tgt_len, mem_len, ext_len = 36, 36, 0 + data_len = tgt_len * 20 + args.n_token = 10000 + + import data_utils + + data = torch.LongTensor(data_len*B).random_(0, args.n_token).to(device) + diter = data_utils.LMOrderedIterator(data, B, tgt_len, device=device, ext_len=ext_len) + + cutoffs = [args.n_token // 2] + tie_projs = [False] + [True] * len(cutoffs) + + for div_val in [1, 2]: + for d_embed in [200, 100]: + model = MemTransformerLM(args.n_token, args.n_layer, args.n_head, + args.d_model, args.d_head, args.d_inner, args.dropout, + dropatt=args.dropout, tie_weight=True, + d_embed=d_embed, div_val=div_val, + tie_projs=tie_projs, pre_lnorm=True, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + cutoffs=cutoffs, attn_type=0).to(device) + + print(sum(p.numel() for p in model.parameters())) + + mems = tuple() + for idx, (inp, tgt, seqlen) in enumerate(diter): + print('batch {}'.format(idx)) + out = model(inp, tgt, *mems) + mems = out[1:] diff --git a/PyTorch/contrib/nlp/TransformerXL/modelzoo_level.txt b/PyTorch/contrib/nlp/TransformerXL/modelzoo_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..860ecbbbd50ca6512728b2dd7c59e005cd57ac91 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/modelzoo_level.txt @@ -0,0 +1,3 @@ +FuncStatus: +PerfStatus: +PrecisionStatus: \ No newline at end of file diff --git a/PyTorch/contrib/nlp/TransformerXL/requirements.txt b/PyTorch/contrib/nlp/TransformerXL/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f4eaeb22a91cbb52bc225de73f755272ad3fe53 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/requirements.txt @@ -0,0 +1,5 @@ +torchvision +tqdm +numpy +itertools +argparse \ No newline at end of file diff --git a/PyTorch/contrib/nlp/TransformerXL/test/env_npu.sh b/PyTorch/contrib/nlp/TransformerXL/test/env_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..280fca96da61c2d983f9a690fd0c044bbcba9167 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/test/env_npu.sh @@ -0,0 +1,71 @@ +#!/bin/bash +export install_path=/usr/local/Ascend + +if [ -d ${install_path}/toolkit ]; then + export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} + export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH + export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH + export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=${install_path}/opp +else + if [ -d ${install_path}/nnae/latest ];then + export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/nnae/latest + else + export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest + fi +fi + + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Host侧Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 +#设置Device侧日志等级为error +${install_path}/driver/tools/msnpureport -g error +#关闭Device侧Event日志 +${install_path}/driver/tools/msnpureport -e disable +export BMMV2_ENABLE=1 + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/gcc7.3.0/lib64:${LD_LIBRARY_PATH} \ No newline at end of file diff --git a/PyTorch/contrib/nlp/TransformerXL/test/train_eval_1p.sh b/PyTorch/contrib/nlp/TransformerXL/test/train_eval_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..da30622701ac128c0868f9635991f2a356b0d65b --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/test/train_eval_1p.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="/home/huangwei/data/enwik8" +# checkpoint文件路径,以实际路径为准 +pth_path="/home/huangwei/transformer-xl" +# 训练epoch +train_epochs=50 +# 指定训练所使用的npu device卡id +device_id=0 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +python3 -u eval_npu.py --split valid > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/Eval_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'bpc' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/Eval_${ASCEND_DEVICE_ID}.log|grep -v test|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + + +#最后一个迭代loss值,不需要修改 +#ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/Eval_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/nlp/TransformerXL/test/train_full_8p.sh b/PyTorch/contrib/nlp/TransformerXL/test/train_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d8f1a54815eafd2800c4bc5c079e1187d721cfa7 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/test/train_full_8p.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="/home/huangwei/data/enwik8" + +# 训练epoch +train_epochs=40 +# 学习率 +learning_rate=0.00025 +# 加载数据进程数 +workers=124 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +device_id_list=0,1,2,3,4,5,6,7 +export RANK_SIZE=8 +currentDir=$(cd "$(dirname "$0")";pwd) +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 -u $(dirname $currentDir)/train_8p_npu.py \ + --addr=$(hostname -I |awk '{print $1}') \ + --workers=$(nproc) \ + --multiprocessing-distributed \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --world-size=1 \ + --device_num=8 \ + --max_step=400000 \ + --rank=0 \ + --device-list=${device_id_list} \ + --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $20}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance characters/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'bpc' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "|" '{print $NF}'|awk -F " " '{print $1}'` +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep epoch ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/nlp/TransformerXL/test/train_performance_1p.sh b/PyTorch/contrib/nlp/TransformerXL/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..ecd1b34c358c832767c24fefcf1c8285b08f8147 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/test/train_performance_1p.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="/home/huangwei/data/enwik8" + +# 训练epoch +train_epochs=1 +# 指定训练所使用的npu device卡id +device_id=0 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +python3.7 -u ./train_1p_npu.py \ + --data=${data_path} \ + --seed=1111 \ + --workers=${workers} \ + --gpu=${ASCEND_DEVICE_ID} \ + --eval-interval=4000 \ + --log-interval=1 \ + --max_step=100 \ + --epochs=${train_epochs} \ + --static-loss-scale=128 \ + --batch_size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#FPS=${FPS#* } + +grep "fps" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $20}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}_fps.log +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` + +#打印,不需要修改 +echo "Final Performance characters/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'bpc' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v test|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} + +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000}'` + +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "epoch" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log + + diff --git a/PyTorch/contrib/nlp/TransformerXL/test/train_performance_8p.sh b/PyTorch/contrib/nlp/TransformerXL/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0116531fc446c8b6a5739c4cb02a9857823276a --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/test/train_performance_8p.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="/home/huangwei/data/enwik8" + +# 训练epoch +train_epochs=50 +# 学习率 +learning_rate=0.00025 +# 加载数据进程数 +workers=124 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +device_id_list=0,1,2,3,4,5,6,7 +export RANK_SIZE=8 +currentDir=$(cd "$(dirname "$0")";pwd) +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 -u $(dirname $currentDir)/train_8p_npu.py \ + --addr=$(hostname -I |awk '{print $1}') \ + --workers=$(nproc) \ + --multiprocessing-distributed \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --world-size=1 \ + --device_num=8 \ + --log-interval=1 \ + --eval-interval=4000 \ + --max_step=100 \ + --rank=0 \ + --device-list=${device_id_list} \ + --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +grep "fps" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $20}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log|awk '{a+=$1} END {if (NR !=0) printf("%.3f", a/NR)}'` +#打印,不需要修改 +echo "Final Performance characters/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'bpc' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v test|awk 'END {print}'|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'` + +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/nlp/TransformerXL/train_1p_npu.py b/PyTorch/contrib/nlp/TransformerXL/train_1p_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..8f213a8237389c9d4942c7bc2ce1d172923850cb --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/train_1p_npu.py @@ -0,0 +1,542 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import time +import math +import os +import itertools + +import torch +import torch.nn as nn +import torch.optim as optim + +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from utils.data_parallel import BalancedDataParallel +from apex import amp +import apex + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/enwik8', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='enwik8', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=8, + help='number of heads') +parser.add_argument('--d_head', type=int, default=64, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=512, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=2048, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.1, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--epochs', type=int, default=50, + help='train epochs') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=100000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=22, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=512, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=128, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=512, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +parser.add_argument('--workers', type=int, default=64, + help='workers num') +parser.add_argument('--npu', default=True, help='use NPU') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=4000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--static-loss-scale', type=float, default=128.0, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +args = parser.parse_args() +args.tied = not args.not_tied + +if args.d_embed < 0: + args.d_embed = args.d_model + +assert args.ext_len >= 0, 'extended context length must be non-negative' +assert args.batch_size % args.batch_chunk == 0 + +args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) +args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) +logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) + +device = torch.device('npu:0') + +############################################################################### +# Load data +############################################################################### +corpus = get_lm_corpus(args.data, args.dataset) +ntokens = len(corpus.vocab) +args.n_token = ntokens + +eval_batch_size = 10 +tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) +va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len, + device=device, ext_len=args.ext_len) +te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len, + device=device, ext_len=args.ext_len) + +# adaptive softmax / embedding +cutoffs, tie_projs = [], [False] +if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + +############################################################################### +# Build the model +############################################################################### +def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + +def init_bias(bias): + nn.init.constant_(bias, 0.0) + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + +def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + +def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + +if args.restart: + with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f: + model = torch.load(f) + model.apply(update_dropout) + model.apply(update_dropatt) +else: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing +args.n_all_param = sum([p.nelement() for p in model.parameters()]) +args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + + +if args.multi_gpu: + model = model.to(device) + if args.gpu0_bsz >= 0: + para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, + model, dim=1).to(device) + else: + para_model = nn.DataParallel(model, dim=1).to(device) +else: + para_model = model.to(device) + +#### optimizer +if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) +elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr) +elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + + +################################################################################################### +opt_level = "O2" +model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True) +################################################################################################### + + +#### scheduler +if args.scheduler == 'cosine': + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, + args.max_step, eta_min=args.eta_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse, + args.max_step, eta_min=args.eta_min) +elif args.scheduler == 'inv_sqrt': + + def lr_lambda(step): + if step == 0 and args.warmup_step == 0: + return 1. + else: + return 1. / (step ** 0.5) if step > args.warmup_step \ + else step / (args.warmup_step ** 1.5) + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) + +elif args.scheduler == 'dev_perf': + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) +elif args.scheduler == 'constant': + pass + + +if args.restart: + if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): + with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: + opt_state_dict = torch.load(f) + optimizer.load_state_dict(opt_state_dict) + else: + print('Optimizer was not saved. Start from scratch.') + +logging('=' * 100) +for k, v in args.__dict__.items(): + logging(' - {} : {}'.format(k, v)) +logging('=' * 100) +logging('#params = {}'.format(args.n_all_param)) +logging('#non emb params = {}'.format(args.n_nonemb_param)) + +############################################################################### +# Training code +############################################################################### + +def evaluate(eval_iter): + model.eval() + if args.mem_len == 0: + model.reset_length(args.eval_tgt_len, + args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len) + else: + model.reset_length(args.eval_tgt_len, + args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len) + + # Evaluation + total_len, total_loss = 0, 0. + with torch.no_grad(): + mems = tuple() + for i, (data, target, seq_len) in enumerate(eval_iter): + if args.max_eval_steps > 0 and i >= args.max_eval_steps: + break + ret = model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.float().item() + total_len += seq_len + + model.reset_length(args.tgt_len, args.ext_len, args.mem_len) + model.train() + + return total_loss / total_len + + +def train(): + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + model.train() + if args.batch_chunk > 1: + mems = [tuple() for _ in range(args.batch_chunk)] + else: + mems = tuple() + train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter + for batch, (data, target, seq_len) in enumerate(train_iter): + model.zero_grad() + if args.batch_chunk > 1: + data_chunks = torch.chunk(data, args.batch_chunk, 1) + target_chunks = torch.chunk(target, args.batch_chunk, 1) + for i in range(args.batch_chunk): + data_i = data_chunks[i].contiguous() + target_i = target_chunks[i].contiguous() + ret = para_model(data_i, target_i, *mems[i]) + loss, mems[i] = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) / args.batch_chunk + #################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + #################################################################### + with torch.no_grad(): + train_loss += loss.float().bool().item() + else: + ret = para_model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) + #################################################### + with torch.no_grad(): + train_loss += loss.float().item() + ################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + + + optimizer.step() + if args.sample_softmax > 0: + optimizer_sparse.step() + + # step-wise learning rate annealing + train_step += 1 + if args.scheduler in ['cosine', 'constant', 'dev_perf']: + # linear warmup stage + if train_step < args.warmup_step: + curr_lr = args.lr * train_step / args.warmup_step + optimizer.param_groups[0]['lr'] = curr_lr + if args.sample_softmax > 0: + optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 + else: + if args.scheduler == 'cosine': + scheduler.step(train_step) + if args.sample_softmax > 0: + scheduler_sparse.step(train_step) + elif args.scheduler == 'inv_sqrt': + scheduler.step(train_step) + + if train_step % args.log_interval == 0: + cur_loss = train_loss / args.log_interval + elapsed = time.time() - log_start_time + log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ + '| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format( + epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], + elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len/elapsed) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2)) + else: + log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) + logging(log_str) + train_loss = 0 + log_start_time = time.time() + + if train_step % args.eval_interval == 0: + ts = time.time() + val_loss = evaluate(va_iter) + print('evaluation use time {} s'.format(time.time()-ts)) + logging('-' * 100) + log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ + '| valid loss {:5.2f}'.format( + train_step // args.eval_interval, train_step, + (time.time() - eval_start_time), val_loss) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2)) + else: + log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) + logging(log_str) + logging('-' * 100) + # Save the model if the validation loss is the best we've seen so far. + if not best_val_loss or val_loss < best_val_loss: + if not args.debug: + with open('model.pt', 'wb') as f: + torch.save(model.state_dict(), f) + with open('optimizer.pt', 'wb') as f: + torch.save(optimizer.state_dict(), f) + best_val_loss = val_loss + + # dev-performance based learning rate annealing + if args.scheduler == 'dev_perf': + scheduler.step(val_loss) + if args.sample_softmax > 0: + scheduler_sparse.step(val_loss) + + eval_start_time = time.time() + + if train_step == args.max_step: + break + +# Loop over epochs. +train_step = 0 +train_loss = 0 +best_val_loss = None + +log_start_time = time.time() +eval_start_time = time.time() + +# At any point you can hit Ctrl + C to break out of training early. +try: + for epoch in itertools.count(start=1): + train() + if train_step == args.max_step: + logging('-' * 100) + logging('End of training') + break +except KeyboardInterrupt: + logging('-' * 100) + logging('Exiting from training early') + +## Load the best saved model. +#with open('model.pt', 'rb') as f: +# model.load_state_dict(torch.load(f, map_location=device)) +#para_model = model.to(device) + +## Run on test data. +#test_loss = evaluate(te_iter) +#logging('=' * 100) +#if args.dataset in ['enwik8', 'text8']: +# logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format( +# test_loss, test_loss / math.log(2))) +#else: +# logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( +# test_loss, math.exp(test_loss))) +#logging('=' * 100) diff --git a/PyTorch/contrib/nlp/TransformerXL/train_8p_npu.py b/PyTorch/contrib/nlp/TransformerXL/train_8p_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef2b16e75627cfe1b390498e2a71d75272b76f4 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/train_8p_npu.py @@ -0,0 +1,648 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +import math +import os, sys +import itertools +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.multiprocessing as mp +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from utils.data_parallel import BalancedDataParallel +from apex import amp +import torch.distributed as dist +import apex +import warnings + + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/enwik8', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='enwik8', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=8, + help='number of heads') +parser.add_argument('--d_head', type=int, default=64, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=512, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=2048, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.1, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, # 源码中 clip 的 default=0.25 + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=400000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=22, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=512, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=128, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=512, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +# parser.add_argument('--npu', default=True, help='use NPU') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=4000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--static-loss-scale', type=float, default=128.0, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +#edit this for 8p +parser.add_argument('--dist-backend', type=str, default='hccl') +parser.add_argument('--world-size', type=int, default=-1) +parser.add_argument('--rank', type=int, default=-1) +parser.add_argument('--local_rank', type=int, default=0) +parser.add_argument('--addr', type=str, default='127.0.0.1') +parser.add_argument('--device_num', type=int, default=-1) +parser.add_argument('--workers', type=int, default=32) +parser.add_argument('--device-list', default='', type=str) +parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:50000') +parser.add_argument('--device', type=str, default='npu') +parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +warnings.filterwarnings('ignore') +#############end################# + +def main(): + args = parser.parse_args() + args.tied = not args.not_tied + torch.manual_seed(args.seed) + + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + ############################## + # edit this for 8p + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29888' + os.environ['LOCAL_DEVICE_ID'] = str(0) + print("+++++++++++++++++++++++++++LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + if args.device_list != '': + ngpus_per_node = len(args.device_list.split(',')) + elif args.device_num != -1: + ngpus_per_node = args.device_num + elif args.device == 'npu': + ngpus_per_node = int(os.environ["RANK_SIZE"]) + else: + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + args.world_size = ngpus_per_node * args.world_size + if args.device == 'npu': + main_worker(args.local_rank, ngpus_per_node,args) + else: + main_worker(args.gpu, ngpus_per_node, args) + ############################## + + +def main_worker(gpu, ngpus_per_node, args): + + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + if args.d_embed < 0: + args.d_embed = args.d_model + + assert args.ext_len >= 0, 'extended context length must be non-negative' + assert args.batch_size % args.batch_chunk == 0 + + args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) + args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) + logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) + + if args.device_list != '': + args.gpu = int(args.device_list.split(',')[gpu]) + else: + args.gpu = gpu + + print("[npu id:", args.gpu, "]", "++++++++++++++++ before set LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + os.environ['LOCAL_DEVICE_ID'] = str(args.gpu) + print("[npu id:", args.gpu, "]", "++++++++++++++++ LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + + if args.gpu is not None: + print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + args.rank = args.rank * ngpus_per_node + gpu + + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, + world_size=args.world_size, rank=args.rank) + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + + print("[npu id:", args.gpu, "]", "===============main_worker()=================") + print("[npu id:", args.gpu, "]", args) + print("[npu id:", args.gpu, "]", "===============main_worker()=================") + + + ############################################################################### + # Load data + ############################################################################### + corpus = get_lm_corpus(args.data, args.dataset) + ntokens = len(corpus.vocab) + args.n_token = ntokens + + eval_batch_size = 10 + tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, + device=loc, ext_len=args.ext_len) + va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) + te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) + + # adaptive softmax / embedding + cutoffs, tie_projs = [], [False] + if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + + ############################################################################### + # Build the model + ############################################################################### + def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + + def init_bias(bias): + nn.init.constant_(bias, 0.0) + + def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + + def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + + def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + + if args.restart: + with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) + model = model.to(loc) + ckpt = torch.load(f, map_location=loc) + model.load_state_dict(ckpt) + model.apply(update_dropout) + model.apply(update_dropatt) + else: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing + + + + args.n_all_param = sum([p.nelement() for p in model.parameters()]) + args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + + + + #### optimizer + if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) + elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + #optimizer = optim.Adam(model.parameters(), lr=args.lr) + optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr) + elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + + model = model.to(loc) + ################################################################################################### + opt_level = "O2" + model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True) + ################################################################################################### + + if args.multi_gpu: + + if args.gpu0_bsz >= 0: + para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, + model, dim=1).to(loc) + else: + para_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + else: + para_model = model.to(loc) + + #### scheduler + if args.scheduler == 'cosine': + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, + args.max_step, eta_min=args.eta_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse, + args.max_step, eta_min=args.eta_min) + elif args.scheduler == 'inv_sqrt': + def lr_lambda(step): + if step == 0 and args.warmup_step == 0: + return 1. + else: + return 1. / (step ** 0.5) if step > args.warmup_step \ + else step / (args.warmup_step ** 1.5) + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) + elif args.scheduler == 'dev_perf': + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + elif args.scheduler == 'constant': + pass + + + if args.restart: + if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): + with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: + opt_state_dict = torch.load(f, map_location=loc) + optimizer.load_state_dict(opt_state_dict) + else: + print('Optimizer was not saved. Start from scratch.') + + logging('=' * 100) + for k, v in args.__dict__.items(): + logging(' - {} : {}'.format(k, v)) + logging('=' * 100) + logging('#params = {}'.format(args.n_all_param)) + logging('#non emb params = {}'.format(args.n_nonemb_param)) + + ############################################################################### + # Training code + ############################################################################### + + def evaluate(eval_iter): + model.eval() + if args.mem_len == 0: + model.reset_length(args.eval_tgt_len, + args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len) + else: + model.reset_length(args.eval_tgt_len, + args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len) + + # Evaluation + total_len, total_loss = 0, 0. + with torch.no_grad(): + mems = tuple() + for i, (data, target, seq_len) in enumerate(eval_iter): + if args.max_eval_steps > 0 and i >= args.max_eval_steps: + break + ret = model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.float().item() + total_len += seq_len + + model.reset_length(args.tgt_len, args.ext_len, args.mem_len) + model.train() + return total_loss / total_len + + + def train(): + # Turn on training mode which enables dropout. + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + + model.train() + if args.batch_chunk > 1: + mems = [tuple() for _ in range(args.batch_chunk)] + else: + mems = tuple() + train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter + for batch, (data, target, seq_len) in enumerate(train_iter): + model.zero_grad() + if args.batch_chunk > 1: + data_chunks = torch.chunk(data, args.batch_chunk, 1) + target_chunks = torch.chunk(target, args.batch_chunk, 1) + for i in range(args.batch_chunk): + data_i = data_chunks[i].contiguous() + target_i = target_chunks[i].contiguous() + ret = para_model(data_i, target_i, *mems[i]) + loss, mems[i] = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) / args.batch_chunk + #################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + #################################################################### + with torch.no_grad(): + train_loss += loss.float().bool().item() + else: + ret = para_model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) + #################################################### + with torch.no_grad(): + train_loss += loss.float().item() + ################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + + + optimizer.step() + if args.sample_softmax > 0: + optimizer_sparse.step() + + # step-wise learning rate annealing + train_step += 1 + if args.scheduler in ['cosine', 'constant', 'dev_perf']: + # linear warmup stage + if train_step < args.warmup_step: + curr_lr = args.lr * train_step / args.warmup_step + optimizer.param_groups[0]['lr'] = curr_lr + if args.sample_softmax > 0: + optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 + else: + if args.scheduler == 'cosine': + scheduler.step(train_step) + if args.sample_softmax > 0: + scheduler_sparse.step(train_step) + elif args.scheduler == 'inv_sqrt': + scheduler.step(train_step) + + if train_step % args.log_interval == 0: + cur_loss = train_loss / args.log_interval + elapsed = time.time() - log_start_time + log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ + '| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format( + epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], + elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len*8/elapsed) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2)) + else: + log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) + logging(log_str) + train_loss = 0 + log_start_time = time.time() + + if train_step % args.eval_interval == 0: + print('train_step is :', train_step) + print('ars.eval_interval is :', args.eval_interval) + print(train_step % args.eval_interval) + print('*'*50) + ts = time.time() + val_loss = evaluate(va_iter) + print('evaluation use time {} s'.format(time.time()-ts)) + logging('-' * 100) + log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ + '| valid loss {:5.2f}'.format( + train_step // args.eval_interval, train_step, + (time.time() - ts), val_loss) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2)) + else: + log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) + logging(log_str) + logging('-' * 100) + # Save the model if the validation loss is the best we've seen so far. + if not best_val_loss or val_loss < best_val_loss: + if not args.debug: + with open('model.pt', 'wb') as f: + torch.save(model.state_dict(), f) + with open('optimizer.pt', 'wb') as f: + torch.save(optimizer.state_dict(), f) + best_val_loss = val_loss + + # dev-performance based learning rate annealing + if args.scheduler == 'dev_perf': + scheduler.step(val_loss) + if args.sample_softmax > 0: + scheduler_sparse.step(val_loss) + + eval_start_time = time.time() + + if train_step == args.max_step: + sys.exit() + + # At any point you can hit Ctrl + C to break out of training early. + try: + for epoch in itertools.count(start=1): + train() + if train_step == args.max_step: + logging('-' * 100) + logging('End of training') + sys.exit() + except KeyboardInterrupt: + logging('-' * 100) + logging('Exiting from training early') + + # # Load the best saved model. + # with open('model.pt', 'rb') as f: + # model.load_state_dict(torch.load(f, map_location=loc)) + # para_model = model.to(loc) + + # # Run on test data. + # test_loss = evaluate(te_iter) + # logging('=' * 100) + # if args.dataset in ['enwik8', 'text8']: + # logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format( + # test_loss, test_loss / math.log(2))) + # else: + # logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( + # test_loss, math.exp(test_loss))) + # logging('=' * 100) + + +if __name__ == '__main__': + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + train_step = 0 + train_loss = 0 + best_val_loss = None + log_start_time = time.time() + eval_start_time = time.time() + main() diff --git a/PyTorch/contrib/nlp/TransformerXL/utils/adaptive_softmax.py b/PyTorch/contrib/nlp/TransformerXL/utils/adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..3c54a69204525d62466eb58245af2a3165798bed --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/utils/adaptive_softmax.py @@ -0,0 +1,102 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +import torch.nn.functional as F + +class AdaptiveLogSoftmax(nn.Module): + def __init__(self, in_features, n_classes, cutoffs, keep_order=False): + super(AdaptiveLogSoftmax, self).__init__() + + cutoffs = list(cutoffs) + + if (cutoffs != sorted(cutoffs)) \ + or (min(cutoffs) <= 0) \ + or (max(cutoffs) >= (n_classes - 1)) \ + or (len(set(cutoffs)) != len(cutoffs)) \ + or any([int(c) != c for c in cutoffs]): + + raise ValueError("cutoffs should be a sequence of unique, positive " + "integers sorted in an increasing order, where " + "each value is between 1 and n_classes-1") + + self.in_features = in_features + self.n_classes = n_classes + self.cutoffs = cutoffs + [n_classes] + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.keep_order = keep_order + + + def forward(self, hidden, target, weight, bias, keep_order=False): + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + head_weight = torch.cat( + [weight[:self.shortlist_size], self.cluster_weight], dim=0) + head_bias = torch.cat( + [bias[:self.shortlist_size], self.cluster_bias], dim=0) + + head_logit = F.linear(hidden, head_weight, bias=head_bias) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < h_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i = weight[l_idx:h_idx] + bias_i = bias[l_idx:h_idx] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + print(f'target_i[:,None]: {target_i[:, None]}') + print(f'target_i[:,None].shape: {target_i[:, None].shape}') + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/PyTorch/contrib/nlp/TransformerXL/utils/data_parallel.py b/PyTorch/contrib/nlp/TransformerXL/utils/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..1b48aaaa644e8310cce7cb4d04d14b9d832d39ff --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/utils/data_parallel.py @@ -0,0 +1,109 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from torch.nn.parallel import DataParallel +import torch +from torch.nn.parallel._functions import Scatter +from torch.nn.parallel.parallel_apply import parallel_apply + +def scatter(inputs, target_gpus, chunk_sizes, dim=0): + r""" + Slices tensors into approximately equal chunks and + distributes them across given GPUs. Duplicates + references to objects that are not tensors. + """ + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + try: + return Scatter.apply(target_gpus, chunk_sizes, dim, obj) + except: + print('obj', obj.size()) + print('dim', dim) + print('chunk_sizes', chunk_sizes) + quit() + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + return list(map(list, zip(*map(scatter_map, obj)))) + if isinstance(obj, dict) and len(obj) > 0: + return list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return [obj for targets in target_gpus] + + try: + return scatter_map(inputs) + finally: + scatter_map = None + +def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0): + r"""Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs + +class BalancedDataParallel(DataParallel): + def __init__(self, gpu0_bsz, *args, **kwargs): + self.gpu0_bsz = gpu0_bsz + super().__init__(*args, **kwargs) + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + if self.gpu0_bsz == 0: + device_ids = self.device_ids[1:] + else: + device_ids = self.device_ids + inputs, kwargs = self.scatter(inputs, kwargs, device_ids) + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + replicas = self.replicate(self.module, self.device_ids) + if self.gpu0_bsz == 0: + replicas = replicas[1:] + outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs) + + ######################################3 + # outputs=outputs.to('cpu') + # self.output_device='cpu' + print(f'outputs: {outputs}') + print(f'type(outputs): {type(outputs)}') + print(f'len(outputs): {len(outputs)}') + print(f'self.output_device: {self.output_device}') + + + return self.gather(outputs, self.output_device) + + def parallel_apply(self, replicas, device_ids, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, device_ids) + + def scatter(self, inputs, kwargs, device_ids): + bsz = inputs[0].size(self.dim) + num_dev = len(self.device_ids) + gpu0_bsz = self.gpu0_bsz + bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1) + if gpu0_bsz < bsz_unit: + chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1) + delta = bsz - sum(chunk_sizes) + for i in range(delta): + chunk_sizes[i + 1] += 1 + if gpu0_bsz == 0: + chunk_sizes = chunk_sizes[1:] + else: + return super().scatter(inputs, kwargs, device_ids) + return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim) + diff --git a/PyTorch/contrib/nlp/TransformerXL/utils/exp_utils.py b/PyTorch/contrib/nlp/TransformerXL/utils/exp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f290b8e70eed7448095e9af4b97341d0e89644eb --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/utils/exp_utils.py @@ -0,0 +1,40 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +import os +import shutil +import torch + + +def logging(s, log_path, print_=True, log_=True): + if print_: + print(s) + if log_: + with open(log_path, 'a+') as f_log: + f_log.write(s + '\n') + + +def get_logger(log_path, **kwargs): + return functools.partial(logging, log_path=log_path, **kwargs) + + +def create_exp_dir(dir_path, scripts_to_save=None, debug=False): + print('Experiment dir : {}'.format(dir_path)) + return get_logger(log_path='log.txt') + + +def save_checkpoint(model, optimizer, path, epoch): + torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch))) + torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch))) diff --git a/PyTorch/contrib/nlp/TransformerXL/utils/log_uniform_sampler.py b/PyTorch/contrib/nlp/TransformerXL/utils/log_uniform_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..4ebe1297479dd63b01cfcc2d553760fff26947b0 --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/utils/log_uniform_sampler.py @@ -0,0 +1,111 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from torch import nn + +class LogUniformSampler(object): + def __init__(self, range_max, n_sample): + """ + Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py + `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` + + expected count can be approximated by 1 - (1 - p)^n + and we use a numerically stable version -expm1(num_tries * log1p(-p)) + + Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run + """ + with torch.no_grad(): + self.range_max = range_max + log_indices = torch.arange(1., range_max+2., 1.).log_() + self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + # print('P', self.dist.numpy().tolist()[-30:]) + + self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() + + self.n_sample = n_sample + + def sample(self, labels): + """ + labels: [b1, b2] + Return + true_log_probs: [b1, b2] + samp_log_probs: [n_sample] + neg_samples: [n_sample] + """ + + n_sample = self.n_sample + n_tries = 2 * n_sample + + with torch.no_grad(): + neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() + device = labels.device + neg_samples = neg_samples.to(device) + true_log_probs = self.log_q[labels].to(device) + samp_log_probs = self.log_q[neg_samples].to(device) + return true_log_probs, samp_log_probs, neg_samples + +def sample_logits(embedding, bias, labels, inputs, sampler): + """ + embedding: an nn.Embedding layer + bias: [n_vocab] + labels: [b1, b2] + inputs: [b1, b2, n_emb] + sampler: you may use a LogUniformSampler + Return + logits: [b1, b2, 1 + n_sample] + """ + true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) + n_sample = neg_samples.size(0) + b1, b2 = labels.size(0), labels.size(1) + all_ids = torch.cat([labels.view(-1), neg_samples]) + all_w = embedding(all_ids) + true_w = all_w[: -n_sample].view(b1, b2, -1) + sample_w = all_w[- n_sample:].view(n_sample, -1) + + all_b = bias[all_ids] + true_b = all_b[: -n_sample].view(b1, b2) + sample_b = all_b[- n_sample:] + + hit = (labels[:, :, None] == neg_samples).detach() + + true_logits = torch.einsum('ijk,ijk->ij', + [true_w, inputs]) + true_b - true_log_probs + sample_logits = torch.einsum('lk,ijk->ijl', + [sample_w, inputs]) + sample_b - samp_log_probs + sample_logits.masked_fill_(hit, -1e30) + logits = torch.cat([true_logits[:, :, None], sample_logits], -1) + + return logits + + +if __name__ == '__main__': + S, B = 3, 4 + n_vocab = 10000 + n_sample = 5 + H = 32 + + labels = torch.LongTensor(S, B).random_(0, n_vocab) + sampler = LogUniformSampler(n_vocab, unique=True) + + embedding = nn.Embedding(n_vocab, H) + bias = torch.zeros(n_vocab) + inputs = torch.Tensor(S, B, H).normal_() + + logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample) + print('logits', logits.detach().numpy().tolist()) + print('logits shape', logits.size()) + print('out_labels', out_labels.detach().numpy().tolist()) + print('out_labels shape', out_labels.size()) + diff --git a/PyTorch/contrib/nlp/TransformerXL/utils/proj_adaptive_softmax.py b/PyTorch/contrib/nlp/TransformerXL/utils/proj_adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..886757190ce2b308dc3e3d6e66762f623cb9878a --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/utils/proj_adaptive_softmax.py @@ -0,0 +1,160 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +import torch.nn.functional as F + +#################################################################################### +# edit +# CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) # 主cuda +# CUDA_MINOR = int(torch.version.cuda.split('.')[1]) # 辅cuda +##################################################################################### + +class ProjectedAdaptiveLogSoftmax(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + keep_order=False): + super(ProjectedAdaptiveLogSoftmax, self).__init__() + self.n_token = n_token + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [n_token] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + if self.n_clusters > 0: + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.out_layers = nn.ModuleList() + self.out_projs = nn.ParameterList() + + if div_val == 1: + for i in range(len(self.cutoffs)): + if d_proj != d_embed: + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_embed)) + ) + else: + self.out_projs.append(None) + + self.out_layers.append(nn.Linear(d_embed, n_token)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_emb_i)) + ) + + self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) + + self.keep_order = keep_order + + def _compute_logit(self, hidden, weight, bias, proj): + if proj is None: + logit = F.linear(hidden, weight, bias=bias) + else: + proj_hid = F.linear(hidden, proj.t().contiguous()) + logit = F.linear(proj_hid, weight, bias=bias) + + return logit + + def forward(self, hidden, target, keep_order=False): + ''' + hidden :: [len*bsz x d_proj] + target :: [len*bsz] + ''' + + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + + torch.save(logit,"logit.pt") + + nll = -F.log_softmax(logit, dim=-1).gather(1, target.unsqueeze(1).long()).squeeze(1) + + else: + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/PyTorch/contrib/nlp/TransformerXL/utils/vocabulary.py b/PyTorch/contrib/nlp/TransformerXL/utils/vocabulary.py new file mode 100644 index 0000000000000000000000000000000000000000..a13e4183c243befab6f7167719931782f67c843c --- /dev/null +++ b/PyTorch/contrib/nlp/TransformerXL/utils/vocabulary.py @@ -0,0 +1,178 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from collections import Counter, OrderedDict +import torch + + +class Vocab(object): + def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True, + delimiter=None, vocab_file=None): + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + + + def tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + if self.lower_case: + line = line.lower() + + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [''] + symbols + [''] + elif add_eos: + return symbols + [''] + else: + return symbols + + + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: print('counting file {} ...'.format(path)) + assert os.path.exists(path) + + sents = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: print('counting {} sents ...'.format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, 'r', encoding='utf-8') as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + self.unk_idx = self.sym2idx[''] + + def build_vocab(self): + if self.vocab_file: + print('building vocab from {}'.format(self.vocab_file)) + self._build_from_file(self.vocab_file) + print('final vocab size {}'.format(len(self))) + else: + print('building vocab with min_freq={}, max_size={}'.format( + self.min_freq, self.max_size)) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: break + self.add_symbol(sym) + + print('final vocab size {} from {} unique tokens'.format( + len(self), len(self.counter))) + + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, + add_double_eos=False): + if verbose: print('encoding file {} ...'.format(path)) + assert os.path.exists(path) + encoded = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, + add_double_eos=add_double_eos) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: print('encoding {} sents ...'.format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def get_sym(self, idx): + assert 0 <= idx < len(self), 'Index {} out of range'.format(idx) + return self.idx2sym[idx] + + def get_idx(self, sym): + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + assert '' not in sym + assert hasattr(self, 'unk_idx') + return self.sym2idx.get(sym, self.unk_idx) + + def get_symbols(self, indices): + return [self.get_sym(idx) for idx in indices] + + def get_indices(self, symbols): + return [self.get_idx(sym) for sym in symbols] + + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.get_indices(symbols)) + + def convert_to_sent(self, indices, exclude=None): + if exclude is None: + return ' '.join([self.get_sym(idx) for idx in indices]) + else: + return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude]) + + def __len__(self): + return len(self.idx2sym)