diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py index 399569939354354306b37b693f7319aa96cbb8c5..2671386f66271ab19d780c21ef4421a0d7660464 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py @@ -29,7 +29,7 @@ from torch import Tensor from typing import Optional, Dict torch.set_printoptions(threshold=500000000, linewidth=1024) from utils import options - +import torch.nn.functional as F from modules import ( MultiheadAttention, SinusoidalPositionalEmbedding ) @@ -184,8 +184,7 @@ class TransformerEncoder(nn.Module): x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout) + x = F.dropout(x, p=self.dropout,training=self.training) # B:batch size ; T: seq length ; C: embedding dim 512 # B x T x C -> T x B x C @@ -266,8 +265,7 @@ class TransformerDecoder(IncrementalDecoder): x = self.embed_scale * self.embed_tokens(prev_output_tokens) if positions is not None: x += positions - if self.training: - x,_,_ = torch.npu_dropoutV2(x, self.seed, p=self.dropout) + x= F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) @@ -319,18 +317,15 @@ class TransformerEncoderLayer(nn.Module): incremental_state=None, need_weights=False, static_kv=False) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout) + x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.ln1(x) residual = x x = F.threshold(self.fc1(x), 0.0, 0.0) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.relu_dropout) + x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p =self.dropout) + x = F.dropout(x, p =self.dropout, training=self.training) x = residual + x x = self.ln2(x) return x @@ -383,8 +378,7 @@ class TransformerDecoderLayer(nn.Module): static_kv=False ) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout) + x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.self_attn_layer_norm(x) @@ -402,19 +396,16 @@ class TransformerDecoderLayer(nn.Module): mask_future_timesteps=False, need_weights=(not self.training and self.need_attn), ) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout) + x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.encoder_attn_layer_norm(x) residual = x x = F.threshold(self.fc1(x), 0.0, 0.0) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.relu_dropout) + x = F.dropout(x, p=self.relu_dropout, training=self.training) x = self.fc2(x) - if self.training: - x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout) + x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.layer_norm(x) return x, attn diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh index 8942bbc29c0e612f48e93f670e02d46183e8fc28..2b6134790ee09dba4d1b6bbb3df2f4852cdc830e 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh @@ -4,6 +4,7 @@ cur_path=`pwd` #集合通信参数,不需要修改 export RANK_SIZE=1 +export BMMV2_ENABLE=1 # 数据集路径,保持为空,不需要修改 data_path="" diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh index 06828617bff413501f15887f596c11df5f1d7307..dc8f9b79a7bd68f19a9dfce631982cfbb5c440e3 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh @@ -11,6 +11,7 @@ data_path="" #网络名称,同目录名称,需要模型审视修改 Network="Transformer_ID0105_for_PyTorch" +export BMMV2_ENABLE=1 #训练epoch train_epochs=30 #训练batch_size,,需要模型审视修改 diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh index 118737861fde420ee874c0f4a75e15b3b440331f..48e7fb3af7decbc497fd4f94d3464f1fc9046a98 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh @@ -4,6 +4,7 @@ cur_path=`pwd` #集合通信参数,不需要修改 export RANK_SIZE=1 +export BMMV2_ENABLE=1 # 数据集路径,保持为空,不需要修改 data_path="" diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh index b10fe5ec3bbed835a4ff0695f80c9f66f4db038a..f4b26836cd96412edf282d783f2236ab1cdbceb4 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh @@ -7,6 +7,7 @@ export RANK_SIZE=8 export MASTER_ADDR=localhost export MASTER_PORT=29688 export HCCL_WHITELIST_DISABLE=1 +export BMMV2_ENABLE=1 # 数据集路径,保持为空,不需要修改 data_path=""