diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index 399569939354354306b37b693f7319aa96cbb8c5..2671386f66271ab19d780c21ef4421a0d7660464 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -29,7 +29,7 @@ from torch import Tensor
 from typing import Optional, Dict
 torch.set_printoptions(threshold=500000000, linewidth=1024)
 from utils import options
-
+import torch.nn.functional as F
 from modules import (
     MultiheadAttention, SinusoidalPositionalEmbedding
 )
@@ -184,8 +184,7 @@ class TransformerEncoder(nn.Module):
         x = self.embed_scale * self.embed_tokens(src_tokens)
         if self.embed_positions is not None:
             x += self.embed_positions(src_tokens)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        x = F.dropout(x, p=self.dropout,training=self.training)
 
         # B:batch size ; T: seq length ; C: embedding dim 512
         # B x T x C -> T x B x C
@@ -266,8 +265,7 @@ class TransformerDecoder(IncrementalDecoder):
         x = self.embed_scale * self.embed_tokens(prev_output_tokens)
         if positions is not None:
             x += positions
-        if self.training:
-            x,_,_ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        x= F.dropout(x, p=self.dropout, training=self.training)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
@@ -319,18 +317,15 @@ class TransformerEncoderLayer(nn.Module):
                               incremental_state=None,
                               need_weights=False,
                               static_kv=False)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.ln1(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.relu_dropout)
+        x = F.dropout(x, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p =self.dropout)
+        x = F.dropout(x, p =self.dropout, training=self.training)
         x = residual + x
         x = self.ln2(x)
         return x
@@ -383,8 +378,7 @@ class TransformerDecoderLayer(nn.Module):
             static_kv=False
         )
 
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.self_attn_layer_norm(x)
 
@@ -402,19 +396,16 @@ class TransformerDecoderLayer(nn.Module):
                 mask_future_timesteps=False,
                 need_weights=(not self.training and self.need_attn),
             )
-            if self.training:
-                x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+            x = F.dropout(x, p=self.dropout, training=self.training)
             x = residual + x
 
             x = self.encoder_attn_layer_norm(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.relu_dropout)
+        x = F.dropout(x, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.layer_norm(x)
         return x, attn
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
index 8942bbc29c0e612f48e93f670e02d46183e8fc28..2b6134790ee09dba4d1b6bbb3df2f4852cdc830e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
@@ -4,6 +4,7 @@ cur_path=`pwd`
 #集合通信参数,不需要修改
 export RANK_SIZE=1
 
+export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
 
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
index 06828617bff413501f15887f596c11df5f1d7307..dc8f9b79a7bd68f19a9dfce631982cfbb5c440e3 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
@@ -11,6 +11,7 @@ data_path=""
 #网络名称,同目录名称,需要模型审视修改
 Network="Transformer_ID0105_for_PyTorch"
 
+export BMMV2_ENABLE=1
 #训练epoch
 train_epochs=30
 #训练batch_size,,需要模型审视修改
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
index 118737861fde420ee874c0f4a75e15b3b440331f..48e7fb3af7decbc497fd4f94d3464f1fc9046a98 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
@@ -4,6 +4,7 @@ cur_path=`pwd`
 #集合通信参数,不需要修改
 export RANK_SIZE=1
 
+export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
 
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
index b10fe5ec3bbed835a4ff0695f80c9f66f4db038a..f4b26836cd96412edf282d783f2236ab1cdbceb4 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
@@ -7,6 +7,7 @@ export RANK_SIZE=8
 export MASTER_ADDR=localhost
 export MASTER_PORT=29688
 export HCCL_WHITELIST_DISABLE=1
+export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""