From b97b4d0be625124178fd2395867e010c28019bf5 Mon Sep 17 00:00:00 2001 From: MeiFei Date: Fri, 14 Jun 2024 11:18:27 +0800 Subject: [PATCH] Replace DropoutWithByteMask with original usage in torch_npu --- .../Bert-Squad_ID0470_for_PyTorch/modeling.py | 16 ++++++++-------- .../transformers/models/bert/modeling_bert.py | 14 +++++++------- .../transformers/models/bert/modeling_bert.py | 14 +++++++------- .../models/bert/modeling_bert_benchmark.py | 14 +++++++------- .../transformers/models/bert/modeling_bert.py | 14 +++++++------- .../models/roberta/modeling_roberta.py | 14 +++++++------- 6 files changed, 43 insertions(+), 43 deletions(-) diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/modeling.py b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/modeling.py index e71e18b349..596b733c48 100644 --- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/modeling.py +++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/modeling.py @@ -351,7 +351,7 @@ class BertEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, input_ids, token_type_ids): seq_length = input_ids.size(1) @@ -383,7 +383,7 @@ class BertSelfAttention(nn.Module): self.key = NpuLinear(config.hidden_size, self.all_head_size) self.value = NpuLinear(config.hidden_size, self.all_head_size) - self.dropout = nn.DropoutWithByteMask(config.attention_probs_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.attention_probs_dropout_prob) self.attention_probs_dropout_prob = config.attention_probs_dropout_prob def transpose_for_qkv(self, x): @@ -446,7 +446,7 @@ class BertSelfOutput(nn.Module): super(BertSelfOutput, self).__init__() self.dense = NpuLinear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -482,7 +482,7 @@ class BertOutput(nn.Module): super(BertOutput, self).__init__() self.dense = NpuLinear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -1108,7 +1108,7 @@ class BertForSequenceClassification(BertPreTrainedModel): super(BertForSequenceClassification, self).__init__(config) self.num_labels = num_labels self.bert = BertModel(config) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights) @@ -1166,7 +1166,7 @@ class BertForMultipleChoice(BertPreTrainedModel): super(BertForMultipleChoice, self).__init__(config) self.num_choices = num_choices self.bert = BertModel(config) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.apply(self.init_bert_weights) @@ -1236,7 +1236,7 @@ class BertForTokenClassification(BertPreTrainedModel): super(BertForTokenClassification, self).__init__(config) self.num_labels = num_labels self.bert = BertModel(config) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, num_labels) self.apply(self.init_bert_weights) @@ -1303,7 +1303,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): super(BertForQuestionAnswering, self).__init__(config) self.bert = BertModel(config) # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) self.qa_outputs = nn.Linear(config.hidden_size, 2) self.apply(self.init_bert_weights) diff --git a/PyTorch/built-in/nlp/Bert-text-classification_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py b/PyTorch/built-in/nlp/Bert-text-classification_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py index 3e4d9ba36b..030cdd8fd7 100644 --- a/PyTorch/built-in/nlp/Bert-text-classification_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py +++ b/PyTorch/built-in/nlp/Bert-text-classification_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py @@ -199,7 +199,7 @@ class BertEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) @@ -268,7 +268,7 @@ class BertSelfAttention(nn.Module): self.key = NpuLinear(config.hidden_size, self.all_head_size) self.value = NpuLinear(config.hidden_size, self.all_head_size) - self.dropout = nn.DropoutWithByteMask(config.attention_probs_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.attention_probs_dropout_prob) self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) @@ -408,7 +408,7 @@ class BertSelfOutput(nn.Module): # self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = NpuLinear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -487,7 +487,7 @@ class BertOutput(nn.Module): # self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.dense = NpuLinear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -1575,7 +1575,7 @@ class BertForSequenceClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = NpuLinear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing @@ -1675,7 +1675,7 @@ class BertForMultipleChoice(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = NpuLinear(config.hidden_size, 1) # Initialize weights and apply final processing @@ -1774,7 +1774,7 @@ class BertForTokenClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = NpuLinear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing diff --git a/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py b/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py index 2f853b0ba7..b077c15124 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py @@ -199,7 +199,7 @@ class BertEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) @@ -268,7 +268,7 @@ class BertSelfAttention(nn.Module): self.key = NpuLinear(config.hidden_size, self.all_head_size) self.value = NpuLinear(config.hidden_size, self.all_head_size) - self.dropout = nn.DropoutWithByteMask(config.attention_probs_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.attention_probs_dropout_prob) self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) @@ -417,7 +417,7 @@ class BertSelfOutput(nn.Module): # self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = NpuLinear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -496,7 +496,7 @@ class BertOutput(nn.Module): # self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.dense = NpuLinear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -1584,7 +1584,7 @@ class BertForSequenceClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing @@ -1684,7 +1684,7 @@ class BertForMultipleChoice(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, 1) # Initialize weights and apply final processing @@ -1783,7 +1783,7 @@ class BertForTokenClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing diff --git a/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert_benchmark.py b/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert_benchmark.py index fbb3d809cb..174b5e500b 100644 --- a/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert_benchmark.py +++ b/PyTorch/built-in/nlp/Bert_Chinese_ID3433_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert_benchmark.py @@ -181,7 +181,7 @@ class BertEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) @@ -246,7 +246,7 @@ class BertSelfAttention(nn.Module): self.key = NpuLinear(config.hidden_size, self.all_head_size) self.value = NpuLinear(config.hidden_size, self.all_head_size) - self.dropout = nn.DropoutWithByteMask(config.attention_probs_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.attention_probs_dropout_prob) self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) @@ -297,7 +297,7 @@ class BertSelfOutput(nn.Module): super().__init__() self.dense = NpuLinear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -381,7 +381,7 @@ class BertOutput(nn.Module): self.dense = NpuLinear(config.intermediate_size, config.hidden_size) if is_last_layer: self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -1478,7 +1478,7 @@ class BertForSequenceClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing @@ -1578,7 +1578,7 @@ class BertForMultipleChoice(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, 1) # Initialize weights and apply final processing @@ -1676,7 +1676,7 @@ class BertForTokenClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing diff --git a/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py b/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py index 7e78f97ef3..3680f08658 100644 --- a/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py +++ b/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/bert/modeling_bert.py @@ -199,7 +199,7 @@ class BertEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) @@ -268,7 +268,7 @@ class BertSelfAttention(nn.Module): self.key = NpuLinear(config.hidden_size, self.all_head_size) self.value = NpuLinear(config.hidden_size, self.all_head_size) - self.dropout = nn.DropoutWithByteMask(config.attention_probs_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.attention_probs_dropout_prob) self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) @@ -417,7 +417,7 @@ class BertSelfOutput(nn.Module): # self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dense = NpuLinear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -496,7 +496,7 @@ class BertOutput(nn.Module): # self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.dense = NpuLinear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -1582,7 +1582,7 @@ class BertForSequenceClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing @@ -1682,7 +1682,7 @@ class BertForMultipleChoice(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, 1) # Initialize weights and apply final processing @@ -1781,7 +1781,7 @@ class BertForTokenClassification(BertPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing diff --git a/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/roberta/modeling_roberta.py b/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/roberta/modeling_roberta.py index 1741476cf3..9a80a24047 100644 --- a/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/roberta/modeling_roberta.py +++ b/PyTorch/built-in/others/CLIP_for_PyTorch/transformers/src/transformers/models/roberta/modeling_roberta.py @@ -86,7 +86,7 @@ class RobertaEmbeddings(nn.Module): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) @@ -179,7 +179,7 @@ class RobertaSelfAttention(nn.Module): self.key = NpuLinear(config.hidden_size, self.all_head_size) self.value = NpuLinear(config.hidden_size, self.all_head_size) - self.dropout = nn.DropoutWithByteMask(config.attention_probs_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.attention_probs_dropout_prob) self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) @@ -293,7 +293,7 @@ class RobertaSelfOutput(nn.Module): super().__init__() self.dense = NpuLinear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -374,7 +374,7 @@ class RobertaOutput(nn.Module): super().__init__() self.dense = NpuLinear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -1272,7 +1272,7 @@ class RobertaForMultipleChoice(RobertaPreTrainedModel): super().__init__(config) self.roberta = RobertaModel(config) - self.dropout = nn.DropoutWithByteMask(config.hidden_dropout_prob) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) # Initialize weights and apply final processing @@ -1370,7 +1370,7 @@ class RobertaForTokenClassification(RobertaPreTrainedModel): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # Initialize weights and apply final processing @@ -1445,7 +1445,7 @@ class RobertaClassificationHead(nn.Module): classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) - self.dropout = nn.DropoutWithByteMask(classifier_dropout) + self.dropout = torch_npu.contrib.module.npu_modules.DropoutWithByteMask(classifier_dropout) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): -- Gitee