From 782c2f0190ba49739e0011d3097f35868552430c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Thu, 27 Mar 2025 20:33:53 +0800 Subject: [PATCH 01/18] =?UTF-8?q?mask=E7=94=9F=E6=88=90=E5=BD=92=E4=B8=80?= =?UTF-8?q?=E9=87=8D=E6=9E=84=E5=92=8Calibi=E9=87=8D=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../basic_feature/generate_mask/__init__.py | 0 .../basic_feature/generate_mask/adaptor.py | 25 +++ .../generate_mask/generate_mask.py | 76 ++++++++ .../core/transformer/dot_product_attention.py | 8 +- .../positional_encoding/alibi/__init__.py | 0 .../positional_encoding/alibi/adaptor.py | 182 ++++++++++++++++++ .../positional_encoding/alibi/alibi.py | 83 ++++++++ .../positional_encoding/alibi/alibi_utils.py | 58 ++++++ mindspeed/features_manager/__init__.py | 6 + .../features_manager/transformer/__init__.py | 0 .../transformer/alibi_feature.py | 76 ++++++++ .../transformer/generate_mask_feature.py | 16 ++ 12 files changed, 526 insertions(+), 4 deletions(-) create mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/__init__.py create mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py create mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/__init__.py create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/adaptor.py create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/alibi.py create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py create mode 100644 mindspeed/features_manager/transformer/__init__.py create mode 100644 mindspeed/features_manager/transformer/alibi_feature.py create mode 100644 mindspeed/features_manager/transformer/generate_mask_feature.py diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py b/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py new file mode 100644 index 00000000..a287aa4d --- /dev/null +++ b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py @@ -0,0 +1,25 @@ +from functools import wraps + +from megatron.training import get_args +from megatron.core.transformer.enums import AttnMaskType + +from .generate_mask import get_attention_mask + + +def parallel_transformer_forward_wrapper(fn): + @wraps(fn) + def wrapper(self, hidden_states, attention_mask, **kwargs): + args = get_args() + if attention_mask is None: + attention_mask = get_attention_mask() + return fn(self, hidden_states, attention_mask, **kwargs) + return wrapper + +def dot_product_attention_forward_wrapper(fn): + @wraps(fn) + def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): + if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: + if not getattr(self.config, 'is_llava', False): + attention_mask = get_attention_mask() + return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) + return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py new file mode 100644 index 00000000..e43cfc1d --- /dev/null +++ b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py @@ -0,0 +1,76 @@ +import warnings + +import torch +from megatron.training import get_args + +_GLOBAL_ATTN_MASK = None + + +def set_attention_mask(attn_mask): + global _GLOBAL_ATTN_MASK + _GLOBAL_ATTN_MASK = attn_mask + + +def generate_attention_mask(compress, device): + global _GLOBAL_ATTN_MASK + args = get_args() + if not args.use_flash_attn: + warnings.warn("Flash Attention is highly recommended") + _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \ + + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1)) + return + + if compress: + seq_len = 2048 + else: + seq_len = args.seq_length + + _GLOBAL_ATTN_MASK = torch.triu( + torch.ones((seq_len, seq_len), + device=device, dtype=torch.bool), diagonal=1) + + +def get_attention_mask(): + global _GLOBAL_ATTN_MASK + if _GLOBAL_ATTN_MASK is not None: + return _GLOBAL_ATTN_MASK + + args = get_args() + should_generate_mask = False + device = 'npu' + + if args.attention_mask_type == 'causal': + args.sparse_mode = 2 + should_generate_mask = True + compress = True + + # ampipe开启在同时不开cp时需要生成全量mask,开cp时生成causal mask + if args.ampipe_degree > 1 and args.context_parallel_size <= 1: + args.sparse_mode = 0 + should_generate_mask = True + compress = False + + # EoD 模式 Ring Attention的实现 + # general 为基线方案,causal 为加速方案 + # 如果 cp > 1 且使用了Ring Attention 并行(包括Hybrid并行)。则Mask为动态生成的,不需要额外的Mask + if args.reset_attention_mask: + if args.attention_mask_type == 'general': + args.sparse_mode = 2 + if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo': + should_generate_mask = True + compress = True + else: + args.sparse_mode = 1 + should_generate_mask = False + else: + should_generate_mask = True + compress = True + + + if args.attention_mask_on_cpu: + device = 'cpu' + + if should_generate_mask: + generate_attention_mask(compress, device) + + return _GLOBAL_ATTN_MASK diff --git a/mindspeed/core/transformer/dot_product_attention.py b/mindspeed/core/transformer/dot_product_attention.py index ec881582..dff192ca 100644 --- a/mindspeed/core/transformer/dot_product_attention.py +++ b/mindspeed/core/transformer/dot_product_attention.py @@ -32,7 +32,7 @@ from mindspeed.core.parallel_state import (get_context_parallel_group_for_hybrid get_ring_group_for_intra_window, get_ring_group_for_intra_window_send_recv_overlap) from mindspeed.core.tensor_parallel_y_union_cp import TensorParallelYUnionCP -from mindspeed.model.transformer import get_attention_mask +# from mindspeed.model.transformer import get_attention_mask from mindspeed.utils import get_actual_seq_len from mindspeed.core.context_parallel.adaptive_context_parallel import adaptive_attn_context_parallel from mindspeed.core.context_parallel.utils import get_scheduling_info @@ -158,9 +158,9 @@ def dot_product_attention_init_wrapper(fn): def dot_product_attention_forward_wrapper(fn): @wraps(fn) def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): - if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: - if not getattr(self.config, 'is_llava', False): - attention_mask = get_attention_mask() + # if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: + # if not getattr(self.config, 'is_llava', False): + # attention_mask = get_attention_mask() if get_args().use_flash_attn: return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) diff --git a/mindspeed/core/transformer/positional_encoding/alibi/__init__.py b/mindspeed/core/transformer/positional_encoding/alibi/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py new file mode 100644 index 00000000..76fe1ff6 --- /dev/null +++ b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py @@ -0,0 +1,182 @@ +from functools import wraps + +import torch +from megatron.training import get_args, mpu, tensor_parallel + +from .alibi import Alibi, AlibiForFusionAttnSingleton +from .alibi_utils import _build_alibi_tensor, _get_inverted_mask + + +def core_attention_init_wrapper(fn): + @wraps(fn) + def wrapper(self, *arg, **kwargs): + fn(self, *arg, **kwargs) + + args = get_args() + self.hidden_size_per_partition = self.hidden_size_per_partition // arg[1].context_parallel_size + self.square_alibi_mask = args.square_alibi_mask + self.fill_neg_inf = args.fill_neg_inf + self.beta = 1.0 + self.config = arg[1] + if self.apply_query_key_layer_scaling: + self.beta = 1.0 / self.layer_number + if args.position_embedding_type == 'alibi': + self.alibi = Alibi() + alibi = _build_alibi_tensor(args.seq_length, + self.config.num_attention_heads, + args.square_alibi_mask, + args.fill_neg_inf + ).to(torch.cuda.current_device()) + if self.config.params_dtype == torch.float16: + alibi = alibi.to(torch.float16) + elif self.config.params_dtype == torch.bfloat16: + alibi = alibi.to(torch.bfloat16) + self.alibi.alibi = alibi + else: + self.alibi = None + + return wrapper + + +def core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.reshape(output_size[2], + output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], + output_size[0] * output_size[1], -1) + + if self.alibi is None: + matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor( + (output_size[0] * output_size[1], output_size[2], output_size[3]), + query_layer.dtype, "mpu") + + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), + key_layer.transpose(0, 1).transpose(1, 2), + beta=0.0, alpha=(1.0 / self.norm_factor)) + else: + if self.alibi.matmul_result is None or self.alibi.output_size != output_size: + args = get_args() + + self.alibi.output_size = output_size + alibi = _build_alibi_tensor(args.seq_length, + self.config.num_attention_heads, + args.square_alibi_mask, + args.fill_neg_inf + ).to(torch.cuda.current_device()) + if self.config.params_dtype == torch.float16: + alibi = alibi.to(torch.float16) + elif self.config.params_dtype == torch.bfloat16: + alibi = alibi.to(torch.bfloat16) + self.alibi.alibi = alibi + + if self.fill_neg_inf: + _alibi = self.alibi.alibi[:, :output_size[3], :output_size[3]] + attention_mask = attention_mask.repeat(output_size[0], 1, 1, 1)[:output_size[0], :, :, :] + self.alibi.matmul_result = _get_inverted_mask(attention_mask, _alibi).view(-1, output_size[2], + output_size[2]).contiguous() + else: + self.alibi.matmul_result = self.alibi.alibi[:, :, :output_size[3]].repeat(output_size[0], 1, 1) + + q_trans = query_layer.transpose(0, 1).contiguous() + k_trans = key_layer.transpose(0, 1).transpose(1, 2).contiguous() + matmul_result = self.beta * self.alibi.matmul_result + torch.bmm(q_trans, k_trans) * (1.0 / self.norm_factor) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + if self.square_alibi_mask: + attention_scores = torch.max( + attention_scores, torch.tensor(torch.finfo(attention_scores.dtype).min) + ) + attention_probs = torch.nn.functional.softmax(attention_scores, -1) + else: + attention_probs = self.scale_mask_softmax(attention_scores, + attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + if not self.sequence_parallel: + with tensor_parallel.get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), + value_layer.size(2), + query_layer.size(0), + value_layer.size(3)) + + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), + output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], + output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +def flash_self_attention_init_wrapper(fn): + @wraps(fn) + def wrapper(self, *arg, **kwargs): + fn(self, *arg, **kwargs) + args = get_args() + + self.pse = None + self.pse_type = args.alibi_fusion_attn_type + + if self.pse_type is None: + self.pse_type = 1 # not use pse + elif self.pse_type == 0: + alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(args.seq_length, + args.num_attention_heads, + args.params_dtype, + args.alibi_diagonal_opposite, + 1024) + self.pse = alibi + + elif self.pse_type == 2 or self.pse_type == 3: + self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(args.num_attention_heads) + + return wrapper diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi.py b/mindspeed/core/transformer/positional_encoding/alibi/alibi.py new file mode 100644 index 00000000..957d334f --- /dev/null +++ b/mindspeed/core/transformer/positional_encoding/alibi/alibi.py @@ -0,0 +1,83 @@ +import threading + +import torch +from megatron.core import parallel_state + +from .alibi_utils import get_slopes + + +class Alibi: + _instance = None + alibi = None + matmul_result = None + output_size = None + lock = threading.Lock() + + def __new__(cls, *args, **kwargs): + if cls._instance: + return cls._instance + else: + with cls.lock: + cls._instance = super().__new__(cls) + return cls._instance + + +class AlibiForFusionAttnSingleton: + _alibi_tensor_args = None + _alibi_tensor = None + + _alibi_slopes_headnum = None + _alibi_slopes = None + + @classmethod + def get_alibi_tensor_for_fusion_attn(cls, max_seq_len, num_attention_heads, dtype, neg_diagonal_opposite=False, + last_k=1024): + if cls._alibi_tensor is None or cls._alibi_tensor_args != ( + max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k): + if last_k > max_seq_len: + last_k = max_seq_len + + tp_world_size = parallel_state.get_tensor_model_parallel_world_size() + current_head_num = num_attention_heads // tp_world_size + slopes = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(num_attention_heads) + + position_point = torch.arange(max_seq_len) - max_seq_len + 1 + diag = torch.diag(torch.diag(position_point)).unsqueeze(0).unsqueeze(0) + + position_point = position_point.unsqueeze(0).unsqueeze(0).expand(current_head_num, last_k, -1) + position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, last_k, + max_seq_len) + + alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point.npu() + + if not neg_diagonal_opposite: + alibi = -torch.abs(alibi) + + alibi = alibi.unsqueeze(0) + alibi = torch.Tensor(alibi).npu() + if dtype == torch.float16: + alibi = alibi.to(torch.float16) + elif dtype == torch.bfloat16: + alibi = alibi.to(torch.bfloat16) + + cls._alibi_tensor = alibi + cls._alibi_tensor_args = (max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k) + + return cls._alibi_tensor + + @classmethod + def get_alibi_slopes_for_fusion_attn(cls, n): + if cls._alibi_slopes is None or cls._alibi_slopes_headnum != n: + slopes = get_slopes(n) + + tp_world_size = parallel_state.get_tensor_model_parallel_world_size() + tp_index = parallel_state.get_tensor_model_parallel_rank() + + current_head_num = n // tp_world_size + slopes = torch.Tensor(slopes[tp_index * current_head_num: tp_index * current_head_num + current_head_num]).npu() + + cls._alibi_slopes = slopes + cls._alibi_slopes_headnum = n + return cls._alibi_slopes + + return cls._alibi_slopes \ No newline at end of file diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py b/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py new file mode 100644 index 00000000..7c28313b --- /dev/null +++ b/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py @@ -0,0 +1,58 @@ +import math + +import torch +from megatron.core import parallel_state + + +def get_slopes(n): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][ + :n - closest_power_of_2] + + +def _get_inverted_mask(attention_mask, alibi): + inverted_mask = attention_mask.to(alibi.dtype) + inverted_mask = inverted_mask.masked_fill( + inverted_mask.to(torch.bool), float("-inf") + ) + return inverted_mask.to(alibi.device) + alibi.unsqueeze(0) + + +def _build_alibi_tensor(max_seq_len, num_attention_heads, square_alibi_mask, fill_neg_inf): + def _fill_with_neg_inf(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(float("-inf")).type_as(t) + + def _buffered_future_mask(maxpos, alibi, attn_heads): + _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) + _future_mask = _future_mask.unsqueeze(0) + alibi + return _future_mask[:attn_heads, :maxpos, :maxpos] + + slopes = torch.Tensor(get_slopes(num_attention_heads)) + if square_alibi_mask: + position_point = torch.arange(max_seq_len) - max_seq_len + 1 + position_point = position_point.unsqueeze(0).unsqueeze(0).expand(num_attention_heads, max_seq_len, -1) + diag = torch.diag(position_point[0]) + position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2) + alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point + else: + alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( + num_attention_heads, -1, -1) + + # Select the part of the tensor that corresponds to our tensor parallel index. + tp_world_size = parallel_state.get_tensor_model_parallel_world_size() + tp_index = parallel_state.get_tensor_model_parallel_rank() + alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index] + + if fill_neg_inf: + return _buffered_future_mask(max_seq_len, alibi, num_attention_heads) + + return alibi diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py index 5902aa6c..cff07ab4 100644 --- a/mindspeed/features_manager/__init__.py +++ b/mindspeed/features_manager/__init__.py @@ -1,6 +1,8 @@ from .tensor_parallel.unaligned_linear_feature import UnalignedLinearFeature from .functional.profiler_default_feature import ProfilerDefaultFeature from .llava.llava_multimodal import LlavaModel +from .transformer.alibi_feature import AlibiFeature +from .transformer.generate_mask_feature import GenerateMaskFeature FEATURES_LIST = [ # Functional features @@ -19,4 +21,8 @@ FEATURES = ( UnalignedLinearFeature(), # llava-multimodal LlavaModel(), + # Transformer features + AlibiFeature(), + # Transformer features + GenerateMaskFeature() ) diff --git a/mindspeed/features_manager/transformer/__init__.py b/mindspeed/features_manager/transformer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/alibi_feature.py new file mode 100644 index 00000000..e43cfc1d --- /dev/null +++ b/mindspeed/features_manager/transformer/alibi_feature.py @@ -0,0 +1,76 @@ +import warnings + +import torch +from megatron.training import get_args + +_GLOBAL_ATTN_MASK = None + + +def set_attention_mask(attn_mask): + global _GLOBAL_ATTN_MASK + _GLOBAL_ATTN_MASK = attn_mask + + +def generate_attention_mask(compress, device): + global _GLOBAL_ATTN_MASK + args = get_args() + if not args.use_flash_attn: + warnings.warn("Flash Attention is highly recommended") + _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \ + + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1)) + return + + if compress: + seq_len = 2048 + else: + seq_len = args.seq_length + + _GLOBAL_ATTN_MASK = torch.triu( + torch.ones((seq_len, seq_len), + device=device, dtype=torch.bool), diagonal=1) + + +def get_attention_mask(): + global _GLOBAL_ATTN_MASK + if _GLOBAL_ATTN_MASK is not None: + return _GLOBAL_ATTN_MASK + + args = get_args() + should_generate_mask = False + device = 'npu' + + if args.attention_mask_type == 'causal': + args.sparse_mode = 2 + should_generate_mask = True + compress = True + + # ampipe开启在同时不开cp时需要生成全量mask,开cp时生成causal mask + if args.ampipe_degree > 1 and args.context_parallel_size <= 1: + args.sparse_mode = 0 + should_generate_mask = True + compress = False + + # EoD 模式 Ring Attention的实现 + # general 为基线方案,causal 为加速方案 + # 如果 cp > 1 且使用了Ring Attention 并行(包括Hybrid并行)。则Mask为动态生成的,不需要额外的Mask + if args.reset_attention_mask: + if args.attention_mask_type == 'general': + args.sparse_mode = 2 + if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo': + should_generate_mask = True + compress = True + else: + args.sparse_mode = 1 + should_generate_mask = False + else: + should_generate_mask = True + compress = True + + + if args.attention_mask_on_cpu: + device = 'cpu' + + if should_generate_mask: + generate_attention_mask(compress, device) + + return _GLOBAL_ATTN_MASK diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py new file mode 100644 index 00000000..8d4283c7 --- /dev/null +++ b/mindspeed/features_manager/transformer/generate_mask_feature.py @@ -0,0 +1,16 @@ +from argparse import ArgumentParser + +from mindspeed.features_manager.feature import MindSpeedFeature + +class GenerateMaskFeature(MindSpeedFeature): + + def __init__(self): + super().__init__('generate-mask') + + def register_patches(self, patch_manager, args): + from mindspeed.model.transformer import parallel_transformer_forward_wrapper + from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import dot_product_attention_forward_wrapper + patch_manager.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', + parallel_transformer_forward_wrapper) + patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', + dot_product_attention_forward_wrapper) \ No newline at end of file -- Gitee From 5f7b8971d33667f591098d5d8b1e236a83035fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Fri, 28 Mar 2025 11:13:31 +0800 Subject: [PATCH 02/18] =?UTF-8?q?=E7=9B=91=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../basic_feature/generate_mask/adaptor.py | 9 +++-- .../generate_mask/generate_mask.py | 24 ++++++------ .../positional_encoding/alibi/adaptor.py | 37 +++++++------------ 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py index a287aa4d..43a4318b 100644 --- a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py +++ b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py @@ -18,8 +18,11 @@ def parallel_transformer_forward_wrapper(fn): def dot_product_attention_forward_wrapper(fn): @wraps(fn) def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): - if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: - if not getattr(self.config, 'is_llava', False): - attention_mask = get_attention_mask() + if ( + attention_mask is None + and self.attn_mask_type == AttnMaskType.causal: + and not getattr(self.config, 'is_llava', False) + ): + attention_mask = get_attention_mask() return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py index e43cfc1d..bf72d125 100644 --- a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py +++ b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py @@ -1,9 +1,10 @@ -import warnings +from logging import getLogger import torch from megatron.training import get_args _GLOBAL_ATTN_MASK = None +LOG = getLogger(__name__) def set_attention_mask(attn_mask): @@ -15,19 +16,20 @@ def generate_attention_mask(compress, device): global _GLOBAL_ATTN_MASK args = get_args() if not args.use_flash_attn: - warnings.warn("Flash Attention is highly recommended") - _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \ - + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1)) + LOG.warn("Flash Attention is highly recommended") + _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], + dtype=bool, + device=device), diagonal=-(args.pre_tockens + 1)) \ + + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], + dtype=bool, + device=device), diagonal=args.next_tockens + 1)) return - if compress: - seq_len = 2048 - else: - seq_len = args.seq_length + seq_len = 2048 if compress else args.seq_length - _GLOBAL_ATTN_MASK = torch.triu( - torch.ones((seq_len, seq_len), - device=device, dtype=torch.bool), diagonal=1) + _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len), + device=device, + dtype=torch.bool), diagonal=1) def get_attention_mask(): diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py index 76fe1ff6..c60350aa 100644 --- a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py +++ b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py @@ -12,25 +12,21 @@ def core_attention_init_wrapper(fn): def wrapper(self, *arg, **kwargs): fn(self, *arg, **kwargs) - args = get_args() - self.hidden_size_per_partition = self.hidden_size_per_partition // arg[1].context_parallel_size - self.square_alibi_mask = args.square_alibi_mask - self.fill_neg_inf = args.fill_neg_inf + # self.config = arg[1] + self.hidden_size_per_partition = self.hidden_size_per_partition // self.config.context_parallel_size + self.square_alibi_mask = self.config.square_alibi_mask + self.fill_neg_inf = self.config.fill_neg_inf self.beta = 1.0 - self.config = arg[1] + if self.apply_query_key_layer_scaling: self.beta = 1.0 / self.layer_number - if args.position_embedding_type == 'alibi': + if self.config.position_embedding_type == 'alibi': self.alibi = Alibi() - alibi = _build_alibi_tensor(args.seq_length, + alibi = _build_alibi_tensor(self.config.seq_length, self.config.num_attention_heads, - args.square_alibi_mask, - args.fill_neg_inf - ).to(torch.cuda.current_device()) - if self.config.params_dtype == torch.float16: - alibi = alibi.to(torch.float16) - elif self.config.params_dtype == torch.bfloat16: - alibi = alibi.to(torch.bfloat16) + self.config.square_alibi_mask, + self.config.fill_neg_inf + ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype) self.alibi.alibi = alibi else: self.alibi = None @@ -68,18 +64,13 @@ def core_attention_forward(self, query_layer, key_layer, value_layer, attention_ beta=0.0, alpha=(1.0 / self.norm_factor)) else: if self.alibi.matmul_result is None or self.alibi.output_size != output_size: - args = get_args() self.alibi.output_size = output_size - alibi = _build_alibi_tensor(args.seq_length, + alibi = _build_alibi_tensor(self.config.seq_length, self.config.num_attention_heads, - args.square_alibi_mask, - args.fill_neg_inf - ).to(torch.cuda.current_device()) - if self.config.params_dtype == torch.float16: - alibi = alibi.to(torch.float16) - elif self.config.params_dtype == torch.bfloat16: - alibi = alibi.to(torch.bfloat16) + self.config.square_alibi_mask, + self.config.fill_neg_inf + ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype) self.alibi.alibi = alibi if self.fill_neg_inf: -- Gitee From 81747d9c9e29ad26d05197d918267e7384e462fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Fri, 28 Mar 2025 15:40:30 +0800 Subject: [PATCH 03/18] bug fix --- .../core/transformer/dot_product_attention.py | 8 +- .../transformer/alibi_feature.py | 123 +++++++----------- .../transformer/generate_mask_feature.py | 2 +- 3 files changed, 52 insertions(+), 81 deletions(-) diff --git a/mindspeed/core/transformer/dot_product_attention.py b/mindspeed/core/transformer/dot_product_attention.py index dff192ca..ec881582 100644 --- a/mindspeed/core/transformer/dot_product_attention.py +++ b/mindspeed/core/transformer/dot_product_attention.py @@ -32,7 +32,7 @@ from mindspeed.core.parallel_state import (get_context_parallel_group_for_hybrid get_ring_group_for_intra_window, get_ring_group_for_intra_window_send_recv_overlap) from mindspeed.core.tensor_parallel_y_union_cp import TensorParallelYUnionCP -# from mindspeed.model.transformer import get_attention_mask +from mindspeed.model.transformer import get_attention_mask from mindspeed.utils import get_actual_seq_len from mindspeed.core.context_parallel.adaptive_context_parallel import adaptive_attn_context_parallel from mindspeed.core.context_parallel.utils import get_scheduling_info @@ -158,9 +158,9 @@ def dot_product_attention_init_wrapper(fn): def dot_product_attention_forward_wrapper(fn): @wraps(fn) def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): - # if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: - # if not getattr(self.config, 'is_llava', False): - # attention_mask = get_attention_mask() + if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: + if not getattr(self.config, 'is_llava', False): + attention_mask = get_attention_mask() if get_args().use_flash_attn: return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/alibi_feature.py index e43cfc1d..21948a68 100644 --- a/mindspeed/features_manager/transformer/alibi_feature.py +++ b/mindspeed/features_manager/transformer/alibi_feature.py @@ -1,76 +1,47 @@ -import warnings - -import torch -from megatron.training import get_args - -_GLOBAL_ATTN_MASK = None - - -def set_attention_mask(attn_mask): - global _GLOBAL_ATTN_MASK - _GLOBAL_ATTN_MASK = attn_mask - - -def generate_attention_mask(compress, device): - global _GLOBAL_ATTN_MASK - args = get_args() - if not args.use_flash_attn: - warnings.warn("Flash Attention is highly recommended") - _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \ - + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1)) - return - - if compress: - seq_len = 2048 - else: - seq_len = args.seq_length - - _GLOBAL_ATTN_MASK = torch.triu( - torch.ones((seq_len, seq_len), - device=device, dtype=torch.bool), diagonal=1) - - -def get_attention_mask(): - global _GLOBAL_ATTN_MASK - if _GLOBAL_ATTN_MASK is not None: - return _GLOBAL_ATTN_MASK - - args = get_args() - should_generate_mask = False - device = 'npu' - - if args.attention_mask_type == 'causal': - args.sparse_mode = 2 - should_generate_mask = True - compress = True - - # ampipe开启在同时不开cp时需要生成全量mask,开cp时生成causal mask - if args.ampipe_degree > 1 and args.context_parallel_size <= 1: - args.sparse_mode = 0 - should_generate_mask = True - compress = False - - # EoD 模式 Ring Attention的实现 - # general 为基线方案,causal 为加速方案 - # 如果 cp > 1 且使用了Ring Attention 并行(包括Hybrid并行)。则Mask为动态生成的,不需要额外的Mask - if args.reset_attention_mask: - if args.attention_mask_type == 'general': - args.sparse_mode = 2 - if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo': - should_generate_mask = True - compress = True - else: - args.sparse_mode = 1 - should_generate_mask = False - else: - should_generate_mask = True - compress = True - - - if args.attention_mask_on_cpu: - device = 'cpu' - - if should_generate_mask: - generate_attention_mask(compress, device) - - return _GLOBAL_ATTN_MASK +from argparse import ArgumentParser + +from mindspeed.features_manager.feature import MindSpeedFeature + +class AlibiFeature(MindSpeedFeature): + + def __init__(self): + super().__init__('alibi') + + def register_args(self, parser: ArgumentParser): + self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi') + + group = parser.add_argument_group(title='alibi') + group.add_argument('--square-alibi-mask', + action='store_true', + default=False, + help='attention mask of alibi is squared') + group.add_argument('--fill-neg-inf', + action='store_true', + default=False, + help='fill alibi with negative inf') + + group.add_argument('--alibi-fusion-attn-type', + type=int, + help='alibi pse type, support for 0,2,3') + group.add_argument('--alibi-diagonal-opposite', + action='store_true', + default=False, + help='make alibi diagonal opposite') + + def validate_args(self, args): + if args.alibi_fusion_attn_type is not None and args.alibi_fusion_attn_type not in [0, 2, 3]: + raise AssertionError('--alibi-fusion-attn-type only support for `0, 2, 3`') + # alibi type [2, 3] is only support FA2 + if args.alibi_fusion_attn_type in [2, 3]: + args.use_fusion_attn_v2 = True + if args.use_fusion_attn_v2: + args.use_flash_attn = True + print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.") + + def register_patches(self, patch_manager, args): + from mindspeed.core.transformer.positional_encoding.alibi.adaptor import flash_self_attention_init_wrapper #l0 + from mindspeed.core.transformer.positional_encoding.alibi.adaptor import core_attention_init_wrapper, core_attention_forward #l2 + patch_manager.register_patch('megatron.legacy.model.transformer.FlashSelfAttention.__init__', + flash_self_attention_init_wrapper) + patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.__init__', core_attention_init_wrapper) + patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.forward', core_attention_forward) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py index 8d4283c7..2858d924 100644 --- a/mindspeed/features_manager/transformer/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/generate_mask_feature.py @@ -8,7 +8,7 @@ class GenerateMaskFeature(MindSpeedFeature): super().__init__('generate-mask') def register_patches(self, patch_manager, args): - from mindspeed.model.transformer import parallel_transformer_forward_wrapper + from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import parallel_transformer_forward_wrapper from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import dot_product_attention_forward_wrapper patch_manager.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', parallel_transformer_forward_wrapper) -- Gitee From b7a0b9897e6bb284797f713a79c6adae10859da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Sat, 29 Mar 2025 10:31:12 +0800 Subject: [PATCH 04/18] =?UTF-8?q?feature=E6=B7=BB=E5=8A=A0optimization=5Fl?= =?UTF-8?q?evel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindspeed/features_manager/transformer/alibi_feature.py | 2 +- mindspeed/features_manager/transformer/generate_mask_feature.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/alibi_feature.py index 21948a68..37e174b7 100644 --- a/mindspeed/features_manager/transformer/alibi_feature.py +++ b/mindspeed/features_manager/transformer/alibi_feature.py @@ -5,7 +5,7 @@ from mindspeed.features_manager.feature import MindSpeedFeature class AlibiFeature(MindSpeedFeature): def __init__(self): - super().__init__('alibi') + super().__init__('pse-alibi', optimization_level=2) def register_args(self, parser: ArgumentParser): self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi') diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py index 2858d924..a839f8f9 100644 --- a/mindspeed/features_manager/transformer/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/generate_mask_feature.py @@ -5,7 +5,7 @@ from mindspeed.features_manager.feature import MindSpeedFeature class GenerateMaskFeature(MindSpeedFeature): def __init__(self): - super().__init__('generate-mask') + super().__init__('generate-mask', optimization_level=0) def register_patches(self, patch_manager, args): from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import parallel_transformer_forward_wrapper -- Gitee From 471e9777ed84c2390c80dacdf680db42f7cc29ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Sat, 29 Mar 2025 11:28:32 +0800 Subject: [PATCH 05/18] =?UTF-8?q?=E5=86=B2=E7=AA=81=E8=A7=A3=E5=86=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindspeed/features_manager/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py index 60988fa0..806b5b46 100644 --- a/mindspeed/features_manager/__init__.py +++ b/mindspeed/features_manager/__init__.py @@ -11,6 +11,9 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import from mindspeed.features_manager.llava.llava_multimodal import LlavaModel +from mindspeed.features_manager.transformer.alibi_feature import AlibiFeature +from mindspeed.features_manager.transformer.generate_mask_feature import GenerateMaskFeature + FEATURES_LIST = [ # Functional features ProfilerDefaultFeature(), @@ -39,4 +42,8 @@ FEATURES_LIST_V2 = ( # llava-multimodal LlavaModel(), + + # Transformer features + AlibiFeature(), + GenerateMaskFeature() ) -- Gitee From 15e57deb16a99e33ad62eb25363e3277e1bf2dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Sat, 29 Mar 2025 14:56:20 +0800 Subject: [PATCH 06/18] =?UTF-8?q?mask=E7=94=9F=E6=88=90=E5=BD=92=E4=B8=80?= =?UTF-8?q?=E7=A7=BB=E5=8A=A8=E5=88=B0megatron=5Fbasic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../transformer_basic.py} | 24 ++++++++++++++++ .../basic_feature/generate_mask/__init__.py | 0 .../basic_feature/generate_mask/adaptor.py | 28 ------------------- mindspeed/features_manager/__init__.py | 4 +-- .../megatron_basic/megatron_basic.py | 5 +++- .../transformer/generate_mask_feature.py | 16 ----------- 6 files changed, 29 insertions(+), 48 deletions(-) rename mindspeed/core/{transformer/basic_feature/generate_mask/generate_mask.py => megatron_basic/transformer_basic.py} (74%) delete mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/__init__.py delete mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py delete mode 100644 mindspeed/features_manager/transformer/generate_mask_feature.py diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py b/mindspeed/core/megatron_basic/transformer_basic.py similarity index 74% rename from mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py rename to mindspeed/core/megatron_basic/transformer_basic.py index bf72d125..65ae654c 100644 --- a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py +++ b/mindspeed/core/megatron_basic/transformer_basic.py @@ -1,7 +1,9 @@ +from functools import wraps from logging import getLogger import torch from megatron.training import get_args +from megatron.core.transformer.enums import AttnMaskType _GLOBAL_ATTN_MASK = None LOG = getLogger(__name__) @@ -76,3 +78,25 @@ def get_attention_mask(): generate_attention_mask(compress, device) return _GLOBAL_ATTN_MASK + + +def parallel_transformer_forward_wrapper(fn): + @wraps(fn) + def wrapper(self, hidden_states, attention_mask, **kwargs): + args = get_args() + if attention_mask is None: + attention_mask = get_attention_mask() + return fn(self, hidden_states, attention_mask, **kwargs) + return wrapper + +def dot_product_attention_forward_wrapper(fn): + @wraps(fn) + def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): + if ( + attention_mask is None + and self.attn_mask_type == AttnMaskType.causal: + and not getattr(self.config, 'is_llava', False) + ): + attention_mask = get_attention_mask() + return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) + return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py b/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py deleted file mode 100644 index 43a4318b..00000000 --- a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py +++ /dev/null @@ -1,28 +0,0 @@ -from functools import wraps - -from megatron.training import get_args -from megatron.core.transformer.enums import AttnMaskType - -from .generate_mask import get_attention_mask - - -def parallel_transformer_forward_wrapper(fn): - @wraps(fn) - def wrapper(self, hidden_states, attention_mask, **kwargs): - args = get_args() - if attention_mask is None: - attention_mask = get_attention_mask() - return fn(self, hidden_states, attention_mask, **kwargs) - return wrapper - -def dot_product_attention_forward_wrapper(fn): - @wraps(fn) - def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): - if ( - attention_mask is None - and self.attn_mask_type == AttnMaskType.causal: - and not getattr(self.config, 'is_llava', False) - ): - attention_mask = get_attention_mask() - return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) - return wrapper \ No newline at end of file diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py index 806b5b46..83869db5 100644 --- a/mindspeed/features_manager/__init__.py +++ b/mindspeed/features_manager/__init__.py @@ -12,7 +12,6 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import from mindspeed.features_manager.llava.llava_multimodal import LlavaModel from mindspeed.features_manager.transformer.alibi_feature import AlibiFeature -from mindspeed.features_manager.transformer.generate_mask_feature import GenerateMaskFeature FEATURES_LIST = [ # Functional features @@ -44,6 +43,5 @@ FEATURES_LIST_V2 = ( LlavaModel(), # Transformer features - AlibiFeature(), - GenerateMaskFeature() + AlibiFeature() ) diff --git a/mindspeed/features_manager/megatron_basic/megatron_basic.py b/mindspeed/features_manager/megatron_basic/megatron_basic.py index ef8d1874..b959e201 100644 --- a/mindspeed/features_manager/megatron_basic/megatron_basic.py +++ b/mindspeed/features_manager/megatron_basic/megatron_basic.py @@ -41,4 +41,7 @@ class MegatronBasicFeature(MindSpeedFeature): pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', start_grad_sync) pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_grad_sync', finish_grad_sync) - + # attention mask generate normalization + from mindspeed.core.megatron_basic.transformer_basic import parallel_transformer_forward_wrapper, dot_product_attention_forward_wrapper + pm.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', parallel_transformer_forward_wrapper) + pm.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py deleted file mode 100644 index a839f8f9..00000000 --- a/mindspeed/features_manager/transformer/generate_mask_feature.py +++ /dev/null @@ -1,16 +0,0 @@ -from argparse import ArgumentParser - -from mindspeed.features_manager.feature import MindSpeedFeature - -class GenerateMaskFeature(MindSpeedFeature): - - def __init__(self): - super().__init__('generate-mask', optimization_level=0) - - def register_patches(self, patch_manager, args): - from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import parallel_transformer_forward_wrapper - from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import dot_product_attention_forward_wrapper - patch_manager.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', - parallel_transformer_forward_wrapper) - patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', - dot_product_attention_forward_wrapper) \ No newline at end of file -- Gitee From 6bb43b1db87fc47c0451e843e4e7ecdfa84a2fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Tue, 1 Apr 2025 17:24:30 +0800 Subject: [PATCH 07/18] =?UTF-8?q?get=5Fargs=E6=9B=BF=E6=8D=A2=E4=B8=BAself?= =?UTF-8?q?.config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/megatron_basic/transformer_basic.py | 15 ++++++--------- .../positional_encoding/alibi/adaptor.py | 13 ++++++------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/mindspeed/core/megatron_basic/transformer_basic.py b/mindspeed/core/megatron_basic/transformer_basic.py index 65ae654c..49e35ee5 100644 --- a/mindspeed/core/megatron_basic/transformer_basic.py +++ b/mindspeed/core/megatron_basic/transformer_basic.py @@ -14,9 +14,8 @@ def set_attention_mask(attn_mask): _GLOBAL_ATTN_MASK = attn_mask -def generate_attention_mask(compress, device): +def generate_attention_mask(args, compress, device): global _GLOBAL_ATTN_MASK - args = get_args() if not args.use_flash_attn: LOG.warn("Flash Attention is highly recommended") _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], @@ -34,12 +33,11 @@ def generate_attention_mask(compress, device): dtype=torch.bool), diagonal=1) -def get_attention_mask(): +def get_attention_mask(args): global _GLOBAL_ATTN_MASK if _GLOBAL_ATTN_MASK is not None: return _GLOBAL_ATTN_MASK - args = get_args() should_generate_mask = False device = 'npu' @@ -75,7 +73,7 @@ def get_attention_mask(): device = 'cpu' if should_generate_mask: - generate_attention_mask(compress, device) + generate_attention_mask(args, compress, device) return _GLOBAL_ATTN_MASK @@ -83,9 +81,8 @@ def get_attention_mask(): def parallel_transformer_forward_wrapper(fn): @wraps(fn) def wrapper(self, hidden_states, attention_mask, **kwargs): - args = get_args() if attention_mask is None: - attention_mask = get_attention_mask() + attention_mask = get_attention_mask(self.config) return fn(self, hidden_states, attention_mask, **kwargs) return wrapper @@ -94,9 +91,9 @@ def dot_product_attention_forward_wrapper(fn): def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): if ( attention_mask is None - and self.attn_mask_type == AttnMaskType.causal: + and self.attn_mask_type == AttnMaskType.causal and not getattr(self.config, 'is_llava', False) ): - attention_mask = get_attention_mask() + attention_mask = get_attention_mask(self.config) return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py index c60350aa..5ae66ae0 100644 --- a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py +++ b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py @@ -152,22 +152,21 @@ def flash_self_attention_init_wrapper(fn): @wraps(fn) def wrapper(self, *arg, **kwargs): fn(self, *arg, **kwargs) - args = get_args() self.pse = None - self.pse_type = args.alibi_fusion_attn_type + self.pse_type = self.config.alibi_fusion_attn_type if self.pse_type is None: self.pse_type = 1 # not use pse elif self.pse_type == 0: - alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(args.seq_length, - args.num_attention_heads, - args.params_dtype, - args.alibi_diagonal_opposite, + alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length, + self.config.num_attention_heads, + self.config.params_dtype, + self.config.alibi_diagonal_opposite, 1024) self.pse = alibi elif self.pse_type == 2 or self.pse_type == 3: - self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(args.num_attention_heads) + self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads) return wrapper -- Gitee From 6843eae980e9991bf827355e038f9edd89644fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Thu, 10 Apr 2025 11:24:42 +0800 Subject: [PATCH 08/18] =?UTF-8?q?refactor=EF=BC=9Afa=20generate=20mask=20a?= =?UTF-8?q?nd=20ailibi=20pse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/megatron_basic/transformer_basic.py | 99 ---------- .../alibi/__init__.py | 0 .../flash_attention/alibi/adaptor.py | 99 ++++++++++ .../alibi/alibi.py | 16 +- .../alibi/alibi_utils.py | 0 .../flash_attention/generate_mask/__init__.py | 0 .../flash_attention/generate_mask/adaptor.py | 20 ++ .../generate_mask/generate_mask.py | 43 +++++ .../positional_encoding/alibi/adaptor.py | 172 ------------------ mindspeed/features_manager/__init__.py | 10 +- .../megatron_basic/megatron_basic.py | 5 +- .../transformer/flash_attention/__init__.py | 0 .../{ => flash_attention}/alibi_feature.py | 35 +++- .../fusion_attention_v2_feature.py | 34 ++++ .../flash_attention/generate_mask_feature.py | 20 ++ 15 files changed, 263 insertions(+), 290 deletions(-) delete mode 100644 mindspeed/core/megatron_basic/transformer_basic.py rename mindspeed/core/transformer/{positional_encoding => flash_attention}/alibi/__init__.py (100%) create mode 100644 mindspeed/core/transformer/flash_attention/alibi/adaptor.py rename mindspeed/core/transformer/{positional_encoding => flash_attention}/alibi/alibi.py (81%) rename mindspeed/core/transformer/{positional_encoding => flash_attention}/alibi/alibi_utils.py (100%) create mode 100644 mindspeed/core/transformer/flash_attention/generate_mask/__init__.py create mode 100644 mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py create mode 100644 mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py delete mode 100644 mindspeed/core/transformer/positional_encoding/alibi/adaptor.py create mode 100644 mindspeed/features_manager/transformer/flash_attention/__init__.py rename mindspeed/features_manager/transformer/{ => flash_attention}/alibi_feature.py (58%) create mode 100644 mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py create mode 100644 mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py diff --git a/mindspeed/core/megatron_basic/transformer_basic.py b/mindspeed/core/megatron_basic/transformer_basic.py deleted file mode 100644 index 49e35ee5..00000000 --- a/mindspeed/core/megatron_basic/transformer_basic.py +++ /dev/null @@ -1,99 +0,0 @@ -from functools import wraps -from logging import getLogger - -import torch -from megatron.training import get_args -from megatron.core.transformer.enums import AttnMaskType - -_GLOBAL_ATTN_MASK = None -LOG = getLogger(__name__) - - -def set_attention_mask(attn_mask): - global _GLOBAL_ATTN_MASK - _GLOBAL_ATTN_MASK = attn_mask - - -def generate_attention_mask(args, compress, device): - global _GLOBAL_ATTN_MASK - if not args.use_flash_attn: - LOG.warn("Flash Attention is highly recommended") - _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], - dtype=bool, - device=device), diagonal=-(args.pre_tockens + 1)) \ - + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], - dtype=bool, - device=device), diagonal=args.next_tockens + 1)) - return - - seq_len = 2048 if compress else args.seq_length - - _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len), - device=device, - dtype=torch.bool), diagonal=1) - - -def get_attention_mask(args): - global _GLOBAL_ATTN_MASK - if _GLOBAL_ATTN_MASK is not None: - return _GLOBAL_ATTN_MASK - - should_generate_mask = False - device = 'npu' - - if args.attention_mask_type == 'causal': - args.sparse_mode = 2 - should_generate_mask = True - compress = True - - # ampipe开启在同时不开cp时需要生成全量mask,开cp时生成causal mask - if args.ampipe_degree > 1 and args.context_parallel_size <= 1: - args.sparse_mode = 0 - should_generate_mask = True - compress = False - - # EoD 模式 Ring Attention的实现 - # general 为基线方案,causal 为加速方案 - # 如果 cp > 1 且使用了Ring Attention 并行(包括Hybrid并行)。则Mask为动态生成的,不需要额外的Mask - if args.reset_attention_mask: - if args.attention_mask_type == 'general': - args.sparse_mode = 2 - if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo': - should_generate_mask = True - compress = True - else: - args.sparse_mode = 1 - should_generate_mask = False - else: - should_generate_mask = True - compress = True - - - if args.attention_mask_on_cpu: - device = 'cpu' - - if should_generate_mask: - generate_attention_mask(args, compress, device) - - return _GLOBAL_ATTN_MASK - - -def parallel_transformer_forward_wrapper(fn): - @wraps(fn) - def wrapper(self, hidden_states, attention_mask, **kwargs): - if attention_mask is None: - attention_mask = get_attention_mask(self.config) - return fn(self, hidden_states, attention_mask, **kwargs) - return wrapper - -def dot_product_attention_forward_wrapper(fn): - @wraps(fn) - def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params): - if ( - attention_mask is None - and self.attn_mask_type == AttnMaskType.causal - and not getattr(self.config, 'is_llava', False) - ): - attention_mask = get_attention_mask(self.config) - return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) - return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/positional_encoding/alibi/__init__.py b/mindspeed/core/transformer/flash_attention/alibi/__init__.py similarity index 100% rename from mindspeed/core/transformer/positional_encoding/alibi/__init__.py rename to mindspeed/core/transformer/flash_attention/alibi/__init__.py diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py new file mode 100644 index 00000000..7abcbf04 --- /dev/null +++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py @@ -0,0 +1,99 @@ +# coding=utf-8 +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. + +import math +from functools import wraps + +from torch import Tensor +import torch_npu + +from megatron.core.transformer.enums import AttnMaskType + +from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention + +from .alibi import AlibiForFusionAttnSingleton + +try: + from einops import rearrange +except ImportError: + rearrange = None + + +def dot_product_attention_init_wrapper(fn): + @wraps(fn) + def wrapper(self, *args, **kwargs): + fn(self, *args, **kwargs) + + # add pse + self.pse = None + self.pse_type = self.config.alibi_fusion_attn_type + + if self.pse_type is None: + self.pse_type = 1 # not use pse + elif self.pse_type == 0: + alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length, + self.config.num_attention_heads, + self.config.params_dtype, + self.config.alibi_diagonal_opposite, + 1024) + self.pse = alibi + elif self.pse_type == 2 or self.pse_type == 3: + self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads) + return wrapper + + +def dot_product_attention_forward_impl( + self, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask, + attn_mask_type, + attention_bias, + packed_seq_params, +): + assert attention_bias is None, "Attention bias is not supported for DotProductAttention." + + if packed_seq_params is None: + seq_length, bsz, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3] + else: + seq_length, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2] + + sparse_mode = self.config.sparse_mode + if attn_mask_type == AttnMaskType.no_mask: + sparse_mode = 0 # default mask + + scale = 1.0 / math.sqrt( + self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale + + if packed_seq_params is not None: # TND + actual_seq_qlen = packed_seq_params.cu_seqlens_q.tolist() + actual_seq_kvlen = packed_seq_params.cu_seqlens_kv.tolist() + query, key, value = [rearrange(x, 's b h d -> (b s) h d') for x in [query, key, value]] + shape_order = 'TND' + else: # SBH + actual_seq_qlen = None + actual_seq_kvlen = None + query, key, value = [rearrange(x, 's b h d -> s b (h d)') for x in [query, key, value]] + shape_order = 'SBH' + + output = npu_fusion_attention(query, key, value, n_head, shape_order, + pse=self.pse, + padding_mask=None, + atten_mask=attention_mask, + scale=scale, + pse_type=self.pse_type, + pre_tokens=self.config.pre_tockens, + next_tokens=self.config.next_tockens, + keep_prob=1 - self.attention_dropout.p, + inner_precise=0, + sparse_mode=sparse_mode, + actual_seq_qlen=actual_seq_qlen, + actual_seq_kvlen=actual_seq_kvlen + )[0] + + if packed_seq_params is not None: + output = rearrange(output, '(b s) h d -> s b (h d)', s=seq_length, b=bsz) + + return output \ No newline at end of file diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py similarity index 81% rename from mindspeed/core/transformer/positional_encoding/alibi/alibi.py rename to mindspeed/core/transformer/flash_attention/alibi/alibi.py index 957d334f..2b5f19f2 100644 --- a/mindspeed/core/transformer/positional_encoding/alibi/alibi.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py @@ -30,10 +30,17 @@ class AlibiForFusionAttnSingleton: _alibi_slopes = None @classmethod - def get_alibi_tensor_for_fusion_attn(cls, max_seq_len, num_attention_heads, dtype, neg_diagonal_opposite=False, + def get_alibi_tensor_for_fusion_attn(cls, + max_seq_len, + num_attention_heads, + dtype, + neg_diagonal_opposite=False, last_k=1024): - if cls._alibi_tensor is None or cls._alibi_tensor_args != ( - max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k): + if cls._alibi_tensor is None or \ + cls._alibi_tensor_args != + ( + max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k + ): if last_k > max_seq_len: last_k = max_seq_len @@ -45,7 +52,8 @@ class AlibiForFusionAttnSingleton: diag = torch.diag(torch.diag(position_point)).unsqueeze(0).unsqueeze(0) position_point = position_point.unsqueeze(0).unsqueeze(0).expand(current_head_num, last_k, -1) - position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, last_k, + position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, + last_k, max_seq_len) alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point.npu() diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py similarity index 100% rename from mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py rename to mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/__init__.py b/mindspeed/core/transformer/flash_attention/generate_mask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py new file mode 100644 index 00000000..e2fad832 --- /dev/null +++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py @@ -0,0 +1,20 @@ +from functools import wraps + +from megatron.core.transformer.enums import AttnMaskType + +from .generate_mask import get_attention_mask + + +def dot_product_attention_forward_wrapper(fn): + @wraps(fn) + def wrapper(self, query, key, value, + attention_mask, + attn_mask_type, + attention_bias, + packed_seq_params): + if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: + if not getattr(self.config, 'is_llava', False): + self.config.sparse_mode = 2 + attention_mask = get_attention_mask(self.config) + return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) + return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py new file mode 100644 index 00000000..9ee07218 --- /dev/null +++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py @@ -0,0 +1,43 @@ +import warnings + +import torch +from megatron.training import get_args + +_GLOBAL_ATTN_MASK = None + + +def set_attention_mask(attn_mask): + global _GLOBAL_ATTN_MASK + _GLOBAL_ATTN_MASK = attn_mask + + +def generate_attention_mask(args, compress, device): + global _GLOBAL_ATTN_MASK + if not args.use_flash_attn: + warnings.warn("Flash Attention is highly recommended") + _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], + dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) + \ + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], + dtype=bool, device=device), diagonal=args.next_tockens + 1)) + return + + if compress: + seq_len = 2048 + else: + seq_len = args.seq_length + + _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len), + device=device, dtype=torch.bool), diagonal=1) + + +def get_attention_mask(args): + global _GLOBAL_ATTN_MASK + if _GLOBAL_ATTN_MASK is not None: + return _GLOBAL_ATTN_MASK + + device = 'npu' + compress = True + + generate_attention_mask(args, compress, device) + + return _GLOBAL_ATTN_MASK \ No newline at end of file diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py deleted file mode 100644 index 5ae66ae0..00000000 --- a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py +++ /dev/null @@ -1,172 +0,0 @@ -from functools import wraps - -import torch -from megatron.training import get_args, mpu, tensor_parallel - -from .alibi import Alibi, AlibiForFusionAttnSingleton -from .alibi_utils import _build_alibi_tensor, _get_inverted_mask - - -def core_attention_init_wrapper(fn): - @wraps(fn) - def wrapper(self, *arg, **kwargs): - fn(self, *arg, **kwargs) - - # self.config = arg[1] - self.hidden_size_per_partition = self.hidden_size_per_partition // self.config.context_parallel_size - self.square_alibi_mask = self.config.square_alibi_mask - self.fill_neg_inf = self.config.fill_neg_inf - self.beta = 1.0 - - if self.apply_query_key_layer_scaling: - self.beta = 1.0 / self.layer_number - if self.config.position_embedding_type == 'alibi': - self.alibi = Alibi() - alibi = _build_alibi_tensor(self.config.seq_length, - self.config.num_attention_heads, - self.config.square_alibi_mask, - self.config.fill_neg_inf - ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype) - self.alibi.alibi = alibi - else: - self.alibi = None - - return wrapper - - -def core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): - # =================================== - # Raw attention scores. [b, np, s, s] - # =================================== - - # [b, np, sq, sk] - output_size = (query_layer.size(1), - query_layer.size(2), - query_layer.size(0), - key_layer.size(0)) - - # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.reshape(output_size[2], - output_size[0] * output_size[1], -1) - # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view(output_size[3], - output_size[0] * output_size[1], -1) - - if self.alibi is None: - matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor( - (output_size[0] * output_size[1], output_size[2], output_size[3]), - query_layer.dtype, "mpu") - - matmul_result = torch.baddbmm( - matmul_input_buffer, - query_layer.transpose(0, 1), - key_layer.transpose(0, 1).transpose(1, 2), - beta=0.0, alpha=(1.0 / self.norm_factor)) - else: - if self.alibi.matmul_result is None or self.alibi.output_size != output_size: - - self.alibi.output_size = output_size - alibi = _build_alibi_tensor(self.config.seq_length, - self.config.num_attention_heads, - self.config.square_alibi_mask, - self.config.fill_neg_inf - ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype) - self.alibi.alibi = alibi - - if self.fill_neg_inf: - _alibi = self.alibi.alibi[:, :output_size[3], :output_size[3]] - attention_mask = attention_mask.repeat(output_size[0], 1, 1, 1)[:output_size[0], :, :, :] - self.alibi.matmul_result = _get_inverted_mask(attention_mask, _alibi).view(-1, output_size[2], - output_size[2]).contiguous() - else: - self.alibi.matmul_result = self.alibi.alibi[:, :, :output_size[3]].repeat(output_size[0], 1, 1) - - q_trans = query_layer.transpose(0, 1).contiguous() - k_trans = key_layer.transpose(0, 1).transpose(1, 2).contiguous() - matmul_result = self.beta * self.alibi.matmul_result + torch.bmm(q_trans, k_trans) * (1.0 / self.norm_factor) - - # change view to [b, np, sq, sk] - attention_scores = matmul_result.view(*output_size) - - # =========================== - # Attention probs and dropout - # =========================== - - # attention scores and attention mask [b, np, sq, sk] - if self.square_alibi_mask: - attention_scores = torch.max( - attention_scores, torch.tensor(torch.finfo(attention_scores.dtype).min) - ) - attention_probs = torch.nn.functional.softmax(attention_scores, -1) - else: - attention_probs = self.scale_mask_softmax(attention_scores, - attention_mask) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - if not self.sequence_parallel: - with tensor_parallel.get_cuda_rng_tracker().fork(): - attention_probs = self.attention_dropout(attention_probs) - else: - attention_probs = self.attention_dropout(attention_probs) - - # ========================= - # Context layer. [sq, b, hp] - # ========================= - - # value_layer -> context layer. - # [sk, b, np, hn] --> [b, np, sq, hn] - - # context layer shape: [b, np, sq, hn] - output_size = (value_layer.size(1), - value_layer.size(2), - query_layer.size(0), - value_layer.size(3)) - - # change view [sk, b * np, hn] - value_layer = value_layer.view(value_layer.size(0), - output_size[0] * output_size[1], -1) - - # change view [b * np, sq, sk] - attention_probs = attention_probs.view(output_size[0] * output_size[1], - output_size[2], -1) - - # matmul: [b * np, sq, hn] - context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) - - # change view [b, np, sq, hn] - context_layer = context_layer.view(*output_size) - - # [b, np, sq, hn] --> [sq, b, np, hn] - context_layer = context_layer.permute(2, 0, 1, 3).contiguous() - - # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.hidden_size_per_partition,) - context_layer = context_layer.view(*new_context_layer_shape) - - return context_layer - - -def flash_self_attention_init_wrapper(fn): - @wraps(fn) - def wrapper(self, *arg, **kwargs): - fn(self, *arg, **kwargs) - - self.pse = None - self.pse_type = self.config.alibi_fusion_attn_type - - if self.pse_type is None: - self.pse_type = 1 # not use pse - elif self.pse_type == 0: - alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length, - self.config.num_attention_heads, - self.config.params_dtype, - self.config.alibi_diagonal_opposite, - 1024) - self.pse = alibi - - elif self.pse_type == 2 or self.pse_type == 3: - self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads) - - return wrapper diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py index 83869db5..ddf0fa69 100644 --- a/mindspeed/features_manager/__init__.py +++ b/mindspeed/features_manager/__init__.py @@ -11,7 +11,9 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import from mindspeed.features_manager.llava.llava_multimodal import LlavaModel -from mindspeed.features_manager.transformer.alibi_feature import AlibiFeature +from mindspeed.features_manager.transformer.flash_attention.fusion_attention_v2_feature import FusionAttentionV2Feature +from mindspeed.features_manager.transformer.flash_attention.alibi_feature import AlibiFeature +from mindspeed.features_manager.transformer.flash_attention.generate_mask_feature import GenerateMaskFeature FEATURES_LIST = [ # Functional features @@ -42,6 +44,8 @@ FEATURES_LIST_V2 = ( # llava-multimodal LlavaModel(), - # Transformer features - AlibiFeature() + # Transformer flash attention features + FusionAttentionV2Feature(), + AlibiFeature(), + GenerateMaskFeature() ) diff --git a/mindspeed/features_manager/megatron_basic/megatron_basic.py b/mindspeed/features_manager/megatron_basic/megatron_basic.py index b959e201..ef8d1874 100644 --- a/mindspeed/features_manager/megatron_basic/megatron_basic.py +++ b/mindspeed/features_manager/megatron_basic/megatron_basic.py @@ -41,7 +41,4 @@ class MegatronBasicFeature(MindSpeedFeature): pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', start_grad_sync) pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_grad_sync', finish_grad_sync) - # attention mask generate normalization - from mindspeed.core.megatron_basic.transformer_basic import parallel_transformer_forward_wrapper, dot_product_attention_forward_wrapper - pm.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', parallel_transformer_forward_wrapper) - pm.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper) \ No newline at end of file + diff --git a/mindspeed/features_manager/transformer/flash_attention/__init__.py b/mindspeed/features_manager/transformer/flash_attention/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py similarity index 58% rename from mindspeed/features_manager/transformer/alibi_feature.py rename to mindspeed/features_manager/transformer/flash_attention/alibi_feature.py index 37e174b7..1f1acc20 100644 --- a/mindspeed/features_manager/transformer/alibi_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py @@ -1,11 +1,32 @@ +from logging import getLogger from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature +from mindspeed.log_config import set_log_config + +LOG = getLogger(__name__) + class AlibiFeature(MindSpeedFeature): + """ + Attention positional embeding. + To enable this feature, the reference is as follows . + + Usage: + "--position-embedding-type alibi" + "--alibi-fusion-attn-type [0, 2, 3]" + "[--alibi-diagonal-opposite]" + """ def __init__(self): - super().__init__('pse-alibi', optimization_level=2) + super().__init__('position-embedding-type', optimization_level=0) + + def is_need_apply(self, args): + pse = getattr(args, self.feature_name, None) + need_apply = False + if pse == 'alibi': + need_apply = True + return (self.optimization_level <= args.optimization_level and need_apply) or self.default_patches def register_args(self, parser: ArgumentParser): self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi') @@ -31,7 +52,8 @@ class AlibiFeature(MindSpeedFeature): def validate_args(self, args): if args.alibi_fusion_attn_type is not None and args.alibi_fusion_attn_type not in [0, 2, 3]: raise AssertionError('--alibi-fusion-attn-type only support for `0, 2, 3`') - # alibi type [2, 3] is only support FA2 + + # alibi is only support FA2 if args.alibi_fusion_attn_type in [2, 3]: args.use_fusion_attn_v2 = True if args.use_fusion_attn_v2: @@ -39,9 +61,6 @@ class AlibiFeature(MindSpeedFeature): print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.") def register_patches(self, patch_manager, args): - from mindspeed.core.transformer.positional_encoding.alibi.adaptor import flash_self_attention_init_wrapper #l0 - from mindspeed.core.transformer.positional_encoding.alibi.adaptor import core_attention_init_wrapper, core_attention_forward #l2 - patch_manager.register_patch('megatron.legacy.model.transformer.FlashSelfAttention.__init__', - flash_self_attention_init_wrapper) - patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.__init__', core_attention_init_wrapper) - patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.forward', core_attention_forward) \ No newline at end of file + from mindspeed.core.transformer.flash_attention.alibi.adaptor import dot_product_attention_init_wrapper, dot_product_attention_forward_impl + patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.__init__', dot_product_attention_init_wrapper) + patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_impl) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py new file mode 100644 index 00000000..a47b5db2 --- /dev/null +++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py @@ -0,0 +1,34 @@ +from argparse import ArgumentParser + +from mindspeed.features_manager.feature import MindSpeedFeature + + +class FusionAttentionV2Feature(MindSpeedFeature): + ''' + fusion attention v2 is a expand to fusion attention v1 + and only support for alibi positional embeding currently. + Close by default. + ''' + + def __init__(self): + super().__init__('use-fusion-attn-v2', 0) + + def register_args(self, parser: ArgumentParser): + group = parser.add_argument_group(title='fusion attention v2') + group.add_argument('--use-fusion-attn-v2', + action='store_true', + default=False, + help='enalbe fusion attention v2') + group.add_argument('--pre-tockens', + type=int, + default=65536, + help='pre-tockens is used by Flash attention') + group.add_argument('--next-tockens', + type=int, + default=0, + help='next-tockens is used by Flash attention') + + def validate_args(self, args): + if args.use_fusion_attn_v2: + args.use_flash_attn = True + print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.") \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py new file mode 100644 index 00000000..3dfaa074 --- /dev/null +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -0,0 +1,20 @@ +from argparse import ArgumentParser + +from mindspeed.features_manager.feature import MindSpeedFeature + +class GenerateMaskFeature(MindSpeedFeature): + + def __init__(self): + super().__init__('no-create-attention-mask-in-dataloader', 0) + + def register_args(self, parser: ArgumentParser): + group = parser.add_argument_group(title='fusion attention') + group.add_argument('--sparse-mode', + type=int, + default=0, + choices=[0,1,2,3,4,5,6,7,8], + help='mask type for fusion attention') + + def register_patches(self, patch_manager, args): + from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper + patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper) -- Gitee From 9cc1eda5c7697bc58df394db75867be27e939978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Thu, 10 Apr 2025 11:41:02 +0800 Subject: [PATCH 09/18] bugfix --- mindspeed/core/transformer/flash_attention/alibi/alibi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py index 2b5f19f2..5177ed41 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py @@ -37,7 +37,7 @@ class AlibiForFusionAttnSingleton: neg_diagonal_opposite=False, last_k=1024): if cls._alibi_tensor is None or \ - cls._alibi_tensor_args != + cls._alibi_tensor_args != \ ( max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k ): -- Gitee From f2a2f6f294fc6d1cdb4b76aef63263e21948c69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Thu, 10 Apr 2025 17:19:45 +0800 Subject: [PATCH 10/18] =?UTF-8?q?clean=20code=20&=20alibi=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E5=AF=B9=E6=95=B4=E4=B8=AA=E7=B1=BBPatch=20&?= =?UTF-8?q?=20=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flash_attention/alibi/adaptor.py | 128 +++++++----------- .../flash_attention/alibi/alibi.py | 78 +++++++---- .../flash_attention/alibi/alibi_utils.py | 36 +++-- .../alibi/dot_product_attention.py | 114 ++++++++++++++++ .../flash_attention/generate_mask/adaptor.py | 27 ++-- .../generate_mask/generate_mask.py | 33 ++++- .../flash_attention/alibi_feature.py | 85 ++++++++---- .../fusion_attention_v2_feature.py | 47 +++++-- .../flash_attention/generate_mask_feature.py | 31 +++-- 9 files changed, 396 insertions(+), 183 deletions(-) create mode 100644 mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py index 7abcbf04..f30cc0ca 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py +++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py @@ -2,98 +2,62 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. -import math -from functools import wraps +from typing import Optional from torch import Tensor -import torch_npu - +from megatron.core.transformer.dot_product_attention import DotProductAttention as MegatronDotProductAttention +from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.enums import AttnMaskType +from megatron.core.packed_seq_params import PackedSeqParams -from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention - -from .alibi import AlibiForFusionAttnSingleton - -try: - from einops import rearrange -except ImportError: - rearrange = None +from mindspeed.core.transformer.flash_attention.alibi.dot_product_attention import DotProductAttentionImpl -def dot_product_attention_init_wrapper(fn): - @wraps(fn) - def wrapper(self, *args, **kwargs): - fn(self, *args, **kwargs) +class MindSpeedDotProductAttention(DotProductAttentionImpl, MegatronDotProductAttention): + def __init__( + self, + config: TransformerConfig, + layer_number: int, + attn_mask_type: AttnMaskType, + attention_type: str, + attention_dropout: float = None, + softmax_scale: float = None, + cp_comm_type: str = None, + ): + MegatronDotProductAttention.__init__( + self, + config, + layer_number, + attn_mask_type, + attention_type, + attention_dropout, + softmax_scale, + cp_comm_type + ) + # add pse - self.pse = None - self.pse_type = self.config.alibi_fusion_attn_type - - if self.pse_type is None: - self.pse_type = 1 # not use pse - elif self.pse_type == 0: - alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length, - self.config.num_attention_heads, - self.config.params_dtype, - self.config.alibi_diagonal_opposite, - 1024) - self.pse = alibi - elif self.pse_type == 2 or self.pse_type == 3: - self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads) - return wrapper - + DotProductAttentionImpl.__init__(self) -def dot_product_attention_forward_impl( + def forward( self, query: Tensor, key: Tensor, value: Tensor, - attention_mask, - attn_mask_type, - attention_bias, - packed_seq_params, -): - assert attention_bias is None, "Attention bias is not supported for DotProductAttention." - - if packed_seq_params is None: - seq_length, bsz, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3] - else: - seq_length, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2] - - sparse_mode = self.config.sparse_mode - if attn_mask_type == AttnMaskType.no_mask: - sparse_mode = 0 # default mask - - scale = 1.0 / math.sqrt( - self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale - - if packed_seq_params is not None: # TND - actual_seq_qlen = packed_seq_params.cu_seqlens_q.tolist() - actual_seq_kvlen = packed_seq_params.cu_seqlens_kv.tolist() - query, key, value = [rearrange(x, 's b h d -> (b s) h d') for x in [query, key, value]] - shape_order = 'TND' - else: # SBH - actual_seq_qlen = None - actual_seq_kvlen = None - query, key, value = [rearrange(x, 's b h d -> s b (h d)') for x in [query, key, value]] - shape_order = 'SBH' - - output = npu_fusion_attention(query, key, value, n_head, shape_order, - pse=self.pse, - padding_mask=None, - atten_mask=attention_mask, - scale=scale, - pse_type=self.pse_type, - pre_tokens=self.config.pre_tockens, - next_tokens=self.config.next_tockens, - keep_prob=1 - self.attention_dropout.p, - inner_precise=0, - sparse_mode=sparse_mode, - actual_seq_qlen=actual_seq_qlen, - actual_seq_kvlen=actual_seq_kvlen - )[0] - - if packed_seq_params is not None: - output = rearrange(output, '(b s) h d -> s b (h d)', s=seq_length, b=bsz) - - return output \ No newline at end of file + attention_mask: Tensor, + attn_mask_type: AttnMaskType = None, + attention_bias: Tensor = None, + packed_seq_params: Optional[PackedSeqParams] = None, + ): + output = DotProductAttentionImpl.forward( + self, + query, + key, + value, + attention_mask, + attn_mask_type, + attention_bias, + packed_seq_params, + ) + + return output diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py index 5177ed41..7614269f 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py @@ -30,31 +30,50 @@ class AlibiForFusionAttnSingleton: _alibi_slopes = None @classmethod - def get_alibi_tensor_for_fusion_attn(cls, - max_seq_len, - num_attention_heads, - dtype, - neg_diagonal_opposite=False, - last_k=1024): - if cls._alibi_tensor is None or \ - cls._alibi_tensor_args != \ - ( - max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k - ): + def get_alibi_tensor_for_fusion_attn( + cls, + max_seq_len, + num_attention_heads, + dtype, + neg_diagonal_opposite=False, + last_k=1024 + ): + if ( + cls._alibi_tensor is None or + cls._alibi_tensor_args != ( + max_seq_len, num_attention_heads, + neg_diagonal_opposite, last_k + ) + ): if last_k > max_seq_len: last_k = max_seq_len - tp_world_size = parallel_state.get_tensor_model_parallel_world_size() + tp_world_size = ( + parallel_state.get_tensor_model_parallel_world_size() + ) current_head_num = num_attention_heads // tp_world_size - slopes = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(num_attention_heads) + slopes = ( + AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn( + num_attention_heads + ) + ) position_point = torch.arange(max_seq_len) - max_seq_len + 1 - diag = torch.diag(torch.diag(position_point)).unsqueeze(0).unsqueeze(0) - - position_point = position_point.unsqueeze(0).unsqueeze(0).expand(current_head_num, last_k, -1) - position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, - last_k, - max_seq_len) + diag = torch.diag( + torch.diag(position_point) + ).unsqueeze(0).unsqueeze(0) + + position_point = ( + position_point.unsqueeze(0).unsqueeze(0).expand( + current_head_num, last_k, -1 + ) + ) + position_point = ( + position_point - + diag.transpose(-1, -2)[:, -last_k:, :].expand( + current_head_num, last_k, max_seq_len + ) + ) alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point.npu() @@ -69,20 +88,33 @@ class AlibiForFusionAttnSingleton: alibi = alibi.to(torch.bfloat16) cls._alibi_tensor = alibi - cls._alibi_tensor_args = (max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k) + cls._alibi_tensor_args = ( + max_seq_len, num_attention_heads, + neg_diagonal_opposite, last_k + ) return cls._alibi_tensor @classmethod def get_alibi_slopes_for_fusion_attn(cls, n): - if cls._alibi_slopes is None or cls._alibi_slopes_headnum != n: + if ( + cls._alibi_slopes is None or + cls._alibi_slopes_headnum != n + ): slopes = get_slopes(n) - tp_world_size = parallel_state.get_tensor_model_parallel_world_size() + tp_world_size = ( + parallel_state.get_tensor_model_parallel_world_size() + ) tp_index = parallel_state.get_tensor_model_parallel_rank() current_head_num = n // tp_world_size - slopes = torch.Tensor(slopes[tp_index * current_head_num: tp_index * current_head_num + current_head_num]).npu() + slopes = torch.Tensor( + slopes[ + tp_index * current_head_num: + tp_index * current_head_num + current_head_num + ] + ).npu() cls._alibi_slopes = slopes cls._alibi_slopes_headnum = n diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py index 7c28313b..24a898c9 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py @@ -14,8 +14,10 @@ def get_slopes(n): return get_slopes_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) - return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][ - :n - closest_power_of_2] + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes(2 * closest_power_of_2)[0::2][:n - closest_power_of_2] + ) def _get_inverted_mask(attention_mask, alibi): @@ -26,26 +28,44 @@ def _get_inverted_mask(attention_mask, alibi): return inverted_mask.to(alibi.device) + alibi.unsqueeze(0) -def _build_alibi_tensor(max_seq_len, num_attention_heads, square_alibi_mask, fill_neg_inf): +def _build_alibi_tensor( + max_seq_len, + num_attention_heads, + square_alibi_mask, + fill_neg_inf +): def _fill_with_neg_inf(t): """FP16-compatible function that fills a tensor with -inf.""" return t.float().fill_(float("-inf")).type_as(t) def _buffered_future_mask(maxpos, alibi, attn_heads): - _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) + _future_mask = torch.triu( + _fill_with_neg_inf(torch.zeros([maxpos, maxpos])), + 1 + ) _future_mask = _future_mask.unsqueeze(0) + alibi return _future_mask[:attn_heads, :maxpos, :maxpos] slopes = torch.Tensor(get_slopes(num_attention_heads)) if square_alibi_mask: position_point = torch.arange(max_seq_len) - max_seq_len + 1 - position_point = position_point.unsqueeze(0).unsqueeze(0).expand(num_attention_heads, max_seq_len, -1) + position_point = ( + position_point.unsqueeze(0).unsqueeze(0).expand( + num_attention_heads, max_seq_len, -1 + ) + ) diag = torch.diag(position_point[0]) - position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2) + position_point = ( + position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2) + ) alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point else: - alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( - num_attention_heads, -1, -1) + alibi = ( + slopes.unsqueeze(1).unsqueeze(1) * + torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( + num_attention_heads, -1, -1 + ) + ) # Select the part of the tensor that corresponds to our tensor parallel index. tp_world_size = parallel_state.get_tensor_model_parallel_world_size() diff --git a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py new file mode 100644 index 00000000..67485ba0 --- /dev/null +++ b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py @@ -0,0 +1,114 @@ +import math +from typing import Optional + +from torch import Tensor + +from megatron.core.transformer.enums import AttnMaskType +from megatron.core.packed_seq_params import PackedSeqParams + +from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention + +from .alibi import AlibiForFusionAttnSingleton + +try: + from einops import rearrange +except ImportError: + rearrange = None + + +class DotProductAttentionImpl(): + + def __init__(self): + # add pse + self.pse = None + self.pse_type = self.config.alibi_fusion_attn_type + + if self.pse_type is None: + self.pse_type = 1 # not use pse + elif self.pse_type == 0: + alibi = ( + AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn( + self.config.seq_length, + self.config.num_attention_heads, + self.config.params_dtype, + self.config.alibi_diagonal_opposite, + 1024 + ) + ) + self.pse = alibi + elif self.pse_type == 2 or self.pse_type == 3: + self.pse = ( + AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn( + self.config.num_attention_heads + ) + ) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Tensor, + attn_mask_type: AttnMaskType = None, + attention_bias: Tensor = None, + packed_seq_params: Optional[PackedSeqParams] = None, + ): + assert attention_bias is None, \ + "Attention bias is not supported for DotProductAttention." + + if packed_seq_params is None: + seq_length, bsz, n_head, head_dim = ( + query.shape[0], query.shape[1], query.shape[2], query.shape[3] + ) + else: + seq_length, n_head, head_dim = ( + query.shape[0], query.shape[1], query.shape[2] + ) + + sparse_mode = self.config.sparse_mode + if attn_mask_type == AttnMaskType.no_mask: + sparse_mode = 0 # default mask + + scale = ( + 1.0 / math.sqrt(self.hidden_size_per_attention_head) + if self.scale_mask_softmax.scale is None + else self.softmax_scale + ) + + if packed_seq_params is not None: # TND + actual_seq_qlen = packed_seq_params.cu_seqlens_q.tolist() + actual_seq_kvlen = packed_seq_params.cu_seqlens_kv.tolist() + query, key, value = ( + [rearrange(x, 's b h d -> (b s) h d') for x in [query, key, value]] + ) + shape_order = 'TND' + else: # SBH + actual_seq_qlen = None + actual_seq_kvlen = None + query, key, value = ( + [rearrange(x, 's b h d -> s b (h d)') for x in [query, key, value]] + ) + shape_order = 'SBH' + + output = npu_fusion_attention( + query, key, value, n_head, shape_order, + pse=self.pse, + padding_mask=None, + atten_mask=attention_mask, + scale=scale, + pse_type=self.pse_type, + pre_tokens=self.config.pre_tockens, + next_tokens=self.config.next_tockens, + keep_prob=1 - self.attention_dropout.p, + inner_precise=0, + sparse_mode=sparse_mode, + actual_seq_qlen=actual_seq_qlen, + actual_seq_kvlen=actual_seq_kvlen + )[0] + + if packed_seq_params is not None: + output = ( + rearrange(output, '(b s) h d -> s b (h d)', s=seq_length, b=bsz) + ) + + return output \ No newline at end of file diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py index e2fad832..12cbc42a 100644 --- a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py +++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py @@ -7,14 +7,21 @@ from .generate_mask import get_attention_mask def dot_product_attention_forward_wrapper(fn): @wraps(fn) - def wrapper(self, query, key, value, - attention_mask, - attn_mask_type, - attention_bias, - packed_seq_params): - if attention_mask is None and self.attn_mask_type == AttnMaskType.causal: - if not getattr(self.config, 'is_llava', False): - self.config.sparse_mode = 2 - attention_mask = get_attention_mask(self.config) - return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params) + def wrapper( + self, query, key, value, + attention_mask, + attn_mask_type, + attention_bias, + packed_seq_params + ): + if ( + attention_mask is None and + self.attn_mask_type == AttnMaskType.causal + ) and not getattr(self.config, 'is_llava', False): + self.config.sparse_mode = 2 + attention_mask = get_attention_mask(self.config) + return fn( + self, query, key, value, + attention_mask, attn_mask_type, attention_bias, packed_seq_params + ) return wrapper \ No newline at end of file diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py index 9ee07218..d93c16a3 100644 --- a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py +++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py @@ -1,7 +1,6 @@ import warnings import torch -from megatron.training import get_args _GLOBAL_ATTN_MASK = None @@ -15,10 +14,24 @@ def generate_attention_mask(args, compress, device): global _GLOBAL_ATTN_MASK if not args.use_flash_attn: warnings.warn("Flash Attention is highly recommended") - _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], - dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) + \ - torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], - dtype=bool, device=device), diagonal=args.next_tockens + 1)) + _GLOBAL_ATTN_MASK = ( + torch.tril( + torch.ones( + [args.micro_batch_size, 1, args.seq_length, args.seq_length], + dtype=bool, + device=device + ), + diagonal=-(args.pre_tockens + 1) + ) + + torch.triu( + torch.ones( + [args.micro_batch_size, 1, args.seq_length, args.seq_length], + dtype=bool, + device=device + ), + diagonal=args.next_tockens + 1 + ) + ) return if compress: @@ -26,8 +39,14 @@ def generate_attention_mask(args, compress, device): else: seq_len = args.seq_length - _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len), - device=device, dtype=torch.bool), diagonal=1) + _GLOBAL_ATTN_MASK = torch.triu( + torch.ones( + (seq_len, seq_len), + device=device, + dtype=torch.bool + ), + diagonal=1 + ) def get_attention_mask(args): diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py index 1f1acc20..6943a875 100644 --- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py @@ -2,9 +2,6 @@ from logging import getLogger from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature -from mindspeed.log_config import set_log_config - -LOG = getLogger(__name__) class AlibiFeature(MindSpeedFeature): @@ -14,53 +11,85 @@ class AlibiFeature(MindSpeedFeature): Usage: "--position-embedding-type alibi" - "--alibi-fusion-attn-type [0, 2, 3]" + "--alibi-fusion-attn-type [2, 3]" "[--alibi-diagonal-opposite]" """ def __init__(self): - super().__init__('position-embedding-type', optimization_level=0) + super().__init__( + 'position-embedding-type', + optimization_level=2 + ) def is_need_apply(self, args): pse = getattr(args, self.feature_name, None) need_apply = False if pse == 'alibi': need_apply = True - return (self.optimization_level <= args.optimization_level and need_apply) or self.default_patches + return ( + self.optimization_level <= args.optimization_level and + need_apply + ) or self.default_patches def register_args(self, parser: ArgumentParser): - self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi') + self.add_parser_argument_choices_value( + parser, + "--position-embedding-type", + 'alibi' + ) group = parser.add_argument_group(title='alibi') - group.add_argument('--square-alibi-mask', - action='store_true', - default=False, - help='attention mask of alibi is squared') - group.add_argument('--fill-neg-inf', - action='store_true', - default=False, - help='fill alibi with negative inf') + group.add_argument( + '--square-alibi-mask', + action='store_true', + default=False, + help='attention mask of alibi is squared' + ) + group.add_argument( + '--fill-neg-inf', + action='store_true', + default=False, + help='fill alibi with negative inf' + ) - group.add_argument('--alibi-fusion-attn-type', - type=int, - help='alibi pse type, support for 0,2,3') - group.add_argument('--alibi-diagonal-opposite', - action='store_true', - default=False, - help='make alibi diagonal opposite') + group.add_argument( + '--alibi-fusion-attn-type', + type=int, + help='alibi pse type, support for 0,2,3' + ) + group.add_argument( + '--alibi-diagonal-opposite', + action='store_true', + default=False, + help='make alibi diagonal opposite' + ) def validate_args(self, args): - if args.alibi_fusion_attn_type is not None and args.alibi_fusion_attn_type not in [0, 2, 3]: - raise AssertionError('--alibi-fusion-attn-type only support for `0, 2, 3`') + if (args.alibi_fusion_attn_type is not None and + args.alibi_fusion_attn_type not in [0, 2, 3] + ): + raise AssertionError( + '--alibi-fusion-attn-type only \support for `0, 2, 3`' + ) + if args.alibi_fusion_attn_type == 0: + raise AssertionError( + 'fa_v2 only support compress model currently.\ + please use 2 or 3' + ) # alibi is only support FA2 if args.alibi_fusion_attn_type in [2, 3]: args.use_fusion_attn_v2 = True if args.use_fusion_attn_v2: args.use_flash_attn = True - print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.") + print( + "[WARNING] \"use_fusion_attn_v2\" is not recommended. \ + This feature is not officially released." + ) def register_patches(self, patch_manager, args): - from mindspeed.core.transformer.flash_attention.alibi.adaptor import dot_product_attention_init_wrapper, dot_product_attention_forward_impl - patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.__init__', dot_product_attention_init_wrapper) - patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_impl) \ No newline at end of file + from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention + patch_manager.register_patch( + 'megatron.core.transformer.dot_product_attention.DotProductAttention', + MindSpeedDotProductAttention + ) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py index a47b5db2..df71abc8 100644 --- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py @@ -11,24 +11,43 @@ class FusionAttentionV2Feature(MindSpeedFeature): ''' def __init__(self): - super().__init__('use-fusion-attn-v2', 0) + super().__init__( + 'use-fusion-attn-v2', + optimization_level=2 + ) def register_args(self, parser: ArgumentParser): group = parser.add_argument_group(title='fusion attention v2') - group.add_argument('--use-fusion-attn-v2', - action='store_true', - default=False, - help='enalbe fusion attention v2') - group.add_argument('--pre-tockens', - type=int, - default=65536, - help='pre-tockens is used by Flash attention') - group.add_argument('--next-tockens', - type=int, - default=0, - help='next-tockens is used by Flash attention') + group.add_argument( + '--use-fusion-attn-v2', + action='store_true', + default=False, + help='enalbe fusion attention v2' + ) + group.add_argument( + '--pre-tockens', + type=int, + default=65536, + help='pre-tockens is used by Flash attention' + ) + group.add_argument( + '--next-tockens', + type=int, + default=0, + help='next-tockens is used by Flash attention' + ) + group.add_argument( + '--sparse-mode', + type=int, + default=0, + choices=[0,1,2,3,4,5,6,7,8], + help='mask type for fusion attention' + ) def validate_args(self, args): if args.use_fusion_attn_v2: args.use_flash_attn = True - print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.") \ No newline at end of file + print( + "[WARNING] \"use_fusion_attn_v2\" is not recommended. \ + This feature is not officially released." + ) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py index 3dfaa074..5815e49c 100644 --- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -1,20 +1,29 @@ -from argparse import ArgumentParser - from mindspeed.features_manager.feature import MindSpeedFeature class GenerateMaskFeature(MindSpeedFeature): def __init__(self): - super().__init__('no-create-attention-mask-in-dataloader', 0) + super().__init__( + 'no-create-attention-mask-in-dataloader', + optimization_level=2 + ) + + def is_need_apply(self, args): + """Check the feature is need to apply.""" + need_apply = False + + # can't find feature name, need to enable + if getattr(args, self.feature_name, None): + need_apply = True - def register_args(self, parser: ArgumentParser): - group = parser.add_argument_group(title='fusion attention') - group.add_argument('--sparse-mode', - type=int, - default=0, - choices=[0,1,2,3,4,5,6,7,8], - help='mask type for fusion attention') + return ( + self.optimization_level <= args.optimization_level and + need_apply + ) or self.default_patches def register_patches(self, patch_manager, args): from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper - patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper) + patch_manager.register_patch( + 'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', + dot_product_attention_forward_wrapper + ) -- Gitee From 0c7b9e01f5096d74ffab3a331d6ecf92a4ffee95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Fri, 11 Apr 2025 09:36:47 +0800 Subject: [PATCH 11/18] bugfix --- .../transformer/flash_attention/alibi_feature.py | 10 +++++----- .../flash_attention/fusion_attention_v2_feature.py | 4 ---- .../flash_attention/generate_mask_feature.py | 4 ++-- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py index 6943a875..595ae476 100644 --- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py @@ -69,12 +69,12 @@ class AlibiFeature(MindSpeedFeature): args.alibi_fusion_attn_type not in [0, 2, 3] ): raise AssertionError( - '--alibi-fusion-attn-type only \support for `0, 2, 3`' + '--alibi-fusion-attn-type only support for `0, 2, 3`' ) if args.alibi_fusion_attn_type == 0: raise AssertionError( - 'fa_v2 only support compress model currently.\ - please use 2 or 3' + 'fa v2 only support compress model currently.' \ + 'please use 2 or 3' ) # alibi is only support FA2 @@ -83,8 +83,8 @@ class AlibiFeature(MindSpeedFeature): if args.use_fusion_attn_v2: args.use_flash_attn = True print( - "[WARNING] \"use_fusion_attn_v2\" is not recommended. \ - This feature is not officially released." + '[WARNING] \"use_fusion_attn_v2\" is not recommended.' \ + 'This feature is not officially released.' ) def register_patches(self, patch_manager, args): diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py index df71abc8..5158970d 100644 --- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py @@ -47,7 +47,3 @@ class FusionAttentionV2Feature(MindSpeedFeature): def validate_args(self, args): if args.use_fusion_attn_v2: args.use_flash_attn = True - print( - "[WARNING] \"use_fusion_attn_v2\" is not recommended. \ - This feature is not officially released." - ) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py index 5815e49c..e45b2662 100644 --- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -10,11 +10,11 @@ class GenerateMaskFeature(MindSpeedFeature): def is_need_apply(self, args): """Check the feature is need to apply.""" - need_apply = False + need_apply = True # can't find feature name, need to enable if getattr(args, self.feature_name, None): - need_apply = True + need_apply = False return ( self.optimization_level <= args.optimization_level and -- Gitee From 17e1ee5162f675a1199fe6c72ac1840098937a05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Fri, 11 Apr 2025 09:53:48 +0800 Subject: [PATCH 12/18] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=20&=20=E6=B7=BB=E5=8A=A0license?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flash_attention/alibi/adaptor.py | 3 +- .../flash_attention/alibi/alibi.py | 20 ++++++------ .../flash_attention/alibi/alibi_utils.py | 14 ++++++++ .../alibi/dot_product_attention.py | 9 ++++-- .../flash_attention/generate_mask/adaptor.py | 6 ++-- .../generate_mask/generate_mask.py | 3 ++ .../flash_attention/alibi_feature.py | 32 ++++++++++--------- .../fusion_attention_v2_feature.py | 3 ++ .../flash_attention/generate_mask_feature.py | 6 +++- 9 files changed, 65 insertions(+), 31 deletions(-) diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py index f30cc0ca..da62f19f 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py +++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py @@ -1,6 +1,5 @@ -# coding=utf-8 -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. from typing import Optional diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py index 7614269f..1a6891a8 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py @@ -1,9 +1,12 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import threading import torch -from megatron.core import parallel_state -from .alibi_utils import get_slopes +from megatron.core import parallel_state +from mindspeed.core.transformer.flash_attention.alibi.alibi_utils import get_slopes class Alibi: @@ -11,15 +14,14 @@ class Alibi: alibi = None matmul_result = None output_size = None - lock = threading.Lock() + _lock = threading.Lock() def __new__(cls, *args, **kwargs): - if cls._instance: - return cls._instance - else: - with cls.lock: - cls._instance = super().__new__(cls) - return cls._instance + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance class AlibiForFusionAttnSingleton: diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py index 24a898c9..800369aa 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py @@ -1,3 +1,6 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import math import torch @@ -5,6 +8,17 @@ from megatron.core import parallel_state def get_slopes(n): + """ + Generate ALiBi slopes for n attention heads. + The slopes are computed based on the number of heads and follow a power-of-2 pattern. + + Args: + n (int): Number of attention heads. + + Returns: + List[float]: A list of slopes for each attention head. + """ + def get_slopes_power_of_2(n): start = (2 ** (-2 ** -(math.log2(n) - 3))) ratio = start diff --git a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py index 67485ba0..b4699876 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py +++ b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py @@ -1,3 +1,6 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import math from typing import Optional @@ -7,8 +10,7 @@ from megatron.core.transformer.enums import AttnMaskType from megatron.core.packed_seq_params import PackedSeqParams from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention - -from .alibi import AlibiForFusionAttnSingleton +from mindspeed.core.transformer.flash_attention.alibi.alibi import AlibiForFusionAttnSingleton try: from einops import rearrange @@ -17,6 +19,9 @@ except ImportError: class DotProductAttentionImpl(): + """ + Implementation of dot product attention with ALiBi support. + """ def __init__(self): # add pse diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py index 12cbc42a..24ca64d4 100644 --- a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py +++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py @@ -1,8 +1,10 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from functools import wraps from megatron.core.transformer.enums import AttnMaskType - -from .generate_mask import get_attention_mask +from mindspeed.core.transformer.flash_attention.generate_mask.generate_mask import get_attention_mask def dot_product_attention_forward_wrapper(fn): diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py index d93c16a3..02295e47 100644 --- a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py +++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py @@ -1,3 +1,6 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + import warnings import torch diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py index 595ae476..f443c437 100644 --- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py @@ -1,3 +1,6 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from logging import getLogger from argparse import ArgumentParser @@ -65,21 +68,20 @@ class AlibiFeature(MindSpeedFeature): ) def validate_args(self, args): - if (args.alibi_fusion_attn_type is not None and - args.alibi_fusion_attn_type not in [0, 2, 3] - ): - raise AssertionError( - '--alibi-fusion-attn-type only support for `0, 2, 3`' - ) - if args.alibi_fusion_attn_type == 0: - raise AssertionError( - 'fa v2 only support compress model currently.' \ - 'please use 2 or 3' - ) - - # alibi is only support FA2 - if args.alibi_fusion_attn_type in [2, 3]: - args.use_fusion_attn_v2 = True + if args.alibi_fusion_attn_type is not None: + if args.alibi_fusion_attn_type not in [0, 2, 3]: + raise AssertionError( + '--alibi-fusion-attn-type only support for `0, 2, 3`' + ) + if args.alibi_fusion_attn_type == 0: + raise AssertionError( + 'fa v2 only support compress model currently. ' + 'please use 2 or 3' + ) + # alibi is only support FA2 + if args.alibi_fusion_attn_type in [2, 3]: + args.use_fusion_attn_v2 = True + if args.use_fusion_attn_v2: args.use_flash_attn = True print( diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py index 5158970d..2496f90c 100644 --- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py @@ -1,3 +1,6 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from argparse import ArgumentParser from mindspeed.features_manager.feature import MindSpeedFeature diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py index e45b2662..23c52684 100644 --- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -1,5 +1,9 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + from mindspeed.features_manager.feature import MindSpeedFeature + class GenerateMaskFeature(MindSpeedFeature): def __init__(self): @@ -8,7 +12,7 @@ class GenerateMaskFeature(MindSpeedFeature): optimization_level=2 ) - def is_need_apply(self, args): + def is_need_apply(self, args: Any) -> bool: """Check the feature is need to apply.""" need_apply = True -- Gitee From 94b7fa555d43af620654fdc3493c225900baafd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Fri, 11 Apr 2025 10:18:53 +0800 Subject: [PATCH 13/18] bugfix --- mindspeed/features_manager/__init__.py | 2 +- .../transformer/flash_attention/generate_mask_feature.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py index d3549e20..6cdb8a87 100644 --- a/mindspeed/features_manager/__init__.py +++ b/mindspeed/features_manager/__init__.py @@ -62,7 +62,7 @@ FEATURES_LIST_V2 = ( # Transformer flash attention features FusionAttentionV2Feature(), AlibiFeature(), - GenerateMaskFeature() + GenerateMaskFeature(), # MoeExperts use gemm MoEGmmFeature(), diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py index 23c52684..4ff3050a 100644 --- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -1,6 +1,8 @@ # Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +from typing import Any + from mindspeed.features_manager.feature import MindSpeedFeature -- Gitee From 69956541d29173dbc8c616cfaf4c8bc59b9ad052 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Fri, 11 Apr 2025 11:34:53 +0800 Subject: [PATCH 14/18] clean code --- .../flash_attention/fusion_attention_v2_feature.py | 2 +- .../flash_attention/generate_mask_feature.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py index 2496f90c..c274b7b3 100644 --- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py @@ -43,7 +43,7 @@ class FusionAttentionV2Feature(MindSpeedFeature): '--sparse-mode', type=int, default=0, - choices=[0,1,2,3,4,5,6,7,8], + choices=[0, 1, 2, 3, 4, 5, 6, 7, 8], help='mask type for fusion attention' ) diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py index 4ff3050a..332d41f9 100644 --- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -20,7 +20,7 @@ class GenerateMaskFeature(MindSpeedFeature): # can't find feature name, need to enable if getattr(args, self.feature_name, None): - need_apply = False + need_apply = False return ( self.optimization_level <= args.optimization_level and @@ -28,8 +28,8 @@ class GenerateMaskFeature(MindSpeedFeature): ) or self.default_patches def register_patches(self, patch_manager, args): - from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper - patch_manager.register_patch( - 'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', - dot_product_attention_forward_wrapper - ) + from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper + patch_manager.register_patch( + 'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', + dot_product_attention_forward_wrapper + ) -- Gitee From 514078b5fa757f2a8a1d4269711da5c5e6747e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Sat, 12 Apr 2025 17:12:34 +0800 Subject: [PATCH 15/18] bugfix --- mindspeed/core/transformer/flash_attention/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mindspeed/core/transformer/flash_attention/__init__.py diff --git a/mindspeed/core/transformer/flash_attention/__init__.py b/mindspeed/core/transformer/flash_attention/__init__.py new file mode 100644 index 00000000..e69de29b -- Gitee From eefec4039dcee921c0514f22e894452f7d965fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Sat, 12 Apr 2025 17:18:36 +0800 Subject: [PATCH 16/18] add alibi ut test example --- requirements.txt | 1 + .../features/flash_attention/test_alibi.py | 74 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py diff --git a/requirements.txt b/requirements.txt index 1044aea4..b7e8f1c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ einops scipy sentencepiece pytest +pytest-mock tokenizers<=0.20.3 transformers>=4.43.2 gpytorch diff --git a/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py new file mode 100644 index 00000000..86bf889c --- /dev/null +++ b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py @@ -0,0 +1,74 @@ +import pytest +import torch +import torch_npu + +from mindspeed import megatron_adaptor_v2 + +from megatron.training.global_vars import set_args +from megatron.training.arguments import parse_args +from megatron.core.transformer.transformer_config import TransformerConfig + +from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention + + +DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] + + +def run_fusion_attn_with_pse_alibi(bs, seq_len, dtype): + from megatron.core.transformer.enums import AttnMaskType + + args = parse_args(None, True) + set_args(args) + + config = TransformerConfig( + num_layers=2, + hidden_size=32, + num_attention_heads=4, + attention_dropout=0.0, + params_dtype = dtype + ) + + # extra arguments mindspeed needed + config.use_flash_attn = True + config.use_fusion_attn_v2 = True + config.alibi_fusion_attn_type = 2 + config.sparse_mode = 2 + config.seq_length = seq_len + config.alibi_diagonal_opposite = False + + attn = MindSpeedDotProductAttention( + config=config, + layer_number=1, + attn_mask_type=AttnMaskType.causal, + attention_type='self' + ) + + # attn.pse should exist and not be None + assert attn.pse is not None + + b, n, s, d = bs, 4, seq_len, 8 + + q = torch.randn(s, b, n, d, dtype=dtype, device='npu', requires_grad=True) + k = torch.randn(s, b, n, d, dtype=dtype, device='npu', requires_grad=True) + v = torch.randn(s, b, n, d, dtype=dtype, device='npu', requires_grad=True) + + # global attn mask will be generated at DotProductAttention forward wrapper + out = attn(q, k, v, None, None, None, None) + assert isinstance(out, torch.Tensor) + + +class TestAlibi(): + + @pytest.mark.skipif(DEVICE_NAME != 'Ascend910B', reason='device type is not supported, skip this UT!') + def test_alibi(self, mocker): + mock_world_size = mocker.patch( + "megatron.core.parallel_state.get_tensor_model_parallel_world_size", + return_value=1 + ) + mock_rank = mocker.patch( + "megatron.core.parallel_state.get_tensor_model_parallel_rank", + return_value=0 + ) + run_fusion_attn_with_pse_alibi(2, 256, torch.bfloat16) + mock_world_size.assert_called() + mock_rank.assert_called_once() -- Gitee From cd2edfdf89cd8ffe6cb7ac96f6a8cdb129fffd9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Mon, 14 Apr 2025 09:44:26 +0800 Subject: [PATCH 17/18] clean code --- .../unit_tests/features/flash_attention/test_alibi.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py index 86bf889c..dd002ad8 100644 --- a/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py +++ b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py @@ -2,15 +2,13 @@ import pytest import torch import torch_npu -from mindspeed import megatron_adaptor_v2 - from megatron.training.global_vars import set_args from megatron.training.arguments import parse_args from megatron.core.transformer.transformer_config import TransformerConfig +from mindspeed import megatron_adaptor_v2 from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention - DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -25,7 +23,7 @@ def run_fusion_attn_with_pse_alibi(bs, seq_len, dtype): hidden_size=32, num_attention_heads=4, attention_dropout=0.0, - params_dtype = dtype + params_dtype=dtype ) # extra arguments mindspeed needed -- Gitee From 1eea1395843ad173228aa98b741f6cf4cbf67914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= Date: Thu, 17 Apr 2025 14:28:50 +0800 Subject: [PATCH 18/18] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../flash_attention/alibi/adaptor.py | 24 +------------------ .../flash_attention/alibi/alibi.py | 2 +- .../flash_attention/alibi/alibi_utils.py | 6 ++--- .../alibi/dot_product_attention.py | 2 +- .../flash_attention/generate_mask/adaptor.py | 10 ++++---- .../generate_mask/generate_mask.py | 2 +- .../flash_attention/alibi_feature.py | 12 +++++----- .../flash_attention/generate_mask_feature.py | 8 +++---- 8 files changed, 22 insertions(+), 44 deletions(-) diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py index da62f19f..24be3b40 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py +++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. from typing import Optional @@ -38,25 +38,3 @@ class MindSpeedDotProductAttention(DotProductAttentionImpl, MegatronDotProductAt # add pse DotProductAttentionImpl.__init__(self) - def forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - attention_mask: Tensor, - attn_mask_type: AttnMaskType = None, - attention_bias: Tensor = None, - packed_seq_params: Optional[PackedSeqParams] = None, - ): - output = DotProductAttentionImpl.forward( - self, - query, - key, - value, - attention_mask, - attn_mask_type, - attention_bias, - packed_seq_params, - ) - - return output diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py index 1a6891a8..df138999 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. import threading diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py index 800369aa..31d879a2 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py +++ b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. import math @@ -29,7 +29,7 @@ def get_slopes(n): else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) return ( - get_slopes_power_of_2(closest_power_of_2) + + get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][:n - closest_power_of_2] ) @@ -75,7 +75,7 @@ def _build_alibi_tensor( alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point else: alibi = ( - slopes.unsqueeze(1).unsqueeze(1) * + slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( num_attention_heads, -1, -1 ) diff --git a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py index b4699876..4d930c27 100644 --- a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py +++ b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. import math diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py index 24ca64d4..f27dae85 100644 --- a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py +++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. from functools import wraps @@ -10,10 +10,10 @@ from mindspeed.core.transformer.flash_attention.generate_mask.generate_mask impo def dot_product_attention_forward_wrapper(fn): @wraps(fn) def wrapper( - self, query, key, value, - attention_mask, - attn_mask_type, - attention_bias, + self, query, key, value, + attention_mask, + attn_mask_type, + attention_bias, packed_seq_params ): if ( diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py index 02295e47..b4bad6d0 100644 --- a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py +++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. import warnings diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py index f443c437..a190780f 100644 --- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. from logging import getLogger @@ -20,7 +20,7 @@ class AlibiFeature(MindSpeedFeature): def __init__(self): super().__init__( - 'position-embedding-type', + 'position-embedding-type', optimization_level=2 ) @@ -30,14 +30,14 @@ class AlibiFeature(MindSpeedFeature): if pse == 'alibi': need_apply = True return ( - self.optimization_level <= args.optimization_level and + self.optimization_level <= args.optimization_level and need_apply ) or self.default_patches def register_args(self, parser: ArgumentParser): self.add_parser_argument_choices_value( - parser, - "--position-embedding-type", + parser, + "--position-embedding-type", 'alibi' ) @@ -92,6 +92,6 @@ class AlibiFeature(MindSpeedFeature): def register_patches(self, patch_manager, args): from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention patch_manager.register_patch( - 'megatron.core.transformer.dot_product_attention.DotProductAttention', + 'megatron.core.transformer.dot_product_attention.DotProductAttention', MindSpeedDotProductAttention ) \ No newline at end of file diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py index 332d41f9..16705975 100644 --- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py +++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved. # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. from typing import Any @@ -10,7 +10,7 @@ class GenerateMaskFeature(MindSpeedFeature): def __init__(self): super().__init__( - 'no-create-attention-mask-in-dataloader', + 'no-create-attention-mask-in-dataloader', optimization_level=2 ) @@ -23,13 +23,13 @@ class GenerateMaskFeature(MindSpeedFeature): need_apply = False return ( - self.optimization_level <= args.optimization_level and + self.optimization_level <= args.optimization_level and need_apply ) or self.default_patches def register_patches(self, patch_manager, args): from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper patch_manager.register_patch( - 'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', + 'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper ) -- Gitee