From 782c2f0190ba49739e0011d3097f35868552430c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Thu, 27 Mar 2025 20:33:53 +0800
Subject: [PATCH 01/18] =?UTF-8?q?mask=E7=94=9F=E6=88=90=E5=BD=92=E4=B8=80?=
 =?UTF-8?q?=E9=87=8D=E6=9E=84=E5=92=8Calibi=E9=87=8D=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../basic_feature/generate_mask/__init__.py   |   0
 .../basic_feature/generate_mask/adaptor.py    |  25 +++
 .../generate_mask/generate_mask.py            |  76 ++++++++
 .../core/transformer/dot_product_attention.py |   8 +-
 .../positional_encoding/alibi/__init__.py     |   0
 .../positional_encoding/alibi/adaptor.py      | 182 ++++++++++++++++++
 .../positional_encoding/alibi/alibi.py        |  83 ++++++++
 .../positional_encoding/alibi/alibi_utils.py  |  58 ++++++
 mindspeed/features_manager/__init__.py        |   6 +
 .../features_manager/transformer/__init__.py  |   0
 .../transformer/alibi_feature.py              |  76 ++++++++
 .../transformer/generate_mask_feature.py      |  16 ++
 12 files changed, 526 insertions(+), 4 deletions(-)
 create mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/__init__.py
 create mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
 create mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
 create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/__init__.py
 create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
 create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/alibi.py
 create mode 100644 mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py
 create mode 100644 mindspeed/features_manager/transformer/__init__.py
 create mode 100644 mindspeed/features_manager/transformer/alibi_feature.py
 create mode 100644 mindspeed/features_manager/transformer/generate_mask_feature.py

diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py b/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
new file mode 100644
index 00000000..a287aa4d
--- /dev/null
+++ b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
@@ -0,0 +1,25 @@
+from functools import wraps
+
+from megatron.training import get_args
+from megatron.core.transformer.enums import AttnMaskType
+
+from .generate_mask import get_attention_mask
+
+
+def parallel_transformer_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, hidden_states, attention_mask, **kwargs):
+        args = get_args()
+        if attention_mask is None:
+            attention_mask = get_attention_mask()
+        return fn(self, hidden_states, attention_mask, **kwargs)
+    return wrapper
+
+def dot_product_attention_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
+        if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
+            if not getattr(self.config, 'is_llava', False):
+                attention_mask = get_attention_mask()
+        return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
+    return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
new file mode 100644
index 00000000..e43cfc1d
--- /dev/null
+++ b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
@@ -0,0 +1,76 @@
+import warnings
+
+import torch
+from megatron.training import get_args
+
+_GLOBAL_ATTN_MASK = None
+
+
+def set_attention_mask(attn_mask):
+    global _GLOBAL_ATTN_MASK
+    _GLOBAL_ATTN_MASK = attn_mask
+
+
+def generate_attention_mask(compress, device):
+    global _GLOBAL_ATTN_MASK
+    args = get_args()
+    if not args.use_flash_attn:
+        warnings.warn("Flash Attention is highly recommended")
+        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \
+                                + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1))
+        return
+
+    if compress:
+        seq_len = 2048
+    else:
+        seq_len = args.seq_length
+    
+    _GLOBAL_ATTN_MASK = torch.triu(
+                            torch.ones((seq_len, seq_len), 
+                            device=device, dtype=torch.bool), diagonal=1)
+
+
+def get_attention_mask():
+    global _GLOBAL_ATTN_MASK
+    if _GLOBAL_ATTN_MASK is not None:
+        return _GLOBAL_ATTN_MASK
+
+    args = get_args()
+    should_generate_mask = False
+    device = 'npu'
+
+    if args.attention_mask_type == 'causal':
+        args.sparse_mode = 2
+        should_generate_mask = True
+        compress = True
+
+    # ampipe开启在同时不开cp时需要生成全量mask，开cp时生成causal mask
+    if args.ampipe_degree > 1 and args.context_parallel_size <= 1:
+        args.sparse_mode = 0
+        should_generate_mask = True
+        compress = False
+
+    # EoD 模式 Ring Attention的实现
+    # general 为基线方案，causal 为加速方案
+    # 如果 cp > 1 且使用了Ring Attention 并行（包括Hybrid并行）。则Mask为动态生成的，不需要额外的Mask
+    if args.reset_attention_mask:
+        if args.attention_mask_type == 'general':
+            args.sparse_mode = 2
+            if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo':
+                should_generate_mask = True
+                compress = True
+            else:
+                args.sparse_mode = 1
+                should_generate_mask = False
+        else:
+            should_generate_mask = True
+            compress = True
+
+
+    if args.attention_mask_on_cpu:
+        device = 'cpu'
+
+    if should_generate_mask:
+        generate_attention_mask(compress, device)
+
+    return _GLOBAL_ATTN_MASK
diff --git a/mindspeed/core/transformer/dot_product_attention.py b/mindspeed/core/transformer/dot_product_attention.py
index ec881582..dff192ca 100644
--- a/mindspeed/core/transformer/dot_product_attention.py
+++ b/mindspeed/core/transformer/dot_product_attention.py
@@ -32,7 +32,7 @@ from mindspeed.core.parallel_state import (get_context_parallel_group_for_hybrid
                                            get_ring_group_for_intra_window,
                                            get_ring_group_for_intra_window_send_recv_overlap)
 from mindspeed.core.tensor_parallel_y_union_cp import TensorParallelYUnionCP
-from mindspeed.model.transformer import get_attention_mask
+# from mindspeed.model.transformer import get_attention_mask
 from mindspeed.utils import get_actual_seq_len
 from mindspeed.core.context_parallel.adaptive_context_parallel import adaptive_attn_context_parallel
 from mindspeed.core.context_parallel.utils import get_scheduling_info
@@ -158,9 +158,9 @@ def dot_product_attention_init_wrapper(fn):
 def dot_product_attention_forward_wrapper(fn):
     @wraps(fn)
     def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
-        if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
-            if not getattr(self.config, 'is_llava', False):
-                attention_mask = get_attention_mask()
+        # if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
+        #     if not getattr(self.config, 'is_llava', False):
+        #         attention_mask = get_attention_mask()
         if get_args().use_flash_attn:
             return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
         return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/__init__.py b/mindspeed/core/transformer/positional_encoding/alibi/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
new file mode 100644
index 00000000..76fe1ff6
--- /dev/null
+++ b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
@@ -0,0 +1,182 @@
+from functools import wraps
+
+import torch
+from megatron.training import get_args, mpu, tensor_parallel
+
+from .alibi import Alibi, AlibiForFusionAttnSingleton
+from .alibi_utils import _build_alibi_tensor, _get_inverted_mask
+
+
+def core_attention_init_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, *arg, **kwargs):
+        fn(self, *arg, **kwargs)
+
+        args = get_args()
+        self.hidden_size_per_partition = self.hidden_size_per_partition // arg[1].context_parallel_size
+        self.square_alibi_mask = args.square_alibi_mask
+        self.fill_neg_inf = args.fill_neg_inf
+        self.beta = 1.0
+        self.config = arg[1]
+        if self.apply_query_key_layer_scaling:
+            self.beta = 1.0 / self.layer_number
+        if args.position_embedding_type == 'alibi':
+            self.alibi = Alibi()
+            alibi = _build_alibi_tensor(args.seq_length,
+                                        self.config.num_attention_heads,
+                                        args.square_alibi_mask,
+                                        args.fill_neg_inf
+                                        ).to(torch.cuda.current_device())
+            if self.config.params_dtype == torch.float16:
+                alibi = alibi.to(torch.float16)
+            elif self.config.params_dtype == torch.bfloat16:
+                alibi = alibi.to(torch.bfloat16)
+            self.alibi.alibi = alibi
+        else:
+            self.alibi = None
+
+    return wrapper
+
+
+def core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
+    # ===================================
+    # Raw attention scores. [b, np, s, s]
+    # ===================================
+
+    # [b, np, sq, sk]
+    output_size = (query_layer.size(1),
+                   query_layer.size(2),
+                   query_layer.size(0),
+                   key_layer.size(0))
+
+    # [sq, b, np, hn] -> [sq, b * np, hn]
+    query_layer = query_layer.reshape(output_size[2],
+                                      output_size[0] * output_size[1], -1)
+    # [sk, b, np, hn] -> [sk, b * np, hn]
+    key_layer = key_layer.view(output_size[3],
+                               output_size[0] * output_size[1], -1)
+
+    if self.alibi is None:
+        matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype, "mpu")
+
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),
+            key_layer.transpose(0, 1).transpose(1, 2),
+            beta=0.0, alpha=(1.0 / self.norm_factor))
+    else:
+        if self.alibi.matmul_result is None or self.alibi.output_size != output_size:
+            args = get_args()
+
+            self.alibi.output_size = output_size
+            alibi = _build_alibi_tensor(args.seq_length,
+                                        self.config.num_attention_heads,
+                                        args.square_alibi_mask,
+                                        args.fill_neg_inf
+                                        ).to(torch.cuda.current_device())
+            if self.config.params_dtype == torch.float16:
+                alibi = alibi.to(torch.float16)
+            elif self.config.params_dtype == torch.bfloat16:
+                alibi = alibi.to(torch.bfloat16)
+            self.alibi.alibi = alibi
+
+            if self.fill_neg_inf:
+                _alibi = self.alibi.alibi[:, :output_size[3], :output_size[3]]
+                attention_mask = attention_mask.repeat(output_size[0], 1, 1, 1)[:output_size[0], :, :, :]
+                self.alibi.matmul_result = _get_inverted_mask(attention_mask, _alibi).view(-1, output_size[2],
+                                                                                           output_size[2]).contiguous()
+            else:
+                self.alibi.matmul_result = self.alibi.alibi[:, :, :output_size[3]].repeat(output_size[0], 1, 1)
+
+        q_trans = query_layer.transpose(0, 1).contiguous()
+        k_trans = key_layer.transpose(0, 1).transpose(1, 2).contiguous()
+        matmul_result = self.beta * self.alibi.matmul_result + torch.bmm(q_trans, k_trans) * (1.0 / self.norm_factor)
+
+        # change view to [b, np, sq, sk]
+    attention_scores = matmul_result.view(*output_size)
+
+    # ===========================
+    # Attention probs and dropout
+    # ===========================
+
+    # attention scores and attention mask [b, np, sq, sk]
+    if self.square_alibi_mask:
+        attention_scores = torch.max(
+            attention_scores, torch.tensor(torch.finfo(attention_scores.dtype).min)
+        )
+        attention_probs = torch.nn.functional.softmax(attention_scores, -1)
+    else:
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    if not self.sequence_parallel:
+        with tensor_parallel.get_cuda_rng_tracker().fork():
+            attention_probs = self.attention_dropout(attention_probs)
+    else:
+        attention_probs = self.attention_dropout(attention_probs)
+
+    # =========================
+    # Context layer. [sq, b, hp]
+    # =========================
+
+    # value_layer -> context layer.
+    # [sk, b, np, hn] --> [b, np, sq, hn]
+
+    # context layer shape: [b, np, sq, hn]
+    output_size = (value_layer.size(1),
+                   value_layer.size(2),
+                   query_layer.size(0),
+                   value_layer.size(3))
+
+    # change view [sk, b * np, hn]
+    value_layer = value_layer.view(value_layer.size(0),
+                                   output_size[0] * output_size[1], -1)
+
+    # change view [b * np, sq, sk]
+    attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                           output_size[2], -1)
+
+    # matmul: [b * np, sq, hn]
+    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+    # change view [b, np, sq, hn]
+    context_layer = context_layer.view(*output_size)
+
+    # [b, np, sq, hn] --> [sq, b, np, hn]
+    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+    # [sq, b, np, hn] --> [sq, b, hp]
+    new_context_layer_shape = context_layer.size()[:-2] + \
+                              (self.hidden_size_per_partition,)
+    context_layer = context_layer.view(*new_context_layer_shape)
+
+    return context_layer
+
+
+def flash_self_attention_init_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, *arg, **kwargs):
+        fn(self, *arg, **kwargs)
+        args = get_args()
+
+        self.pse = None
+        self.pse_type = args.alibi_fusion_attn_type
+
+        if self.pse_type is None:
+            self.pse_type = 1  # not use pse
+        elif self.pse_type == 0:
+            alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(args.seq_length,
+                                                                                 args.num_attention_heads,
+                                                                                 args.params_dtype,
+                                                                                 args.alibi_diagonal_opposite,
+                                                                                 1024)
+            self.pse = alibi
+
+        elif self.pse_type == 2 or self.pse_type == 3:
+            self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(args.num_attention_heads)
+
+    return wrapper
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi.py b/mindspeed/core/transformer/positional_encoding/alibi/alibi.py
new file mode 100644
index 00000000..957d334f
--- /dev/null
+++ b/mindspeed/core/transformer/positional_encoding/alibi/alibi.py
@@ -0,0 +1,83 @@
+import threading
+
+import torch
+from megatron.core import parallel_state
+
+from .alibi_utils import get_slopes
+
+
+class Alibi:
+    _instance = None
+    alibi = None
+    matmul_result = None
+    output_size = None
+    lock = threading.Lock()
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance:
+            return cls._instance
+        else:
+            with cls.lock:
+                cls._instance = super().__new__(cls)
+                return cls._instance
+
+
+class AlibiForFusionAttnSingleton:
+    _alibi_tensor_args = None
+    _alibi_tensor = None
+
+    _alibi_slopes_headnum = None
+    _alibi_slopes = None
+
+    @classmethod
+    def get_alibi_tensor_for_fusion_attn(cls, max_seq_len, num_attention_heads, dtype, neg_diagonal_opposite=False,
+                                         last_k=1024):
+        if cls._alibi_tensor is None or cls._alibi_tensor_args != (
+        max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k):
+            if last_k > max_seq_len:
+                last_k = max_seq_len
+
+            tp_world_size = parallel_state.get_tensor_model_parallel_world_size()
+            current_head_num = num_attention_heads // tp_world_size
+            slopes = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(num_attention_heads)
+
+            position_point = torch.arange(max_seq_len) - max_seq_len + 1
+            diag = torch.diag(torch.diag(position_point)).unsqueeze(0).unsqueeze(0)
+
+            position_point = position_point.unsqueeze(0).unsqueeze(0).expand(current_head_num, last_k, -1)
+            position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, last_k,
+                                                                                            max_seq_len)
+
+            alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point.npu()
+
+            if not neg_diagonal_opposite:
+                alibi = -torch.abs(alibi)
+
+            alibi = alibi.unsqueeze(0)
+            alibi = torch.Tensor(alibi).npu()
+            if dtype == torch.float16:
+                alibi = alibi.to(torch.float16)
+            elif dtype == torch.bfloat16:
+                alibi = alibi.to(torch.bfloat16)
+
+            cls._alibi_tensor = alibi
+            cls._alibi_tensor_args = (max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k)
+
+        return cls._alibi_tensor
+
+    @classmethod
+    def get_alibi_slopes_for_fusion_attn(cls, n):
+        if cls._alibi_slopes is None or cls._alibi_slopes_headnum != n:
+            slopes = get_slopes(n)
+
+            tp_world_size = parallel_state.get_tensor_model_parallel_world_size()
+            tp_index = parallel_state.get_tensor_model_parallel_rank()
+
+            current_head_num = n // tp_world_size
+            slopes = torch.Tensor(slopes[tp_index * current_head_num: tp_index * current_head_num + current_head_num]).npu()
+
+            cls._alibi_slopes = slopes
+            cls._alibi_slopes_headnum = n
+            return cls._alibi_slopes
+
+        return cls._alibi_slopes
\ No newline at end of file
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py b/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py
new file mode 100644
index 00000000..7c28313b
--- /dev/null
+++ b/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py
@@ -0,0 +1,58 @@
+import math
+
+import torch
+from megatron.core import parallel_state
+
+
+def get_slopes(n):
+    def get_slopes_power_of_2(n):
+        start = (2 ** (-2 ** -(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio ** i for i in range(n)]
+
+    if math.log2(n).is_integer():
+        return get_slopes_power_of_2(n)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n))
+        return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
+                                                           :n - closest_power_of_2]
+
+
+def _get_inverted_mask(attention_mask, alibi):
+    inverted_mask = attention_mask.to(alibi.dtype)
+    inverted_mask = inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), float("-inf")
+    )
+    return inverted_mask.to(alibi.device) + alibi.unsqueeze(0)
+
+
+def _build_alibi_tensor(max_seq_len, num_attention_heads, square_alibi_mask, fill_neg_inf):
+    def _fill_with_neg_inf(t):
+        """FP16-compatible function that fills a tensor with -inf."""
+        return t.float().fill_(float("-inf")).type_as(t)
+
+    def _buffered_future_mask(maxpos, alibi, attn_heads):
+        _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1)
+        _future_mask = _future_mask.unsqueeze(0) + alibi
+        return _future_mask[:attn_heads, :maxpos, :maxpos]
+
+    slopes = torch.Tensor(get_slopes(num_attention_heads))
+    if square_alibi_mask:
+        position_point = torch.arange(max_seq_len) - max_seq_len + 1
+        position_point = position_point.unsqueeze(0).unsqueeze(0).expand(num_attention_heads, max_seq_len, -1)
+        diag = torch.diag(position_point[0])
+        position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2)
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point
+    else:
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
+            num_attention_heads, -1, -1)
+
+    # Select the part of the tensor that corresponds to our tensor parallel index.
+    tp_world_size = parallel_state.get_tensor_model_parallel_world_size()
+    tp_index = parallel_state.get_tensor_model_parallel_rank()
+    alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index]
+
+    if fill_neg_inf:
+        return _buffered_future_mask(max_seq_len, alibi, num_attention_heads)
+
+    return alibi
diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py
index 5902aa6c..cff07ab4 100644
--- a/mindspeed/features_manager/__init__.py
+++ b/mindspeed/features_manager/__init__.py
@@ -1,6 +1,8 @@
 from .tensor_parallel.unaligned_linear_feature import UnalignedLinearFeature
 from .functional.profiler_default_feature import ProfilerDefaultFeature
 from .llava.llava_multimodal import LlavaModel
+from .transformer.alibi_feature import AlibiFeature
+from .transformer.generate_mask_feature import GenerateMaskFeature
 
 FEATURES_LIST = [
     # Functional features
@@ -19,4 +21,8 @@ FEATURES = (
     UnalignedLinearFeature(),
     # llava-multimodal
     LlavaModel(),
+    # Transformer features
+    AlibiFeature(),
+    # Transformer features
+    GenerateMaskFeature()
 )
diff --git a/mindspeed/features_manager/transformer/__init__.py b/mindspeed/features_manager/transformer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/alibi_feature.py
new file mode 100644
index 00000000..e43cfc1d
--- /dev/null
+++ b/mindspeed/features_manager/transformer/alibi_feature.py
@@ -0,0 +1,76 @@
+import warnings
+
+import torch
+from megatron.training import get_args
+
+_GLOBAL_ATTN_MASK = None
+
+
+def set_attention_mask(attn_mask):
+    global _GLOBAL_ATTN_MASK
+    _GLOBAL_ATTN_MASK = attn_mask
+
+
+def generate_attention_mask(compress, device):
+    global _GLOBAL_ATTN_MASK
+    args = get_args()
+    if not args.use_flash_attn:
+        warnings.warn("Flash Attention is highly recommended")
+        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \
+                                + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1))
+        return
+
+    if compress:
+        seq_len = 2048
+    else:
+        seq_len = args.seq_length
+    
+    _GLOBAL_ATTN_MASK = torch.triu(
+                            torch.ones((seq_len, seq_len), 
+                            device=device, dtype=torch.bool), diagonal=1)
+
+
+def get_attention_mask():
+    global _GLOBAL_ATTN_MASK
+    if _GLOBAL_ATTN_MASK is not None:
+        return _GLOBAL_ATTN_MASK
+
+    args = get_args()
+    should_generate_mask = False
+    device = 'npu'
+
+    if args.attention_mask_type == 'causal':
+        args.sparse_mode = 2
+        should_generate_mask = True
+        compress = True
+
+    # ampipe开启在同时不开cp时需要生成全量mask，开cp时生成causal mask
+    if args.ampipe_degree > 1 and args.context_parallel_size <= 1:
+        args.sparse_mode = 0
+        should_generate_mask = True
+        compress = False
+
+    # EoD 模式 Ring Attention的实现
+    # general 为基线方案，causal 为加速方案
+    # 如果 cp > 1 且使用了Ring Attention 并行（包括Hybrid并行）。则Mask为动态生成的，不需要额外的Mask
+    if args.reset_attention_mask:
+        if args.attention_mask_type == 'general':
+            args.sparse_mode = 2
+            if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo':
+                should_generate_mask = True
+                compress = True
+            else:
+                args.sparse_mode = 1
+                should_generate_mask = False
+        else:
+            should_generate_mask = True
+            compress = True
+
+
+    if args.attention_mask_on_cpu:
+        device = 'cpu'
+
+    if should_generate_mask:
+        generate_attention_mask(compress, device)
+
+    return _GLOBAL_ATTN_MASK
diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py
new file mode 100644
index 00000000..8d4283c7
--- /dev/null
+++ b/mindspeed/features_manager/transformer/generate_mask_feature.py
@@ -0,0 +1,16 @@
+from argparse import ArgumentParser
+
+from mindspeed.features_manager.feature import MindSpeedFeature
+
+class GenerateMaskFeature(MindSpeedFeature):
+
+    def __init__(self):
+        super().__init__('generate-mask')
+
+    def register_patches(self, patch_manager, args):
+          from mindspeed.model.transformer import parallel_transformer_forward_wrapper
+          from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import dot_product_attention_forward_wrapper
+          patch_manager.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward',
+                                        parallel_transformer_forward_wrapper)
+          patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward',
+                                        dot_product_attention_forward_wrapper)
\ No newline at end of file
-- 
Gitee


From 5f7b8971d33667f591098d5d8b1e236a83035fcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Fri, 28 Mar 2025 11:13:31 +0800
Subject: [PATCH 02/18] =?UTF-8?q?=E7=9B=91=E8=A7=86=E6=84=8F=E8=A7=81?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../basic_feature/generate_mask/adaptor.py    |  9 +++--
 .../generate_mask/generate_mask.py            | 24 ++++++------
 .../positional_encoding/alibi/adaptor.py      | 37 +++++++------------
 3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
index a287aa4d..43a4318b 100644
--- a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
+++ b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
@@ -18,8 +18,11 @@ def parallel_transformer_forward_wrapper(fn):
 def dot_product_attention_forward_wrapper(fn):
     @wraps(fn)
     def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
-        if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
-            if not getattr(self.config, 'is_llava', False):
-                attention_mask = get_attention_mask()
+        if (
+            attention_mask is None
+            and self.attn_mask_type == AttnMaskType.causal:
+            and not getattr(self.config, 'is_llava', False)
+        ):
+            attention_mask = get_attention_mask()
         return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
     return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
index e43cfc1d..bf72d125 100644
--- a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
+++ b/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
@@ -1,9 +1,10 @@
-import warnings
+from logging import getLogger
 
 import torch
 from megatron.training import get_args
 
 _GLOBAL_ATTN_MASK = None
+LOG = getLogger(__name__)
 
 
 def set_attention_mask(attn_mask):
@@ -15,19 +16,20 @@ def generate_attention_mask(compress, device):
     global _GLOBAL_ATTN_MASK
     args = get_args()
     if not args.use_flash_attn:
-        warnings.warn("Flash Attention is highly recommended")
-        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \
-                                + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1))
+        LOG.warn("Flash Attention is highly recommended")
+        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length],
+                                                   dtype=bool,
+                                                   device=device), diagonal=-(args.pre_tockens + 1)) \
+                             + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length],
+                                                     dtype=bool,
+                                                     device=device), diagonal=args.next_tockens + 1))
         return
 
-    if compress:
-        seq_len = 2048
-    else:
-        seq_len = args.seq_length
+    seq_len = 2048 if compress else args.seq_length
     
-    _GLOBAL_ATTN_MASK = torch.triu(
-                            torch.ones((seq_len, seq_len), 
-                            device=device, dtype=torch.bool), diagonal=1)
+    _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len),
+                                              device=device,
+                                              dtype=torch.bool), diagonal=1)
 
 
 def get_attention_mask():
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
index 76fe1ff6..c60350aa 100644
--- a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
+++ b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
@@ -12,25 +12,21 @@ def core_attention_init_wrapper(fn):
     def wrapper(self, *arg, **kwargs):
         fn(self, *arg, **kwargs)
 
-        args = get_args()
-        self.hidden_size_per_partition = self.hidden_size_per_partition // arg[1].context_parallel_size
-        self.square_alibi_mask = args.square_alibi_mask
-        self.fill_neg_inf = args.fill_neg_inf
+        # self.config = arg[1]
+        self.hidden_size_per_partition = self.hidden_size_per_partition // self.config.context_parallel_size
+        self.square_alibi_mask = self.config.square_alibi_mask
+        self.fill_neg_inf = self.config.fill_neg_inf
         self.beta = 1.0
-        self.config = arg[1]
+
         if self.apply_query_key_layer_scaling:
             self.beta = 1.0 / self.layer_number
-        if args.position_embedding_type == 'alibi':
+        if self.config.position_embedding_type == 'alibi':
             self.alibi = Alibi()
-            alibi = _build_alibi_tensor(args.seq_length,
+            alibi = _build_alibi_tensor(self.config.seq_length,
                                         self.config.num_attention_heads,
-                                        args.square_alibi_mask,
-                                        args.fill_neg_inf
-                                        ).to(torch.cuda.current_device())
-            if self.config.params_dtype == torch.float16:
-                alibi = alibi.to(torch.float16)
-            elif self.config.params_dtype == torch.bfloat16:
-                alibi = alibi.to(torch.bfloat16)
+                                        self.config.square_alibi_mask,
+                                        self.config.fill_neg_inf
+                                        ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype)
             self.alibi.alibi = alibi
         else:
             self.alibi = None
@@ -68,18 +64,13 @@ def core_attention_forward(self, query_layer, key_layer, value_layer, attention_
             beta=0.0, alpha=(1.0 / self.norm_factor))
     else:
         if self.alibi.matmul_result is None or self.alibi.output_size != output_size:
-            args = get_args()
 
             self.alibi.output_size = output_size
-            alibi = _build_alibi_tensor(args.seq_length,
+            alibi = _build_alibi_tensor(self.config.seq_length,
                                         self.config.num_attention_heads,
-                                        args.square_alibi_mask,
-                                        args.fill_neg_inf
-                                        ).to(torch.cuda.current_device())
-            if self.config.params_dtype == torch.float16:
-                alibi = alibi.to(torch.float16)
-            elif self.config.params_dtype == torch.bfloat16:
-                alibi = alibi.to(torch.bfloat16)
+                                        self.config.square_alibi_mask,
+                                        self.config.fill_neg_inf
+                                        ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype)
             self.alibi.alibi = alibi
 
             if self.fill_neg_inf:
-- 
Gitee


From 81747d9c9e29ad26d05197d918267e7384e462fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Fri, 28 Mar 2025 15:40:30 +0800
Subject: [PATCH 03/18] bug fix

---
 .../core/transformer/dot_product_attention.py |   8 +-
 .../transformer/alibi_feature.py              | 123 +++++++-----------
 .../transformer/generate_mask_feature.py      |   2 +-
 3 files changed, 52 insertions(+), 81 deletions(-)

diff --git a/mindspeed/core/transformer/dot_product_attention.py b/mindspeed/core/transformer/dot_product_attention.py
index dff192ca..ec881582 100644
--- a/mindspeed/core/transformer/dot_product_attention.py
+++ b/mindspeed/core/transformer/dot_product_attention.py
@@ -32,7 +32,7 @@ from mindspeed.core.parallel_state import (get_context_parallel_group_for_hybrid
                                            get_ring_group_for_intra_window,
                                            get_ring_group_for_intra_window_send_recv_overlap)
 from mindspeed.core.tensor_parallel_y_union_cp import TensorParallelYUnionCP
-# from mindspeed.model.transformer import get_attention_mask
+from mindspeed.model.transformer import get_attention_mask
 from mindspeed.utils import get_actual_seq_len
 from mindspeed.core.context_parallel.adaptive_context_parallel import adaptive_attn_context_parallel
 from mindspeed.core.context_parallel.utils import get_scheduling_info
@@ -158,9 +158,9 @@ def dot_product_attention_init_wrapper(fn):
 def dot_product_attention_forward_wrapper(fn):
     @wraps(fn)
     def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
-        # if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
-        #     if not getattr(self.config, 'is_llava', False):
-        #         attention_mask = get_attention_mask()
+        if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
+            if not getattr(self.config, 'is_llava', False):
+                attention_mask = get_attention_mask()
         if get_args().use_flash_attn:
             return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
         return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/alibi_feature.py
index e43cfc1d..21948a68 100644
--- a/mindspeed/features_manager/transformer/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/alibi_feature.py
@@ -1,76 +1,47 @@
-import warnings
-
-import torch
-from megatron.training import get_args
-
-_GLOBAL_ATTN_MASK = None
-
-
-def set_attention_mask(attn_mask):
-    global _GLOBAL_ATTN_MASK
-    _GLOBAL_ATTN_MASK = attn_mask
-
-
-def generate_attention_mask(compress, device):
-    global _GLOBAL_ATTN_MASK
-    args = get_args()
-    if not args.use_flash_attn:
-        warnings.warn("Flash Attention is highly recommended")
-        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) \
-                                + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], dtype=bool, device=device), diagonal=args.next_tockens + 1))
-        return
-
-    if compress:
-        seq_len = 2048
-    else:
-        seq_len = args.seq_length
-    
-    _GLOBAL_ATTN_MASK = torch.triu(
-                            torch.ones((seq_len, seq_len), 
-                            device=device, dtype=torch.bool), diagonal=1)
-
-
-def get_attention_mask():
-    global _GLOBAL_ATTN_MASK
-    if _GLOBAL_ATTN_MASK is not None:
-        return _GLOBAL_ATTN_MASK
-
-    args = get_args()
-    should_generate_mask = False
-    device = 'npu'
-
-    if args.attention_mask_type == 'causal':
-        args.sparse_mode = 2
-        should_generate_mask = True
-        compress = True
-
-    # ampipe开启在同时不开cp时需要生成全量mask，开cp时生成causal mask
-    if args.ampipe_degree > 1 and args.context_parallel_size <= 1:
-        args.sparse_mode = 0
-        should_generate_mask = True
-        compress = False
-
-    # EoD 模式 Ring Attention的实现
-    # general 为基线方案，causal 为加速方案
-    # 如果 cp > 1 且使用了Ring Attention 并行（包括Hybrid并行）。则Mask为动态生成的，不需要额外的Mask
-    if args.reset_attention_mask:
-        if args.attention_mask_type == 'general':
-            args.sparse_mode = 2
-            if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo':
-                should_generate_mask = True
-                compress = True
-            else:
-                args.sparse_mode = 1
-                should_generate_mask = False
-        else:
-            should_generate_mask = True
-            compress = True
-
-
-    if args.attention_mask_on_cpu:
-        device = 'cpu'
-
-    if should_generate_mask:
-        generate_attention_mask(compress, device)
-
-    return _GLOBAL_ATTN_MASK
+from argparse import ArgumentParser
+
+from mindspeed.features_manager.feature import MindSpeedFeature
+
+class AlibiFeature(MindSpeedFeature):
+
+    def __init__(self):
+        super().__init__('alibi')
+
+    def register_args(self, parser: ArgumentParser):
+        self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi')
+
+        group = parser.add_argument_group(title='alibi')
+        group.add_argument('--square-alibi-mask',
+                           action='store_true',
+                           default=False,
+                           help='attention mask of alibi is squared')
+        group.add_argument('--fill-neg-inf',
+                           action='store_true',
+                           default=False,
+                           help='fill alibi with negative inf')
+
+        group.add_argument('--alibi-fusion-attn-type',
+                           type=int,
+                           help='alibi pse type, support for 0,2,3')
+        group.add_argument('--alibi-diagonal-opposite',
+                           action='store_true',
+                           default=False,
+                           help='make alibi diagonal opposite')
+        
+    def validate_args(self, args):
+        if args.alibi_fusion_attn_type is not None and args.alibi_fusion_attn_type not in [0, 2, 3]:
+            raise AssertionError('--alibi-fusion-attn-type only support for `0, 2, 3`')
+        # alibi type [2, 3] is only support FA2
+        if args.alibi_fusion_attn_type in [2, 3]:
+            args.use_fusion_attn_v2 = True
+        if args.use_fusion_attn_v2:
+            args.use_flash_attn = True
+            print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.")
+
+    def register_patches(self, patch_manager, args):
+        from mindspeed.core.transformer.positional_encoding.alibi.adaptor import flash_self_attention_init_wrapper #l0
+        from mindspeed.core.transformer.positional_encoding.alibi.adaptor import core_attention_init_wrapper, core_attention_forward #l2
+        patch_manager.register_patch('megatron.legacy.model.transformer.FlashSelfAttention.__init__',
+                                        flash_self_attention_init_wrapper)
+        patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.__init__', core_attention_init_wrapper)
+        patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.forward', core_attention_forward)
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py
index 8d4283c7..2858d924 100644
--- a/mindspeed/features_manager/transformer/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/generate_mask_feature.py
@@ -8,7 +8,7 @@ class GenerateMaskFeature(MindSpeedFeature):
         super().__init__('generate-mask')
 
     def register_patches(self, patch_manager, args):
-          from mindspeed.model.transformer import parallel_transformer_forward_wrapper
+          from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import parallel_transformer_forward_wrapper
           from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import dot_product_attention_forward_wrapper
           patch_manager.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward',
                                         parallel_transformer_forward_wrapper)
-- 
Gitee


From b7a0b9897e6bb284797f713a79c6adae10859da5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Sat, 29 Mar 2025 10:31:12 +0800
Subject: [PATCH 04/18] =?UTF-8?q?feature=E6=B7=BB=E5=8A=A0optimization=5Fl?=
 =?UTF-8?q?evel?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mindspeed/features_manager/transformer/alibi_feature.py         | 2 +-
 mindspeed/features_manager/transformer/generate_mask_feature.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/alibi_feature.py
index 21948a68..37e174b7 100644
--- a/mindspeed/features_manager/transformer/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/alibi_feature.py
@@ -5,7 +5,7 @@ from mindspeed.features_manager.feature import MindSpeedFeature
 class AlibiFeature(MindSpeedFeature):
 
     def __init__(self):
-        super().__init__('alibi')
+        super().__init__('pse-alibi', optimization_level=2)
 
     def register_args(self, parser: ArgumentParser):
         self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi')
diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py
index 2858d924..a839f8f9 100644
--- a/mindspeed/features_manager/transformer/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/generate_mask_feature.py
@@ -5,7 +5,7 @@ from mindspeed.features_manager.feature import MindSpeedFeature
 class GenerateMaskFeature(MindSpeedFeature):
 
     def __init__(self):
-        super().__init__('generate-mask')
+        super().__init__('generate-mask', optimization_level=0)
 
     def register_patches(self, patch_manager, args):
           from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import parallel_transformer_forward_wrapper
-- 
Gitee


From 471e9777ed84c2390c80dacdf680db42f7cc29ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Sat, 29 Mar 2025 11:28:32 +0800
Subject: [PATCH 05/18] =?UTF-8?q?=E5=86=B2=E7=AA=81=E8=A7=A3=E5=86=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mindspeed/features_manager/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py
index 60988fa0..806b5b46 100644
--- a/mindspeed/features_manager/__init__.py
+++ b/mindspeed/features_manager/__init__.py
@@ -11,6 +11,9 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import
 
 from mindspeed.features_manager.llava.llava_multimodal import LlavaModel
 
+from mindspeed.features_manager.transformer.alibi_feature import AlibiFeature
+from mindspeed.features_manager.transformer.generate_mask_feature import GenerateMaskFeature
+
 FEATURES_LIST = [
     # Functional features
     ProfilerDefaultFeature(),
@@ -39,4 +42,8 @@ FEATURES_LIST_V2 = (
 
     # llava-multimodal
     LlavaModel(),
+
+     # Transformer features
+    AlibiFeature(),
+    GenerateMaskFeature()
 )
-- 
Gitee


From 15e57deb16a99e33ad62eb25363e3277e1bf2dde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Sat, 29 Mar 2025 14:56:20 +0800
Subject: [PATCH 06/18] =?UTF-8?q?mask=E7=94=9F=E6=88=90=E5=BD=92=E4=B8=80?=
 =?UTF-8?q?=E7=A7=BB=E5=8A=A8=E5=88=B0megatron=5Fbasic?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../transformer_basic.py}                     | 24 ++++++++++++++++
 .../basic_feature/generate_mask/__init__.py   |  0
 .../basic_feature/generate_mask/adaptor.py    | 28 -------------------
 mindspeed/features_manager/__init__.py        |  4 +--
 .../megatron_basic/megatron_basic.py          |  5 +++-
 .../transformer/generate_mask_feature.py      | 16 -----------
 6 files changed, 29 insertions(+), 48 deletions(-)
 rename mindspeed/core/{transformer/basic_feature/generate_mask/generate_mask.py => megatron_basic/transformer_basic.py} (74%)
 delete mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/__init__.py
 delete mode 100644 mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
 delete mode 100644 mindspeed/features_manager/transformer/generate_mask_feature.py

diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py b/mindspeed/core/megatron_basic/transformer_basic.py
similarity index 74%
rename from mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
rename to mindspeed/core/megatron_basic/transformer_basic.py
index bf72d125..65ae654c 100644
--- a/mindspeed/core/transformer/basic_feature/generate_mask/generate_mask.py
+++ b/mindspeed/core/megatron_basic/transformer_basic.py
@@ -1,7 +1,9 @@
+from functools import wraps
 from logging import getLogger
 
 import torch
 from megatron.training import get_args
+from megatron.core.transformer.enums import AttnMaskType
 
 _GLOBAL_ATTN_MASK = None
 LOG = getLogger(__name__)
@@ -76,3 +78,25 @@ def get_attention_mask():
         generate_attention_mask(compress, device)
 
     return _GLOBAL_ATTN_MASK
+
+
+def parallel_transformer_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, hidden_states, attention_mask, **kwargs):
+        args = get_args()
+        if attention_mask is None:
+            attention_mask = get_attention_mask()
+        return fn(self, hidden_states, attention_mask, **kwargs)
+    return wrapper
+
+def dot_product_attention_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
+        if (
+            attention_mask is None
+            and self.attn_mask_type == AttnMaskType.causal:
+            and not getattr(self.config, 'is_llava', False)
+        ):
+            attention_mask = get_attention_mask()
+        return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
+    return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py b/mindspeed/core/transformer/basic_feature/generate_mask/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py b/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
deleted file mode 100644
index 43a4318b..00000000
--- a/mindspeed/core/transformer/basic_feature/generate_mask/adaptor.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from functools import wraps
-
-from megatron.training import get_args
-from megatron.core.transformer.enums import AttnMaskType
-
-from .generate_mask import get_attention_mask
-
-
-def parallel_transformer_forward_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, hidden_states, attention_mask, **kwargs):
-        args = get_args()
-        if attention_mask is None:
-            attention_mask = get_attention_mask()
-        return fn(self, hidden_states, attention_mask, **kwargs)
-    return wrapper
-
-def dot_product_attention_forward_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
-        if (
-            attention_mask is None
-            and self.attn_mask_type == AttnMaskType.causal:
-            and not getattr(self.config, 'is_llava', False)
-        ):
-            attention_mask = get_attention_mask()
-        return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
-    return wrapper
\ No newline at end of file
diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py
index 806b5b46..83869db5 100644
--- a/mindspeed/features_manager/__init__.py
+++ b/mindspeed/features_manager/__init__.py
@@ -12,7 +12,6 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import
 from mindspeed.features_manager.llava.llava_multimodal import LlavaModel
 
 from mindspeed.features_manager.transformer.alibi_feature import AlibiFeature
-from mindspeed.features_manager.transformer.generate_mask_feature import GenerateMaskFeature
 
 FEATURES_LIST = [
     # Functional features
@@ -44,6 +43,5 @@ FEATURES_LIST_V2 = (
     LlavaModel(),
 
      # Transformer features
-    AlibiFeature(),
-    GenerateMaskFeature()
+    AlibiFeature()
 )
diff --git a/mindspeed/features_manager/megatron_basic/megatron_basic.py b/mindspeed/features_manager/megatron_basic/megatron_basic.py
index ef8d1874..b959e201 100644
--- a/mindspeed/features_manager/megatron_basic/megatron_basic.py
+++ b/mindspeed/features_manager/megatron_basic/megatron_basic.py
@@ -41,4 +41,7 @@ class MegatronBasicFeature(MindSpeedFeature):
         pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', start_grad_sync)
         pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_grad_sync', finish_grad_sync)
 
-
+        # attention mask generate normalization
+        from mindspeed.core.megatron_basic.transformer_basic import parallel_transformer_forward_wrapper, dot_product_attention_forward_wrapper
+        pm.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', parallel_transformer_forward_wrapper)
+        pm.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper)
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/generate_mask_feature.py b/mindspeed/features_manager/transformer/generate_mask_feature.py
deleted file mode 100644
index a839f8f9..00000000
--- a/mindspeed/features_manager/transformer/generate_mask_feature.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from argparse import ArgumentParser
-
-from mindspeed.features_manager.feature import MindSpeedFeature
-
-class GenerateMaskFeature(MindSpeedFeature):
-
-    def __init__(self):
-        super().__init__('generate-mask', optimization_level=0)
-
-    def register_patches(self, patch_manager, args):
-          from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import parallel_transformer_forward_wrapper
-          from mindspeed.core.transformer.basic_feature.generate_mask.adaptor import dot_product_attention_forward_wrapper
-          patch_manager.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward',
-                                        parallel_transformer_forward_wrapper)
-          patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward',
-                                        dot_product_attention_forward_wrapper)
\ No newline at end of file
-- 
Gitee


From 6bb43b1db87fc47c0451e843e4e7ecdfa84a2fac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Tue, 1 Apr 2025 17:24:30 +0800
Subject: [PATCH 07/18] =?UTF-8?q?get=5Fargs=E6=9B=BF=E6=8D=A2=E4=B8=BAself?=
 =?UTF-8?q?.config?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../core/megatron_basic/transformer_basic.py      | 15 ++++++---------
 .../positional_encoding/alibi/adaptor.py          | 13 ++++++-------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/mindspeed/core/megatron_basic/transformer_basic.py b/mindspeed/core/megatron_basic/transformer_basic.py
index 65ae654c..49e35ee5 100644
--- a/mindspeed/core/megatron_basic/transformer_basic.py
+++ b/mindspeed/core/megatron_basic/transformer_basic.py
@@ -14,9 +14,8 @@ def set_attention_mask(attn_mask):
     _GLOBAL_ATTN_MASK = attn_mask
 
 
-def generate_attention_mask(compress, device):
+def generate_attention_mask(args, compress, device):
     global _GLOBAL_ATTN_MASK
-    args = get_args()
     if not args.use_flash_attn:
         LOG.warn("Flash Attention is highly recommended")
         _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length],
@@ -34,12 +33,11 @@ def generate_attention_mask(compress, device):
                                               dtype=torch.bool), diagonal=1)
 
 
-def get_attention_mask():
+def get_attention_mask(args):
     global _GLOBAL_ATTN_MASK
     if _GLOBAL_ATTN_MASK is not None:
         return _GLOBAL_ATTN_MASK
 
-    args = get_args()
     should_generate_mask = False
     device = 'npu'
 
@@ -75,7 +73,7 @@ def get_attention_mask():
         device = 'cpu'
 
     if should_generate_mask:
-        generate_attention_mask(compress, device)
+        generate_attention_mask(args, compress, device)
 
     return _GLOBAL_ATTN_MASK
 
@@ -83,9 +81,8 @@ def get_attention_mask():
 def parallel_transformer_forward_wrapper(fn):
     @wraps(fn)
     def wrapper(self, hidden_states, attention_mask, **kwargs):
-        args = get_args()
         if attention_mask is None:
-            attention_mask = get_attention_mask()
+            attention_mask = get_attention_mask(self.config)
         return fn(self, hidden_states, attention_mask, **kwargs)
     return wrapper
 
@@ -94,9 +91,9 @@ def dot_product_attention_forward_wrapper(fn):
     def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
         if (
             attention_mask is None
-            and self.attn_mask_type == AttnMaskType.causal:
+            and self.attn_mask_type == AttnMaskType.causal
             and not getattr(self.config, 'is_llava', False)
         ):
-            attention_mask = get_attention_mask()
+            attention_mask = get_attention_mask(self.config)
         return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
     return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
index c60350aa..5ae66ae0 100644
--- a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
+++ b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
@@ -152,22 +152,21 @@ def flash_self_attention_init_wrapper(fn):
     @wraps(fn)
     def wrapper(self, *arg, **kwargs):
         fn(self, *arg, **kwargs)
-        args = get_args()
 
         self.pse = None
-        self.pse_type = args.alibi_fusion_attn_type
+        self.pse_type = self.config.alibi_fusion_attn_type
 
         if self.pse_type is None:
             self.pse_type = 1  # not use pse
         elif self.pse_type == 0:
-            alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(args.seq_length,
-                                                                                 args.num_attention_heads,
-                                                                                 args.params_dtype,
-                                                                                 args.alibi_diagonal_opposite,
+            alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length,
+                                                                                 self.config.num_attention_heads,
+                                                                                 self.config.params_dtype,
+                                                                                 self.config.alibi_diagonal_opposite,
                                                                                  1024)
             self.pse = alibi
 
         elif self.pse_type == 2 or self.pse_type == 3:
-            self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(args.num_attention_heads)
+            self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads)
 
     return wrapper
-- 
Gitee


From 6843eae980e9991bf827355e038f9edd89644fcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Thu, 10 Apr 2025 11:24:42 +0800
Subject: [PATCH 08/18] =?UTF-8?q?refactor=EF=BC=9Afa=20generate=20mask=20a?=
 =?UTF-8?q?nd=20ailibi=20pse?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../core/megatron_basic/transformer_basic.py  |  99 ----------
 .../alibi/__init__.py                         |   0
 .../flash_attention/alibi/adaptor.py          |  99 ++++++++++
 .../alibi/alibi.py                            |  16 +-
 .../alibi/alibi_utils.py                      |   0
 .../flash_attention/generate_mask/__init__.py |   0
 .../flash_attention/generate_mask/adaptor.py  |  20 ++
 .../generate_mask/generate_mask.py            |  43 +++++
 .../positional_encoding/alibi/adaptor.py      | 172 ------------------
 mindspeed/features_manager/__init__.py        |  10 +-
 .../megatron_basic/megatron_basic.py          |   5 +-
 .../transformer/flash_attention/__init__.py   |   0
 .../{ => flash_attention}/alibi_feature.py    |  35 +++-
 .../fusion_attention_v2_feature.py            |  34 ++++
 .../flash_attention/generate_mask_feature.py  |  20 ++
 15 files changed, 263 insertions(+), 290 deletions(-)
 delete mode 100644 mindspeed/core/megatron_basic/transformer_basic.py
 rename mindspeed/core/transformer/{positional_encoding => flash_attention}/alibi/__init__.py (100%)
 create mode 100644 mindspeed/core/transformer/flash_attention/alibi/adaptor.py
 rename mindspeed/core/transformer/{positional_encoding => flash_attention}/alibi/alibi.py (81%)
 rename mindspeed/core/transformer/{positional_encoding => flash_attention}/alibi/alibi_utils.py (100%)
 create mode 100644 mindspeed/core/transformer/flash_attention/generate_mask/__init__.py
 create mode 100644 mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
 create mode 100644 mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
 delete mode 100644 mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
 create mode 100644 mindspeed/features_manager/transformer/flash_attention/__init__.py
 rename mindspeed/features_manager/transformer/{ => flash_attention}/alibi_feature.py (58%)
 create mode 100644 mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
 create mode 100644 mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py

diff --git a/mindspeed/core/megatron_basic/transformer_basic.py b/mindspeed/core/megatron_basic/transformer_basic.py
deleted file mode 100644
index 49e35ee5..00000000
--- a/mindspeed/core/megatron_basic/transformer_basic.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from functools import wraps
-from logging import getLogger
-
-import torch
-from megatron.training import get_args
-from megatron.core.transformer.enums import AttnMaskType
-
-_GLOBAL_ATTN_MASK = None
-LOG = getLogger(__name__)
-
-
-def set_attention_mask(attn_mask):
-    global _GLOBAL_ATTN_MASK
-    _GLOBAL_ATTN_MASK = attn_mask
-
-
-def generate_attention_mask(args, compress, device):
-    global _GLOBAL_ATTN_MASK
-    if not args.use_flash_attn:
-        LOG.warn("Flash Attention is highly recommended")
-        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length],
-                                                   dtype=bool,
-                                                   device=device), diagonal=-(args.pre_tockens + 1)) \
-                             + torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length],
-                                                     dtype=bool,
-                                                     device=device), diagonal=args.next_tockens + 1))
-        return
-
-    seq_len = 2048 if compress else args.seq_length
-    
-    _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len),
-                                              device=device,
-                                              dtype=torch.bool), diagonal=1)
-
-
-def get_attention_mask(args):
-    global _GLOBAL_ATTN_MASK
-    if _GLOBAL_ATTN_MASK is not None:
-        return _GLOBAL_ATTN_MASK
-
-    should_generate_mask = False
-    device = 'npu'
-
-    if args.attention_mask_type == 'causal':
-        args.sparse_mode = 2
-        should_generate_mask = True
-        compress = True
-
-    # ampipe开启在同时不开cp时需要生成全量mask，开cp时生成causal mask
-    if args.ampipe_degree > 1 and args.context_parallel_size <= 1:
-        args.sparse_mode = 0
-        should_generate_mask = True
-        compress = False
-
-    # EoD 模式 Ring Attention的实现
-    # general 为基线方案，causal 为加速方案
-    # 如果 cp > 1 且使用了Ring Attention 并行（包括Hybrid并行）。则Mask为动态生成的，不需要额外的Mask
-    if args.reset_attention_mask:
-        if args.attention_mask_type == 'general':
-            args.sparse_mode = 2
-            if args.context_parallel_size == 1 or args.context_parallel_algo == 'ulysses_cp_algo':
-                should_generate_mask = True
-                compress = True
-            else:
-                args.sparse_mode = 1
-                should_generate_mask = False
-        else:
-            should_generate_mask = True
-            compress = True
-
-
-    if args.attention_mask_on_cpu:
-        device = 'cpu'
-
-    if should_generate_mask:
-        generate_attention_mask(args, compress, device)
-
-    return _GLOBAL_ATTN_MASK
-
-
-def parallel_transformer_forward_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, hidden_states, attention_mask, **kwargs):
-        if attention_mask is None:
-            attention_mask = get_attention_mask(self.config)
-        return fn(self, hidden_states, attention_mask, **kwargs)
-    return wrapper
-
-def dot_product_attention_forward_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params):
-        if (
-            attention_mask is None
-            and self.attn_mask_type == AttnMaskType.causal
-            and not getattr(self.config, 'is_llava', False)
-        ):
-            attention_mask = get_attention_mask(self.config)
-        return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
-    return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/__init__.py b/mindspeed/core/transformer/flash_attention/alibi/__init__.py
similarity index 100%
rename from mindspeed/core/transformer/positional_encoding/alibi/__init__.py
rename to mindspeed/core/transformer/flash_attention/alibi/__init__.py
diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
new file mode 100644
index 00000000..7abcbf04
--- /dev/null
+++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+
+import math
+from functools import wraps
+
+from torch import Tensor
+import torch_npu
+
+from megatron.core.transformer.enums import AttnMaskType
+
+from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention
+
+from .alibi import AlibiForFusionAttnSingleton
+
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+
+def dot_product_attention_init_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        fn(self, *args, **kwargs)
+
+        # add pse
+        self.pse = None
+        self.pse_type = self.config.alibi_fusion_attn_type
+
+        if self.pse_type is None:
+            self.pse_type = 1 # not use pse
+        elif self.pse_type == 0:
+            alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length,
+                                                                                 self.config.num_attention_heads,
+                                                                                 self.config.params_dtype,
+                                                                                 self.config.alibi_diagonal_opposite,
+                                                                                 1024)
+            self.pse = alibi
+        elif self.pse_type == 2 or self.pse_type == 3:
+            self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads)
+    return wrapper
+
+
+def dot_product_attention_forward_impl(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask,
+        attn_mask_type,
+        attention_bias,
+        packed_seq_params,
+):
+    assert attention_bias is None, "Attention bias is not supported for DotProductAttention."
+
+    if packed_seq_params is None:
+        seq_length, bsz, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
+    else:
+        seq_length, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2]
+
+    sparse_mode = self.config.sparse_mode
+    if attn_mask_type == AttnMaskType.no_mask:
+        sparse_mode = 0  # default mask
+
+    scale = 1.0 / math.sqrt(
+        self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale
+    
+    if packed_seq_params is not None: # TND
+        actual_seq_qlen = packed_seq_params.cu_seqlens_q.tolist()
+        actual_seq_kvlen = packed_seq_params.cu_seqlens_kv.tolist()
+        query, key, value = [rearrange(x, 's b h d -> (b s) h d') for x in [query, key, value]]
+        shape_order = 'TND'
+    else: # SBH
+        actual_seq_qlen = None
+        actual_seq_kvlen = None
+        query, key, value = [rearrange(x, 's b h d -> s b (h d)') for x in [query, key, value]]
+        shape_order = 'SBH'
+
+    output = npu_fusion_attention(query, key, value, n_head, shape_order,
+                                  pse=self.pse,
+                                  padding_mask=None,
+                                  atten_mask=attention_mask,
+                                  scale=scale,
+                                  pse_type=self.pse_type,
+                                  pre_tokens=self.config.pre_tockens,
+                                  next_tokens=self.config.next_tockens,
+                                  keep_prob=1 - self.attention_dropout.p,
+                                  inner_precise=0,
+                                  sparse_mode=sparse_mode,
+                                  actual_seq_qlen=actual_seq_qlen,
+                                  actual_seq_kvlen=actual_seq_kvlen
+    )[0]
+
+    if packed_seq_params is not None:
+        output = rearrange(output, '(b s) h d -> s b (h d)', s=seq_length, b=bsz)
+
+    return output
\ No newline at end of file
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
similarity index 81%
rename from mindspeed/core/transformer/positional_encoding/alibi/alibi.py
rename to mindspeed/core/transformer/flash_attention/alibi/alibi.py
index 957d334f..2b5f19f2 100644
--- a/mindspeed/core/transformer/positional_encoding/alibi/alibi.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
@@ -30,10 +30,17 @@ class AlibiForFusionAttnSingleton:
     _alibi_slopes = None
 
     @classmethod
-    def get_alibi_tensor_for_fusion_attn(cls, max_seq_len, num_attention_heads, dtype, neg_diagonal_opposite=False,
+    def get_alibi_tensor_for_fusion_attn(cls, 
+                                         max_seq_len, 
+                                         num_attention_heads, 
+                                         dtype, 
+                                         neg_diagonal_opposite=False,
                                          last_k=1024):
-        if cls._alibi_tensor is None or cls._alibi_tensor_args != (
-        max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k):
+        if cls._alibi_tensor is None or \
+           cls._alibi_tensor_args != 
+           (
+            max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k
+           ):
             if last_k > max_seq_len:
                 last_k = max_seq_len
 
@@ -45,7 +52,8 @@ class AlibiForFusionAttnSingleton:
             diag = torch.diag(torch.diag(position_point)).unsqueeze(0).unsqueeze(0)
 
             position_point = position_point.unsqueeze(0).unsqueeze(0).expand(current_head_num, last_k, -1)
-            position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, last_k,
+            position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, 
+                                                                                            last_k,
                                                                                             max_seq_len)
 
             alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point.npu()
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
similarity index 100%
rename from mindspeed/core/transformer/positional_encoding/alibi/alibi_utils.py
rename to mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/__init__.py b/mindspeed/core/transformer/flash_attention/generate_mask/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
new file mode 100644
index 00000000..e2fad832
--- /dev/null
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
@@ -0,0 +1,20 @@
+from functools import wraps
+
+from megatron.core.transformer.enums import AttnMaskType
+
+from .generate_mask import get_attention_mask
+
+
+def dot_product_attention_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, query, key, value, 
+                attention_mask, 
+                attn_mask_type, 
+                attention_bias, 
+                packed_seq_params):
+        if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
+            if not getattr(self.config, 'is_llava', False):
+                self.config.sparse_mode = 2
+                attention_mask = get_attention_mask(self.config)
+        return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
+    return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
new file mode 100644
index 00000000..9ee07218
--- /dev/null
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
@@ -0,0 +1,43 @@
+import warnings
+
+import torch
+from megatron.training import get_args
+
+_GLOBAL_ATTN_MASK = None
+
+
+def set_attention_mask(attn_mask):
+    global _GLOBAL_ATTN_MASK
+    _GLOBAL_ATTN_MASK = attn_mask
+
+
+def generate_attention_mask(args, compress, device):
+    global _GLOBAL_ATTN_MASK
+    if not args.use_flash_attn:
+        warnings.warn("Flash Attention is highly recommended")
+        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], 
+                                                   dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) + \
+                             torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], 
+                                                   dtype=bool, device=device), diagonal=args.next_tockens + 1))
+        return
+
+    if compress:
+        seq_len = 2048
+    else:
+        seq_len = args.seq_length
+    
+    _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len), 
+                                              device=device, dtype=torch.bool), diagonal=1)
+
+
+def get_attention_mask(args):
+    global _GLOBAL_ATTN_MASK
+    if _GLOBAL_ATTN_MASK is not None:
+        return _GLOBAL_ATTN_MASK
+
+    device = 'npu'
+    compress = True
+
+    generate_attention_mask(args, compress, device)
+
+    return _GLOBAL_ATTN_MASK
\ No newline at end of file
diff --git a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py b/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
deleted file mode 100644
index 5ae66ae0..00000000
--- a/mindspeed/core/transformer/positional_encoding/alibi/adaptor.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from functools import wraps
-
-import torch
-from megatron.training import get_args, mpu, tensor_parallel
-
-from .alibi import Alibi, AlibiForFusionAttnSingleton
-from .alibi_utils import _build_alibi_tensor, _get_inverted_mask
-
-
-def core_attention_init_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, *arg, **kwargs):
-        fn(self, *arg, **kwargs)
-
-        # self.config = arg[1]
-        self.hidden_size_per_partition = self.hidden_size_per_partition // self.config.context_parallel_size
-        self.square_alibi_mask = self.config.square_alibi_mask
-        self.fill_neg_inf = self.config.fill_neg_inf
-        self.beta = 1.0
-
-        if self.apply_query_key_layer_scaling:
-            self.beta = 1.0 / self.layer_number
-        if self.config.position_embedding_type == 'alibi':
-            self.alibi = Alibi()
-            alibi = _build_alibi_tensor(self.config.seq_length,
-                                        self.config.num_attention_heads,
-                                        self.config.square_alibi_mask,
-                                        self.config.fill_neg_inf
-                                        ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype)
-            self.alibi.alibi = alibi
-        else:
-            self.alibi = None
-
-    return wrapper
-
-
-def core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
-    # ===================================
-    # Raw attention scores. [b, np, s, s]
-    # ===================================
-
-    # [b, np, sq, sk]
-    output_size = (query_layer.size(1),
-                   query_layer.size(2),
-                   query_layer.size(0),
-                   key_layer.size(0))
-
-    # [sq, b, np, hn] -> [sq, b * np, hn]
-    query_layer = query_layer.reshape(output_size[2],
-                                      output_size[0] * output_size[1], -1)
-    # [sk, b, np, hn] -> [sk, b * np, hn]
-    key_layer = key_layer.view(output_size[3],
-                               output_size[0] * output_size[1], -1)
-
-    if self.alibi is None:
-        matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor(
-            (output_size[0] * output_size[1], output_size[2], output_size[3]),
-            query_layer.dtype, "mpu")
-
-        matmul_result = torch.baddbmm(
-            matmul_input_buffer,
-            query_layer.transpose(0, 1),
-            key_layer.transpose(0, 1).transpose(1, 2),
-            beta=0.0, alpha=(1.0 / self.norm_factor))
-    else:
-        if self.alibi.matmul_result is None or self.alibi.output_size != output_size:
-
-            self.alibi.output_size = output_size
-            alibi = _build_alibi_tensor(self.config.seq_length,
-                                        self.config.num_attention_heads,
-                                        self.config.square_alibi_mask,
-                                        self.config.fill_neg_inf
-                                        ).to(device=torch.cuda.current_device(), dtype=self.config.params_dtype)
-            self.alibi.alibi = alibi
-
-            if self.fill_neg_inf:
-                _alibi = self.alibi.alibi[:, :output_size[3], :output_size[3]]
-                attention_mask = attention_mask.repeat(output_size[0], 1, 1, 1)[:output_size[0], :, :, :]
-                self.alibi.matmul_result = _get_inverted_mask(attention_mask, _alibi).view(-1, output_size[2],
-                                                                                           output_size[2]).contiguous()
-            else:
-                self.alibi.matmul_result = self.alibi.alibi[:, :, :output_size[3]].repeat(output_size[0], 1, 1)
-
-        q_trans = query_layer.transpose(0, 1).contiguous()
-        k_trans = key_layer.transpose(0, 1).transpose(1, 2).contiguous()
-        matmul_result = self.beta * self.alibi.matmul_result + torch.bmm(q_trans, k_trans) * (1.0 / self.norm_factor)
-
-        # change view to [b, np, sq, sk]
-    attention_scores = matmul_result.view(*output_size)
-
-    # ===========================
-    # Attention probs and dropout
-    # ===========================
-
-    # attention scores and attention mask [b, np, sq, sk]
-    if self.square_alibi_mask:
-        attention_scores = torch.max(
-            attention_scores, torch.tensor(torch.finfo(attention_scores.dtype).min)
-        )
-        attention_probs = torch.nn.functional.softmax(attention_scores, -1)
-    else:
-        attention_probs = self.scale_mask_softmax(attention_scores,
-                                                  attention_mask)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    if not self.sequence_parallel:
-        with tensor_parallel.get_cuda_rng_tracker().fork():
-            attention_probs = self.attention_dropout(attention_probs)
-    else:
-        attention_probs = self.attention_dropout(attention_probs)
-
-    # =========================
-    # Context layer. [sq, b, hp]
-    # =========================
-
-    # value_layer -> context layer.
-    # [sk, b, np, hn] --> [b, np, sq, hn]
-
-    # context layer shape: [b, np, sq, hn]
-    output_size = (value_layer.size(1),
-                   value_layer.size(2),
-                   query_layer.size(0),
-                   value_layer.size(3))
-
-    # change view [sk, b * np, hn]
-    value_layer = value_layer.view(value_layer.size(0),
-                                   output_size[0] * output_size[1], -1)
-
-    # change view [b * np, sq, sk]
-    attention_probs = attention_probs.view(output_size[0] * output_size[1],
-                                           output_size[2], -1)
-
-    # matmul: [b * np, sq, hn]
-    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-
-    # change view [b, np, sq, hn]
-    context_layer = context_layer.view(*output_size)
-
-    # [b, np, sq, hn] --> [sq, b, np, hn]
-    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-
-    # [sq, b, np, hn] --> [sq, b, hp]
-    new_context_layer_shape = context_layer.size()[:-2] + \
-                              (self.hidden_size_per_partition,)
-    context_layer = context_layer.view(*new_context_layer_shape)
-
-    return context_layer
-
-
-def flash_self_attention_init_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, *arg, **kwargs):
-        fn(self, *arg, **kwargs)
-
-        self.pse = None
-        self.pse_type = self.config.alibi_fusion_attn_type
-
-        if self.pse_type is None:
-            self.pse_type = 1  # not use pse
-        elif self.pse_type == 0:
-            alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length,
-                                                                                 self.config.num_attention_heads,
-                                                                                 self.config.params_dtype,
-                                                                                 self.config.alibi_diagonal_opposite,
-                                                                                 1024)
-            self.pse = alibi
-
-        elif self.pse_type == 2 or self.pse_type == 3:
-            self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads)
-
-    return wrapper
diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py
index 83869db5..ddf0fa69 100644
--- a/mindspeed/features_manager/__init__.py
+++ b/mindspeed/features_manager/__init__.py
@@ -11,7 +11,9 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import
 
 from mindspeed.features_manager.llava.llava_multimodal import LlavaModel
 
-from mindspeed.features_manager.transformer.alibi_feature import AlibiFeature
+from mindspeed.features_manager.transformer.flash_attention.fusion_attention_v2_feature import FusionAttentionV2Feature
+from mindspeed.features_manager.transformer.flash_attention.alibi_feature import AlibiFeature
+from mindspeed.features_manager.transformer.flash_attention.generate_mask_feature import GenerateMaskFeature
 
 FEATURES_LIST = [
     # Functional features
@@ -42,6 +44,8 @@ FEATURES_LIST_V2 = (
     # llava-multimodal
     LlavaModel(),
 
-     # Transformer features
-    AlibiFeature()
+    # Transformer flash attention features
+    FusionAttentionV2Feature(),
+    AlibiFeature(),
+    GenerateMaskFeature()
 )
diff --git a/mindspeed/features_manager/megatron_basic/megatron_basic.py b/mindspeed/features_manager/megatron_basic/megatron_basic.py
index b959e201..ef8d1874 100644
--- a/mindspeed/features_manager/megatron_basic/megatron_basic.py
+++ b/mindspeed/features_manager/megatron_basic/megatron_basic.py
@@ -41,7 +41,4 @@ class MegatronBasicFeature(MindSpeedFeature):
         pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', start_grad_sync)
         pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_grad_sync', finish_grad_sync)
 
-        # attention mask generate normalization
-        from mindspeed.core.megatron_basic.transformer_basic import parallel_transformer_forward_wrapper, dot_product_attention_forward_wrapper
-        pm.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', parallel_transformer_forward_wrapper)
-        pm.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper)
\ No newline at end of file
+
diff --git a/mindspeed/features_manager/transformer/flash_attention/__init__.py b/mindspeed/features_manager/transformer/flash_attention/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindspeed/features_manager/transformer/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
similarity index 58%
rename from mindspeed/features_manager/transformer/alibi_feature.py
rename to mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
index 37e174b7..1f1acc20 100644
--- a/mindspeed/features_manager/transformer/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
@@ -1,11 +1,32 @@
+from logging import getLogger
 from argparse import ArgumentParser
 
 from mindspeed.features_manager.feature import MindSpeedFeature
+from mindspeed.log_config import set_log_config
+
+LOG = getLogger(__name__)
+
 
 class AlibiFeature(MindSpeedFeature):
+    """
+    Attention positional embeding.
+    To enable this feature, the reference is as follows .
+
+    Usage:
+      "--position-embedding-type alibi"
+      "--alibi-fusion-attn-type [0, 2, 3]"
+      "[--alibi-diagonal-opposite]"
+    """
 
     def __init__(self):
-        super().__init__('pse-alibi', optimization_level=2)
+        super().__init__('position-embedding-type', optimization_level=0)
+
+    def is_need_apply(self, args):
+        pse = getattr(args, self.feature_name, None)
+        need_apply = False
+        if pse == 'alibi':
+            need_apply = True
+        return (self.optimization_level <= args.optimization_level and need_apply) or self.default_patches
 
     def register_args(self, parser: ArgumentParser):
         self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi')
@@ -31,7 +52,8 @@ class AlibiFeature(MindSpeedFeature):
     def validate_args(self, args):
         if args.alibi_fusion_attn_type is not None and args.alibi_fusion_attn_type not in [0, 2, 3]:
             raise AssertionError('--alibi-fusion-attn-type only support for `0, 2, 3`')
-        # alibi type [2, 3] is only support FA2
+        
+        # alibi is only support FA2
         if args.alibi_fusion_attn_type in [2, 3]:
             args.use_fusion_attn_v2 = True
         if args.use_fusion_attn_v2:
@@ -39,9 +61,6 @@ class AlibiFeature(MindSpeedFeature):
             print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.")
 
     def register_patches(self, patch_manager, args):
-        from mindspeed.core.transformer.positional_encoding.alibi.adaptor import flash_self_attention_init_wrapper #l0
-        from mindspeed.core.transformer.positional_encoding.alibi.adaptor import core_attention_init_wrapper, core_attention_forward #l2
-        patch_manager.register_patch('megatron.legacy.model.transformer.FlashSelfAttention.__init__',
-                                        flash_self_attention_init_wrapper)
-        patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.__init__', core_attention_init_wrapper)
-        patch_manager.register_patch('megatron.legacy.model.transformer.CoreAttention.forward', core_attention_forward)
\ No newline at end of file
+        from mindspeed.core.transformer.flash_attention.alibi.adaptor import dot_product_attention_init_wrapper, dot_product_attention_forward_impl
+        patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.__init__', dot_product_attention_init_wrapper)
+        patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_impl)
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
new file mode 100644
index 00000000..a47b5db2
--- /dev/null
+++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
@@ -0,0 +1,34 @@
+from argparse import ArgumentParser
+
+from mindspeed.features_manager.feature import MindSpeedFeature
+
+
+class FusionAttentionV2Feature(MindSpeedFeature):
+    '''
+    fusion attention v2 is a expand to fusion attention v1
+    and only support for alibi positional embeding currently.
+    Close by default.
+    '''
+
+    def __init__(self):
+        super().__init__('use-fusion-attn-v2', 0)
+
+    def register_args(self, parser: ArgumentParser):
+        group = parser.add_argument_group(title='fusion attention v2')
+        group.add_argument('--use-fusion-attn-v2',
+                           action='store_true',
+                           default=False,
+                           help='enalbe fusion attention v2')
+        group.add_argument('--pre-tockens', 
+                           type=int, 
+                           default=65536,
+                           help='pre-tockens is used by Flash attention')
+        group.add_argument('--next-tockens', 
+                           type=int, 
+                           default=0,
+                           help='next-tockens is used by Flash attention')
+        
+    def validate_args(self, args):
+        if args.use_fusion_attn_v2:
+            args.use_flash_attn = True
+            print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.")
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
new file mode 100644
index 00000000..3dfaa074
--- /dev/null
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -0,0 +1,20 @@
+from argparse import ArgumentParser
+
+from mindspeed.features_manager.feature import MindSpeedFeature
+
+class GenerateMaskFeature(MindSpeedFeature):
+
+    def __init__(self):
+        super().__init__('no-create-attention-mask-in-dataloader', 0)
+
+    def register_args(self, parser: ArgumentParser):
+        group = parser.add_argument_group(title='fusion attention')
+        group.add_argument('--sparse-mode',
+                           type=int,
+                           default=0,
+                           choices=[0,1,2,3,4,5,6,7,8],
+                           help='mask type for fusion attention')
+
+    def register_patches(self, patch_manager, args):
+          from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper
+          patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper)
-- 
Gitee


From 9cc1eda5c7697bc58df394db75867be27e939978 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Thu, 10 Apr 2025 11:41:02 +0800
Subject: [PATCH 09/18] bugfix

---
 mindspeed/core/transformer/flash_attention/alibi/alibi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
index 2b5f19f2..5177ed41 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
@@ -37,7 +37,7 @@ class AlibiForFusionAttnSingleton:
                                          neg_diagonal_opposite=False,
                                          last_k=1024):
         if cls._alibi_tensor is None or \
-           cls._alibi_tensor_args != 
+           cls._alibi_tensor_args != \
            (
             max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k
            ):
-- 
Gitee


From f2a2f6f294fc6d1cdb4b76aef63263e21948c69f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Thu, 10 Apr 2025 17:19:45 +0800
Subject: [PATCH 10/18] =?UTF-8?q?clean=20code=20&=20alibi=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9=E4=B8=BA=E5=AF=B9=E6=95=B4=E4=B8=AA=E7=B1=BBPatch=20&?=
 =?UTF-8?q?=20=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../flash_attention/alibi/adaptor.py          | 128 +++++++-----------
 .../flash_attention/alibi/alibi.py            |  78 +++++++----
 .../flash_attention/alibi/alibi_utils.py      |  36 +++--
 .../alibi/dot_product_attention.py            | 114 ++++++++++++++++
 .../flash_attention/generate_mask/adaptor.py  |  27 ++--
 .../generate_mask/generate_mask.py            |  33 ++++-
 .../flash_attention/alibi_feature.py          |  85 ++++++++----
 .../fusion_attention_v2_feature.py            |  47 +++++--
 .../flash_attention/generate_mask_feature.py  |  31 +++--
 9 files changed, 396 insertions(+), 183 deletions(-)
 create mode 100644 mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py

diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
index 7abcbf04..f30cc0ca 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
@@ -2,98 +2,62 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 # Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
 
-import math
-from functools import wraps
+from typing import Optional
 
 from torch import Tensor
-import torch_npu
-
+from megatron.core.transformer.dot_product_attention import DotProductAttention as MegatronDotProductAttention
+from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.packed_seq_params import PackedSeqParams
 
-from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention
-
-from .alibi import AlibiForFusionAttnSingleton
-
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
+from mindspeed.core.transformer.flash_attention.alibi.dot_product_attention import DotProductAttentionImpl
 
 
-def dot_product_attention_init_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, *args, **kwargs):
-        fn(self, *args, **kwargs)
+class MindSpeedDotProductAttention(DotProductAttentionImpl, MegatronDotProductAttention):
 
+    def __init__(
+        self,
+        config: TransformerConfig,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: float = None,
+        softmax_scale: float = None,
+        cp_comm_type: str = None,
+    ):
+        MegatronDotProductAttention.__init__(
+            self,
+            config,
+            layer_number,
+            attn_mask_type,
+            attention_type,
+            attention_dropout,
+            softmax_scale,
+            cp_comm_type
+        )
+        
         # add pse
-        self.pse = None
-        self.pse_type = self.config.alibi_fusion_attn_type
-
-        if self.pse_type is None:
-            self.pse_type = 1 # not use pse
-        elif self.pse_type == 0:
-            alibi = AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(self.config.seq_length,
-                                                                                 self.config.num_attention_heads,
-                                                                                 self.config.params_dtype,
-                                                                                 self.config.alibi_diagonal_opposite,
-                                                                                 1024)
-            self.pse = alibi
-        elif self.pse_type == 2 or self.pse_type == 3:
-            self.pse = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(self.config.num_attention_heads)
-    return wrapper
-
+        DotProductAttentionImpl.__init__(self)
 
-def dot_product_attention_forward_impl(
+    def forward(
         self,
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        attention_mask,
-        attn_mask_type,
-        attention_bias,
-        packed_seq_params,
-):
-    assert attention_bias is None, "Attention bias is not supported for DotProductAttention."
-
-    if packed_seq_params is None:
-        seq_length, bsz, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
-    else:
-        seq_length, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2]
-
-    sparse_mode = self.config.sparse_mode
-    if attn_mask_type == AttnMaskType.no_mask:
-        sparse_mode = 0  # default mask
-
-    scale = 1.0 / math.sqrt(
-        self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale
-    
-    if packed_seq_params is not None: # TND
-        actual_seq_qlen = packed_seq_params.cu_seqlens_q.tolist()
-        actual_seq_kvlen = packed_seq_params.cu_seqlens_kv.tolist()
-        query, key, value = [rearrange(x, 's b h d -> (b s) h d') for x in [query, key, value]]
-        shape_order = 'TND'
-    else: # SBH
-        actual_seq_qlen = None
-        actual_seq_kvlen = None
-        query, key, value = [rearrange(x, 's b h d -> s b (h d)') for x in [query, key, value]]
-        shape_order = 'SBH'
-
-    output = npu_fusion_attention(query, key, value, n_head, shape_order,
-                                  pse=self.pse,
-                                  padding_mask=None,
-                                  atten_mask=attention_mask,
-                                  scale=scale,
-                                  pse_type=self.pse_type,
-                                  pre_tokens=self.config.pre_tockens,
-                                  next_tokens=self.config.next_tockens,
-                                  keep_prob=1 - self.attention_dropout.p,
-                                  inner_precise=0,
-                                  sparse_mode=sparse_mode,
-                                  actual_seq_qlen=actual_seq_qlen,
-                                  actual_seq_kvlen=actual_seq_kvlen
-    )[0]
-
-    if packed_seq_params is not None:
-        output = rearrange(output, '(b s) h d -> s b (h d)', s=seq_length, b=bsz)
-
-    return output
\ No newline at end of file
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType = None,
+        attention_bias: Tensor = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ):
+        output = DotProductAttentionImpl.forward(
+            self,
+            query,
+            key,
+            value,
+            attention_mask,
+            attn_mask_type,
+            attention_bias,
+            packed_seq_params,
+        )
+
+        return output
diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
index 5177ed41..7614269f 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
@@ -30,31 +30,50 @@ class AlibiForFusionAttnSingleton:
     _alibi_slopes = None
 
     @classmethod
-    def get_alibi_tensor_for_fusion_attn(cls, 
-                                         max_seq_len, 
-                                         num_attention_heads, 
-                                         dtype, 
-                                         neg_diagonal_opposite=False,
-                                         last_k=1024):
-        if cls._alibi_tensor is None or \
-           cls._alibi_tensor_args != \
-           (
-            max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k
-           ):
+    def get_alibi_tensor_for_fusion_attn(
+        cls, 
+        max_seq_len, 
+        num_attention_heads, 
+        dtype, 
+        neg_diagonal_opposite=False,
+        last_k=1024
+    ):
+        if (
+            cls._alibi_tensor is None or 
+            cls._alibi_tensor_args != (
+                max_seq_len, num_attention_heads, 
+                neg_diagonal_opposite, last_k
+            )
+        ):
             if last_k > max_seq_len:
                 last_k = max_seq_len
 
-            tp_world_size = parallel_state.get_tensor_model_parallel_world_size()
+            tp_world_size = (
+                parallel_state.get_tensor_model_parallel_world_size()
+            )
             current_head_num = num_attention_heads // tp_world_size
-            slopes = AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(num_attention_heads)
+            slopes = (
+                AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(
+                    num_attention_heads
+                )
+            )
 
             position_point = torch.arange(max_seq_len) - max_seq_len + 1
-            diag = torch.diag(torch.diag(position_point)).unsqueeze(0).unsqueeze(0)
-
-            position_point = position_point.unsqueeze(0).unsqueeze(0).expand(current_head_num, last_k, -1)
-            position_point = position_point - diag.transpose(-1, -2)[:, -last_k:, :].expand(current_head_num, 
-                                                                                            last_k,
-                                                                                            max_seq_len)
+            diag = torch.diag(
+                torch.diag(position_point)
+            ).unsqueeze(0).unsqueeze(0)
+
+            position_point = (
+                position_point.unsqueeze(0).unsqueeze(0).expand(
+                    current_head_num, last_k, -1
+                )
+            )
+            position_point = (
+                position_point - 
+                diag.transpose(-1, -2)[:, -last_k:, :].expand(
+                    current_head_num, last_k, max_seq_len
+                )
+            )
 
             alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point.npu()
 
@@ -69,20 +88,33 @@ class AlibiForFusionAttnSingleton:
                 alibi = alibi.to(torch.bfloat16)
 
             cls._alibi_tensor = alibi
-            cls._alibi_tensor_args = (max_seq_len, num_attention_heads, neg_diagonal_opposite, last_k)
+            cls._alibi_tensor_args = (
+                max_seq_len, num_attention_heads,
+                neg_diagonal_opposite, last_k
+            )
 
         return cls._alibi_tensor
 
     @classmethod
     def get_alibi_slopes_for_fusion_attn(cls, n):
-        if cls._alibi_slopes is None or cls._alibi_slopes_headnum != n:
+        if (
+            cls._alibi_slopes is None or 
+            cls._alibi_slopes_headnum != n
+        ):
             slopes = get_slopes(n)
 
-            tp_world_size = parallel_state.get_tensor_model_parallel_world_size()
+            tp_world_size = (
+                parallel_state.get_tensor_model_parallel_world_size()
+            )
             tp_index = parallel_state.get_tensor_model_parallel_rank()
 
             current_head_num = n // tp_world_size
-            slopes = torch.Tensor(slopes[tp_index * current_head_num: tp_index * current_head_num + current_head_num]).npu()
+            slopes = torch.Tensor(
+                slopes[
+                    tp_index * current_head_num: 
+                    tp_index * current_head_num + current_head_num
+                ]
+            ).npu()
 
             cls._alibi_slopes = slopes
             cls._alibi_slopes_headnum = n
diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
index 7c28313b..24a898c9 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
@@ -14,8 +14,10 @@ def get_slopes(n):
         return get_slopes_power_of_2(n)
     else:
         closest_power_of_2 = 2 ** math.floor(math.log2(n))
-        return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
-                                                           :n - closest_power_of_2]
+        return (
+            get_slopes_power_of_2(closest_power_of_2) + 
+            get_slopes(2 * closest_power_of_2)[0::2][:n - closest_power_of_2]
+        )
 
 
 def _get_inverted_mask(attention_mask, alibi):
@@ -26,26 +28,44 @@ def _get_inverted_mask(attention_mask, alibi):
     return inverted_mask.to(alibi.device) + alibi.unsqueeze(0)
 
 
-def _build_alibi_tensor(max_seq_len, num_attention_heads, square_alibi_mask, fill_neg_inf):
+def _build_alibi_tensor(
+    max_seq_len,
+    num_attention_heads,
+    square_alibi_mask,
+    fill_neg_inf
+):
     def _fill_with_neg_inf(t):
         """FP16-compatible function that fills a tensor with -inf."""
         return t.float().fill_(float("-inf")).type_as(t)
 
     def _buffered_future_mask(maxpos, alibi, attn_heads):
-        _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1)
+        _future_mask = torch.triu(
+            _fill_with_neg_inf(torch.zeros([maxpos, maxpos])),
+            1
+        )
         _future_mask = _future_mask.unsqueeze(0) + alibi
         return _future_mask[:attn_heads, :maxpos, :maxpos]
 
     slopes = torch.Tensor(get_slopes(num_attention_heads))
     if square_alibi_mask:
         position_point = torch.arange(max_seq_len) - max_seq_len + 1
-        position_point = position_point.unsqueeze(0).unsqueeze(0).expand(num_attention_heads, max_seq_len, -1)
+        position_point = (
+            position_point.unsqueeze(0).unsqueeze(0).expand(
+                num_attention_heads, max_seq_len, -1
+            )
+        )
         diag = torch.diag(position_point[0])
-        position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2)
+        position_point = (
+            position_point - diag.unsqueeze(0).unsqueeze(0).transpose(-1, -2)
+        )
         alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point
     else:
-        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
-            num_attention_heads, -1, -1)
+        alibi = (
+            slopes.unsqueeze(1).unsqueeze(1) * 
+            torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
+                num_attention_heads, -1, -1
+            )
+        )
 
     # Select the part of the tensor that corresponds to our tensor parallel index.
     tp_world_size = parallel_state.get_tensor_model_parallel_world_size()
diff --git a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
new file mode 100644
index 00000000..67485ba0
--- /dev/null
+++ b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
@@ -0,0 +1,114 @@
+import math
+from typing import Optional
+
+from torch import Tensor
+
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.packed_seq_params import PackedSeqParams
+
+from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention
+
+from .alibi import AlibiForFusionAttnSingleton
+
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+
+class DotProductAttentionImpl():
+
+    def __init__(self):
+        # add pse
+        self.pse = None
+        self.pse_type = self.config.alibi_fusion_attn_type
+
+        if self.pse_type is None:
+            self.pse_type = 1 # not use pse
+        elif self.pse_type == 0:
+            alibi = (
+                AlibiForFusionAttnSingleton.get_alibi_tensor_for_fusion_attn(
+                    self.config.seq_length,
+                    self.config.num_attention_heads,
+                    self.config.params_dtype,
+                    self.config.alibi_diagonal_opposite,
+                    1024
+                )
+            )
+            self.pse = alibi
+        elif self.pse_type == 2 or self.pse_type == 3:
+            self.pse = (
+                AlibiForFusionAttnSingleton.get_alibi_slopes_for_fusion_attn(
+                    self.config.num_attention_heads
+                )
+            )
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attention_mask: Tensor,
+        attn_mask_type: AttnMaskType = None,
+        attention_bias: Tensor = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ):
+        assert attention_bias is None, \
+            "Attention bias is not supported for DotProductAttention."
+
+        if packed_seq_params is None:
+            seq_length, bsz, n_head, head_dim = (
+                query.shape[0], query.shape[1], query.shape[2], query.shape[3]
+            )
+        else:
+            seq_length, n_head, head_dim = (
+                query.shape[0], query.shape[1], query.shape[2]
+            )
+
+        sparse_mode = self.config.sparse_mode
+        if attn_mask_type == AttnMaskType.no_mask:
+            sparse_mode = 0  # default mask
+
+        scale = (
+            1.0 / math.sqrt(self.hidden_size_per_attention_head) 
+            if self.scale_mask_softmax.scale is None 
+            else self.softmax_scale
+        )
+        
+        if packed_seq_params is not None: # TND
+            actual_seq_qlen = packed_seq_params.cu_seqlens_q.tolist()
+            actual_seq_kvlen = packed_seq_params.cu_seqlens_kv.tolist()
+            query, key, value = (
+                [rearrange(x, 's b h d -> (b s) h d') for x in [query, key, value]]
+            )
+            shape_order = 'TND'
+        else: # SBH
+            actual_seq_qlen = None
+            actual_seq_kvlen = None
+            query, key, value = (
+                [rearrange(x, 's b h d -> s b (h d)') for x in [query, key, value]]
+            )
+            shape_order = 'SBH'
+
+        output = npu_fusion_attention(
+            query, key, value, n_head, shape_order,
+            pse=self.pse,
+            padding_mask=None,
+            atten_mask=attention_mask,
+            scale=scale,
+            pse_type=self.pse_type,
+            pre_tokens=self.config.pre_tockens,
+            next_tokens=self.config.next_tockens,
+            keep_prob=1 - self.attention_dropout.p,
+            inner_precise=0,
+            sparse_mode=sparse_mode,
+            actual_seq_qlen=actual_seq_qlen,
+            actual_seq_kvlen=actual_seq_kvlen
+        )[0]
+
+        if packed_seq_params is not None:
+            output = (
+                rearrange(output, '(b s) h d -> s b (h d)', s=seq_length, b=bsz)
+            )
+
+        return output
\ No newline at end of file
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
index e2fad832..12cbc42a 100644
--- a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
@@ -7,14 +7,21 @@ from .generate_mask import get_attention_mask
 
 def dot_product_attention_forward_wrapper(fn):
     @wraps(fn)
-    def wrapper(self, query, key, value, 
-                attention_mask, 
-                attn_mask_type, 
-                attention_bias, 
-                packed_seq_params):
-        if attention_mask is None and self.attn_mask_type == AttnMaskType.causal:
-            if not getattr(self.config, 'is_llava', False):
-                self.config.sparse_mode = 2
-                attention_mask = get_attention_mask(self.config)
-        return fn(self, query, key, value, attention_mask, attn_mask_type, attention_bias, packed_seq_params)
+    def wrapper(
+        self, query, key, value, 
+        attention_mask, 
+        attn_mask_type, 
+        attention_bias, 
+        packed_seq_params
+    ):
+        if (
+            attention_mask is None and 
+            self.attn_mask_type == AttnMaskType.causal
+        ) and not getattr(self.config, 'is_llava', False):
+            self.config.sparse_mode = 2
+            attention_mask = get_attention_mask(self.config)
+        return fn(
+            self, query, key, value, 
+            attention_mask, attn_mask_type, attention_bias, packed_seq_params
+            )
     return wrapper
\ No newline at end of file
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
index 9ee07218..d93c16a3 100644
--- a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
@@ -1,7 +1,6 @@
 import warnings
 
 import torch
-from megatron.training import get_args
 
 _GLOBAL_ATTN_MASK = None
 
@@ -15,10 +14,24 @@ def generate_attention_mask(args, compress, device):
     global _GLOBAL_ATTN_MASK
     if not args.use_flash_attn:
         warnings.warn("Flash Attention is highly recommended")
-        _GLOBAL_ATTN_MASK = (torch.tril(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], 
-                                                   dtype=bool, device=device), diagonal=-(args.pre_tockens + 1)) + \
-                             torch.triu(torch.ones([args.micro_batch_size, 1, args.seq_length, args.seq_length], 
-                                                   dtype=bool, device=device), diagonal=args.next_tockens + 1))
+        _GLOBAL_ATTN_MASK = (
+            torch.tril(
+                torch.ones(
+                    [args.micro_batch_size, 1, args.seq_length, args.seq_length],
+                    dtype=bool,
+                    device=device
+                ), 
+                diagonal=-(args.pre_tockens + 1)
+            ) + 
+            torch.triu(
+                torch.ones(
+                    [args.micro_batch_size, 1, args.seq_length, args.seq_length], 
+                    dtype=bool, 
+                    device=device
+                ), 
+                diagonal=args.next_tockens + 1
+            )
+        )
         return
 
     if compress:
@@ -26,8 +39,14 @@ def generate_attention_mask(args, compress, device):
     else:
         seq_len = args.seq_length
     
-    _GLOBAL_ATTN_MASK = torch.triu(torch.ones((seq_len, seq_len), 
-                                              device=device, dtype=torch.bool), diagonal=1)
+    _GLOBAL_ATTN_MASK = torch.triu(
+        torch.ones(
+            (seq_len, seq_len), 
+            device=device, 
+            dtype=torch.bool
+        ), 
+        diagonal=1
+    )
 
 
 def get_attention_mask(args):
diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
index 1f1acc20..6943a875 100644
--- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
@@ -2,9 +2,6 @@ from logging import getLogger
 from argparse import ArgumentParser
 
 from mindspeed.features_manager.feature import MindSpeedFeature
-from mindspeed.log_config import set_log_config
-
-LOG = getLogger(__name__)
 
 
 class AlibiFeature(MindSpeedFeature):
@@ -14,53 +11,85 @@ class AlibiFeature(MindSpeedFeature):
 
     Usage:
       "--position-embedding-type alibi"
-      "--alibi-fusion-attn-type [0, 2, 3]"
+      "--alibi-fusion-attn-type [2, 3]"
       "[--alibi-diagonal-opposite]"
     """
 
     def __init__(self):
-        super().__init__('position-embedding-type', optimization_level=0)
+        super().__init__(
+            'position-embedding-type', 
+            optimization_level=2
+        )
 
     def is_need_apply(self, args):
         pse = getattr(args, self.feature_name, None)
         need_apply = False
         if pse == 'alibi':
             need_apply = True
-        return (self.optimization_level <= args.optimization_level and need_apply) or self.default_patches
+        return (
+                self.optimization_level <= args.optimization_level and 
+                need_apply
+            ) or self.default_patches
 
     def register_args(self, parser: ArgumentParser):
-        self.add_parser_argument_choices_value(parser, "--position-embedding-type", 'alibi')
+        self.add_parser_argument_choices_value(
+            parser, 
+            "--position-embedding-type", 
+            'alibi'
+        )
 
         group = parser.add_argument_group(title='alibi')
-        group.add_argument('--square-alibi-mask',
-                           action='store_true',
-                           default=False,
-                           help='attention mask of alibi is squared')
-        group.add_argument('--fill-neg-inf',
-                           action='store_true',
-                           default=False,
-                           help='fill alibi with negative inf')
+        group.add_argument(
+            '--square-alibi-mask',
+            action='store_true',
+            default=False,
+            help='attention mask of alibi is squared'
+        )
+        group.add_argument(
+            '--fill-neg-inf',
+            action='store_true',
+            default=False,
+            help='fill alibi with negative inf'
+        )
 
-        group.add_argument('--alibi-fusion-attn-type',
-                           type=int,
-                           help='alibi pse type, support for 0,2,3')
-        group.add_argument('--alibi-diagonal-opposite',
-                           action='store_true',
-                           default=False,
-                           help='make alibi diagonal opposite')
+        group.add_argument(
+            '--alibi-fusion-attn-type',
+            type=int,
+            help='alibi pse type, support for 0,2,3'
+        )
+        group.add_argument(
+            '--alibi-diagonal-opposite',
+            action='store_true',
+            default=False,
+            help='make alibi diagonal opposite'
+        )
         
     def validate_args(self, args):
-        if args.alibi_fusion_attn_type is not None and args.alibi_fusion_attn_type not in [0, 2, 3]:
-            raise AssertionError('--alibi-fusion-attn-type only support for `0, 2, 3`')
+        if (args.alibi_fusion_attn_type is not None and 
+            args.alibi_fusion_attn_type not in [0, 2, 3]
+        ):
+            raise AssertionError(
+                '--alibi-fusion-attn-type only \support for `0, 2, 3`'
+            )
+        if args.alibi_fusion_attn_type == 0:
+            raise AssertionError(
+                'fa_v2 only support compress model currently.\
+                please use 2 or 3'
+            )
         
         # alibi is only support FA2
         if args.alibi_fusion_attn_type in [2, 3]:
             args.use_fusion_attn_v2 = True
         if args.use_fusion_attn_v2:
             args.use_flash_attn = True
-            print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.")
+            print(
+                "[WARNING] \"use_fusion_attn_v2\" is not recommended. \
+                This feature is not officially released."
+            )
 
     def register_patches(self, patch_manager, args):
-        from mindspeed.core.transformer.flash_attention.alibi.adaptor import dot_product_attention_init_wrapper, dot_product_attention_forward_impl
-        patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.__init__', dot_product_attention_init_wrapper)
-        patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_impl)
\ No newline at end of file
+        from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention
+        patch_manager.register_patch(
+            'megatron.core.transformer.dot_product_attention.DotProductAttention', 
+            MindSpeedDotProductAttention
+        )
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
index a47b5db2..df71abc8 100644
--- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
@@ -11,24 +11,43 @@ class FusionAttentionV2Feature(MindSpeedFeature):
     '''
 
     def __init__(self):
-        super().__init__('use-fusion-attn-v2', 0)
+        super().__init__(
+            'use-fusion-attn-v2', 
+            optimization_level=2
+        )
 
     def register_args(self, parser: ArgumentParser):
         group = parser.add_argument_group(title='fusion attention v2')
-        group.add_argument('--use-fusion-attn-v2',
-                           action='store_true',
-                           default=False,
-                           help='enalbe fusion attention v2')
-        group.add_argument('--pre-tockens', 
-                           type=int, 
-                           default=65536,
-                           help='pre-tockens is used by Flash attention')
-        group.add_argument('--next-tockens', 
-                           type=int, 
-                           default=0,
-                           help='next-tockens is used by Flash attention')
+        group.add_argument(
+            '--use-fusion-attn-v2',
+            action='store_true',
+            default=False,
+            help='enalbe fusion attention v2'
+        )
+        group.add_argument(
+            '--pre-tockens', 
+            type=int, 
+            default=65536,
+            help='pre-tockens is used by Flash attention'
+        )
+        group.add_argument(
+            '--next-tockens', 
+            type=int, 
+            default=0,
+            help='next-tockens is used by Flash attention'
+        )
+        group.add_argument(
+            '--sparse-mode',
+            type=int,
+            default=0,
+            choices=[0,1,2,3,4,5,6,7,8],
+            help='mask type for fusion attention'
+        )
         
     def validate_args(self, args):
         if args.use_fusion_attn_v2:
             args.use_flash_attn = True
-            print("[WARNING] \"use_fusion_attn_v2\" is not recommended. This feature is not officially released.")
\ No newline at end of file
+            print(
+                "[WARNING] \"use_fusion_attn_v2\" is not recommended. \
+                This feature is not officially released."
+            )
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
index 3dfaa074..5815e49c 100644
--- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -1,20 +1,29 @@
-from argparse import ArgumentParser
-
 from mindspeed.features_manager.feature import MindSpeedFeature
 
 class GenerateMaskFeature(MindSpeedFeature):
 
     def __init__(self):
-        super().__init__('no-create-attention-mask-in-dataloader', 0)
+        super().__init__(
+            'no-create-attention-mask-in-dataloader', 
+            optimization_level=2
+        )
+
+    def is_need_apply(self, args):
+        """Check the feature is need to apply."""
+        need_apply = False
+        
+        # can't find feature name, need to enable
+        if getattr(args, self.feature_name, None):
+             need_apply = True
 
-    def register_args(self, parser: ArgumentParser):
-        group = parser.add_argument_group(title='fusion attention')
-        group.add_argument('--sparse-mode',
-                           type=int,
-                           default=0,
-                           choices=[0,1,2,3,4,5,6,7,8],
-                           help='mask type for fusion attention')
+        return (
+                self.optimization_level <= args.optimization_level and 
+                need_apply
+            ) or self.default_patches
 
     def register_patches(self, patch_manager, args):
           from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper
-          patch_manager.register_patch('megatron.core.transformer.dot_product_attention.DotProductAttention.forward', dot_product_attention_forward_wrapper)
+          patch_manager.register_patch(
+               'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', 
+               dot_product_attention_forward_wrapper
+          )
-- 
Gitee


From 0c7b9e01f5096d74ffab3a331d6ecf92a4ffee95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Fri, 11 Apr 2025 09:36:47 +0800
Subject: [PATCH 11/18] bugfix

---
 .../transformer/flash_attention/alibi_feature.py       | 10 +++++-----
 .../flash_attention/fusion_attention_v2_feature.py     |  4 ----
 .../flash_attention/generate_mask_feature.py           |  4 ++--
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
index 6943a875..595ae476 100644
--- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
@@ -69,12 +69,12 @@ class AlibiFeature(MindSpeedFeature):
             args.alibi_fusion_attn_type not in [0, 2, 3]
         ):
             raise AssertionError(
-                '--alibi-fusion-attn-type only \support for `0, 2, 3`'
+                '--alibi-fusion-attn-type only support for `0, 2, 3`'
             )
         if args.alibi_fusion_attn_type == 0:
             raise AssertionError(
-                'fa_v2 only support compress model currently.\
-                please use 2 or 3'
+                'fa v2 only support compress model currently.' \
+                'please use 2 or 3'
             )
         
         # alibi is only support FA2
@@ -83,8 +83,8 @@ class AlibiFeature(MindSpeedFeature):
         if args.use_fusion_attn_v2:
             args.use_flash_attn = True
             print(
-                "[WARNING] \"use_fusion_attn_v2\" is not recommended. \
-                This feature is not officially released."
+                '[WARNING] \"use_fusion_attn_v2\" is not recommended.' \
+                'This feature is not officially released.'
             )
 
     def register_patches(self, patch_manager, args):
diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
index df71abc8..5158970d 100644
--- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
@@ -47,7 +47,3 @@ class FusionAttentionV2Feature(MindSpeedFeature):
     def validate_args(self, args):
         if args.use_fusion_attn_v2:
             args.use_flash_attn = True
-            print(
-                "[WARNING] \"use_fusion_attn_v2\" is not recommended. \
-                This feature is not officially released."
-            )
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
index 5815e49c..e45b2662 100644
--- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -10,11 +10,11 @@ class GenerateMaskFeature(MindSpeedFeature):
 
     def is_need_apply(self, args):
         """Check the feature is need to apply."""
-        need_apply = False
+        need_apply = True
         
         # can't find feature name, need to enable
         if getattr(args, self.feature_name, None):
-             need_apply = True
+             need_apply = False
 
         return (
                 self.optimization_level <= args.optimization_level and 
-- 
Gitee


From 17e1ee5162f675a1199fe6c72ac1840098937a05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Fri, 11 Apr 2025 09:53:48 +0800
Subject: [PATCH 12/18] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9=20&=20=E6=B7=BB=E5=8A=A0license?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../flash_attention/alibi/adaptor.py          |  3 +-
 .../flash_attention/alibi/alibi.py            | 20 ++++++------
 .../flash_attention/alibi/alibi_utils.py      | 14 ++++++++
 .../alibi/dot_product_attention.py            |  9 ++++--
 .../flash_attention/generate_mask/adaptor.py  |  6 ++--
 .../generate_mask/generate_mask.py            |  3 ++
 .../flash_attention/alibi_feature.py          | 32 ++++++++++---------
 .../fusion_attention_v2_feature.py            |  3 ++
 .../flash_attention/generate_mask_feature.py  |  6 +++-
 9 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
index f30cc0ca..da62f19f 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
@@ -1,6 +1,5 @@
-# coding=utf-8
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 # Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from typing import Optional
 
diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
index 7614269f..1a6891a8 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
@@ -1,9 +1,12 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import threading
 
 import torch
-from megatron.core import parallel_state
 
-from .alibi_utils import get_slopes
+from megatron.core import parallel_state
+from mindspeed.core.transformer.flash_attention.alibi.alibi_utils import get_slopes
 
 
 class Alibi:
@@ -11,15 +14,14 @@ class Alibi:
     alibi = None
     matmul_result = None
     output_size = None
-    lock = threading.Lock()
+    _lock = threading.Lock()
 
     def __new__(cls, *args, **kwargs):
-        if cls._instance:
-            return cls._instance
-        else:
-            with cls.lock:
-                cls._instance = super().__new__(cls)
-                return cls._instance
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+        return cls._instance
 
 
 class AlibiForFusionAttnSingleton:
diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
index 24a898c9..800369aa 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import math
 
 import torch
@@ -5,6 +8,17 @@ from megatron.core import parallel_state
 
 
 def get_slopes(n):
+    """
+    Generate ALiBi slopes for n attention heads.
+    The slopes are computed based on the number of heads and follow a power-of-2 pattern.
+
+    Args:
+        n (int): Number of attention heads.
+
+    Returns:
+        List[float]: A list of slopes for each attention head.
+    """
+
     def get_slopes_power_of_2(n):
         start = (2 ** (-2 ** -(math.log2(n) - 3)))
         ratio = start
diff --git a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
index 67485ba0..b4699876 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import math
 from typing import Optional
 
@@ -7,8 +10,7 @@ from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.packed_seq_params import PackedSeqParams
 
 from mindspeed.ops.fusion_attention_v2 import npu_fusion_attention
-
-from .alibi import AlibiForFusionAttnSingleton
+from mindspeed.core.transformer.flash_attention.alibi.alibi import AlibiForFusionAttnSingleton
 
 try:
     from einops import rearrange
@@ -17,6 +19,9 @@ except ImportError:
 
 
 class DotProductAttentionImpl():
+    """
+    Implementation of dot product attention with ALiBi support.
+    """
 
     def __init__(self):
         # add pse
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
index 12cbc42a..24ca64d4 100644
--- a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
@@ -1,8 +1,10 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 from functools import wraps
 
 from megatron.core.transformer.enums import AttnMaskType
-
-from .generate_mask import get_attention_mask
+from mindspeed.core.transformer.flash_attention.generate_mask.generate_mask import get_attention_mask
 
 
 def dot_product_attention_forward_wrapper(fn):
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
index d93c16a3..02295e47 100644
--- a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import warnings
 
 import torch
diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
index 595ae476..f443c437 100644
--- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 from logging import getLogger
 from argparse import ArgumentParser
 
@@ -65,21 +68,20 @@ class AlibiFeature(MindSpeedFeature):
         )
         
     def validate_args(self, args):
-        if (args.alibi_fusion_attn_type is not None and 
-            args.alibi_fusion_attn_type not in [0, 2, 3]
-        ):
-            raise AssertionError(
-                '--alibi-fusion-attn-type only support for `0, 2, 3`'
-            )
-        if args.alibi_fusion_attn_type == 0:
-            raise AssertionError(
-                'fa v2 only support compress model currently.' \
-                'please use 2 or 3'
-            )
-        
-        # alibi is only support FA2
-        if args.alibi_fusion_attn_type in [2, 3]:
-            args.use_fusion_attn_v2 = True
+        if args.alibi_fusion_attn_type is not None:
+            if args.alibi_fusion_attn_type not in [0, 2, 3]:
+                raise AssertionError(
+                    '--alibi-fusion-attn-type only support for `0, 2, 3`'
+                )
+            if args.alibi_fusion_attn_type == 0:
+                raise AssertionError(
+                    'fa v2 only support compress model currently. '
+                    'please use 2 or 3'
+                )
+            # alibi is only support FA2
+            if args.alibi_fusion_attn_type in [2, 3]:
+                args.use_fusion_attn_v2 = True
+
         if args.use_fusion_attn_v2:
             args.use_flash_attn = True
             print(
diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
index 5158970d..2496f90c 100644
--- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 from argparse import ArgumentParser
 
 from mindspeed.features_manager.feature import MindSpeedFeature
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
index e45b2662..23c52684 100644
--- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -1,5 +1,9 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 from mindspeed.features_manager.feature import MindSpeedFeature
 
+
 class GenerateMaskFeature(MindSpeedFeature):
 
     def __init__(self):
@@ -8,7 +12,7 @@ class GenerateMaskFeature(MindSpeedFeature):
             optimization_level=2
         )
 
-    def is_need_apply(self, args):
+    def is_need_apply(self, args: Any) -> bool:
         """Check the feature is need to apply."""
         need_apply = True
         
-- 
Gitee


From 94b7fa555d43af620654fdc3493c225900baafd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Fri, 11 Apr 2025 10:18:53 +0800
Subject: [PATCH 13/18] bugfix

---
 mindspeed/features_manager/__init__.py                          | 2 +-
 .../transformer/flash_attention/generate_mask_feature.py        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mindspeed/features_manager/__init__.py b/mindspeed/features_manager/__init__.py
index d3549e20..6cdb8a87 100644
--- a/mindspeed/features_manager/__init__.py
+++ b/mindspeed/features_manager/__init__.py
@@ -62,7 +62,7 @@ FEATURES_LIST_V2 = (
     # Transformer flash attention features
     FusionAttentionV2Feature(),
     AlibiFeature(),
-    GenerateMaskFeature()
+    GenerateMaskFeature(),
     
     # MoeExperts use gemm
     MoEGmmFeature(),
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
index 23c52684..4ff3050a 100644
--- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+from typing import Any
+
 from mindspeed.features_manager.feature import MindSpeedFeature
 
 
-- 
Gitee


From 69956541d29173dbc8c616cfaf4c8bc59b9ad052 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Fri, 11 Apr 2025 11:34:53 +0800
Subject: [PATCH 14/18] clean code

---
 .../flash_attention/fusion_attention_v2_feature.py   |  2 +-
 .../flash_attention/generate_mask_feature.py         | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
index 2496f90c..c274b7b3 100644
--- a/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/fusion_attention_v2_feature.py
@@ -43,7 +43,7 @@ class FusionAttentionV2Feature(MindSpeedFeature):
             '--sparse-mode',
             type=int,
             default=0,
-            choices=[0,1,2,3,4,5,6,7,8],
+            choices=[0, 1, 2, 3, 4, 5, 6, 7, 8],
             help='mask type for fusion attention'
         )
         
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
index 4ff3050a..332d41f9 100644
--- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -20,7 +20,7 @@ class GenerateMaskFeature(MindSpeedFeature):
         
         # can't find feature name, need to enable
         if getattr(args, self.feature_name, None):
-             need_apply = False
+            need_apply = False
 
         return (
                 self.optimization_level <= args.optimization_level and 
@@ -28,8 +28,8 @@ class GenerateMaskFeature(MindSpeedFeature):
             ) or self.default_patches
 
     def register_patches(self, patch_manager, args):
-          from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper
-          patch_manager.register_patch(
-               'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', 
-               dot_product_attention_forward_wrapper
-          )
+        from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper
+        patch_manager.register_patch(
+            'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', 
+            dot_product_attention_forward_wrapper
+        )
-- 
Gitee


From 514078b5fa757f2a8a1d4269711da5c5e6747e1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Sat, 12 Apr 2025 17:12:34 +0800
Subject: [PATCH 15/18] bugfix

---
 mindspeed/core/transformer/flash_attention/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 mindspeed/core/transformer/flash_attention/__init__.py

diff --git a/mindspeed/core/transformer/flash_attention/__init__.py b/mindspeed/core/transformer/flash_attention/__init__.py
new file mode 100644
index 00000000..e69de29b
-- 
Gitee


From eefec4039dcee921c0514f22e894452f7d965fc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Sat, 12 Apr 2025 17:18:36 +0800
Subject: [PATCH 16/18] add alibi ut test example

---
 requirements.txt                              |  1 +
 .../features/flash_attention/test_alibi.py    | 74 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py

diff --git a/requirements.txt b/requirements.txt
index 1044aea4..b7e8f1c8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,7 @@ einops
 scipy
 sentencepiece
 pytest
+pytest-mock
 tokenizers<=0.20.3
 transformers>=4.43.2
 gpytorch
diff --git a/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py
new file mode 100644
index 00000000..86bf889c
--- /dev/null
+++ b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py
@@ -0,0 +1,74 @@
+import pytest
+import torch
+import torch_npu
+
+from mindspeed import megatron_adaptor_v2
+
+from megatron.training.global_vars import set_args
+from megatron.training.arguments import parse_args
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention
+
+
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+def run_fusion_attn_with_pse_alibi(bs, seq_len, dtype):
+    from megatron.core.transformer.enums import AttnMaskType
+
+    args = parse_args(None, True)
+    set_args(args)
+
+    config = TransformerConfig(
+        num_layers=2,
+        hidden_size=32,
+        num_attention_heads=4,
+        attention_dropout=0.0,
+        params_dtype = dtype
+    )
+
+    # extra arguments mindspeed needed
+    config.use_flash_attn = True
+    config.use_fusion_attn_v2 = True
+    config.alibi_fusion_attn_type = 2
+    config.sparse_mode = 2
+    config.seq_length = seq_len
+    config.alibi_diagonal_opposite = False
+
+    attn = MindSpeedDotProductAttention(
+        config=config, 
+        layer_number=1, 
+        attn_mask_type=AttnMaskType.causal, 
+        attention_type='self'
+    )
+
+    # attn.pse should exist and not be None
+    assert attn.pse is not None
+
+    b, n, s, d = bs, 4, seq_len, 8
+
+    q = torch.randn(s, b, n, d, dtype=dtype, device='npu', requires_grad=True)
+    k = torch.randn(s, b, n, d, dtype=dtype, device='npu', requires_grad=True)
+    v = torch.randn(s, b, n, d, dtype=dtype, device='npu', requires_grad=True)
+
+    # global attn mask will be generated at DotProductAttention forward wrapper
+    out = attn(q, k, v, None, None, None, None)
+    assert isinstance(out, torch.Tensor)
+
+
+class TestAlibi():
+
+    @pytest.mark.skipif(DEVICE_NAME != 'Ascend910B', reason='device type is not supported, skip this UT!')
+    def test_alibi(self, mocker):
+        mock_world_size = mocker.patch(
+            "megatron.core.parallel_state.get_tensor_model_parallel_world_size",
+            return_value=1
+        )
+        mock_rank = mocker.patch(
+            "megatron.core.parallel_state.get_tensor_model_parallel_rank",
+            return_value=0
+        )
+        run_fusion_attn_with_pse_alibi(2, 256, torch.bfloat16)
+        mock_world_size.assert_called()
+        mock_rank.assert_called_once()
-- 
Gitee


From cd2edfdf89cd8ffe6cb7ac96f6a8cdb129fffd9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Mon, 14 Apr 2025 09:44:26 +0800
Subject: [PATCH 17/18] clean code

---
 .../unit_tests/features/flash_attention/test_alibi.py       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py
index 86bf889c..dd002ad8 100644
--- a/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py
+++ b/tests_extend_v2/unit_tests/features/flash_attention/test_alibi.py
@@ -2,15 +2,13 @@ import pytest
 import torch
 import torch_npu
 
-from mindspeed import megatron_adaptor_v2
-
 from megatron.training.global_vars import set_args
 from megatron.training.arguments import parse_args
 from megatron.core.transformer.transformer_config import TransformerConfig
 
+from mindspeed import megatron_adaptor_v2
 from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention
 
-
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
 
@@ -25,7 +23,7 @@ def run_fusion_attn_with_pse_alibi(bs, seq_len, dtype):
         hidden_size=32,
         num_attention_heads=4,
         attention_dropout=0.0,
-        params_dtype = dtype
+        params_dtype=dtype
     )
 
     # extra arguments mindspeed needed
-- 
Gitee


From 1eea1395843ad173228aa98b741f6cf4cbf67914 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8C=83=E6=96=87=E7=84=98?= <fanwentao11@h-partners.com>
Date: Thu, 17 Apr 2025 14:28:50 +0800
Subject: [PATCH 18/18] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../flash_attention/alibi/adaptor.py          | 24 +------------------
 .../flash_attention/alibi/alibi.py            |  2 +-
 .../flash_attention/alibi/alibi_utils.py      |  6 ++---
 .../alibi/dot_product_attention.py            |  2 +-
 .../flash_attention/generate_mask/adaptor.py  | 10 ++++----
 .../generate_mask/generate_mask.py            |  2 +-
 .../flash_attention/alibi_feature.py          | 12 +++++-----
 .../flash_attention/generate_mask_feature.py  |  8 +++----
 8 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
index da62f19f..24be3b40 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/adaptor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from typing import Optional
@@ -38,25 +38,3 @@ class MindSpeedDotProductAttention(DotProductAttentionImpl, MegatronDotProductAt
         # add pse
         DotProductAttentionImpl.__init__(self)
 
-    def forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        attention_mask: Tensor,
-        attn_mask_type: AttnMaskType = None,
-        attention_bias: Tensor = None,
-        packed_seq_params: Optional[PackedSeqParams] = None,
-    ):
-        output = DotProductAttentionImpl.forward(
-            self,
-            query,
-            key,
-            value,
-            attention_mask,
-            attn_mask_type,
-            attention_bias,
-            packed_seq_params,
-        )
-
-        return output
diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi.py b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
index 1a6891a8..df138999 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import threading
diff --git a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
index 800369aa..31d879a2 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/alibi_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import math
@@ -29,7 +29,7 @@ def get_slopes(n):
     else:
         closest_power_of_2 = 2 ** math.floor(math.log2(n))
         return (
-            get_slopes_power_of_2(closest_power_of_2) + 
+            get_slopes_power_of_2(closest_power_of_2) +
             get_slopes(2 * closest_power_of_2)[0::2][:n - closest_power_of_2]
         )
 
@@ -75,7 +75,7 @@ def _build_alibi_tensor(
         alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point
     else:
         alibi = (
-            slopes.unsqueeze(1).unsqueeze(1) * 
+            slopes.unsqueeze(1).unsqueeze(1) *
             torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
                 num_attention_heads, -1, -1
             )
diff --git a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
index b4699876..4d930c27 100644
--- a/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
+++ b/mindspeed/core/transformer/flash_attention/alibi/dot_product_attention.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import math
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
index 24ca64d4..f27dae85 100644
--- a/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/adaptor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from functools import wraps
@@ -10,10 +10,10 @@ from mindspeed.core.transformer.flash_attention.generate_mask.generate_mask impo
 def dot_product_attention_forward_wrapper(fn):
     @wraps(fn)
     def wrapper(
-        self, query, key, value, 
-        attention_mask, 
-        attn_mask_type, 
-        attention_bias, 
+        self, query, key, value,
+        attention_mask,
+        attn_mask_type,
+        attention_bias,
         packed_seq_params
     ):
         if (
diff --git a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
index 02295e47..b4bad6d0 100644
--- a/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
+++ b/mindspeed/core/transformer/flash_attention/generate_mask/generate_mask.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import warnings
diff --git a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
index f443c437..a190780f 100644
--- a/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/alibi_feature.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from logging import getLogger
@@ -20,7 +20,7 @@ class AlibiFeature(MindSpeedFeature):
 
     def __init__(self):
         super().__init__(
-            'position-embedding-type', 
+            'position-embedding-type',
             optimization_level=2
         )
 
@@ -30,14 +30,14 @@ class AlibiFeature(MindSpeedFeature):
         if pse == 'alibi':
             need_apply = True
         return (
-                self.optimization_level <= args.optimization_level and 
+                self.optimization_level <= args.optimization_level and
                 need_apply
             ) or self.default_patches
 
     def register_args(self, parser: ArgumentParser):
         self.add_parser_argument_choices_value(
-            parser, 
-            "--position-embedding-type", 
+            parser,
+            "--position-embedding-type",
             'alibi'
         )
 
@@ -92,6 +92,6 @@ class AlibiFeature(MindSpeedFeature):
     def register_patches(self, patch_manager, args):
         from mindspeed.core.transformer.flash_attention.alibi.adaptor import MindSpeedDotProductAttention
         patch_manager.register_patch(
-            'megatron.core.transformer.dot_product_attention.DotProductAttention', 
+            'megatron.core.transformer.dot_product_attention.DotProductAttention',
             MindSpeedDotProductAttention
         )
\ No newline at end of file
diff --git a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
index 332d41f9..16705975 100644
--- a/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
+++ b/mindspeed/features_manager/transformer/flash_attention/generate_mask_feature.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) 2025, Huawei Technologies Co., Ltd. All rights reserved.
 # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from typing import Any
@@ -10,7 +10,7 @@ class GenerateMaskFeature(MindSpeedFeature):
 
     def __init__(self):
         super().__init__(
-            'no-create-attention-mask-in-dataloader', 
+            'no-create-attention-mask-in-dataloader',
             optimization_level=2
         )
 
@@ -23,13 +23,13 @@ class GenerateMaskFeature(MindSpeedFeature):
             need_apply = False
 
         return (
-                self.optimization_level <= args.optimization_level and 
+                self.optimization_level <= args.optimization_level and
                 need_apply
             ) or self.default_patches
 
     def register_patches(self, patch_manager, args):
         from mindspeed.core.transformer.flash_attention.generate_mask.adaptor import dot_product_attention_forward_wrapper
         patch_manager.register_patch(
-            'megatron.core.transformer.dot_product_attention.DotProductAttention.forward', 
+            'megatron.core.transformer.dot_product_attention.DotProductAttention.forward',
             dot_product_attention_forward_wrapper
         )
-- 
Gitee