From ce9db7720e8c6e62312d90532feda49d19e207dd Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 18:57:09 +0800
Subject: [PATCH 1/5] bug: mla remove nz for ring

---
 .../test_multi_latent_attention_lse.py        | 470 ++++++++++++++++++
 1 file changed, 470 insertions(+)
 create mode 100644 tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py

diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py
new file mode 100644
index 00000000..27bba90a
--- /dev/null
+++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+
+import logging
+import sys
+import os
+import unittest
+import math
+import numpy as np
+import torch
+import random
+import json
+import torch.nn.functional as F
+import torch_npu
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+import operation_test
+from precision_calcu import *
+
+torch.set_printoptions(precision=4, sci_mode=False)
+# torch_npu.npu.set_device()
+
+class TestPagedAttentionMLA(operation_test.OperationTest):
+
+    def compare_output_data(self, out, golden, ratios):
+        error_count = 0
+        strict_error_count = 0
+        fp16_min_normal = 1.0 / (1 << 14)
+        golden = golden.flatten().to(torch.float32)
+        out = out.flatten().to(torch.float32)
+        len = out.shape[0]
+        diff = torch.abs(golden - out)
+        max_diff = diff.max().item()
+        limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1]))
+        strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3]))
+        error_count = torch.gt(diff, limit_error).sum().item()
+        strict_error_count = torch.gt(diff, strict_limit_error).sum().item()
+        logging.info(f"maxDiff {max_diff}")
+        logging.info("1/1000 Accuracy is %f", 1 - float(error_count) / len)
+        logging.info("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len)
+        if self.data_type == torch.bfloat16 or self.is_int8_flag:
+            logging.info("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[2])
+        else:
+            logging.info("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[0])
+        calc_times = self.head_size_qk * self.max_context_len + 4
+        if self.data_type == torch.bfloat16:
+            if calc_times < 2048:
+                error = 2 ** (-7)
+            else:
+                error = 2 ** (-6)
+            error_threshold = torch.clamp(torch.abs(golden), min=1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+        else:
+            if calc_times < 2048:
+                error = 2 ** (-8)
+            else:
+                error = 2 ** (-7)
+            error_threshold = torch.clamp(torch.abs(golden), min=1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+
+    def get_alibi_slopes(self, n_heads):
+        n = 2 ** math.floor(math.log2(n_heads))
+        m0 = 2.0 ** (-8.0 / n)
+        slopes = torch.pow(m0, torch.arange(1, n + 1))
+        if n < n_heads:
+            m1 = 2.0 ** (-4.0 / n)
+            mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
+            slopes = torch.cat([slopes, mm])
+        # slopes = torch.ones(n_heads)
+        return slopes
+
+    def group_mm_torch(self, heads, group_num, A, B, is_k):
+        group_head = heads // group_num
+        score_high = None
+        for i in range(group_num):
+            if self.is_int8_flag:
+                int8_B = B[i: (i + 1), :, :, ]
+                head_dim = int8_B.shape[2]
+                int32_B = torch.matmul(torch.eye(int8_B.shape[1]).to(torch.float32), int8_B.to(torch.float32)).to(
+                    torch.int32)
+                if is_k:
+                    if self.has_bias:
+                        int32_B = int32_B + self.offset1[i * head_dim:(i + 1) * head_dim]
+                    fp32_B = int32_B.to(torch.float32) * self.de_scale1_fp32[i * head_dim:(i + 1) * head_dim]
+                    fp32_B = torch.permute(fp32_B, (0, 2, 1))
+                else:
+                    if self.has_bias:
+                        int32_B = int32_B + self.offset2[i * head_dim:(i + 1) * head_dim]
+                    fp32_B = int32_B.to(torch.float32) * self.de_scale2_fp32[i * head_dim:(i + 1) * head_dim]
+                group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32),
+                                                fp32_B)
+            else:
+                group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32),
+                                                B[i:(i + 1), :, :].to(torch.float32))
+            if score_high is None:
+                score_high = group_score_high
+            else:
+                score_high = torch.cat((score_high, group_score_high), 0)
+        return score_high
+
+    def process_deq_scale(self, deq_scale) -> np.ndarray:
+        new_deq_scale = np.frombuffer(deq_scale.tobytes(), dtype=np.uint32)
+        return new_deq_scale.astype(np.uint64)
+
+    def softmax(self, sim):
+        row_max = torch.max(sim, axis=-1, keepdims=True)[0]
+        sim_sub = sim - row_max
+        sim_sub = torch.exp(sim_sub)
+        row_sum = torch.sum(sim_sub, axis=-1, keepdims=True)
+        soft_res = sim_sub / row_sum
+        return soft_res
+
+    def softmax_numpy(self, sim):
+        sim = sim.cpu().numpy()
+        row_max = np.max(sim, axis=-1, keepdims=True)
+        sim_sub = sim - row_max
+        sim_sub = np.exp(sim_sub)
+        # print(sim_sub)
+        row_sum = np.sum(sim_sub, axis=-1, keepdims=True)
+        soft_res = sim_sub / row_sum
+        return soft_res, row_max + np.log(row_sum)
+
+    def shape_nd_to_nz(self, shape, dtype='float16'):
+        assert len(shape) >= 2
+        batch = shape[:-2]  # 最后两维nd->nz
+        a, b = shape[-2], shape[-1]
+        a0, b0 = 16, 16
+        return list(batch) + [math.ceil(b / b0), math.ceil(a / a0), a0, b0]
+
+    def gen_axes_for_transpose(self,offset, base):
+        return [x for x in range(offset)] + [x + offset for x in base]
+
+    def convert_nd_to_nz(self, x):
+        array_trans = self.gen_axes_for_transpose(len(x.shape) - 2, [2, 0, 1, 3])  # (m1, m0, n1, n0) -> (n1, m1, m0, n0)
+        x_shape = self.shape_nd_to_nz(x.shape, dtype=x.dtype)
+        *_, n1, m1, m0, n0 = x_shape
+        return x.reshape(x_shape[:-4] + [m1, m0, n1, n0]).permute(*array_trans)  # x原始需要对齐，才能reshape
+
+    def ref_masked_attention(self,
+                             query,  # (1, num_heads, head_size)
+                             key,  # (context_len, kv_heads, head_size)
+                             value,
+                             scale: float,
+                             alibi_bias,
+                             mask_data_type=torch.bfloat16
+                             ):
+        # Q * K.T
+        query = query
+        query = torch.permute(query, (1, 0, 2))
+        if not self.is_int8_flag:
+            key = torch.permute(key, (1, 2, 0))  # 0 1 2
+        else:
+            key = torch.permute(key, (1, 0, 2))
+        sim_high = self.group_mm_torch(query.shape[0], key.shape[0], query, key, 1)  # (head_num, q_seqlen, k_seqlen)
+        sim_out = sim_high.to(torch.float32)
+        sim_high = sim_high.to(torch.float32) * scale
+        if alibi_bias is not None:
+            sim_high = sim_high + alibi_bias.to(torch.float32)
+        # softmax
+        p_high, lse = self.softmax_numpy(sim_high)
+        p = torch.from_numpy(p_high).to(mask_data_type)
+        p_high = torch.from_numpy(p_high)
+
+        lse = torch.permute(torch.from_numpy(lse).to(mask_data_type), (1, 0, 2))  # (q_seqlen, head_num, 1)
+
+        # P * V
+        value = torch.permute(value, (1, 0, 2))
+        out = self.group_mm_torch(query.shape[0], key.shape[0], p, value, 0)
+        out_high = self.group_mm_torch(query.shape[0], key.shape[0], p_high, value, 0)
+        out = torch.permute(out, (1, 0, 2))
+        out_high = torch.permute(out_high, (1, 0, 2))
+        sim_out = torch.permute(sim_out, (1, 0, 2))
+        return out, out_high, sim_out, lse
+
+    def ref_single_query_cached_kv_attention(self,
+                                             sim,
+                                             output,
+                                             true_out,
+                                             lse,        # (num_tokens, num_heads, 1)
+                                             query,
+                                             key_cache,  # (num_blocks, block_size, num_heads, head_size)
+                                             value_cache,  # (num_blocks, block_size, num_heads, head_size)
+                                             block_tables,
+                                             context_lens,
+                                             mask,
+                                             mask_dim=4,
+                                             mask_data_type=torch.bfloat16
+                                             ) -> None:
+        mask_index_coff = 1
+        if self.compressHead:
+            query = query.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size_qk)
+            output = output.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size_vo)
+            true_out = true_out.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads,
+                                     self.head_size_vo)
+            if mask_dim == 4:
+                mask_shape = mask.shape
+                mask = mask.view(mask_shape[0] * self.kv_heads, self.num_heads // self.kv_heads, 1,
+                                 self.max_context_len)
+            else:
+                mask_index_coff = self.kv_heads
+        num_heads = query.shape[1]
+        kv_heads = value_cache.shape[2]
+        head_size_qk = key_cache.shape[3]
+        head_size_vo = value_cache.shape[3]
+        block_size = value_cache.shape[1]
+
+        num_input_tokens = query.shape[0]
+        index = 0
+        for i in range(len(context_lens)):
+            block_table = block_tables[i]
+            context_len = int(context_lens[i])
+            if context_len == 0:
+                continue
+
+            q = query[index].view(1, num_heads, head_size_qk)
+            keys = []
+            values = []
+            for j in range(context_len):
+                block_number = int(block_table[j // block_size])
+                block_offset = j % block_size
+
+                k = key_cache[block_number, block_offset, :, :]
+                k = k.reshape(kv_heads, head_size_qk)
+                keys.append(k)
+
+                v = value_cache[block_number, block_offset, :, :]
+                v = v.reshape(kv_heads, head_size_vo)
+                values.append(v)
+            keys = torch.stack(keys, axis=0)
+            values = torch.stack(values, axis=0)
+            scale = np.float32(1.0 / (head_size_qk ** 0.5))
+            if mask_dim == 4:
+                out, out_high, sim_out, _ = self.ref_masked_attention(q, keys, values, scale,
+                                                                         mask[i, :, :, :context_len], mask_data_type)
+                out = out.reshape(num_heads, head_size_vo)
+            elif mask_dim == 3:
+                out, out_high, sim_out, _ = self.ref_masked_attention(q, keys, values, scale,
+                                                                         mask[i // mask_index_coff, :, :context_len],
+                                                                         mask_data_type)
+                out = out.reshape(num_heads, head_size_vo)
+            else:
+                out, out_high, sim_out, lse_i = self.ref_masked_attention(q, keys, values, scale, mask,
+                                                                          mask_data_type)
+                out = out.reshape(num_heads, head_size_vo)
+                lse_i = lse_i.reshape(num_heads, 1)
+                lse[index] = lse_i.to(mask_data_type)
+            out_high = out_high.reshape(num_heads, head_size_vo)
+            sim_out = sim_out.reshape(1, num_heads * context_len)
+            output[index] = out.to(mask_data_type)
+            true_out[index] = out_high
+            sim[index] = sim_out
+            index = index + 1
+
+    def calc_data(self, num_tokens, num_heads, kv_heads, head_size_qk, head_size_vo, block_size, num_blocks, k_seqlen,\
+                  dtype, mask_dim = 0, mask_data_type = torch.bfloat16,\
+                  dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False,
+                  compressHead = False, is_kv_combined = True, is_nz_in = False):
+        self.num_heads = num_heads
+        self.kv_heads = kv_heads
+        self.num_tokens = num_tokens
+        self.compressHead = compressHead
+        self.head_size_qk = head_size_qk
+        self.head_size_vo = head_size_vo
+
+        logging.debug(
+            f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size_qk}, {head_size_vo}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}')
+
+        q_range = 5.0
+        query = torch.from_numpy(np.random.uniform(-q_range, q_range, size=(num_tokens, num_heads, head_size_qk))).to(dtype)
+        # (num_blocks, block_size, num_heads, head_size)
+        kv_range = 5.0
+        kv_type = dtype
+        if is_int8_flag:
+            kv_type = torch.int8
+        if not compressHead:
+            key_cache = torch.from_numpy(
+                np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_qk))).to(
+                kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+            if not is_kv_combined:
+                value_cache = torch.from_numpy(
+                    np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_vo))).to(
+                    kv_type)
+            else:
+                value_cache = key_cache[:, :, :, :head_size_vo]
+        else:
+            key_cache = torch.from_numpy(
+                np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size_qk))).to(
+                kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+            if not is_kv_combined:
+                value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(
+                num_blocks * kv_heads, block_size, 1, head_size_vo))).to(kv_type)
+            else:
+                value_cache = key_cache[:, :, :, :head_size_vo]
+        self.data_type = dtype
+
+        if dynamic_batch:
+            context_lens = dynamic_seqlen
+        else:
+            context_lens = [k_seqlen] * num_tokens
+        max_context_len = max(context_lens)
+        self.max_context_len = max_context_len
+        batch = len(context_lens)
+
+        # alibi mask
+        if mask_dim == 4:
+            mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32)
+            alibi_slopes = self.get_alibi_slopes(num_heads)
+            for i, context_len in enumerate(context_lens):
+                if context_len == 0:
+                    continue
+                position_ids = np.arange(context_len).astype(np.int32)
+                alibi_bias = (position_ids - context_len + 1).astype(np.float32)
+                alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1)  # (head_num, 1, context)
+                mask[i, :, :, :context_len] = alibi_bias
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        # normal mask
+        elif mask_dim == 3:
+            mask = np.zeros((batch, 1, max_context_len), dtype=np.float16)
+            for i in range(batch):
+                mask[i, :, :i] = -10000
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        else:  # no mask
+            mask = None
+
+        if compressHead:
+            context_lens = [val for val in context_lens for _ in range(kv_heads)]
+        batch = len(context_lens)
+        max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+        block_tables = []  # （num_tokens, max_num_blocks_per_seq）
+        for i in range(batch):
+            block_table = [
+                i * max_num_blocks_per_seq + _ for _ in range(max_num_blocks_per_seq)
+            ]
+            block_tables.append(block_table)
+
+        self.is_int8_flag = is_int8_flag
+        if is_int8_flag:
+            de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale1_int64 = self.process_deq_scale(de_scale1_fp32)
+
+            de_scale2_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale2_int64 = self.process_deq_scale(de_scale2_fp32)
+
+            offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64)
+            self.de_scale2_int64 = torch.tensor(list(de_scale2_int64), dtype=torch.int64)
+            self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32)
+            self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32)
+            self.offset1 = torch.from_numpy(offset1)
+            self.offset2 = torch.from_numpy(offset2)
+            self.has_bias = has_bias
+
+        shape_out = (num_tokens, num_heads, head_size_vo)
+        ref_output = torch.zeros(shape_out, dtype=dtype)
+        true_out = torch.zeros(shape_out, dtype=torch.float32)
+        sim = torch.zeros((num_tokens, num_heads * k_seqlen), dtype=torch.float32)
+        lse = torch.zeros((num_tokens, num_heads, 1), dtype=dtype)
+        self.ref_single_query_cached_kv_attention(
+            sim,
+            ref_output,
+            true_out,
+            lse,
+            query,
+            key_cache,
+            value_cache,
+            block_tables,
+            context_lens,
+            mask,
+            mask_dim,
+            mask_data_type
+        )
+
+        self.q_split1, self.q_split2 = torch.split(query, [512, 64], dim=2)
+        self.key_cache_split1, self.key_cache_split2 = torch.split(key_cache, [512, 64], dim=3)
+        self.value_cache = value_cache
+
+        if (is_nz_in):
+            key_cache_split1, key_cache_split2 = torch.split(key_cache, [512, 64], dim=3)
+            key_cache_split1 = key_cache_split1.reshape(num_blocks, block_size, -1)
+            key_cache_split2 = key_cache_split2.reshape(num_blocks, block_size, -1)
+            key_cache_split1_nz = self.convert_nd_to_nz(key_cache_split1)
+            key_cache_split2_nz = self.convert_nd_to_nz(key_cache_split2)
+            self.key_cache_split1 = key_cache_split1_nz.to(mask_data_type).reshape(num_blocks, -1, block_size, 16)
+            self.key_cache_split2 = key_cache_split2_nz.to(mask_data_type).reshape(num_blocks, -1, block_size, 16)
+
+        self.block_tables = np.array(block_tables).astype(np.int32)
+        self.contex_lens = np.array(context_lens).astype(np.int32)
+        self.alib_mask = mask
+        self.golden_out = ref_output
+        self.true_out = true_out
+        self.lse = lse
+
+    def golden_calc(self, in_tensors):
+        golden_out = torch.tensor(self.golden_out)
+        return [golden_out, self.lse]
+
+    def golden_compare(self, out_tensors, golden_tensors):
+        go_double = compare_cv(self.true_out, golden_tensors[0].cpu(), out_tensors[0].cpu())
+        result_old = self.compare_output_data(out_tensors[0].npu(), golden_tensors[0].npu(), [0.001, 0.001, 0.005, 0.005])
+        lse_double = True
+        lse_old = True
+        if self.is_ring:
+            lse_double = compare_cv(golden_tensors[1].npu(), golden_tensors[1].npu(), out_tensors[1].npu())
+            lse_old = self.compare_output_data(out_tensors[1].npu(), golden_tensors[1].npu(), [0.001, 0.001, 0.005, 0.005])
+        return (result_old) and (lse_double or lse_old)
+
+    def test_paged_mla_combine_cache_norm_128_nz(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 32
+        num_heads = 32
+        kv_heads = 1
+        block_size = 128
+        head_size_qk = 576
+        head_size_vo = 512
+        num_blocks = 64
+        k_seqlen = 256
+        tor = 1.0 / (head_size_qk ** 0.5)
+        mask_dim = 0
+        dtype = torch.float16
+        is_kv_combined = True
+        self.is_ring = 1
+        is_nz_in = True
+
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size_qk, head_size_vo, block_size, num_blocks, k_seqlen,
+                       dtype, mask_dim, dtype,
+                       is_kv_combined=is_kv_combined, is_nz_in=is_nz_in)
+
+        OP_NAME = "MLAOperation"
+        OP_PARAM = {"type": 0, "kvHead": kv_heads, "headSize": num_heads, "tor": tor,
+                    "kvSeqLen": self.contex_lens.tolist(), "isRing": self.is_ring}
+        logging.debug(f"blcok_tables shape: {self.block_tables}")
+        logging.debug(f"contex_lens shape: {self.contex_lens}")
+        logging.debug(f"numTokens: {num_tokens}, numHeads: {num_heads}, kvHead: {kv_heads}"
+                      f", blockSize: {block_size}, headSizeQK: {head_size_qk}, headSizeVO: {head_size_vo}, numBlocks: {num_blocks}")
+        logging.info(f"Q1 shape: {self.q_split1.shape}")
+        logging.info(f"Q2 shape: {self.q_split2.shape}")
+        logging.info(f"K1 shape: {self.key_cache_split1.shape}")
+        logging.info(f"K2 shape: {self.key_cache_split2.shape}")
+        
+        OP_NAME = "MultiLatentAttentionOperation"
+        PARAM = json.dumps({"headNum": num_heads, "qkScale":tor, "kvHeadNum":kv_heads, "maskType": 0, "cacheMode": 3, "calcType": 2})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+            [
+                self.q_split1.npu(),
+                self.q_split2.npu(),
+                torch.tensor(self.key_cache_split1).npu(),
+                torch.tensor(self.key_cache_split2).npu(),
+                torch.tensor(self.block_tables).int().npu(),
+                torch.tensor(self.contex_lens).npu()
+            ])
+
+if __name__ == '__main__':
+    unittest.main()
-- 
Gitee


From d053ce7b87b68e85f93d2dbe240ac28ce93a82be Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 19:03:28 +0800
Subject: [PATCH 2/5] remove strict mla nz

---
 .../multi_latent_attention_operation.cpp                    | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
index 5a720458..d51e896d 100644
--- a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
+++ b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
@@ -111,12 +111,6 @@ static bool ParamCheck(const infer::MultiLatentAttentionParam &opParam)
         ATB_LOG(ERROR) << "only mtp(CALC_TYPE_SPEC) support mask";
         return false;
     }
-    if ((opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING ||
-         opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC_AND_RING) &&
-        opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::KROPE_CTKV) {
-        ATB_LOG(ERROR) << "CalcType is ring only support krppe ctkv";
-        return false;
-    }
     return true;
 }
 
-- 
Gitee


From c7b8acddf659901bc7de4e89ddd509a932e5202e Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 19:28:00 +0800
Subject: [PATCH 3/5] fix check

---
 .../multi_latent_attention_operation.cpp                   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
index d51e896d..bbd059ee 100644
--- a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
+++ b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
@@ -111,6 +111,13 @@ static bool ParamCheck(const infer::MultiLatentAttentionParam &opParam)
         ATB_LOG(ERROR) << "only mtp(CALC_TYPE_SPEC) support mask";
         return false;
     }
+    if ((opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING ||
+         opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC_AND_RING) &&
+         opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::KROPE_CTKV &&
+         opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::NZCACHE) {
+        ATB_LOG(ERROR) << "CalcType is ring only support krppe ctkv and nzcache";
+        return false;
+    }
     return true;
 }
 
-- 
Gitee


From 079773c9cfec3d724f2adb83fc64e4a2584b4bcf Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 19:40:35 +0800
Subject: [PATCH 4/5] remove ODR

---
 src/torch_atb/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/torch_atb/CMakeLists.txt b/src/torch_atb/CMakeLists.txt
index 3e525a8b..37a7f1a6 100644
--- a/src/torch_atb/CMakeLists.txt
+++ b/src/torch_atb/CMakeLists.txt
@@ -8,6 +8,8 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 #
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w")
+
 file(GLOB_RECURSE pybind11_source_files "*.cpp")
 pybind11_add_module(_C ${pybind11_source_files})
 set_target_properties(_C PROPERTIES OUTPUT_NAME "_C" SUFFIX ".so")
-- 
Gitee


From 1fcd8f35362203575dff6a25506e2abbb6af9faf Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 20:12:21 +0800
Subject: [PATCH 5/5] remove test case

---
 tests/apitest/opstest/csv/multi_latent_attention.csv | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/apitest/opstest/csv/multi_latent_attention.csv b/tests/apitest/opstest/csv/multi_latent_attention.csv
index 7de546d8..ef8654c9 100644
--- a/tests/apitest/opstest/csv/multi_latent_attention.csv
+++ b/tests/apitest/opstest/csv/multi_latent_attention.csv
@@ -1,14 +1,12 @@
 CaseNum |CaseName                     |OpName            |OpParam                             |InNum |InDType                    |InFormat      |InShape                |OutNum |OutDType      |OutFormat  |OutShape        |DataGenType          |DataGenRange               |InTensorFile |OutTensorFile |TestType |TestLevel |FromModel |SocVersion |ExpectedError
 1       |MultiLatentAttentionBadCaseHeadNum |MultiLatentAttentionOperation |{"maskType":1,"calcType":2,"cacheMode":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2|float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 2       |MultiLatentAttentionBadCaseKvHeadNumNot1 |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":8,"kvHeadNum":2}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
-3       |MultiLatentAttentionBadCaseInt8NzCacheHeadNum128 |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":2,"headNum":128,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|1      |float16|nd|32,32,512|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 4       |MultiLatentAttentionErrorQkScale |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1,"qkScale":100}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 5       |MultiLatentAttentionErrorMaskType |MultiLatentAttentionOperation |{"maskType":3,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 6       |MultiLatentAttentionInvalidCalcType |MultiLatentAttentionOperation |{"maskType":0,"calcType":5,"cacheMode":3,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 7       |MultiLatentAttentionInvalidCacheType |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":4,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 8       |MultiLatentAttentionBadCaseNotSupportedKvCache |MultiLatentAttentionOperation |{"maskType":0,"calcType":0,"cacheMode":0, "headNum": 8, "kvHeadNum": 1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|1      |float16|nd|32,32,512|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 9       |MultiLatentAttentionNoError |MultiLatentAttentionOperation |{"maskType":1,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
-10       |MultiLatentAttentionNoError |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":2,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 11       |MultiLatentAttentionWrongDimNum |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |I:ERROR_INVALID_TENSOR_DIM_NUM
 12       |MultiLatentAttentionErrorBatchExceeded |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;8200,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |I:ERROR_INVALID_TENSOR_DIM
 13       |MultiLatentAttentionBatchNotSame |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;64,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |I:ERROR_INVALID_TENSOR_DIM
-- 
Gitee