From 5b0b600ab317cdee116f2ebc52227f4426ff7738 Mon Sep 17 00:00:00 2001
From: zhuhaozhecool <zhuhaozhe2@huawei.com>
Date: Mon, 22 Sep 2025 20:11:59 +0800
Subject: [PATCH] fix pp matmul I8 Kernel memory illegal read

---
 .../pp_matmul_i8_kernel/op_kernel/pp_matmul.cce     |  9 ++++++++-
 .../op_kernel/pp_matmul_bf16.cce                    |  7 ++++---
 .../op_kernel/pp_matmul_i8_weight_nz.cce            | 13 ++++++++++---
 .../op_kernel/pp_matmul_nz_m300.cce                 | 13 ++++++++++---
 4 files changed, 32 insertions(+), 10 deletions(-)
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce
index 1cd15bfb..1beff8e7 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce
@@ -24,6 +24,7 @@
 
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB
+constexpr uint32_t BLOCK_SIZE_8 = 8;
 constexpr uint32_t BLOCK_SIZE_16 = 16;
 constexpr uint32_t BLOCK_SIZE_32 = 32;
 constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256;           // 16 * 16
@@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val)
     return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16;
 }
 
+__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val)
+{
+    return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8;
+}
+
 template <bool TA, bool TB, bool SPLIT_K = false, bool HAVE_BIAS = false, bool IS_INT8 = false, typename InDtype = half,
           typename OutDtype = half, typename BiasDtype = float, typename ScaleDtype = float>
 class PpMatmulInt {
@@ -163,6 +169,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -212,7 +219,7 @@ public:
                 WAIT_FLAG(MTE1, MTE2, EVENT_ID7);
                 gm_to_l1<ArchType::ASCEND_V220, BiasDtype, DataFormat::ND, DataFormat::ND>(bias_l1,              // dst
                                                                                            gm_bias[offset_bias], // src
-                                                                                           1, RoundUp16(1), 1, n_round,
+                                                                                           1, RoundUp16(1), 1, bias_n_round,
                                                                                            RoundUp16(n_round), n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
             }
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce
index 85d802c1..44e2475b 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce
@@ -180,6 +180,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -240,7 +241,7 @@ public:
                 WAIT_FLAG(MTE1, MTE2, EVENT_ID7);
                 gm_to_l1<ArchType::ASCEND_V220, BiasDtype, DataFormat::ND, DataFormat::ND>(bias_l1,              // dst
                                                                                            gm_bias[offset_bias], // src
-                                                                                           1, RoundUp16(1), 1, n_round,
+                                                                                           1, RoundUp16(1), 1, bias_n_round,
                                                                                            RoundUp16(n_round), n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
             }
@@ -279,7 +280,7 @@ public:
                         l1_buf_b, gm_b[offset_b], k_actual, k_round, k, n_actual, n_round, n);
                 } else {
                     gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                        l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                        l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n));
                 }
             }
             SET_FLAG(MTE2, MTE1, event_id + CONST_2);
@@ -372,7 +373,7 @@ public:
                             gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,
                                                                                                      gm_b[offset_b_next],
                                                                                                      k_actual_next,
-                                                                                                     k_round_next,
+                                                                                                     RoundUp16(k_actual_next),
                                                                                                      RoundUp16(k),
                                                                                                      n_actual,
                                                                                                      n_round,
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce
index d3d58a48..a498351b 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce
@@ -23,6 +23,7 @@
 
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB
+constexpr uint32_t BLOCK_SIZE_8 = 8;
 constexpr uint32_t BLOCK_SIZE_16 = 16;
 constexpr uint32_t BLOCK_SIZE_32 = 32;
 constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256;           // 16 * 16
@@ -70,6 +71,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val)
     return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16;
 }
 
+__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val)
+{
+    return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8;
+}
+
 template <bool TA,
           bool TB,
           bool SPLIT_K = false,
@@ -186,6 +192,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -236,7 +243,7 @@ public:
                                                                                            1,
                                                                                            RoundUp16(1),
                                                                                            1,
-                                                                                           n_round,
+                                                                                           bias_n_round,
                                                                                            RoundUp16(n_round),
                                                                                            n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
@@ -271,7 +278,7 @@ public:
                     l1_buf_b, gm_b[offset_b], n_actual, n_round, RoundUp16(n), k_actual, k_round, RoundUp32(k));
             } else {
                 gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                    l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                    l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n));
             }
             SET_FLAG(MTE2, MTE1, event_id + CONST_2);
 
@@ -383,7 +390,7 @@ public:
                         gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,
                                                                                                  gm_b[offset_b_next],
                                                                                                  k_actual_next,
-                                                                                                 k_round_next,
+                                                                                                 RoundUp16(k_actual_next),
                                                                                                  RoundUp16(k),
                                                                                                  n_actual,
                                                                                                  n_round,
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce
index c21ac739..c373e68f 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce
@@ -24,6 +24,7 @@
 
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB
+constexpr uint32_t BLOCK_SIZE_8 = 8;
 constexpr uint32_t BLOCK_SIZE_16 = 16;
 constexpr uint32_t BLOCK_SIZE_32 = 32;
 constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256;           // 16 * 16
@@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val)
     return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16;
 }
 
+__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val)
+{
+    return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8;
+}
+
 template <bool TA, bool TB, bool SPLIT_K = false, bool HAVE_BIAS = false, bool IS_INT8 = false, typename InDtype = half,
           typename OutDtype = half, typename BiasDtype = float, typename ScaleDtype = float>
 class PpMatmulInt {
@@ -163,6 +169,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -212,7 +219,7 @@ public:
                 WAIT_FLAG(MTE1, MTE2, EVENT_ID7);
                 gm_to_l1<ArchType::ASCEND_V220, BiasDtype, DataFormat::ND, DataFormat::ND>(bias_l1,              // dst
                                                                                            gm_bias[offset_bias], // src
-                                                                                           1, RoundUp16(1), 1, n_round,
+                                                                                           1, RoundUp16(1), 1, bias_n_round,
                                                                                            RoundUp16(n_round), n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
             }
@@ -242,7 +249,7 @@ public:
                     l1_buf_b, gm_b[offset_b], n_actual, n_round, RoundUp16(n), k_actual, k_round, RoundUp32(k));
             } else {
                 gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                    l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                    l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n));
             }
             SET_FLAG(MTE2, MTE1, event_id + CONST_2);
 
@@ -323,7 +330,7 @@ public:
                             l1_buf_b_next, gm_b[offset_b_next], n_actual, n_round, RoundUp16(n), k_actual_next, k_round_next, RoundUp32(k));
                     } else {
                         gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                            l1_buf_b_next, gm_b[offset_b_next], k_actual_next, k_round_next, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                            l1_buf_b_next, gm_b[offset_b_next], k_actual_next, RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, RoundUp32(n));
                     }
                     SET_FLAG(MTE2, MTE1, event_id_next + CONST_2);
                 }
-- 
Gitee