From 5b0b600ab317cdee116f2ebc52227f4426ff7738 Mon Sep 17 00:00:00 2001 From: zhuhaozhecool Date: Mon, 22 Sep 2025 20:11:59 +0800 Subject: [PATCH] fix pp matmul I8 Kernel memory illegal read --- .../pp_matmul_i8_kernel/op_kernel/pp_matmul.cce | 9 ++++++++- .../op_kernel/pp_matmul_bf16.cce | 7 ++++--- .../op_kernel/pp_matmul_i8_weight_nz.cce | 13 ++++++++++--- .../op_kernel/pp_matmul_nz_m300.cce | 13 ++++++++++--- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce index 1cd15bfb..1beff8e7 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce @@ -24,6 +24,7 @@ constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB +constexpr uint32_t BLOCK_SIZE_8 = 8; constexpr uint32_t BLOCK_SIZE_16 = 16; constexpr uint32_t BLOCK_SIZE_32 = 32; constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256; // 16 * 16 @@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val) return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16; } +__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val) +{ + return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8; +} + template class PpMatmulInt { @@ -163,6 +169,7 @@ public: uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + uint32_t bias_n_round = RoundUp8(n_actual); uint32_t m_round = 0; uint32_t n_round = 0; uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0; @@ -212,7 +219,7 @@ public: WAIT_FLAG(MTE1, MTE2, EVENT_ID7); gm_to_l1(bias_l1, // dst gm_bias[offset_bias], // src - 1, RoundUp16(1), 1, n_round, + 1, RoundUp16(1), 1, bias_n_round, RoundUp16(n_round), n_round); SET_FLAG(MTE2, MTE1, EVENT_ID6); } diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce index 85d802c1..44e2475b 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce @@ -180,6 +180,7 @@ public: uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + uint32_t bias_n_round = RoundUp8(n_actual); uint32_t m_round = 0; uint32_t n_round = 0; uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0; @@ -240,7 +241,7 @@ public: WAIT_FLAG(MTE1, MTE2, EVENT_ID7); gm_to_l1(bias_l1, // dst gm_bias[offset_bias], // src - 1, RoundUp16(1), 1, n_round, + 1, RoundUp16(1), 1, bias_n_round, RoundUp16(n_round), n_round); SET_FLAG(MTE2, MTE1, EVENT_ID6); } @@ -279,7 +280,7 @@ public: l1_buf_b, gm_b[offset_b], k_actual, k_round, k, n_actual, n_round, n); } else { gm_to_l1( - l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } } SET_FLAG(MTE2, MTE1, event_id + CONST_2); @@ -372,7 +373,7 @@ public: gm_to_l1(l1_buf_b_next, gm_b[offset_b_next], k_actual_next, - k_round_next, + RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce index d3d58a48..a498351b 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce @@ -23,6 +23,7 @@ constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB +constexpr uint32_t BLOCK_SIZE_8 = 8; constexpr uint32_t BLOCK_SIZE_16 = 16; constexpr uint32_t BLOCK_SIZE_32 = 32; constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256; // 16 * 16 @@ -70,6 +71,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val) return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16; } +__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val) +{ + return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8; +} + template ( - l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } SET_FLAG(MTE2, MTE1, event_id + CONST_2); @@ -383,7 +390,7 @@ public: gm_to_l1(l1_buf_b_next, gm_b[offset_b_next], k_actual_next, - k_round_next, + RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce index c21ac739..c373e68f 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce @@ -24,6 +24,7 @@ constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB +constexpr uint32_t BLOCK_SIZE_8 = 8; constexpr uint32_t BLOCK_SIZE_16 = 16; constexpr uint32_t BLOCK_SIZE_32 = 32; constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256; // 16 * 16 @@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val) return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16; } +__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val) +{ + return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8; +} + template class PpMatmulInt { @@ -163,6 +169,7 @@ public: uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + uint32_t bias_n_round = RoundUp8(n_actual); uint32_t m_round = 0; uint32_t n_round = 0; uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0; @@ -212,7 +219,7 @@ public: WAIT_FLAG(MTE1, MTE2, EVENT_ID7); gm_to_l1(bias_l1, // dst gm_bias[offset_bias], // src - 1, RoundUp16(1), 1, n_round, + 1, RoundUp16(1), 1, bias_n_round, RoundUp16(n_round), n_round); SET_FLAG(MTE2, MTE1, EVENT_ID6); } @@ -242,7 +249,7 @@ public: l1_buf_b, gm_b[offset_b], n_actual, n_round, RoundUp16(n), k_actual, k_round, RoundUp32(k)); } else { gm_to_l1( - l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } SET_FLAG(MTE2, MTE1, event_id + CONST_2); @@ -323,7 +330,7 @@ public: l1_buf_b_next, gm_b[offset_b_next], n_actual, n_round, RoundUp16(n), k_actual_next, k_round_next, RoundUp32(k)); } else { gm_to_l1( - l1_buf_b_next, gm_b[offset_b_next], k_actual_next, k_round_next, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b_next, gm_b[offset_b_next], k_actual_next, RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } SET_FLAG(MTE2, MTE1, event_id_next + CONST_2); } -- Gitee