From c1c6835f9b689bb4b82c2993e14d1e5cd6236fa5 Mon Sep 17 00:00:00 2001 From: willow Date: Tue, 16 Sep 2025 14:31:33 +0800 Subject: [PATCH 1/3] bugfix:mla --- .../mixkernels/multi_latent_attention/op_kernel/mla.cce | 4 ++-- .../op_kernel/unpad_flash_attention_razor_fusion.cce | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce b/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce index db5c4a35..0e01495d 100644 --- a/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce +++ b/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce @@ -179,10 +179,10 @@ struct UbufAlloc static constexpr uint32_t ls32_quant_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lp_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lp32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; - static constexpr uint32_t mask_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; + static constexpr uint32_t mask32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; + static constexpr uint32_t mask_ubuf_offset = 3 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lo_ubuf_ping_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lo_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; - static constexpr uint32_t mask32_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t ls16_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lm32_ubuf_offset = 6 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t hm32_ubuf_offset = 6 * UB_UINT8_BLOCK_SIZE_MLA + 1 * UB_UINT8_LINE_SIZE; diff --git a/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce b/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce index 2a7bd6ee..dbee1f95 100644 --- a/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce +++ b/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce @@ -941,6 +941,7 @@ public: if (sub_m <= 16) { PIPE_BARRIER(V); } + PIPE_BARRIER(V); for (uint32_t ms_idx = 0; ms_idx < loop_m; ms_idx++) { uint32_t nowm = (ms_idx == (loop_m - 1))? sub_m - ms_idx * sub_km : sub_km; uint32_t now_roundm = (nowm + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; -- Gitee From 7233bac5484153ff88baed0827e1adcddae85e87 Mon Sep 17 00:00:00 2001 From: willow Date: Wed, 17 Sep 2025 09:33:32 +0800 Subject: [PATCH 2/3] bugfix:razor --- .../mixkernels/multi_latent_attention/op_kernel/mla.cce | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce b/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce index 0e01495d..5885f3df 100644 --- a/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce +++ b/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce @@ -180,9 +180,10 @@ struct UbufAlloc static constexpr uint32_t lp_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lp32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t mask32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; - static constexpr uint32_t mask_ubuf_offset = 3 * UB_UINT8_BLOCK_SIZE_MLA; + static constexpr uint32_t mask_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lo_ubuf_ping_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lo_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; + static constexpr uint32_t mask32_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t ls16_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lm32_ubuf_offset = 6 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t hm32_ubuf_offset = 6 * UB_UINT8_BLOCK_SIZE_MLA + 1 * UB_UINT8_LINE_SIZE; -- Gitee From 325e517dcb75f7cdd320ab77f91f86f6f8980686 Mon Sep 17 00:00:00 2001 From: willow Date: Wed, 17 Sep 2025 09:34:37 +0800 Subject: [PATCH 3/3] bugfix:razor --- src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce b/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce index 5885f3df..db5c4a35 100644 --- a/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce +++ b/src/kernels/mixkernels/multi_latent_attention/op_kernel/mla.cce @@ -179,7 +179,6 @@ struct UbufAlloc static constexpr uint32_t ls32_quant_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lp_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lp32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; - static constexpr uint32_t mask32_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t mask_ubuf_offset = 2 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lo_ubuf_ping_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; static constexpr uint32_t lo_ubuf_offset = 4 * UB_UINT8_BLOCK_SIZE_MLA; -- Gitee