diff --git a/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce b/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce index 2a7bd6ee4d699f33a4393e3f932412d2a638272a..dbee1f9523ae4350b8b0404071cb4b19bf39ba27 100644 --- a/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce +++ b/src/kernels/mixkernels/unpad_flash_attention/op_kernel/unpad_flash_attention_razor_fusion.cce @@ -941,6 +941,7 @@ public: if (sub_m <= 16) { PIPE_BARRIER(V); } + PIPE_BARRIER(V); for (uint32_t ms_idx = 0; ms_idx < loop_m; ms_idx++) { uint32_t nowm = (ms_idx == (loop_m - 1))? sub_m - ms_idx * sub_km : sub_km; uint32_t now_roundm = (nowm + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE;