diff --git a/src/kernels/lcal/src/kernels/coc_const_args.cce b/src/kernels/lcal/src/kernels/coc_const_args.cce index 9832d2943253aa9c6a2cd49292dcb56714513edd..b6dabdbb5fe4dc745b6c0942d1b53ab8cf9236bb 100644 --- a/src/kernels/lcal/src/kernels/coc_const_args.cce +++ b/src/kernels/lcal/src/kernels/coc_const_args.cce @@ -83,6 +83,9 @@ constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; // 2.5MB constexpr int32_t FLAG_OFFSET = (LCAL_BUFF_BYTES - FLAG_BUFF_BYTES) / sizeof(int32_t); // 201.5 * 1024 * 1024 +constexpr int32_t DATA_COPY_MAX_BLOCK_COUNT = 4095; +constexpr int32_t DATA_COPY_MAX_BLOCK_LEN = 65535; + enum QuantGranularity : int { QUANT_GRANULARITY_UNDEFINED = -1, PER_TENSOR = 0, diff --git a/src/kernels/lcal/src/kernels/coc_preprocessor.cce b/src/kernels/lcal/src/kernels/coc_preprocessor.cce index ce0de5678d532836bbae2c734c71a10e863df20d..ea6e92866c951ee2dfa4fce689589d1b170dc1b8 100644 --- a/src/kernels/lcal/src/kernels/coc_preprocessor.cce +++ b/src/kernels/lcal/src/kernels/coc_preprocessor.cce @@ -200,6 +200,8 @@ protected: int32_t n_cols_round = Block32B::AlignUp(n_cols); int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + max_rows_per_loop = max_rows_per_loop > DATA_COPY_MAX_BLOCK_COUNT ? DATA_COPY_MAX_BLOCK_COUNT : max_rows_per_loop; + max_cols_per_loop = max_cols_per_loop > DATA_COPY_MAX_BLOCK_LEN ? DATA_COPY_MAX_BLOCK_LEN : max_cols_per_loop; auto ub_base = reinterpret_cast<__ubuf__ MmadDtype *>((uintptr_t)0); @@ -451,6 +453,8 @@ private: int32_t n_cols_round = Block32B::AlignUp(n_cols); int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + max_rows_per_loop = max_rows_per_loop > DATA_COPY_MAX_BLOCK_COUNT ? DATA_COPY_MAX_BLOCK_COUNT : max_rows_per_loop; + max_cols_per_loop = max_cols_per_loop > DATA_COPY_MAX_BLOCK_LEN ? DATA_COPY_MAX_BLOCK_LEN : max_cols_per_loop; auto ub_vconv = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); auto ub_muls = reinterpret_cast<__ubuf__ half *>((uintptr_t)(MAX_LEN * sizeof(int8_t)));