diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 78db7293716c941633750d64aaca0064fbb10f64..780de9f7d9ee657427e0a41c7175905a52ab81f3 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -10127,10 +10127,11 @@ template ::CopyCo22GMNZ2NDOnTheFly( const GlobalTensor& gmC, const LocalTensor& src, bool enSequentialWrite) { + uint32_t dimN = (Kc_ != 0) ? Kc_ : N_; const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); const int oneBlockCount = ONE_BLK_SIZE / sizeof(DstT); int calcWidth = var.baseUseN_ / blockCount; - int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN; + int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN; int blockLen = blockCount * sizeof(DstT) / ONE_BLK_SIZE; int srcRepeatGap = (var.blockUseM_ * BLOCK_CUBE * blockCount - blockCount) * sizeof(DstT) / ONE_BLK_SIZE; int tail = var.baseUseN_ % blockCount; @@ -10142,7 +10143,7 @@ __aicore__ inline void MatmulImpl::CopyCo22GMNZ2ND( const GlobalTensor& gmC, LocalTensor& src, bool enSequentialWrite) { + uint32_t dimN = (Kc_ != 0) ? Kc_ : N_; const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); int width = var.blockUseN_ * blockCount; if constexpr (IsSameType::value || IsSameType::value) { @@ -10427,15 +10429,15 @@ __aicore__ inline void MatmulImpl= width), - { KERNEL_LOG(KERNEL_ERROR, "N_ is %d, width is %d, N_ should be no less than width", N_, width); }); - int dstStride = (N_ - width) * sizeof(DstT) / ONE_BLK_SIZE; - int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN; - int offset = N_; + ASCENDC_ASSERT((dimN >= width), + { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); }); + int dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE; + int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN; + int offset = dimN; if (enSequentialWrite) { isGmAligned = (var.baseUseN_ % blockCount) == 0; dstStride = 0; @@ -10451,7 +10453,7 @@ __aicore__ inline void MatmulImpl::value) { CopyToGMForNotAligned(gmC, trans, blocklen, enSequentialWrite, isTragetAligned); } else { @@ -10522,9 +10524,10 @@ template ::CopyCo22UBNZ2ND( const LocalTensor& dst, const LocalTensor& src, bool enSequentialWrite) { + uint32_t dimN = (Kc_ != 0) ? Kc_ : N_; const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN; - int offset = Ceil(N_, blockCount) * blockCount; + int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN; + int offset = Ceil(dimN, blockCount) * blockCount; if (enSequentialWrite) { dstOffset = 0; offset = var.tiling_->baseN; @@ -10924,8 +10927,8 @@ __aicore__ inline MatmulImpl 16 or m,n<16 - const int32_t m0 = min(minMNSize, min(coreStatus.m, minTotalSize / n0)); + const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); const int32_t k0 = min(min(minKSize / m0, minKSize / n0), coreStatus.k); const int32_t dbBuffer = 2; diff --git a/version.info b/version.info index c1fd1fdd851ff93873b70743661fd3accf13efdf..41cbf7525f4d5a79085f5431fe26e3c83b0f58f6 100644 --- a/version.info +++ b/version.info @@ -1 +1 @@ -Version=7.5.T6.0 +Version=7.5.T11.0