From 054bfa3e4a17db8995ae3b2738d9f8c195bb31cb Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Fri, 20 Sep 2024 02:04:36 +0000 Subject: [PATCH 1/3] fix sc problem Signed-off-by: jiangchengcheng-on --- impl/matmul/matmul_impl.h | 31 ++++++++++++++----------- impl/matmul/matmul_tiling_algorithm.cpp | 13 ++++++----- version.info | 2 +- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 78db7293..780de9f7 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -10127,10 +10127,11 @@ template ::CopyCo22GMNZ2NDOnTheFly( const GlobalTensor& gmC, const LocalTensor& src, bool enSequentialWrite) { + uint32_t dimN = (Kc_ != 0) ? Kc_ : N_; const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); const int oneBlockCount = ONE_BLK_SIZE / sizeof(DstT); int calcWidth = var.baseUseN_ / blockCount; - int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN; + int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN; int blockLen = blockCount * sizeof(DstT) / ONE_BLK_SIZE; int srcRepeatGap = (var.blockUseM_ * BLOCK_CUBE * blockCount - blockCount) * sizeof(DstT) / ONE_BLK_SIZE; int tail = var.baseUseN_ % blockCount; @@ -10142,7 +10143,7 @@ __aicore__ inline void MatmulImpl::CopyCo22GMNZ2ND( const GlobalTensor& gmC, LocalTensor& src, bool enSequentialWrite) { + uint32_t dimN = (Kc_ != 0) ? Kc_ : N_; const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); int width = var.blockUseN_ * blockCount; if constexpr (IsSameType::value || IsSameType::value) { @@ -10427,15 +10429,15 @@ __aicore__ inline void MatmulImpl= width), - { KERNEL_LOG(KERNEL_ERROR, "N_ is %d, width is %d, N_ should be no less than width", N_, width); }); - int dstStride = (N_ - width) * sizeof(DstT) / ONE_BLK_SIZE; - int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN; - int offset = N_; + ASCENDC_ASSERT((dimN >= width), + { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); }); + int dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE; + int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN; + int offset = dimN; if (enSequentialWrite) { isGmAligned = (var.baseUseN_ % blockCount) == 0; dstStride = 0; @@ -10451,7 +10453,7 @@ __aicore__ inline void MatmulImpl::value) { CopyToGMForNotAligned(gmC, trans, blocklen, enSequentialWrite, isTragetAligned); } else { @@ -10522,9 +10524,10 @@ template ::CopyCo22UBNZ2ND( const LocalTensor& dst, const LocalTensor& src, bool enSequentialWrite) { + uint32_t dimN = (Kc_ != 0) ? Kc_ : N_; const int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - int dstOffset = var.curM_ * var.tiling_->baseM * N_ + var.curN_ * var.tiling_->baseN; - int offset = Ceil(N_, blockCount) * blockCount; + int dstOffset = var.curM_ * var.tiling_->baseM * dimN + var.curN_ * var.tiling_->baseN; + int offset = Ceil(dimN, blockCount) * blockCount; if (enSequentialWrite) { dstOffset = 0; offset = var.tiling_->baseN; @@ -10924,8 +10927,8 @@ __aicore__ inline MatmulImpl 16 or m,n<16 - const int32_t m0 = min(minMNSize, min(coreStatus.m, minTotalSize / n0)); - const int32_t k0 = min(min(minKSize / m0, minKSize / n0), coreStatus.k); + const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); + const int32_t k0 = (m0 != 0 && n0 != 0) ? + min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; const int32_t dbBuffer = 2; // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) @@ -1670,12 +1671,12 @@ int MatmulTilingAlgorithm::GetBigPackageCondition(CoreStatusPack &coreStatus, void MatmulTilingAlgorithm::GetBlockDimHelper(const DimFactor& blockDim, CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes, const MatmulRunParas& params) { - blockDimRes.kNum = params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 + blockDimRes.kNum = (blockDim.k == 0) ? 0 : params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDim.batch); coreStatus.m = MathUtil::CeilDivision(params.m32, blockDim.m); coreStatus.n = MathUtil::CeilDivision(params.n32, blockDim.n); - coreStatus.k = params.k32 / blockDim.k; + coreStatus.k = (blockDim.k == 0) ? 0 : params.k32 / blockDim.k; if (tilingIns_->enableSplitK_) { if (params.kMapped != params.k32) { // need check--splitK blockDimRes.kNum = params.kMapped / blockDim.k * NUM_TWO * C0_SIZE * REDUCE_BLOCK_SIZE; @@ -1805,7 +1806,7 @@ bool MatmulTilingAlgorithm::PreProcessMiniShape(const std::string& opType, CoreS coreStatus.n = coreStatus.nDim == 1 ? params.n32 : MathUtil::CeilDivision(params.nMapped, coreStatus.nDim); coreStatus.m = coreStatus.mDim == 1 ? params.m32 : MathUtil::CeilDivision(params.mMapped, coreStatus.mDim); coreStatus.k = coreStatus.kDim == 1 ? params.k32 : MathUtil::CeilDivision(params.kMapped, coreStatus.kDim); - params.nonFactorK = params.k32 % coreStatus.kDim == 0 ? false : true; + params.nonFactorK = (coreStatus.kDim == 0) ? false : (params.k32 % coreStatus.kDim == 0 ? false : true); return true; } return false; @@ -1859,7 +1860,7 @@ void MatmulTilingAlgorithm::AddOptimalFactors(const std::string& opType, const M // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) const int32_t mnCore = MathUtil::CeilDivision(coreNum, params.batch32); if (mnCore > 1) { - const float optPoint = sqrt((params.m32 + 0.0) / params.n32 * mnCore); + const float optPoint = static_cast(sqrt((params.m32 + 0.0f) / params.n32 * mnCore)); const int32_t mdim = static_cast(ceil(optPoint)); const int32_t ndim = static_cast(ceil(mnCore / optPoint)); MathUtil::AddFactor(blockDimRes.mDimFactors, mdim); diff --git a/version.info b/version.info index c1fd1fdd..41cbf752 100644 --- a/version.info +++ b/version.info @@ -1 +1 @@ -Version=7.5.T6.0 +Version=7.5.T11.0 -- Gitee From 7bf9f9af527fff08126c64cb6888a08d19f7d34c Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Fri, 20 Sep 2024 02:17:32 +0000 Subject: [PATCH 2/3] fix Signed-off-by: jiangchengcheng-on --- impl/matmul/matmul_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 780de9f7..04772e66 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -10927,8 +10927,8 @@ __aicore__ inline MatmulImpl Date: Fri, 20 Sep 2024 02:27:13 +0000 Subject: [PATCH 3/3] FIX Signed-off-by: jiangchengcheng-on --- impl/matmul/matmul_impl.h | 4 ++-- impl/matmul/matmul_tiling_algorithm.cpp | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 04772e66..780de9f7 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -10927,8 +10927,8 @@ __aicore__ inline MatmulImpl 16 or m,n<16 const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? - min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; + const int32_t k0 = min(min(minKSize / m0, minKSize / n0), coreStatus.k); const int32_t dbBuffer = 2; // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) @@ -1671,12 +1670,12 @@ int MatmulTilingAlgorithm::GetBigPackageCondition(CoreStatusPack &coreStatus, void MatmulTilingAlgorithm::GetBlockDimHelper(const DimFactor& blockDim, CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes, const MatmulRunParas& params) { - blockDimRes.kNum = (blockDim.k == 0) ? 0 : params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 + blockDimRes.kNum = params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDim.batch); coreStatus.m = MathUtil::CeilDivision(params.m32, blockDim.m); coreStatus.n = MathUtil::CeilDivision(params.n32, blockDim.n); - coreStatus.k = (blockDim.k == 0) ? 0 : params.k32 / blockDim.k; + coreStatus.k = params.k32 / blockDim.k; if (tilingIns_->enableSplitK_) { if (params.kMapped != params.k32) { // need check--splitK blockDimRes.kNum = params.kMapped / blockDim.k * NUM_TWO * C0_SIZE * REDUCE_BLOCK_SIZE; @@ -1806,7 +1805,7 @@ bool MatmulTilingAlgorithm::PreProcessMiniShape(const std::string& opType, CoreS coreStatus.n = coreStatus.nDim == 1 ? params.n32 : MathUtil::CeilDivision(params.nMapped, coreStatus.nDim); coreStatus.m = coreStatus.mDim == 1 ? params.m32 : MathUtil::CeilDivision(params.mMapped, coreStatus.mDim); coreStatus.k = coreStatus.kDim == 1 ? params.k32 : MathUtil::CeilDivision(params.kMapped, coreStatus.kDim); - params.nonFactorK = (coreStatus.kDim == 0) ? false : (params.k32 % coreStatus.kDim == 0 ? false : true); + params.nonFactorK = params.k32 % coreStatus.kDim == 0 ? false : true; return true; } return false; @@ -1860,7 +1859,7 @@ void MatmulTilingAlgorithm::AddOptimalFactors(const std::string& opType, const M // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) const int32_t mnCore = MathUtil::CeilDivision(coreNum, params.batch32); if (mnCore > 1) { - const float optPoint = static_cast(sqrt((params.m32 + 0.0f) / params.n32 * mnCore)); + const float optPoint = sqrt((params.m32 + 0.0) / params.n32 * mnCore); const int32_t mdim = static_cast(ceil(optPoint)); const int32_t ndim = static_cast(ceil(mnCore / optPoint)); MathUtil::AddFactor(blockDimRes.mDimFactors, mdim); -- Gitee