From 632306efe23ffbe2ef14eb7ced0873f7aa3635ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5?= Date: Mon, 28 Jul 2025 11:58:40 +0800 Subject: [PATCH 1/2] add double buffer with odd batchNum --- .../iterator/batch_loop/batch_loop_multi.h | 69 +++++++++++++++---- .../copy_cube_in/batch/batch_copy_cube_in.h | 17 ++--- .../batch/batch_copy_cube_in_params.h | 5 ++ 3 files changed, 70 insertions(+), 21 deletions(-) diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h index 1a8379e1..410f2faf 100644 --- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h @@ -71,6 +71,16 @@ public: { outerIdx_++; dstOffset_ += batchCalcSize_; + if (newProcess_ && outerIdx_ == batchOuter_ - 1) { + const int32_t tail = inputBatchNum_ % batchA_; + batchA_ = tail == 0 ? mainBatchInner_ : tail; + batchB_ = tail == 0 ? mainBatchInner_ : tail; + batchNum_ = batchA_; + batchCalcSize_ = batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() * + MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; + splitBatchNum_ = batchNum_ / splitSize_; + } } __aicore__ inline bool OuterEnd() @@ -78,6 +88,11 @@ public: return outerIdx_ >= batchOuter_; } + __aicore__ inline int32_t GetMainBatchBlock() const + { + return mainBatchInner_; // batchNum main block in outLoop + } + __aicore__ inline uint32_t GetOuterIndex() const { return outerIdx_; @@ -161,7 +176,15 @@ public: __aicore__ inline bool InnerEnd() { - return innerIdx_ >= splitBatchNum_ || splitOuterIdx_ * splitBatchNum_ >= batchNum_; + if ((!newProcess_) || (batchNum_ % DB_FACTOR == 0) || (splitSize_ < DB_FACTOR)) { + return (innerIdx_ >= splitBatchNum_) || (splitOuterIdx_ * splitBatchNum_ >= batchNum_); + } + const auto firstBatchNum = batchNum_ / splitSize_; + if (splitOuterIdx_ < 1) { + return innerIdx_ >= firstBatchNum; + } else { + return innerIdx_ >= batchNum_ - firstBatchNum; + } } __aicore__ inline uint32_t GetInnerIndex() const @@ -274,26 +297,42 @@ private: int32_t multiples = batchNumLarge / batchNumLess; int32_t singleBatchSize = multiples * largeMatrixSingleBatchSize + lessMatrixSingleBatchSize; int32_t batchInner = TOTAL_L1_SIZE / singleBatchSize; + inputBatchNum_ = batchNumLarge; + ASSERT(batchInner > 0); - while (batchNumLess % batchInner != 0 && batchInner > 0) { - --batchInner; + newProcess_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0) && (inputBatchNum_ >= batchInner); + if (newProcess_) { + mainBatchInner_ = batchInner; + batchOuter_ = CeilT(batchNumLess, batchInner); + batchA_ = batchInner; + batchB_ = batchInner; + } else { + while (batchNumLess % batchInner != 0 && batchInner > 0) { + --batchInner; + } + mainBatchInner_ = batchInner; + batchOuter_ = batchNumLess / batchInner; + batchA_ = multiples * batchInner; + batchB_ = batchInner; } - batchOuter_ = batchNumLess / batchInner; - batchA_ = multiples * batchInner; - batchB_ = batchInner; } __aicore__ inline void UpdateBatchNumParams() { batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_; - splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) && - (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1; - splitBatchNum_ = batchNum_ / splitSize_; + if (!newProcess_ || batchA_ != batchB_) { + splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) && + (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1; + splitBatchNum_ = batchNum_ / splitSize_; + } else { + splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; + splitBatchNum_ = batchNum_ / splitSize_; + } } __aicore__ inline void UpdateSplitParams() { - splitBatchIdx_ += batchNum_ / splitSize_; + splitBatchIdx_ += splitBatchNum_; } __aicore__ inline void UpdateInnerParams() @@ -301,9 +340,9 @@ private: innerBatchIdx_ = innerIdx_ + splitBatchIdx_; } - int32_t batchA_; - int32_t batchB_; - int32_t batchNum_; + int32_t batchA_; // outerLoop main/tail block + int32_t batchB_; // outerLoop main/tail block + int32_t batchNum_; // outerLoop main/tail block int32_t batchOuter_ = 1; constexpr static int32_t c0Size_ = AuxGetC0Size(); @@ -327,6 +366,10 @@ private: int32_t nBatchOutNum_ = 1; int32_t batchOutCacheNum_ = 0; int32_t batchOutOffsetNum_ = 0; + + int32_t inputBatchNum_ = 0; + bool newProcess_ = false; // new logical judgment condition for handling odd batchNum + int32_t mainBatchInner_ = 0; // outerLoop main block }; } // namespace Detail } // namespace Impl diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h index 809aab80..c9e86c49 100644 --- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h +++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h @@ -75,13 +75,13 @@ private: // Calculate batch outer loop offset // the parameter false means don't need to use constant parameters int64_t batchOffset = outerIdx * GetSingleSize() * - MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(); + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchMainBlock(); // Calculate iter numbers by line of BSNGD layout int32_t batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(); // batchA_ or batchB_ int32_t iterNum = 1; + int32_t tmpBatchNum = batchNum / splitSize; UpdataBatchNum(batchNum, iterNum); - batchNum /= splitSize; // Calculate srcDValue for ND copy auto srcDValue = MATMUL_MODULE(BatchCopyCubeInParams)->template GetBatchOrgWidth(); @@ -90,12 +90,13 @@ private: // if user input matrixStride, use matrixStride as srcStride auto srcStride = matrixStride != 0 ? matrixStride : GetSrcStride(); auto dstStride = MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleSizeAlign(); - int64_t srcOffset = batchNum * splitIdx * srcStride; - int64_t dstOffset = batchNum * splitIdx * dstStride; + int64_t srcOffset = tmpBatchNum * splitIdx * srcStride; + int64_t dstOffset = tmpBatchNum * splitIdx * dstStride; + auto paramsBatchNum = splitIdx == 0 ? tmpBatchNum : batchNum - tmpBatchNum; // Calculate src and dst stride of one line - auto iterSrcStride = batchNum * GetSingleSize(); - auto iterDstStride = batchNum * GetSingleSize(); + auto iterSrcStride = paramsBatchNum * GetSingleSize(); + auto iterDstStride = paramsBatchNum * GetSingleSize(); // Complete datacopy by line GlobalTensor srcGlobal; @@ -103,7 +104,7 @@ private: srcGlobal.SetAddr(batchOffset); for (int32_t idx = 0; idx < iterNum; ++idx) { if (srcStride >= UINT16_MAX) { - for (int i = 0; i < batchNum; ++i) { + for (int i = 0; i < paramsBatchNum; ++i) { MATMUL_MODULE(DataCopyWrapper)->CopyND2NZ( dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleHeight(), @@ -116,7 +117,7 @@ private: dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleHeight(), MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleWidth(), - srcDValue, batchNum, srcStride, dstStride); + srcDValue, paramsBatchNum, srcStride, dstStride); } dstOffset += iterDstStride; srcOffset += iterSrcStride; diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h index fb2cfe19..6e3a04a9 100644 --- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h +++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h @@ -36,6 +36,11 @@ public: } } + __aicore__ inline uint32_t GetBatchMainBlock() + { + return MATMUL_MODULE(BatchLoop)->GetMainBatchBlock(); + } + template __aicore__ inline int32_t GetBatchOrgWidth() { -- Gitee From 778147409636dabb41a6732a328e2bac681b5c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5?= Date: Mon, 28 Jul 2025 15:05:01 +0800 Subject: [PATCH 2/2] Fix bias ground and data enter with NZ Precison errors --- .../iterator/batch_loop/batch_loop_multi.h | 14 +++++++------- .../stage/copy_cube_in/batch/batch_copy_cube_in.h | 13 ++++++++----- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h index 410f2faf..48f391b0 100644 --- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h @@ -74,7 +74,7 @@ public: if (newProcess_ && outerIdx_ == batchOuter_ - 1) { const int32_t tail = inputBatchNum_ % batchA_; batchA_ = tail == 0 ? mainBatchInner_ : tail; - batchB_ = tail == 0 ? mainBatchInner_ : tail; + batchB_ = batchA_ batchNum_ = batchA_; batchCalcSize_ = batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); @@ -120,7 +120,7 @@ public: __aicore__ inline int32_t GetBiasBatchSrcOffset() const { - return outerIdx_ * batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + return outerIdx_ * mainBatchInner_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); } // Double Buffer Loop @@ -300,7 +300,7 @@ private: inputBatchNum_ = batchNumLarge; ASSERT(batchInner > 0); - newProcess_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0) && (inputBatchNum_ >= batchInner); + newProcess_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0); if (newProcess_) { mainBatchInner_ = batchInner; batchOuter_ = CeilT(batchNumLess, batchInner); @@ -320,12 +320,12 @@ private: __aicore__ inline void UpdateBatchNumParams() { batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_; - if (!newProcess_ || batchA_ != batchB_) { - splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) && - (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1; + if (batchOuter_ > 1 && batchA_ == batchB_) { + splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; splitBatchNum_ = batchNum_ / splitSize_; } else { - splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1; + splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) && + (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1; splitBatchNum_ = batchNum_ / splitSize_; } } diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h index c9e86c49..31edec87 100644 --- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h +++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h @@ -255,19 +255,22 @@ private: auto alignWidth = CeilAlign(MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleWidth(), c0Size_); // 2. Calculate src and dst stride of one step - auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() / splitSize; + auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(); + int32_t tmpBatchNum = batchNum / splitSize; + int64_t srcStride = alignWidth * alignHeight; int64_t dstStride = MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleSizeAlign(); - int64_t srcOffset = batchNum * splitIdx * srcStride; - int64_t dstOffset = batchNum * splitIdx * dstStride; + int64_t srcOffset = tmpBatchNum * splitIdx * srcStride; + int64_t dstOffset = tmpBatchNum * splitIdx * dstStride; + auto paramsBatchNum = splitIdx == 0 ? tmpBatchNum : batchNum - tmpBatchNum; // 3. loop copy NZ data by batch bool iskRowDirec = IS_KROW && IsSupportB8(); - auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * srcStride; + auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchMainBlock() * srcStride; GlobalTensor srcGlobal; srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_); srcGlobal.SetAddr(batchOffset); - for (int i = 0; i < batchNum; ++i) { + for (int i = 0; i < paramsBatchNum; ++i) { MATMUL_MODULE(DataCopyWrapper)->CopyNZ2NZ( dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, alignHeight, alignWidth, alignHeight, iskRowDirec); -- Gitee