From 632306efe23ffbe2ef14eb7ced0873f7aa3635ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5?=
 <huangjiaxuan4@huawei.com>
Date: Mon, 28 Jul 2025 11:58:40 +0800
Subject: [PATCH 1/2] add double buffer with odd batchNum

---
 .../iterator/batch_loop/batch_loop_multi.h    | 69 +++++++++++++++----
 .../copy_cube_in/batch/batch_copy_cube_in.h   | 17 ++---
 .../batch/batch_copy_cube_in_params.h         |  5 ++
 3 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h
index 1a8379e1..410f2faf 100644
--- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h
+++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h
@@ -71,6 +71,16 @@ public:
     {
         outerIdx_++;
         dstOffset_ += batchCalcSize_;
+        if (newProcess_ && outerIdx_ == batchOuter_ - 1) {
+            const int32_t tail = inputBatchNum_ % batchA_;
+            batchA_ = tail == 0 ? mainBatchInner_ : tail;
+            batchB_ = tail == 0 ? mainBatchInner_ : tail;
+            batchNum_ = batchA_;
+            batchCalcSize_ = batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() *
+                MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN();
+            splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1;
+            splitBatchNum_ = batchNum_ / splitSize_;
+        }
     }
 
     __aicore__ inline bool OuterEnd()
@@ -78,6 +88,11 @@ public:
         return outerIdx_ >= batchOuter_;
     }
 
+    __aicore__ inline int32_t GetMainBatchBlock() const
+    {
+        return mainBatchInner_; // batchNum main block in outLoop
+    }
+
     __aicore__ inline uint32_t GetOuterIndex() const
     {
         return outerIdx_;
@@ -161,7 +176,15 @@ public:
 
     __aicore__ inline bool InnerEnd()
     {
-        return innerIdx_ >= splitBatchNum_ || splitOuterIdx_ * splitBatchNum_ >= batchNum_;
+        if ((!newProcess_) || (batchNum_ % DB_FACTOR == 0) || (splitSize_ < DB_FACTOR)) {
+            return (innerIdx_ >= splitBatchNum_) || (splitOuterIdx_ * splitBatchNum_ >= batchNum_);
+        }
+        const auto firstBatchNum = batchNum_ / splitSize_;
+        if (splitOuterIdx_ < 1) {
+            return innerIdx_ >= firstBatchNum;
+        } else {
+            return innerIdx_ >= batchNum_ - firstBatchNum;
+        }
     }
 
     __aicore__ inline uint32_t GetInnerIndex() const
@@ -274,26 +297,42 @@ private:
         int32_t multiples = batchNumLarge / batchNumLess;
         int32_t singleBatchSize = multiples * largeMatrixSingleBatchSize + lessMatrixSingleBatchSize;
         int32_t batchInner = TOTAL_L1_SIZE / singleBatchSize;
+        inputBatchNum_ = batchNumLarge;
+
         ASSERT(batchInner > 0);
-        while (batchNumLess % batchInner != 0 && batchInner > 0) {
-            --batchInner;
+        newProcess_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0) && (inputBatchNum_ >= batchInner);
+        if (newProcess_) {
+            mainBatchInner_ = batchInner;
+            batchOuter_ = CeilT(batchNumLess, batchInner);
+            batchA_ = batchInner;
+            batchB_ = batchInner;
+        } else {
+            while (batchNumLess % batchInner != 0 && batchInner > 0) {
+                --batchInner;
+            }
+            mainBatchInner_ = batchInner;
+            batchOuter_ = batchNumLess / batchInner;
+            batchA_ = multiples * batchInner;
+            batchB_ = batchInner;
         }
-        batchOuter_ = batchNumLess / batchInner;
-        batchA_ = multiples * batchInner;
-        batchB_ = batchInner;
     }
 
     __aicore__ inline void UpdateBatchNumParams()
     {
         batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_;
-        splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) &&
-            (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1;
-        splitBatchNum_ = batchNum_ / splitSize_;
+        if (!newProcess_ || batchA_ != batchB_) {
+            splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) &&
+                (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1;
+            splitBatchNum_ = batchNum_ / splitSize_;
+        } else {
+            splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1;
+            splitBatchNum_ = batchNum_ / splitSize_;
+        }
     }
 
     __aicore__ inline void UpdateSplitParams()
     {
-        splitBatchIdx_ += batchNum_ / splitSize_;
+        splitBatchIdx_ += splitBatchNum_;
     }
 
     __aicore__ inline void UpdateInnerParams()
@@ -301,9 +340,9 @@ private:
         innerBatchIdx_ = innerIdx_ + splitBatchIdx_;
     }
 
-    int32_t batchA_;
-    int32_t batchB_;
-    int32_t batchNum_;
+    int32_t batchA_; // outerLoop main/tail block
+    int32_t batchB_; // outerLoop main/tail block
+    int32_t batchNum_; // outerLoop main/tail block
     int32_t batchOuter_ = 1;
     constexpr static int32_t c0Size_ = AuxGetC0Size<typename INPUT_TYPE::T>();
 
@@ -327,6 +366,10 @@ private:
     int32_t nBatchOutNum_ = 1;
     int32_t batchOutCacheNum_ = 0;
     int32_t batchOutOffsetNum_ = 0;
+
+    int32_t inputBatchNum_ = 0; 
+    bool newProcess_ = false; // new logical judgment condition for handling odd batchNum
+    int32_t mainBatchInner_ = 0; // outerLoop main block
 };
 }  // namespace Detail
 }  // namespace Impl
diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h
index 809aab80..c9e86c49 100644
--- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h
+++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h
@@ -75,13 +75,13 @@ private:
         // Calculate batch outer loop offset
         // the parameter false means don't need to use constant parameters
         int64_t batchOffset = outerIdx * GetSingleSize<IS_TRANS, false>() *
-                              MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum();
+                              MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchMainBlock();
 
         // Calculate iter numbers by line of BSNGD layout
         int32_t batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(); // batchA_ or batchB_
         int32_t iterNum = 1;
+        int32_t tmpBatchNum = batchNum / splitSize;
         UpdataBatchNum(batchNum, iterNum);
-        batchNum /= splitSize;
 
         // Calculate srcDValue for ND copy
         auto srcDValue = MATMUL_MODULE(BatchCopyCubeInParams)->template GetBatchOrgWidth<IS_TRANS>();
@@ -90,12 +90,13 @@ private:
         // if user input matrixStride, use matrixStride as srcStride
         auto srcStride = matrixStride != 0 ? matrixStride : GetSrcStride<IS_TRANS, false>();
         auto dstStride =  MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleSizeAlign<IS_TRANS>();
-        int64_t srcOffset = batchNum * splitIdx * srcStride;
-        int64_t dstOffset = batchNum * splitIdx * dstStride;
+        int64_t srcOffset = tmpBatchNum * splitIdx * srcStride;
+        int64_t dstOffset = tmpBatchNum * splitIdx * dstStride;
+        auto paramsBatchNum = splitIdx == 0 ? tmpBatchNum : batchNum - tmpBatchNum;
 
         // Calculate src and dst stride of one line
-        auto iterSrcStride = batchNum * GetSingleSize<IS_TRANS, false>();
-        auto iterDstStride = batchNum * GetSingleSize<IS_TRANS>();
+        auto iterSrcStride = paramsBatchNum * GetSingleSize<IS_TRANS, false>();
+        auto iterDstStride = paramsBatchNum * GetSingleSize<IS_TRANS>();
 
         // Complete datacopy by line
         GlobalTensor<SrcT> srcGlobal;
@@ -103,7 +104,7 @@ private:
         srcGlobal.SetAddr(batchOffset);
         for (int32_t idx = 0; idx < iterNum; ++idx) {
             if (srcStride >= UINT16_MAX) {
-                for (int i = 0; i < batchNum; ++i) {
+                for (int i = 0; i < paramsBatchNum; ++i) {
                     MATMUL_MODULE(DataCopyWrapper)->CopyND2NZ(
                         dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0,
                         MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleHeight<IS_TRANS>(),
@@ -116,7 +117,7 @@ private:
                     dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0,
                     MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleHeight<IS_TRANS>(),
                     MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleWidth<IS_TRANS>(),
-                    srcDValue, batchNum, srcStride, dstStride);
+                    srcDValue, paramsBatchNum, srcStride, dstStride);
             }
             dstOffset += iterDstStride;
             srcOffset += iterSrcStride;
diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h
index fb2cfe19..6e3a04a9 100644
--- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h
+++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h
@@ -36,6 +36,11 @@ public:
         }
     }
 
+    __aicore__ inline uint32_t GetBatchMainBlock()
+    {
+        return MATMUL_MODULE(BatchLoop)->GetMainBatchBlock();
+    }
+
     template <bool IS_TRANS = false>
     __aicore__ inline int32_t GetBatchOrgWidth()
     {
-- 
Gitee


From 778147409636dabb41a6732a328e2bac681b5c9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5=EF=A3=B5?=
 <huangjiaxuan4@huawei.com>
Date: Mon, 28 Jul 2025 15:05:01 +0800
Subject: [PATCH 2/2] Fix bias ground and data enter with NZ Precison errors

---
 .../iterator/batch_loop/batch_loop_multi.h         | 14 +++++++-------
 .../stage/copy_cube_in/batch/batch_copy_cube_in.h  | 13 ++++++++-----
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h
index 410f2faf..48f391b0 100644
--- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h
+++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h
@@ -74,7 +74,7 @@ public:
         if (newProcess_ && outerIdx_ == batchOuter_ - 1) {
             const int32_t tail = inputBatchNum_ % batchA_;
             batchA_ = tail == 0 ? mainBatchInner_ : tail;
-            batchB_ = tail == 0 ? mainBatchInner_ : tail;
+            batchB_ = batchA_
             batchNum_ = batchA_;
             batchCalcSize_ = batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() *
                 MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN();
@@ -120,7 +120,7 @@ public:
 
     __aicore__ inline int32_t GetBiasBatchSrcOffset() const
     {
-        return outerIdx_ * batchNum_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN();
+        return outerIdx_ * mainBatchInner_ * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN();
     }
 
     // Double Buffer Loop
@@ -300,7 +300,7 @@ private:
         inputBatchNum_ = batchNumLarge;
 
         ASSERT(batchInner > 0);
-        newProcess_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0) && (inputBatchNum_ >= batchInner);
+        newProcess_ = (multiples == 1) && (inputBatchNum_ % DB_FACTOR != 0);
         if (newProcess_) {
             mainBatchInner_ = batchInner;
             batchOuter_ = CeilT(batchNumLess, batchInner);
@@ -320,12 +320,12 @@ private:
     __aicore__ inline void UpdateBatchNumParams()
     {
         batchNum_ = batchA_ > batchB_ ? batchA_ : batchB_;
-        if (!newProcess_ || batchA_ != batchB_) {
-            splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) &&
-                (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1;
+        if (batchOuter_ > 1 && batchA_ == batchB_) {
+            splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1;
             splitBatchNum_ = batchNum_ / splitSize_;
         } else {
-            splitSize_ = (batchA_ >= DB_FACTOR) ? DB_FACTOR : 1;
+            splitSize_ = (batchNum_ >= DB_FACTOR) && (batchA_ % DB_FACTOR == 0) &&
+                (batchB_ % DB_FACTOR == 0) ? DB_FACTOR : 1;
             splitBatchNum_ = batchNum_ / splitSize_;
         }
     }
diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h
index c9e86c49..31edec87 100644
--- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h
+++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h
@@ -255,19 +255,22 @@ private:
         auto alignWidth = CeilAlign(MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleWidth<IS_TRANS>(), c0Size_);
 
         // 2. Calculate src and dst stride of one step
-        auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() / splitSize;
+        auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum();
+        int32_t tmpBatchNum = batchNum / splitSize;
+
         int64_t srcStride = alignWidth * alignHeight;
         int64_t dstStride = MATMUL_MODULE(BatchCopyCubeInParams)->template GetSingleSizeAlign<IS_TRANS, IS_KROW>();
-        int64_t srcOffset = batchNum * splitIdx * srcStride;
-        int64_t dstOffset = batchNum * splitIdx * dstStride;
+        int64_t srcOffset = tmpBatchNum * splitIdx * srcStride;
+        int64_t dstOffset = tmpBatchNum * splitIdx * dstStride;
+        auto paramsBatchNum = splitIdx == 0 ? tmpBatchNum : batchNum - tmpBatchNum;
 
         // 3. loop copy NZ data by batch
         bool iskRowDirec = IS_KROW && IsSupportB8<TransT>();
-        auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * srcStride;
+        auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchMainBlock() * srcStride;
         GlobalTensor<SrcT> srcGlobal;
         srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_);
         srcGlobal.SetAddr(batchOffset);
-        for (int i = 0; i < batchNum; ++i) {
+        for (int i = 0; i < paramsBatchNum; ++i) {
             MATMUL_MODULE(DataCopyWrapper)->CopyNZ2NZ(
                 dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0,
                 alignHeight, alignWidth, alignHeight, iskRowDirec);
-- 
Gitee