diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h
index 3b75e829ab92d5081c2769f268b35152c6fe5476..fa2eb3877d4d6c09008b541ec9ff2935c41bddbf 100644
--- a/impl/matmul/matmul_impl.h
+++ b/impl/matmul/matmul_impl.h
@@ -1861,8 +1861,17 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     if (enAtomic == 1) {
         SetAtomicAdd<DstT>();
     }
-    MATMUL_MODULE(CopyCubeOut)->Copy(gm, co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_,
-        var.baseUseN_, var.blockUseM_, var.blockUseN_, enSequentialWrite);
+
+    if (enSequentialWrite) {
+        MATMUL_MODULE(CopyCubeOut)
+            ->template Copy<true>(gm, co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
+                                  var.blockUseM_, var.blockUseN_);
+    } else {
+        MATMUL_MODULE(CopyCubeOut)
+            ->template Copy<false>(gm, co2Local, co1Local, var.curM_, var.curN_, var.baseUseM_, var.baseUseN_,
+                                   var.blockUseM_, var.blockUseN_);
+    }
+
     if (enAtomic != 0) {
         SetAtomicNone();
     }
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h
index 73a35ec314d0329ac3b52052c5557c255f7df798..087a98f9de527b401ed98411c12aab4f465c83fc 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h
@@ -37,25 +37,17 @@ public:
      * @return: void
      */
     __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) {}
+
     /**
      * @description: Reset all should be called when matmul end
      * @param: void
      * @return: void
      */
     __aicore__ inline void Destroy() {}
-    /**
-     * @description: Get current index of iteration
-     * @param: curRow: current row index
-     * @param: curCol: current col index
-     * @return: current index of iteration
-     */
-    __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol)
-    {
-        return 0;
-    }
+
     /**
      * @description: Judge if data of current iteration is already in buffer
-     * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex
+     * @param: iterIndex: current index of iteration
      * @param: bufferPos: current buffer position
      * @return: true if already in buffer, else false
      */
@@ -63,9 +55,10 @@ public:
     {
         return false;
     }
+
     /**
      * @description: Get buffer only when hit
-     * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex
+     * @param: iterIndex: current index of iteration
      * @param: bufferPos: current buffer position
      * @return: tensor on L1
      */
@@ -73,6 +66,7 @@ public:
     {
         return NULL_TENSOR<TransT>;
     }
+
     /**
      * @description: Allocate one block of buffer, should be called only when current iterindex does not hit
      * @param: bufferPos: current buffer position
@@ -82,6 +76,7 @@ public:
     {
         return NULL_TENSOR<TransT>;
     }
+
     /**
      * @description: Free tensor, should be called after AllocTensor
      * @param: bufferPos: current buffer position
@@ -89,18 +84,21 @@ public:
      * @return: void
      */
     __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor<TransT>& tensor = NULL_TENSOR<TransT>) {}
+
     /**
      * @description: Reset the status of que in CubeInBuffer
      * @return: void
      */
     __aicore__ inline void Reset() {}
+
     /**
      * @description: Put tensor to buffer que
      * @param: tensor: target tensor on L1
-     * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex
+     * @param: iterIndex: current index of iteration
      * @return: void
      */
     __aicore__ inline void EnQue(LocalTensor<TransT>& tensor) {}
+
     /**
      * @description: Fetch tensor from que
      * @param: void
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h
index 680fb949542058065f404c23f4152e1f67a3ab2a..ac45de199665874dd94063411fb755121fafddbe 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h
@@ -86,11 +86,6 @@ public:
         (void) qid_.DeQue();
     }
 
-    __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol)
-    {
-        return 0;
-    }
-
 private:
     typename CubeInQueType<INPUT_TYPE>::QUE qid_;
     LocalTensor<TransT> cacheHead_;
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h
index 71935f7cf51d48647ae060518345b670a22b8060..4133f7e45afeff36d8d0db34bc2ed90e0dd1a9df 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h
@@ -47,6 +47,7 @@ public:
 
     /**
      * @description: Load input data to L1
+     * @param: ScheduleContext: generic type will be used later to store matmul context
      * @param: curRow: The row index of the matrixA/B to be loaded at current iterate
      * @param: curCol: The column index of the matrixA/B to be loaded at current iterate
      * @param: tileHeight: The height of the matrixA/B tiles to be loaded at current iterate
diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h
index ce966450edfd5d97f8ac91fc31aa9517372ffc95..7fb304cc3c022b2b03d30b2df10beef45a9203eb 100644
--- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h
+++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h
@@ -37,6 +37,7 @@ public:
      * @param: baseBlockHeight: The current block number of the matrixA tiles
      * @param: baseBlockWidth: The current handled block number of the matrixB tiles 
      * @param: enSequentialWrite: The data's write type on dst address, continue or flat write
+     * @param: ScheduleContext: generic type will be used later to store matmul context
      * @return: void
      */
     template <bool enSequentialWrite = false, typename ScheduleContext = int>