diff --git a/impl/matmul/math_util.cpp b/impl/matmul/math_util.cpp
deleted file mode 100644
index 0d81640fd32f9685568034c163e486239574b738..0000000000000000000000000000000000000000
--- a/impl/matmul/math_util.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/**
- * Copyright (c) 2024 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-/*!
- * \file math_util.cpp
- * \brief
- */
-
-#include "math_util.h"
-#include <algorithm>
-#include <cmath>
-#include <limits>
-namespace matmul_tiling {
-constexpr static int32_t SEED_MAP_MIN = 16;
-constexpr static int32_t SEED_MAP_MAX = 1024;
-constexpr static int32_t FACTOR_NUM_LIMIT = 4;
-constexpr static int32_t L0_FACTOR_NUM_LIMIT = 2;
-constexpr static int32_t L1_FACTOR_NUM_LIMIT = 4;
-constexpr static int32_t MIN_FACTOR_LIMIT = 8;
-constexpr static int32_t L0_FACTOR_LIMIT = 64;
-constexpr static int32_t L1_FACTOR_LIMIT = 128;
-
-bool MathUtil::IsEqual(float leftValue, float rightValue)
-{
-    return std::fabs(leftValue - rightValue) <= std::numeric_limits<float>::epsilon();
-}
-
-int32_t MathUtil::CeilDivision(int32_t num1, int32_t num2)
-{
-    if (num2 == 0) {
-        return 0;
-    }
-    return (num1 + num2 - 1) / num2;
-}
-
-int32_t MathUtil::Align(int32_t num1, int32_t num2)
-{
-    return CeilDivision(num1, num2) * num2;
-}
-
-int32_t MathUtil::AlignDown(int32_t num1, int32_t num2)
-{
-    if (num2 == 0) {
-        return 0;
-    }
-    return (num1 / num2) * num2;
-}
-
-bool MathUtil::CheckMulOverflow(int32_t a, int32_t b, int32_t &c)
-{
-    if (a > 0 && b > 0) {
-        if (a > (INT32_MAX / b)) {
-            return false;
-        }
-    } else {
-        return false;
-    }
-    c = a * b;
-    return true;
-}
-
-int32_t MathUtil::MapShape(int32_t shape, bool roundUpFlag)
-{
-    // map numbers between 32 to 1024 to number of power of 2, and map numbers greater than 1024 to 1024.
-    uint32_t seed = static_cast<uint32_t>(SEED_MAP_MIN);
-    if (shape < static_cast<int32_t>(seed)) {
-        return shape;
-    }
-    while (static_cast<int32_t>(seed) < SEED_MAP_MAX) {
-        if (static_cast<int32_t>(seed) < shape && static_cast<int32_t>(seed << 1U) >= shape) {
-            break;
-        }
-        seed = seed << 1U;
-    }
-    if (roundUpFlag) {
-        seed = seed << 1U;
-    }
-    return static_cast<int32_t>(seed);
-}
-
-void MathUtil::GetFactors(std::vector<int32_t> &factorList, int32_t srcNum, int32_t minFactor, int32_t maxFactor)
-{
-    for (int32_t factor = maxFactor; factor >= minFactor; factor--) {
-        if (srcNum % factor == 0) {
-            factorList.push_back(factor);
-        }
-    }
-}
-
-void MathUtil::GetFactors(std::vector<int32_t> &factorList, int32_t srcNum, int32_t maxFactor)
-{
-    int32_t maxNum = std::min(srcNum, maxFactor);
-    for (int32_t factor = 1; factor <= maxNum; factor++) {
-        if (srcNum % factor == 0) {
-            factorList.push_back(factor);
-        }
-    }
-}
-
-void MathUtil::GetFactorCnt(const int32_t shape, int32_t &factorCnt, const int32_t factorStart, const int32_t factorEnd)
-{
-    for (int32_t i = factorStart; i <= factorEnd; i++) {
-        if (shape < i) {
-            return;
-        }
-        if (shape % i == 0) {
-            ++factorCnt;
-        }
-    }
-}
-
-void MathUtil::GetFactorLayerCnt(const int32_t shape, int32_t &factorCnt, const int32_t factorStart,
-    const int32_t factorEnd)
-{
-    std::vector<int32_t> factorList;
-    MathUtil::GetFactors(factorList, shape, factorStart, factorEnd);
-    for (const auto factor : factorList) {
-        int32_t fcnt = 0;
-        GetFactorCnt(factor, fcnt, 1, factor + 1);
-        factorCnt = fcnt >= factorCnt ? fcnt : factorCnt;
-    }
-}
-
-void MathUtil::AddFactor(std::vector<int32_t> &dimsFactors, int32_t dim)
-{
-    dimsFactors.push_back(dim);
-    sort(dimsFactors.begin(), dimsFactors.end());
-    (void)dimsFactors.erase(unique(dimsFactors.begin(), dimsFactors.end()), dimsFactors.cend());
-}
-
-int32_t MathUtil::GetNonFactorMap(std::vector<int32_t> &factorList, int32_t srcNum, int32_t maxFactor)
-{
-    int32_t factorCnt = 0;
-    int32_t mapFactor = srcNum;
-    MathUtil::GetFactorLayerCnt(srcNum, factorCnt, 1, maxFactor);
-    if (srcNum > 1 && factorCnt <= FACTOR_NUM_LIMIT) {
-        mapFactor = MathUtil::MapShape(srcNum, true);
-    }
-    GetFactors(factorList, mapFactor, maxFactor);
-    return mapFactor;
-}
-
-void MathUtil::GetBlockFactors(std::vector<int32_t> &factorList, const int32_t oriShape, const int32_t mpShape,
-    const int32_t coreNum, const int32_t maxNum)
-{
-    // get all factors of ori_shape/mapshape/coreNum which smaller or equal to maxNum
-    for (int32_t i = 1; i <= maxNum; ++i) {
-        if ((oriShape % i == 0) || (mpShape % i == 0) || (coreNum % i == 0)) {
-            factorList.push_back(i);
-        }
-    }
-}
-
-bool MathUtil::CheckFactorNumSatisfy(const int32_t dim)
-{
-    if (dim <= MIN_FACTOR_LIMIT) {
-        return true;
-    }
-    int32_t factorL0Cnt = 0;
-    int32_t factorL1Cnt = 0;
-    MathUtil::GetFactorLayerCnt(dim, factorL0Cnt, 1, L0_FACTOR_LIMIT);
-    if (dim > L1_FACTOR_LIMIT) {
-        MathUtil::GetFactorLayerCnt(dim, factorL1Cnt, L0_FACTOR_LIMIT + 1, L1_FACTOR_LIMIT);
-    }
-    bool factorNumNotSatisfied = (factorL0Cnt <= L0_FACTOR_NUM_LIMIT) ||
-        ((dim > L1_FACTOR_LIMIT) && (factorL0Cnt + factorL1Cnt <= L1_FACTOR_NUM_LIMIT));
-    return !factorNumNotSatisfied;
-}
-
-int32_t MathUtil::FindBestSingleCore(const int32_t oriShape, const int32_t mappedShape, const int32_t coreNum,
-    bool isKDim)
-{
-    int32_t bestSingleCore = oriShape;
-    int32_t realSingleCore = MathUtil::CeilDivision(oriShape, coreNum);
-    int32_t mappedSingleCore = MathUtil::CeilDivision(mappedShape, coreNum);
-
-    if (isKDim) {
-        int32_t bestShape = (oriShape % coreNum == 0) ? oriShape : mappedShape;
-        bestSingleCore = MathUtil::CeilDivision(bestShape, coreNum);
-        return bestSingleCore;
-    }
-
-    if (coreNum == 1 && CheckFactorNumSatisfy(bestSingleCore)) {
-        return bestSingleCore;
-    }
-
-    bestSingleCore = realSingleCore;
-    while (bestSingleCore != mappedSingleCore) {
-        if (CheckFactorNumSatisfy(bestSingleCore)) {
-            return bestSingleCore;
-        }
-        if (bestSingleCore < mappedSingleCore) {
-            ++bestSingleCore;
-        } else {
-            --bestSingleCore;
-        }
-    }
-    return bestSingleCore;
-}
-} // namespace matmul_tiling
diff --git a/lib/matmul/matmul_call_back.h b/impl/matmul/matmul_call_back.h
similarity index 98%
rename from lib/matmul/matmul_call_back.h
rename to impl/matmul/matmul_call_back.h
index e2b7217dede5288193ebe4ad02ffe3ff2f4d0498..074cb3b7532a61949d4106a5cfc516a1b0aa7a0d 100644
--- a/lib/matmul/matmul_call_back.h
+++ b/impl/matmul/matmul_call_back.h
@@ -1,36 +1,36 @@
-/**
- * Copyright (c) 2024 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-/*!
- * \file matmul_call_back.h
- * \brief
- */
-#ifndef LIB_MATMUL_MATMUL_CALL_BACK_H
-#define LIB_MATMUL_MATMUL_CALL_BACK_H
-
-namespace matmul {
-using namespace AscendC;
-template <void (*DataCopyOut)(const __gm__ void* gm, const LocalTensor<int8_t> &co1Local,
-        const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr,
-        void (*CopyA1)(const LocalTensor<int8_t> &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK,
-        const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr,
-        void (*CopyB1)(const LocalTensor<int8_t> &bMatrix, const __gm__ void *gm, int row, int col, int useK, int useN,
-        const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr>
-struct MatmulCallBackFunc {
-    constexpr static void (*DataCopyOutPtr)(const __gm__ void* gm, const LocalTensor<int8_t> &co1Local,
-        const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = DataCopyOut;
-    constexpr static void (*CopyA1Ptr)(const LocalTensor<int8_t> &aMatrix, const __gm__ void *gm, int row, int col,
-        int useM, int useK, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyA1;
-    constexpr static void (*CopyB1Ptr)(const LocalTensor<int8_t> &bMatrix, const __gm__ void *gm, int row, int col,
-        int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1;
-};
-
-} // namespace matmul
-#endif
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file matmul_call_back.h
+ * \brief
+ */
+#ifndef LIB_MATMUL_MATMUL_CALL_BACK_H
+#define LIB_MATMUL_MATMUL_CALL_BACK_H
+
+namespace matmul {
+using namespace AscendC;
+template <void (*DataCopyOut)(const __gm__ void* gm, const LocalTensor<int8_t> &co1Local,
+        const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr,
+        void (*CopyA1)(const LocalTensor<int8_t> &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK,
+        const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr,
+        void (*CopyB1)(const LocalTensor<int8_t> &bMatrix, const __gm__ void *gm, int row, int col, int useK, int useN,
+        const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr>
+struct MatmulCallBackFunc {
+    constexpr static void (*DataCopyOutPtr)(const __gm__ void* gm, const LocalTensor<int8_t> &co1Local,
+        const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = DataCopyOut;
+    constexpr static void (*CopyA1Ptr)(const LocalTensor<int8_t> &aMatrix, const __gm__ void *gm, int row, int col,
+        int useM, int useK, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyA1;
+    constexpr static void (*CopyB1Ptr)(const LocalTensor<int8_t> &bMatrix, const __gm__ void *gm, int row, int col,
+        int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1;
+};
+
+} // namespace matmul
+#endif
\ No newline at end of file
diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h
index bad9046a77c0ecff8247c5e7c97ebccb7babc863..06c975a9bd1210d329e7ad91814d69d6dce9b863 100644
--- a/impl/matmul/matmul_impl.h
+++ b/impl/matmul/matmul_impl.h
@@ -697,9 +697,9 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 #endif
     var.tiling_ = cubeTiling;
 #if __CCE_AICORE__ == 220
-    if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) {
+    if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) {
         ASCENDC_ASSERT(var.tiling_->singleCoreK <= var.tiling_->baseK, { KERNEL_LOG(KERNEL_ERROR,
-            "When singleCoreK is larger than baseK, the parameter scheduleMode of MM_CFG should not be L0_MN_DB");});
+            "When singleCoreK is larger than baseK, the parameter scheduleType of MM_CFG should not be OUTER_PRODUCT");});
     }
 #endif
     var.tpipe_ = tpipe;
@@ -836,7 +836,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     }
 
 #if __CCE_AICORE__ >= 220
-    if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) {
+    if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) {
         if constexpr (EnUnitFlag(MM_CFG)) {
             var.tpipe_->InitBuffer(var.CO1_, 2 * var.baseMN_ * sizeof(L0cT));
         } else {
@@ -884,10 +884,6 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     if constexpr (!PhyPosIsUB(C_TYPE::pos)) {
         var.tpipe_->InitBuffer(var.qidCO2_, 1, var.baseMN_ * sizeof(DstT));
     }
-    if constexpr (A_TYPE::format == CubeFormat::ND || B_TYPE::format == CubeFormat::ND ||
-        !PhyPosIsUB(C_TYPE::pos)) {
-        var.tpipe_->InitBuffer(var.calcBuf_, var.tiling_->transLength);
-    }
 #endif
 
     InitShareBufEnd(var.tpipe_);
@@ -1018,9 +1014,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_->shareUbSize);
 #if __CCE_AICORE__ == 200
-    if constexpr (!MM_CFG.enVecND2NZ && (!PhyPosIsUB(C_TYPE::pos) || C_TYPE::format == CubeFormat::NZ)) {
-        shareUbSize = 0;
-    }
+    shareUbSize = 0;
 #endif
     uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_->shareL1Size),
         static_cast<uint32_t>(var.tiling_->shareL0CSize), shareUbSize};
@@ -1125,29 +1119,16 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     }
 #endif
 
-#if __CCE_AICORE__ == 220
+#if __CCE_AICORE__ >= 220
     if (var.tiling_->isBias) {
         var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT));
     }
-#else
-    if constexpr (!MM_CFG.enVecND2NZ) {
-        if (var.tiling_->isBias) {
-            var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT));
-        }
-    }
 #endif
     if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
         var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t));
     }
-#if __CCE_AICORE__ < 220
-    // need extra ub when SetQuantTensor, may not use
-    if constexpr (C_TYPE::format == CubeFormat::NZ &&
-        (IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, uint8_t>::value)) {
-        var.tpipe_->InitBuffer(var.calcBuf_, var.tiling_->baseN * sizeof(uint64_t));
-    }
-#endif
 #if (__CCE_AICORE__ < 200)
     var.tpipe_->InitBuffer(var.qidA2_, 1, L0ASize_);
     var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_);
@@ -1268,15 +1249,9 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     var.cacheB1IsCachingPing_ = false;
     var.cacheB1IsCachingPong_ = false;
 
-uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_->shareUbSize);
+    uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_->shareUbSize);
 #if __CCE_AICORE__ == 200
-    if constexpr (!MM_CFG.enVecND2NZ && (!PhyPosIsUB(C_TYPE::pos) || C_TYPE::format == CubeFormat::NZ)) {
-        shareUbSize = 0;
-        if constexpr (C_TYPE::format == CubeFormat::NZ &&
-                      (IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, uint8_t>::value)) {
-            shareUbSize = var.tiling_->baseN * sizeof(uint64_t);
-        }
-    }
+    shareUbSize = 0;
 #endif
     uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_->shareL1Size),
         static_cast<uint32_t>(var.tiling_->shareL0CSize), shareUbSize};
@@ -1364,16 +1339,10 @@ uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_->shareUbSize);
     }
 #endif
 
-#if __CCE_AICORE__ == 220
+#if __CCE_AICORE__ >= 220
     if (var.tiling_->isBias) {
         var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT));
     }
-#else
-    if constexpr (!MM_CFG.enVecND2NZ) {
-        if (var.tiling_->isBias) {
-            var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT));
-        }
-    }
 #endif
 
 #if __CCE_AICORE__ == 220
@@ -1391,13 +1360,6 @@ uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_->shareUbSize);
         var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t));
     }
 #endif
-#if __CCE_AICORE__ < 220
-    // need extra ub when SetQuantTensor, may not use
-    if constexpr (C_TYPE::format == CubeFormat::NZ &&
-        (IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, uint8_t>::value)) {
-        var.tpipe_->InitBuffer(var.calcBuf_, var.tiling_->baseN * sizeof(uint64_t));
-    }
-#endif
 #if (__CCE_AICORE__ < 200)
     var.tpipe_->InitBuffer(var.qidA2_, 1, L0ASize_);
     var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_);
@@ -2222,6 +2184,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     var.CO1_.EnQue(var.cMatrix_);
     auto co1Local = var.CO1_.template DeQue<L0cT>();
     LocalTensor<DstT> co2Local;
+    if constexpr (C_TYPE::format == CubeFormat::NZ) {
+        event_t eventIDMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
+        SetFlag<HardEvent::MTE3_V>(eventIDMte3ToV);
+        WaitFlag<HardEvent::MTE3_V>(eventIDMte3ToV);
+    }
     if constexpr (MM_CFG.enVecND2NZ) {
         if constexpr (!MM_CFG.enableUBReuse) {
             co2Local = var.localWorkspace[var.tiling_->transLength * 2].template ReinterpretCast<DstT>();
@@ -4218,7 +4185,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         var.CO1_.EnQue(var.cMatrix_);
         co1Local = var.CO1_.template DeQue<L0cT>();
     }
-    if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) {
+    if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) {
         if (var.sMadNStep_ > var.tiling_->baseN) { // Means L0 N db, need to excute twice FixpipeOutToGm
             FixpipeOutToGm(gm, co1Local, var.curM_, var.curN_, enAtomic, enSequentialWrite);
             var.baseUseN_ = (var.curN_ + 2 == var.nIter_) ? var.tailN_ : var.tiling_->baseN; // update next var.curN_ baseUseN_
@@ -4482,9 +4449,15 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     GlobalTensor<BiasT>& biasGlobal, LocalTensor<L0cT>& cMatrix, int col)
 {
     auto bias = var.qidBias_.template AllocTensor<BiasT>();
-    auto blockLen = Ceil(var.baseUseN_ * sizeof(BiasT), ONE_BLK_SIZE);
-    DataCopy(bias, biasGlobal[col * var.tiling_->baseN], { (uint16_t)1,
-        (uint16_t)blockLen, (uint16_t)0, (uint16_t)0 });
+    // if var.baseUseN_ is not 32B align, use DataCopy Nd2Nz
+    if ((var.baseUseN_ * sizeof(BiasT)) % ONE_BLK_SIZE != 0) {
+        Nd2NzParams intriParams{ 1, 1, (uint16_t)var.baseUseN_, 0, (uint16_t)var.baseUseN_, 1, 1, 1 };
+        DataCopy(bias, biasGlobal[col * var.tiling_->baseN], intriParams);
+    } else {
+        auto blockLen = Ceil(var.baseUseN_ * sizeof(BiasT), ONE_BLK_SIZE);
+        DataCopy(bias, biasGlobal[col * var.tiling_->baseN],
+            { (uint16_t)1, (uint16_t)blockLen, (uint16_t)0, (uint16_t)0 });
+    }
     // delete after tpipe supports bias queue
     var.qidBias_.EnQue(bias);
 }
@@ -4511,7 +4484,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     co1Local = var.CO1_.template AllocTensor<L0cT>();
 #endif
 
-    if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) {
+    if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) {
         co1Local.SetSize(var.blockUseM_ * var.blockUseN_ * CUBE_MAX_SIZE * 2);
     } else {
         if constexpr (DoMatmulSpecialMDL(MM_CFG)) {
@@ -5940,18 +5913,18 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             bias = bias[var.curN_ * var.tiling_->baseN];
             matmulInstr_.biasType_ = IsSameType<L0cT, BiasT>::value ? 2 : 1; // 2:f32, 1:f16
             matmulInstr_.sL1BiasOffset_ = 0;            
-            matmulInstr_.template Compute<false, false, false, MM_CFG.scheduleMode, MM_CFG.iterateOrder>(a1, b1, var.cMatrix_, bias);
+            matmulInstr_.template Compute<false, false, false, MM_CFG.scheduleType, MM_CFG.iterateOrder>(a1, b1, var.cMatrix_, bias);
             
             if constexpr (A_TYPE::layout == LayoutMode::NONE || MM_CFG.batchMode == BatchMode::SINGLE_LARGE_THAN_L1) {
                 var.qidBias_.FreeTensor(bias);
             }
         } else {
             matmulInstr_.biasType_ = 0;
-            matmulInstr_.template Compute<false, false, false, MM_CFG.scheduleMode, MM_CFG.iterateOrder>(a1, b1, var.cMatrix_, bias);
+            matmulInstr_.template Compute<false, false, false, MM_CFG.scheduleType, MM_CFG.iterateOrder>(a1, b1, var.cMatrix_, bias);
         }
     } else {
         matmulInstr_.biasType_ = 0;
-        matmulInstr_.template Compute<!MM_CFG.enableSetBias, true, false, MM_CFG.scheduleMode, MM_CFG.iterateOrder>(a1, b1, var.cMatrix_, bias);    
+        matmulInstr_.template Compute<!MM_CFG.enableSetBias, true, false, MM_CFG.scheduleType, MM_CFG.iterateOrder>(a1, b1, var.cMatrix_, bias);    
     }
 }
 
@@ -6789,7 +6762,7 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const Matmu
 template <bool sync>
 __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::IterateNorm(bool enPartialSum)
 {
-    if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) {
+    if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) {
         return IterateNormL0DB(enPartialSum);
     }
     if (unlikely(var.isFirstIter_)) {
@@ -7150,8 +7123,8 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const Matmu
 template <bool sync>
 __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::IterateNormL0DB(bool enPartialSum)
 {
-    ASCENDC_ASSERT((MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB), {
-        KERNEL_LOG(KERNEL_ERROR, "IterateNormL0DB only support scheduleMode == L0_MN_DB");
+    ASCENDC_ASSERT((MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT), {
+        KERNEL_LOG(KERNEL_ERROR, "IterateNormL0DB only support scheduleType == OUTER_PRODUCT");
     });
     if (unlikely(var.isFirstIter_)) {
         var.isFirstIter_ = false;
@@ -8389,7 +8362,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             enhancedParams.deqScale = DeqScale::VDEQ16;
             LocalTensor<uint64_t> quantLocalTensor;
             if constexpr (C_TYPE::format == CubeFormat::NZ) {
-                quantLocalTensor = var.calcBuf_.template Get<uint64_t>();
+                quantLocalTensor =
+                        var.localWorkspace[var.tiling_->transLength].template ReinterpretCast<uint64_t>();
             } else if constexpr (MM_CFG.enVecND2NZ) {
                 if constexpr (!MM_CFG.enableUBReuse) {
                     quantLocalTensor =
@@ -8420,7 +8394,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             enhancedParams.deqScale = DeqScale::VDEQ8;
             LocalTensor<uint64_t> quantLocalTensor;
             if constexpr (C_TYPE::format == CubeFormat::NZ) {
-                quantLocalTensor = var.calcBuf_.template Get<uint64_t>();
+                quantLocalTensor =
+                        var.localWorkspace[var.tiling_->transLength].template ReinterpretCast<uint64_t>();
             } else if constexpr (MM_CFG.enVecND2NZ) {
                 if constexpr (!MM_CFG.enableUBReuse) {
                     quantLocalTensor =
@@ -8840,10 +8815,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 DataCopy(gmC[dstTailOffset], trans, { 1, 1, 0, 0 });
                 if constexpr (IsSameType<typename A_TYPE::T, half>::value &&
                     IsSameType<typename B_TYPE::T, int8_t>::value) {
-                    event_t eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
-                    SetFlag<HardEvent::MTE3_V>(eventID);
-                    WaitFlag<HardEvent::MTE3_V>(eventID);
-                    eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_S));
+                    event_t eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_S));
                     SetFlag<HardEvent::MTE3_S>(eventID);
                     WaitFlag<HardEvent::MTE3_S>(eventID);
                 }
@@ -8864,19 +8836,15 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 DataCopy(gmC[dstOffset + i * offset], src[srcTailOffset], { 1, 1, 0, 0 });
                 PipeBarrier<PIPE_MTE3>();
                 DataCopy(gmC[dstOffset + i * offset + var.baseUseN_], trans, { 1, 1, 0, 0 });
-                if constexpr (IsSameType<typename A_TYPE::T, half>::value &&
-                    IsSameType<typename B_TYPE::T, int8_t>::value) {
-                    event_t eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
-                    SetFlag<HardEvent::MTE3_V>(eventID);
-                    WaitFlag<HardEvent::MTE3_V>(eventID);
-                }
                 if (i <  var.baseUseM_ - 1) {
                     SetFlag<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
                 }
             }
         }
     }
-
+    event_t eventID = static_cast<event_t>(GetTPipePtr()->AllocEventID<HardEvent::MTE3_V>());
+    SetFlag<HardEvent::MTE3_V>(eventID);
+    WaitFlag<HardEvent::MTE3_V>(eventID);
     // Release MTE2_MTE3 eventId: eventIDMte3ToMte2
     GetTPipePtr()->ReleaseEventID<HardEvent::MTE3_MTE2>(eventIDMte3ToMte2);
 }
@@ -8901,15 +8869,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         dstOffset = 0;
         offset = var.baseUseN_;
     }
-    bool isOdd = false;
-    if constexpr (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value) {
-        if (var.baseUseN_ % 2 > 0) {
-            isOdd = true;
-        }
-    }
-    bool needDataCopyPad = !isTragetAligned && (M_ > var.singleCoreM_ || N_ > var.singleCoreN_ || isOdd);
     int gmOffset = blockCount * (blocklen - 2);
-    if (needDataCopyPad && blocklen == 1) {
+    if (!isTragetAligned && blocklen == 1) {
         auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
         SetFlag<HardEvent::V_S>(eventIDVToS);
         WaitFlag<HardEvent::V_S>(eventIDVToS);
@@ -8936,7 +8897,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             dstOffset += offset;
         }
         SetAtomicNone();
-    } else if (needDataCopyPad && blocklen > 1) {
+    } else if (!isTragetAligned && blocklen > 1) {
         if constexpr (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value) {
             LocalTensor<uint16_t> transAligin;
             if constexpr (!MM_CFG.enableUBReuse) {
@@ -9196,19 +9157,46 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 KERNEL_LOG(KERNEL_ERROR, "M_ is %d, baseUseM_ is %d, M_ should be no less than baseUseM_", M_,
                     var.baseUseM_);
             });
-            int dstOffset = var.curN_ * var.tiling_->baseN * M_ + var.curM_ * var.tiling_->baseM * BLOCK_CUBE;
-            int blockLen = var.blockUseM_ * BLOCK_CUBE * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE;
-            int dstStride = (M_ - var.baseUseM_) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE;
-            if (dstStride >= UINT16_MAX) {
-                int srcOffset = var.baseUseM_ * BLOCK_CUBE;
-                for (int i = 0; i < var.blockUseN_; ++i) {
-                    DataCopy(gmC[dstOffset + i * M_ * BLOCK_CUBE], src[i * srcOffset],
-                        { 1, static_cast<uint16_t>(blockLen), 0, 0 });
-                }
+            int64_t alignM;
+            int alignBaseUseM;
+            if constexpr (C_TYPE::format == CubeFormat::NZ){
+                alignM = Ceil(M_, BLOCK_CUBE) * BLOCK_CUBE;
+                alignBaseUseM = Ceil(var.baseUseM_, BLOCK_CUBE) * BLOCK_CUBE;
+            } else {
+                alignM = M_;
+                alignBaseUseM = var.baseUseM_;
+            }
+            if constexpr (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value) {
+                int64_t dstOffset = var.curN_ * var.tiling_->baseN * alignM + var.curM_ * var.tiling_->baseM * ONE_BLK_SIZE;
+                int blockLen = var.blockUseM_ * BLOCK_CUBE * sizeof(DstT);
+                int64_t dstStride = (alignM - alignBaseUseM) * sizeof(DstT);
+                int blockCount = Ceil(var.blockUseN_, 2);
+                if (dstStride >= UINT16_MAX) {
+                    int srcOffset = var.baseUseM_ * ONE_BLK_SIZE;
+                    for (int i = 0; i < blockCount; ++i) {
+                        DataCopy(gmC[dstOffset + i * alignM * ONE_BLK_SIZE], src[i * srcOffset],
+                            { 1, static_cast<uint16_t>(blockLen), 0, 0 });
+                    }
+                } else {
+                    DataCopy(gmC[dstOffset], src,
+                        { static_cast<uint16_t>(blockCount), static_cast<uint16_t>(blockLen), 0,
+                        static_cast<uint16_t>(dstStride) });
+                } 
             } else {
-                DataCopy(gmC[dstOffset], src,
-                    { static_cast<uint16_t>(var.blockUseN_), static_cast<uint16_t>(blockLen), 0,
-                    static_cast<uint16_t>(dstStride) });
+                int64_t dstOffset = var.curN_ * var.tiling_->baseN * alignM + var.curM_ * var.tiling_->baseM * BLOCK_CUBE;
+                int blockLen = var.blockUseM_ * BLOCK_CUBE * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE;
+                int64_t dstStride = (alignM - alignBaseUseM) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE;
+                if (dstStride >= UINT16_MAX) {
+                    int srcOffset = var.baseUseM_ * BLOCK_CUBE;
+                    for (int i = 0; i < var.blockUseN_; ++i) {
+                        DataCopy(gmC[dstOffset + i * alignM * BLOCK_CUBE], src[i * srcOffset],
+                            { 1, static_cast<uint16_t>(blockLen), 0, 0 });
+                    }
+                } else {
+                    DataCopy(gmC[dstOffset], src,
+                        { static_cast<uint16_t>(var.blockUseN_), static_cast<uint16_t>(blockLen), 0,
+                        static_cast<uint16_t>(dstStride) });
+                }
             }
         }
     } else if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) {
diff --git a/impl/matmul/matmul_macro_v220_impl.h b/impl/matmul/matmul_macro_v220_impl.h
index 9615936c389067d2ed26e3418203908b6b03be7d..1daafedd2a3659b551998d1cae6dd568ff3b29a7 100644
--- a/impl/matmul/matmul_macro_v220_impl.h
+++ b/impl/matmul/matmul_macro_v220_impl.h
@@ -106,7 +106,7 @@ public:
     inline __aicore__ void Init();
     inline __aicore__ void Release();
     template <bool noBias = false, bool noTail = false, bool intraBlockPartSum = false,
-            ScheduleMode scheduleMode = ScheduleMode::NONE, IterateOrder iterateOrder = IterateOrder::UNDEF>
+            ScheduleType scheduleType = ScheduleType::INNER_PRODUCT, IterateOrder iterateOrder = IterateOrder::UNDEF>
     inline __aicore__ void Compute(const LocalTensor<A_T> &l1AMatrix, const LocalTensor<B_T> &l1BMatrix,
         const LocalTensor<C_T> &cMatrix, const LocalTensor<BIAS_T> &bias,
 	int64_t offsetb = 0, uint8_t subIdx = 0);
@@ -416,10 +416,6 @@ inline __aicore__ void MacroMatmul<C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, L0AB_USING
         loadDataParams.ifTranspose = 0;
         uint64_t l1bOffset = sBL1NOffset_ * hwK0_ + sBL1KOffset_ * sBL1N_ +
             k_inner * kC0 * hwK0_ * sBL1N_;
-        if constexpr (IsSameType<B_T, int4b_t>::value) {
-            l0bSrcAddrStride = l0bSrcAddrStride / 2;
-            l0bDstAddrStride = l0bDstAddrStride / 2;
-        }
         uint64_t l0bOffset = 0;
         for (uint64_t i = 0; i < l0bLoop; i++) {
 #if __CCE_AICORE__ >= 220
@@ -565,7 +561,7 @@ inline __aicore__ void MacroMatmul<C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, L0AB_USING
 
 template <typename C_T, typename A_T, typename B_T, typename BIAS_T, uint16_t UNIFLAG_EN,
     uint16_t L0AB_USING_HSET, uint16_t GEMV_MODE>
-template <bool noBias, bool noTail, bool intraBlockPartSum, ScheduleMode scheduleMode, IterateOrder iterateOrder>
+template <bool noBias, bool noTail, bool intraBlockPartSum, ScheduleType scheduleType, IterateOrder iterateOrder>
 inline __aicore__ void MacroMatmul<C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, L0AB_USING_HSET, GEMV_MODE>::Compute(
     const LocalTensor<A_T> &l1AMatrix, const LocalTensor<B_T> &l1BMatrix, const LocalTensor<C_T> &cMatrix,
     const LocalTensor<BIAS_T> &bias, int64_t offsetb, uint8_t subIdx)
@@ -588,12 +584,12 @@ inline __aicore__ void MacroMatmul<C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, L0AB_USING
     }
  
     // m db
-    if constexpr (scheduleMode == ScheduleMode::L0_MN_DB && iterateOrder == IterateOrder::ORDER_N) {
+    if constexpr (scheduleType == ScheduleType::OUTER_PRODUCT && iterateOrder == IterateOrder::ORDER_N) {
         ComputeWithMdb<noBias>(l1AMatrix, l1BMatrix, cMatrix, bias, kC0Tail, kTail);
         return;
     }
     // n db
-    if constexpr (scheduleMode == ScheduleMode::L0_MN_DB && iterateOrder == IterateOrder::ORDER_M) {
+    if constexpr (scheduleType == ScheduleType::OUTER_PRODUCT && iterateOrder == IterateOrder::ORDER_M) {
         ComputeWithNdb<noBias>(l1AMatrix, l1BMatrix, cMatrix, bias, kC0Tail, kTail);
         return;
     }
diff --git a/lib/matmul/matmul_server.h b/impl/matmul/matmul_sever.h
similarity index 96%
rename from lib/matmul/matmul_server.h
rename to impl/matmul/matmul_sever.h
index 5a671203562849324542ec453ec9cac85b7fd005..caf8a3def2834cb5dea8f5a4aa63e8cfb54c3488 100644
--- a/lib/matmul/matmul_server.h
+++ b/impl/matmul/matmul_sever.h
@@ -1,838 +1,859 @@
-/**
- * Copyright (c) 2024 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-/*!
- * \file matmul_server.h
- * \brief
- */
-#ifndef LIB_MATMUL_MATMUL_SERVER_H
-#define LIB_MATMUL_MATMUL_SERVER_H
-
-#include "lib/matmul/matmul.h"
-#include "kernel_operator.h"
-
-namespace matmul {
-using namespace AscendC;
-template <bool IS_IBSHARE> struct IBShareCache {
-    __aicore__ inline IBShareCache() {};
-};
-
-template <>
-struct IBShareCache<false> {
-    __aicore__ inline IBShareCache() {};
-    using ShareCache = uint16_t;
-};
-
-template <>
-struct IBShareCache<true> {
-    __aicore__ inline IBShareCache() {};
-    using ShareCache = GlobalCache;
-};
-template<class A_TYPE, class B_TYPE> __aicore__ constexpr bool IsIBShare()
-{
-    if (A_TYPE::ibShare == true) {
-        return true;
-    }
-    if (B_TYPE::ibShare == true) {
-        return true;
-    }
-    return false;
-}
-
-struct MatmulMsg {
-    uint32_t setOrgShape : 1;
-    uint32_t orgM;
-    uint32_t orgN;
-    uint32_t orgKa;
-    uint32_t orgKb;
-    uint32_t orgKc;
-};
-
-struct ShareMatmulBase {
-    __aicore__ inline ShareMatmulBase() {};
-};
-
-struct ShareMatmul : ShareMatmulBase {
-    __aicore__ inline ShareMatmul(){};
-    MatmulMsg msg0;
-    MatmulMsg msg1;
-};
-
-template <bool SHARED>
-struct ShareMatmulAux {
-    __aicore__ inline ShareMatmulAux(){};
-};
-
-template <>
-struct ShareMatmulAux<false> {
-    __aicore__ inline ShareMatmulAux(){};
-    using MSG = ShareMatmulBase;
-};
-
-template <>
-struct ShareMatmulAux<true> {
-    __aicore__ inline ShareMatmulAux(){};
-    using MSG = ShareMatmul;
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG = CFG_NORM,
-    class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>>
-class MatmulService {
-    using SrcT = typename A_TYPE::T;
-    using DstT = typename C_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
-
-public:
-    __aicore__ inline MatmulService() {}
-    __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace)
-    {
-        ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server");
-        ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server");
-        ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server");
-        ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server");
-        this->instID = instID;
-        this->kfcCommSrv = kfc;
-        this->tpipe = tpipe;
-        this->workspace = workspace;
-        mul.SetSubBlockIdx(kfcCommSrv->subBlockID);
-        if constexpr (!MM_CFG.enableInit) {
-            msgAux.msg0.setOrgShape = false;
-            msgAux.msg1.setOrgShape = false;
-        }
-        this->devEvtID = instID;
-        if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) {
-            if (kfcCommSrv->subBlockID == 0) {
-                if (tiling) {
-                    tiling_ = (TCubeTiling *)tiling;
-                    gCache.template Init<A_TYPE, B_TYPE>(tiling_, tpipe);
-                }
-            }
-        }
-        if (tiling) {
-            tiling_ = (TCubeTiling *)tiling;
-            mul.Init(tiling_, tpipe);
-        }
-    }
-
-    __aicore__ inline void Init(__gm__ KfcMsg* msg)
-    {
-        if constexpr (!MM_CFG.enableInit) {
-            return;
-        } else {
-            ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server");
-            ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server");
-            auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr));
-            tiling_ = &tmpTiling_;
-            auto temp2 = (uint32_t*)tiling_;
-
-            constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
-            GlobalTensor<int64_t> tilingGlobal;
-            for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) {
-                Barrier();
-                tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i));
-                DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(tilingGlobal);
-            }
-
-            for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) {
-                *temp2 = *temp1;
-            }
-            mul.Init(this->tiling_, this->tpipe);
-        }
-    }
-
-    __aicore__ inline void SetSubBlockIdx(uint8_t idx)
-    {
-        mul.SetSubBlockIdx(idx);
-    }
-
-    __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg)
-    {
-        if constexpr (!MM_CFG.enableInit) {
-            if (mul.GetSubBlockIdx() == 0) {
-                msgAux.msg0.orgM = msg->orgShape.orgM;
-                msgAux.msg0.orgN = msg->orgShape.orgN;
-                msgAux.msg0.orgKa = msg->orgShape.orgKa;
-                msgAux.msg0.orgKb = msg->orgShape.orgKb;
-                msgAux.msg0.orgKc = msg->orgShape.orgKc;
-                msgAux.msg0.setOrgShape = true;
-            } else {
-                msgAux.msg1.orgM = msg->orgShape.orgM;
-                msgAux.msg1.orgN = msg->orgShape.orgN;
-                msgAux.msg1.orgKa = msg->orgShape.orgKa;
-                msgAux.msg1.orgKb = msg->orgShape.orgKb;
-                msgAux.msg1.orgKc = msg->orgShape.orgKc;
-                msgAux.msg1.setOrgShape = true;
-            }
-        } else {
-            mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb,
-                msg->orgShape.orgKc);
-        }
-    }
-
-    __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg)
-    {
-        if (msg->body.setTail) {
-            mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK);
-        }
-    }
-
-    __aicore__ inline void SetTail(__gm__ KfcMsg* msg)
-    {
-        if (msg->body.setTail) {
-            mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK);
-        }
-    }
-
-    __aicore__ inline void SetHF32(__gm__ KfcMsg* msg)
-    {
-        mul.SetHF32(static_cast<bool>(msg->body.enHF32), static_cast<int32_t>(msg->body.hf32TransMode));
-    }
-
-    __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg)
-    {
-        if (!msg->body.setTensorA)
-            return;
-        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
-            SrcT scalar;
-            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr));
-            auto temp2 = (uint8_t*)&scalar;
-
-            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
-                *temp2 = *temp1;
-            }
-            mul.SetTensorA(scalar);
-            return;
-        }
-        const uint64_t size = (uint64_t)(msg->body.sizeAmatrix);
-        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
-            const auto& scmLocal = GetTscmTensor<typename A_TYPE::T>(msg->body.aAddr, size);
-            mul.SetTensorA(scmLocal, msg->body.isTransA);
-        } else {
-            GlobalTensor<SrcT> aGlobal;
-            aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr), size);
-            mul.SetTensorA(aGlobal, msg->body.isTransA);
-        }
-    }
-
-    __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset)
-    {
-        if (!msg->body.setTensorA) {
-            return;
-        }
-        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
-            SrcT scalar;
-            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset);
-            auto temp2 = (uint8_t*)&scalar;
-
-            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
-                *temp2 = *temp1;
-            }
-            mul.SetTensorA(scalar);
-            return;
-        }
-        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
-            const auto& scmLocal = GetTscmTensor<typename A_TYPE::T>(msg->body.aAddr + offset, size);
-            mul.SetTensorA(scmLocal, msg->body.isTransA);
-        } else {
-            GlobalTensor<SrcT> aGlobal;
-            aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr + offset), size);
-            mul.SetTensorA(aGlobal, msg->body.isTransA);
-        }
-    }
-
-    __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg)
-    {
-        if (!msg->body.setQuant) {
-            return;
-        }
-        int quantMode = msg->body.quantMode;
-        if (quantMode == 1) {
-            uint64_t quantScalar = msg->body.quantScalar;
-            mul.SetQuantScalar(quantScalar);
-        } else if (quantMode == 2) {
-            const uint64_t size = static_cast<uint64_t>(msg->body.quantSize);
-            GlobalTensor<uint64_t> quantGlobal;
-            quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size);
-            mul.SetQuantVector(quantGlobal);
-        }
-    }
-
-    __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg)
-    {
-        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
-            return;
-        }
-        if (!msg->body.setBatch) {
-            return;
-        }
-        mul.SetBatchNum(msg->body.batchA, msg->body.batchB);
-    }
-
-    __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg)
-    {
-        GlobalTensor<int64_t> msgGlobal;
-        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
-        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
-        mul.SetSelfDefineData(msg->body.dataPtr);
-        if constexpr (!MM_CFG.enableReuse) {
-            GlobalTensor<uint32_t> dataGlobal;
-            dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr));
-            DataCacheCleanAndInvalid<uint32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(dataGlobal);
-        }
-    }
-
-    __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg)
-    {
-        mul.SetUserDefInfo(msg->userDefInfo.tilingPtr);
-    }
-
-    __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg)
-    {
-        if (!msg->body.setTensorB)
-            return;
-        if constexpr (B_TYPE::format == CubeFormat::SCALAR) {
-            SrcT scalar;
-            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr));
-            auto temp2 = (uint8_t*)&scalar;
-
-            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
-                *temp2 = *temp1;
-            }
-            mul.SetTensorB(scalar);
-            return;
-        }
-        const uint64_t size = (uint64_t)(msg->body.sizeBmatrix);
-        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
-            const auto& scmLocal = GetTscmTensor<typename B_TYPE::T>(msg->body.bAddr, size);
-            mul.SetTensorB(scmLocal, msg->body.isTransB);
-        } else {
-            GlobalTensor<SrcT> bGlobal;
-            bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr), size);
-            mul.SetTensorB(bGlobal, msg->body.isTransB);
-        }
-    }
-
-    __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset)
-    {
-        if (!msg->body.setTensorB) {
-            return;
-        }
-        if constexpr (B_TYPE::format == CubeFormat::SCALAR) {
-            SrcT scalar;
-            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset);
-            auto temp2 = (uint8_t*)&scalar;
-
-            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
-                *temp2 = *temp1;
-            }
-            mul.SetTensorB(scalar);
-            return;
-        }
-        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
-            const auto& scmLocal = GetTscmTensor<typename B_TYPE::T>(msg->body.bAddr + offset, size);
-            mul.SetTensorB(scmLocal, msg->body.isTransB);
-        } else {
-            GlobalTensor<SrcT> bGlobal;
-            bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr + offset), size);
-            mul.SetTensorB(bGlobal, msg->body.isTransB);
-        }
-    }
-
-    __aicore__ inline void SetBias(__gm__ KfcMsg* msg)
-    {
-        if (msg->body.setTensorBias) {
-            const uint64_t size = (uint64_t)tiling_->singleCoreN;
-            if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) {
-                const auto& scmLocal = GetTscmTensor<typename BIAS_TYPE::T>(msg->body.biasAddr, size);
-                mul.SetBias(scmLocal);
-            } else {
-                GlobalTensor<typename BIAS_TYPE::T> biasGlobal;
-                biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size);
-                mul.SetBias(biasGlobal);
-            }
-        } else if (msg->body.setClearBias) {
-            mul.ClearBias();
-        }
-    }
-
-    __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset)
-    {
-        if (msg->body.setTensorBias) {
-            const uint64_t size = (uint64_t)tiling_->singleCoreN;
-            if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) {
-                const auto& scmLocal = GetTscmTensor<typename BIAS_TYPE::T>(msg->body.biasAddr + offset, size);
-                mul.SetBias(scmLocal);
-            } else {
-                GlobalTensor<typename BIAS_TYPE::T> biasGlobal;
-                biasGlobal.SetGlobalBuffer(
-                    reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size);
-                mul.SetBias(biasGlobal);
-            }
-        } else if (msg->body.setClearBias) {
-            mul.ClearBias();
-        }
-    }
-
-    __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg)
-    {
-        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
-            return true;
-        }
-        uint64_t size;
-        if constexpr (MM_CFG.baseMN != 0) {
-            size = MM_CFG.baseMN;
-        } else {
-            size = tiling_->baseM * tiling_->baseN;
-        }
-        if constexpr (PhyPosIsL1(C_TYPE::pos)) {
-            const auto& scmLocal = GetTscmTensor<typename C_TYPE::T>(msg->body.cAddr, size);
-            mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite);
-        } else {
-            GlobalTensor<DstT> cGlobal;
-
-            cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
-            mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite);
-        }
-        // Now release UB
-        if constexpr (PhyPosIsUB(C_TYPE::pos)) {
-            if (unlikely(msg->ubAddr >= 0)) {
-                kfcCommSrv->FreeUB(msg->ubAddr);
-            }
-        }
-        if (msg->body.sync == 1) { // Synchronize
-            uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
-            NotifyEvent<PIPE_FIX>(eventID);
-        }
-        return false;
-    }
-
-    __aicore__ inline uint16_t GetInstID()
-    {
-        return instID;
-    }
-    __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg)
-    {
-        if constexpr (!MM_CFG.enableInit) {
-            if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) {
-                mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa,
-                    msgAux.msg0.orgKb, msgAux.msg0.orgKc);
-            } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) {
-                mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa,
-                    msgAux.msg1.orgKb, msgAux.msg1.orgKc);
-            }
-        }
-        if (msg->body.isFirstIter) {
-            SetTensorA(msg);
-            SetTensorB(msg);
-            if constexpr (MM_CFG.enableSetBias) {
-                SetBias(msg);
-            }
-            if constexpr (MM_CFG.enableSetTail) {
-                SetTail(msg);
-            }
-            if constexpr (MM_CFG.enableQuantVector) {
-                SetQuantVector(msg);
-            }
-            if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) ||
-                ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) {
-                SetBatchNum(msg);
-            }
-            if constexpr (MM_CFG.enableSetDefineData) {
-                SetSelfDefineData(msg);
-            }
-        }
-    }
-
-    __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize,
-        const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0)
-    {
-        if (msg->body.isFirstIter) {
-            SetTensorA(msg, batchASize, offsetA);
-            SetTensorB(msg, batchBSize, offsetB);
-            SetBias(msg, offsetBias);
-            SetTail(msg);
-            SetQuantVector(msg);
-            SetBatchNum(msg);
-        }
-    }
-
-    __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg)
-    {
-        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
-            return true;
-        }
-        // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes.
-        GlobalTensor<int64_t> msgGlobal;
-        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
-        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
-        ASSERT(msg->body.quantMode != 1);  // scalar mode is not supported for quantization parameters in Batch MM
-        IterateSetMessage(msg);
-        uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN;
-
-        GlobalTensor<DstT> cGlobal;
-        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
-        mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic),
-            msg->body.enSequentialWrite, msg->body.matrixStrideA,
-            msg->body.matrixStrideB, msg->body.matrixStrideC);
-
-        // Now release UB
-        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
-            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
-            if (unlikely(msg->ubAddr >= 0)) {
-                kfcCommSrv->FreeUB(msg->ubAddr);
-            }
-        }
-        if (msg->body.sync || msg->body.waitIterateBatch) {
-            uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
-            NotifyEvent<PIPE_FIX>(eventID);
-        }
-        return true;
-    }
-
-    __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg)
-    {
-        if constexpr (!MM_CFG.isNBatch) {
-            return true;
-        }
-        GlobalTensor<int64_t> msgGlobal;
-        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
-        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
-        ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM
-        const uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN;
-        const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop;
-        uint64_t batchAOffset = tiling_->ALayoutInfoD * msg->body.batchA;
-        if constexpr (A_TYPE::layout != LayoutMode::SBNGD) {
-            batchAOffset = batchAOffset * tiling_->ALayoutInfoS;
-        }
-        const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop;
-        uint64_t batchBOffset = tiling_->BLayoutInfoD * msg->body.batchB;
-        if constexpr (B_TYPE::layout != LayoutMode::SBNGD) {
-            batchBOffset = batchBOffset * tiling_->BLayoutInfoS;
-        }
-        const uint64_t batchCOffset = tiling_->CLayoutInfoS2;
-        const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB;
-        bool layoutGCondition = tiling_->CLayoutInfoG == 1 &&
-                                (tiling_->BLayoutInfoG != 1 || tiling_->ALayoutInfoG != 1);
-        int32_t layoutG = tiling_->BLayoutInfoG > tiling_->ALayoutInfoG ? tiling_->BLayoutInfoG : tiling_->ALayoutInfoG;
-        int32_t batchOffsetBias = tiling_->CLayoutInfoS2 * batchC;
-        if (layoutGCondition) {
-            batchOffsetBias = batchOffsetBias / layoutG;
-        }
-        int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T);
-        if constexpr (C_TYPE::layout != LayoutMode::SBNGD) {
-            batchOffsetC = batchOffsetC * tiling_->CLayoutInfoS1;
-        }
-        uint64_t offset = 0;
-        uint32_t cntIterator = 0;
-        for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) {
-            const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T);
-            const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T);
-            const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T);
-            IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset);
-            GlobalTensor<DstT> cGlobal;
-            cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size);
-            mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic),
-                msg->body.enSequentialWrite, msg->body.matrixStrideA,
-                msg->body.matrixStrideB, msg->body.matrixStrideC);
-            cntIterator++;
-            if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) {
-                uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
-                NotifyEvent<PIPE_FIX>(eventID);
-            }
-            offset += batchOffsetC;
-        }
-        // Now release UB
-        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
-            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
-            if (unlikely(msg->ubAddr >= 0)) {
-                kfcCommSrv->FreeUB(msg->ubAddr);
-            }
-        }
-        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
-        if (msg->body.sync || msg->body.waitIterateBatch) {
-            NotifyEvent<PIPE_FIX>(eventID);
-        } else if (cntIterator >= INC_PROCESS_CHECK) {
-            NotifyEvent<PIPE_FIX>(eventID);
-        }
-        return true;
-    }
-
-    __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID)
-    {
-        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
-            return true;
-        }
-        if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) {
-            if (msg->body.iterateFakeMsg) {
-                if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg
-                    uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + kfcCommSrv->subBlockID);
-                    NotifyEvent<PIPE_FIX>(eventID);
-                    return true;
-                }
-            }
-        } else {
-            ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg.");
-        }
-        if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
-           ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
-           IsSameType<DstT, int8_t>::value) ||
-           (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
-           IsSameType<DstT, uint8_t>::value))) {
-            GlobalTensor<int64_t> msgGlobal;
-            msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
-            DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
-        }
-        IterateSetMessage(msg);
-        uint64_t size;
-        if constexpr (MM_CFG.singleCoreMN != 0) {
-            size = MM_CFG.singleCoreMN;
-        } else {
-            size = tiling_->singleCoreM * tiling_->singleCoreN;
-        }
-
-        GlobalTensor<DstT> cGlobal;
-        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
-        const auto& scmLocal = GetTscmTensor<typename C_TYPE::T>(msg->body.cAddr, size);
-        uint64_t offset = 0;
-        uint64_t offsetSize = 0;
-        auto enSequentialWrite = msg->body.enSequentialWrite;
-        auto enAtomic = msg->body.enAtomic;
-        auto sync = msg->body.sync;
-        auto enPartialSum = msg->body.enPartialSum;
-        if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) {
-            ASSERT(msg->body.cAddr != 0); // The output address must be configured.
-            if constexpr (MM_CFG.baseMN != 0) {
-                offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0;
-            } else {
-                offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0;
-            }
-        } else {
-            if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
-                ASSERT(msg->body.cAddr != 0); // The output address must be configured.
-                if constexpr (MM_CFG.baseMN != 0) {
-                    offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0;
-                } else {
-                    offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0;
-                }
-            } else if (sync == 0) {
-                // For asynchronous Iterate, the offset must be used for address calculation and
-                // the size is baseM x baseN.
-                if constexpr (MM_CFG.baseMN != 0) {
-                    offsetSize = MM_CFG.baseMN;
-                } else {
-                    offsetSize = tiling_->baseM * tiling_->baseN;
-                }
-                enSequentialWrite = 1;
-            }
-        }
-        uint32_t cntIterator = 0;
-        TRACE_START(TraceId::MatMul_CALC);
-        // Asynchronous and configure the workspace
-        while (mul.Iterate(enPartialSum)) {
-            if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) {
-                if (unlikely(cntIterator == 0)) {
-                    if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) {
-                        TRACE_STOP(TraceId::MatMul_CALC);
-                        return false; // The queue is not switched, and no message needs to be returned.
-                    }
-                }
-            }
-            if constexpr (PhyPosIsL1(C_TYPE::pos)) {
-                mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite);
-            } else {
-                mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite);
-            }
-            cntIterator++;
-            if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) {
-                if (cntIterator < INC_PROCESS_CHECK) {
-                    if (funID == KFC_Enum::MMFUN_ITERATE) {
-                        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
-                        NotifyEvent<PIPE_FIX>(eventID);
-                    }
-                }
-            }
-            offset += offsetSize;
-        }
-        // Now release UB
-        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
-            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
-            if (unlikely(msg->ubAddr >= 0)) {
-                kfcCommSrv->FreeUB(msg->ubAddr);
-            }
-        }
-
-        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
-        if (sync || msg->body.waitIterateAll) {
-            ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
-            NotifyEvent<PIPE_FIX>(eventID);
-        } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) {
-            NotifyEvent<PIPE_FIX>(eventID);
-        }
-        mul.End();
-        TRACE_STOP(TraceId::MatMul_CALC);
-        return true;
-    }
-
-    __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID)
-    {
-        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
-            return true;
-        }
-        if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
-           ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
-           IsSameType<DstT, int8_t>::value) ||
-           (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
-           IsSameType<DstT, uint8_t>::value))) {
-            GlobalTensor<int64_t> msgGlobal;
-            msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
-            DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
-        }
-        IterateSetMessage(msg);
-        if (mul.GetSubBlockIdx() == 0) {
-            return true;
-        }
-        uint64_t size;
-        if constexpr (MM_CFG.singleCoreMN != 0) {
-            size = MM_CFG.singleCoreMN;
-        } else {
-            size = tiling_->singleCoreM * tiling_->singleCoreN;
-        }
-
-        GlobalTensor<DstT> cGlobal;
-        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
-        mul.IterateAll(cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite,
-            msg->body.waitIterateAll, msg->body.iterateFakeMsg);
-
-        uint16_t eventID0 = static_cast<uint16_t>(this->devEvtID * 2 + 0);
-        uint16_t eventID1 = static_cast<uint16_t>(this->devEvtID * 2 + 1);
-        if (msg->body.sync || msg->body.waitIterateAll) {
-            ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
-            NotifyEvent<PIPE_FIX>(eventID0);
-            NotifyEvent<PIPE_FIX>(eventID1);
-        }
-        if (!msg->body.iterateFakeMsg) {
-            mul.End();
-        }
-        TRACE_STOP(TraceId::MatMul_CALC);
-        return true;
-    }
-
-    __aicore__ inline bool IsSharedMatmul()
-    {
-        if constexpr (MM_CFG.enableInit) {
-            return false;
-        } else {
-            return true;
-        }
-    }
-
-    __aicore__ inline bool ProcessIbShareSync(KFC_Enum funID, bool& freeMsg,
-        int &lastMsgId, const int subBlockID)
-    {
-        if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || MM_CFG.intraBlockPartSum) {
-            if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
-                if (lastMsgId == subBlockID) {
-                    freeMsg = false;
-                    return true;
-                }
-                lastMsgId = subBlockID;
-                return false;
-            }
-            return false;
-        } else {
-            return false;
-        }
-    }
-
-    __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID)
-    {
-        if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) ||
-            ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) {
-            if ((static_cast<uint16_t>(funID) & static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) ==
-                static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) {
-                if constexpr (MM_CFG.intraBlockPartSum) {
-                    return IterateIntraBlockPartSum(msg, funID);
-                } else {
-                    return Iterate(msg, funID);
-                }
-            }
-        }
-        if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0)) {
-            if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) {
-                return IterateBatch(msg);
-            }
-        }
-        if constexpr (MM_CFG.enableEnd) {
-            if (funID == KFC_Enum::MMFUN_END) {
-                mul.End();
-            }
-        }
-        if constexpr (MM_CFG.enableGetTensorC) {
-            if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) {
-                return GetTensorC(msg);
-            }
-        }
-        if constexpr (MM_CFG.enableSetOrgShape) {
-            if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) {
-                SetOrgShape(msg);
-                return true;
-            }
-        }
-        if constexpr (MM_CFG.enableInit) {
-            if (funID == KFC_Enum::MMFUN_INIT) {
-                Init(msg);
-                return true;
-            }
-        }
-        if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) {
-            if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) {
-                return IterateNBatch(msg);
-            }
-        }
-        if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) {
-            SetUserDefInfo(msg);
-            return true;
-        }
-        if (funID == KFC_Enum::MMFUN_SET_HF32) {
-            SetHF32(msg);
-            return true;
-        }
-        ASSERT("illegal function ID.");
-        return true;
-    }
-
-    template <class T> __aicore__ LocalTensor<T> GetTscmTensor(uint64_t addr, const uint64_t size)
-    {
-        LocalTensor<T> scmLocal;
-        TBuffAddr scmTbuf;
-        scmTbuf.logicPos = (uint8_t)(TPosition::TSCM);
-        scmTbuf.dataLen = size * sizeof(DstT);
-        scmTbuf.bufferAddr = addr;
-#if ASCENDC_CPU_DEBUG
-        scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr;
-#endif
-        scmLocal.SetAddr(scmTbuf);
-        return scmLocal;
-    }
-
-private:
-    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> mul;
-    GM_ADDR workspace;
-    KfcCommServer* kfcCommSrv;
-    TPipe* tpipe;
-    TCubeTiling* tiling_;
-    TCubeTiling tmpTiling_; // for compatible with init interface
-    typename IBShareCache<IsIBShare<A_TYPE, B_TYPE>()>::ShareCache gCache;
-    typename  ShareMatmulAux<!MM_CFG.enableInit>::MSG msgAux;
-    uint16_t instID;
-    uint16_t devEvtID;
-};
-} // namespace matmul
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file matmul_server.h
+ * \brief
+ */
+#ifndef LIB_MATMUL_MATMUL_SERVER_H
+#define LIB_MATMUL_MATMUL_SERVER_H
+
+#include "../../lib/matmul/matmul.h"
+#include "kernel_operator.h"
+
+namespace matmul {
+using namespace AscendC;
+template <bool IS_IBSHARE> struct IBShareCache {
+    __aicore__ inline IBShareCache() {};
+};
+
+template <>
+struct IBShareCache<false> {
+    __aicore__ inline IBShareCache() {};
+    using ShareCache = uint16_t;
+};
+
+template <>
+struct IBShareCache<true> {
+    __aicore__ inline IBShareCache() {};
+    using ShareCache = GlobalCache;
+};
+template<class A_TYPE, class B_TYPE> __aicore__ constexpr bool IsIBShare()
+{
+    if (A_TYPE::ibShare == true) {
+        return true;
+    }
+    if (B_TYPE::ibShare == true) {
+        return true;
+    }
+    return false;
+}
+
+struct MatmulMsg {
+    uint32_t setOrgShape : 1;
+    uint32_t orgM;
+    uint32_t orgN;
+    uint32_t orgKa;
+    uint32_t orgKb;
+    uint32_t orgKc;
+};
+
+struct ShareMatmulBase {
+    __aicore__ inline ShareMatmulBase() {};
+};
+
+struct ShareMatmul : ShareMatmulBase {
+    __aicore__ inline ShareMatmul(){};
+    MatmulMsg msg0;
+    MatmulMsg msg1;
+};
+
+template <bool SHARED>
+struct ShareMatmulAux {
+    __aicore__ inline ShareMatmulAux(){};
+};
+
+template <>
+struct ShareMatmulAux<false> {
+    __aicore__ inline ShareMatmulAux(){};
+    using MSG = ShareMatmulBase;
+};
+
+template <>
+struct ShareMatmulAux<true> {
+    __aicore__ inline ShareMatmulAux(){};
+    using MSG = ShareMatmul;
+};
+
+__aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace)
+{
+    SetAtomicNone();
+#if __CCE_AICORE__ == 220
+    if ASCEND_IS_AIC {
+        SetMaskNorm();
+        SetLoadDataBoundary((uint64_t)0);
+        SetLoadDataPaddingValue((uint64_t)0);
+    } else {
+        AscendCUtils::SetMask<uint64_t>((uint64_t)-1, (uint64_t)-1);
+        SetMaskNorm();
+    }
+#endif
+
+#ifdef __DAV_C220_CUBE__
+    ClearWorkspaceImpl(workspace);
+    uint16_t eventID = 3;
+    NotifyEvent<PIPE_MTE3>(eventID);
+#endif
+}
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG = CFG_NORM,
+    class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>>
+class MatmulService {
+    using SrcT = typename A_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+
+public:
+    __aicore__ inline MatmulService() {}
+    __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace)
+    {
+        ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server");
+        ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server");
+        ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server");
+        ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server");
+        this->instID = instID;
+        this->kfcCommSrv = kfc;
+        this->tpipe = tpipe;
+        this->workspace = workspace;
+        mul.SetSubBlockIdx(kfcCommSrv->subBlockID);
+        if constexpr (!MM_CFG.enableInit) {
+            msgAux.msg0.setOrgShape = false;
+            msgAux.msg1.setOrgShape = false;
+        }
+        this->devEvtID = instID;
+        if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) {
+            if (kfcCommSrv->subBlockID == 0) {
+                if (tiling) {
+                    tiling_ = (TCubeTiling *)tiling;
+                    gCache.template Init<A_TYPE, B_TYPE>(tiling_, tpipe);
+                }
+            }
+        }
+        if (tiling) {
+            tiling_ = (TCubeTiling *)tiling;
+            mul.Init(tiling_, tpipe);
+        }
+    }
+
+    __aicore__ inline void Init(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!MM_CFG.enableInit) {
+            return;
+        } else {
+            ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server");
+            ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server");
+            auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr));
+            tiling_ = &tmpTiling_;
+            auto temp2 = (uint32_t*)tiling_;
+
+            constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
+            GlobalTensor<int64_t> tilingGlobal;
+            for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) {
+                Barrier();
+                tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i));
+                DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(tilingGlobal);
+            }
+
+            for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.Init(this->tiling_, this->tpipe);
+        }
+    }
+
+    __aicore__ inline void SetSubBlockIdx(uint8_t idx)
+    {
+        mul.SetSubBlockIdx(idx);
+    }
+
+    __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!MM_CFG.enableInit) {
+            if (mul.GetSubBlockIdx() == 0) {
+                msgAux.msg0.orgM = msg->orgShape.orgM;
+                msgAux.msg0.orgN = msg->orgShape.orgN;
+                msgAux.msg0.orgKa = msg->orgShape.orgKa;
+                msgAux.msg0.orgKb = msg->orgShape.orgKb;
+                msgAux.msg0.orgKc = msg->orgShape.orgKc;
+                msgAux.msg0.setOrgShape = true;
+            } else {
+                msgAux.msg1.orgM = msg->orgShape.orgM;
+                msgAux.msg1.orgN = msg->orgShape.orgN;
+                msgAux.msg1.orgKa = msg->orgShape.orgKa;
+                msgAux.msg1.orgKb = msg->orgShape.orgKb;
+                msgAux.msg1.orgKc = msg->orgShape.orgKc;
+                msgAux.msg1.setOrgShape = true;
+            }
+        } else {
+            mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb,
+                msg->orgShape.orgKc);
+        }
+    }
+
+    __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg)
+    {
+        if (msg->body.setTail) {
+            mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK);
+        }
+    }
+
+    __aicore__ inline void SetTail(__gm__ KfcMsg* msg)
+    {
+        if (msg->body.setTail) {
+            mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK);
+        }
+    }
+
+    __aicore__ inline void SetHF32(__gm__ KfcMsg* msg)
+    {
+        mul.SetHF32(static_cast<bool>(msg->body.enHF32), static_cast<int32_t>(msg->body.hf32TransMode));
+    }
+
+    __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg)
+    {
+        if (!msg->body.setTensorA)
+            return;
+        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
+            SrcT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr));
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorA(scalar);
+            return;
+        }
+        const uint64_t size = (uint64_t)(msg->body.sizeAmatrix);
+        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename A_TYPE::T>(msg->body.aAddr, size);
+            mul.SetTensorA(scmLocal, msg->body.isTransA);
+        } else {
+            GlobalTensor<SrcT> aGlobal;
+            aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr), size);
+            mul.SetTensorA(aGlobal, msg->body.isTransA);
+        }
+    }
+
+    __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset)
+    {
+        if (!msg->body.setTensorA) {
+            return;
+        }
+        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
+            SrcT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset);
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorA(scalar);
+            return;
+        }
+        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename A_TYPE::T>(msg->body.aAddr + offset, size);
+            mul.SetTensorA(scmLocal, msg->body.isTransA);
+        } else {
+            GlobalTensor<SrcT> aGlobal;
+            aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr + offset), size);
+            mul.SetTensorA(aGlobal, msg->body.isTransA);
+        }
+    }
+
+    __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg)
+    {
+        if (!msg->body.setQuant) {
+            return;
+        }
+        int quantMode = msg->body.quantMode;
+        if (quantMode == 1) {
+            uint64_t quantScalar = msg->body.quantScalar;
+            mul.SetQuantScalar(quantScalar);
+        } else if (quantMode == 2) {
+            const uint64_t size = static_cast<uint64_t>(msg->body.quantSize);
+            GlobalTensor<uint64_t> quantGlobal;
+            quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size);
+            mul.SetQuantVector(quantGlobal);
+        }
+    }
+
+    __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg)
+    {
+        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
+            return;
+        }
+        if (!msg->body.setBatch) {
+            return;
+        }
+        mul.SetBatchNum(msg->body.batchA, msg->body.batchB);
+    }
+
+    __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg)
+    {
+        GlobalTensor<int64_t> msgGlobal;
+        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        mul.SetSelfDefineData(msg->body.dataPtr);
+        if constexpr (!MM_CFG.enableReuse) {
+            GlobalTensor<uint32_t> dataGlobal;
+            dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr));
+            DataCacheCleanAndInvalid<uint32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(dataGlobal);
+        }
+    }
+
+    __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg)
+    {
+        mul.SetUserDefInfo(msg->userDefInfo.tilingPtr);
+    }
+
+    __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg)
+    {
+        if (!msg->body.setTensorB)
+            return;
+        if constexpr (B_TYPE::format == CubeFormat::SCALAR) {
+            SrcT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr));
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorB(scalar);
+            return;
+        }
+        const uint64_t size = (uint64_t)(msg->body.sizeBmatrix);
+        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename B_TYPE::T>(msg->body.bAddr, size);
+            mul.SetTensorB(scmLocal, msg->body.isTransB);
+        } else {
+            GlobalTensor<SrcT> bGlobal;
+            bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr), size);
+            mul.SetTensorB(bGlobal, msg->body.isTransB);
+        }
+    }
+
+    __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset)
+    {
+        if (!msg->body.setTensorB) {
+            return;
+        }
+        if constexpr (B_TYPE::format == CubeFormat::SCALAR) {
+            SrcT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset);
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorB(scalar);
+            return;
+        }
+        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename B_TYPE::T>(msg->body.bAddr + offset, size);
+            mul.SetTensorB(scmLocal, msg->body.isTransB);
+        } else {
+            GlobalTensor<SrcT> bGlobal;
+            bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr + offset), size);
+            mul.SetTensorB(bGlobal, msg->body.isTransB);
+        }
+    }
+
+    __aicore__ inline void SetBias(__gm__ KfcMsg* msg)
+    {
+        if (msg->body.setTensorBias) {
+            const uint64_t size = (uint64_t)tiling_->singleCoreN;
+            if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) {
+                const auto& scmLocal = GetTscmTensor<typename BIAS_TYPE::T>(msg->body.biasAddr, size);
+                mul.SetBias(scmLocal);
+            } else {
+                GlobalTensor<typename BIAS_TYPE::T> biasGlobal;
+                biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size);
+                mul.SetBias(biasGlobal);
+            }
+        } else if (msg->body.setClearBias) {
+            mul.ClearBias();
+        }
+    }
+
+    __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset)
+    {
+        if (msg->body.setTensorBias) {
+            const uint64_t size = (uint64_t)tiling_->singleCoreN;
+            if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) {
+                const auto& scmLocal = GetTscmTensor<typename BIAS_TYPE::T>(msg->body.biasAddr + offset, size);
+                mul.SetBias(scmLocal);
+            } else {
+                GlobalTensor<typename BIAS_TYPE::T> biasGlobal;
+                biasGlobal.SetGlobalBuffer(
+                    reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size);
+                mul.SetBias(biasGlobal);
+            }
+        } else if (msg->body.setClearBias) {
+            mul.ClearBias();
+        }
+    }
+
+    __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg)
+    {
+        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
+            return true;
+        }
+        uint64_t size;
+        if constexpr (MM_CFG.baseMN != 0) {
+            size = MM_CFG.baseMN;
+        } else {
+            size = tiling_->baseM * tiling_->baseN;
+        }
+        if constexpr (PhyPosIsL1(C_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename C_TYPE::T>(msg->body.cAddr, size);
+            mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite);
+        } else {
+            GlobalTensor<DstT> cGlobal;
+
+            cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+            mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite);
+        }
+        // Now release UB
+        if constexpr (PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+        if (msg->body.sync == 1) { // Synchronize
+            uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        return false;
+    }
+
+    __aicore__ inline uint16_t GetInstID()
+    {
+        return instID;
+    }
+    __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!MM_CFG.enableInit) {
+            if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) {
+                mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa,
+                    msgAux.msg0.orgKb, msgAux.msg0.orgKc);
+            } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) {
+                mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa,
+                    msgAux.msg1.orgKb, msgAux.msg1.orgKc);
+            }
+        }
+        if (msg->body.isFirstIter) {
+            SetTensorA(msg);
+            SetTensorB(msg);
+            if constexpr (MM_CFG.enableSetBias) {
+                SetBias(msg);
+            }
+            if constexpr (MM_CFG.enableSetTail) {
+                SetTail(msg);
+            }
+            if constexpr (MM_CFG.enableQuantVector) {
+                SetQuantVector(msg);
+            }
+            if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) ||
+                ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) {
+                SetBatchNum(msg);
+            }
+            if constexpr (MM_CFG.enableSetDefineData) {
+                SetSelfDefineData(msg);
+            }
+        }
+    }
+
+    __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize,
+        const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0)
+    {
+        if (msg->body.isFirstIter) {
+            SetTensorA(msg, batchASize, offsetA);
+            SetTensorB(msg, batchBSize, offsetB);
+            SetBias(msg, offsetBias);
+            SetTail(msg);
+            SetQuantVector(msg);
+            SetBatchNum(msg);
+        }
+    }
+
+    __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg)
+    {
+        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
+            return true;
+        }
+        // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes.
+        GlobalTensor<int64_t> msgGlobal;
+        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        ASSERT(msg->body.quantMode != 1);  // scalar mode is not supported for quantization parameters in Batch MM
+        IterateSetMessage(msg);
+        uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN;
+
+        GlobalTensor<DstT> cGlobal;
+        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+        mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic),
+            msg->body.enSequentialWrite, msg->body.matrixStrideA,
+            msg->body.matrixStrideB, msg->body.matrixStrideC);
+
+        // Now release UB
+        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
+            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+        if (msg->body.sync || msg->body.waitIterateBatch) {
+            uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        return true;
+    }
+
+    __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!MM_CFG.isNBatch) {
+            return true;
+        }
+        GlobalTensor<int64_t> msgGlobal;
+        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM
+        const uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN;
+        const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop;
+        uint64_t batchAOffset = tiling_->ALayoutInfoD * msg->body.batchA;
+        if constexpr (A_TYPE::layout != LayoutMode::SBNGD) {
+            batchAOffset = batchAOffset * tiling_->ALayoutInfoS;
+        }
+        const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop;
+        uint64_t batchBOffset = tiling_->BLayoutInfoD * msg->body.batchB;
+        if constexpr (B_TYPE::layout != LayoutMode::SBNGD) {
+            batchBOffset = batchBOffset * tiling_->BLayoutInfoS;
+        }
+        const uint64_t batchCOffset = tiling_->CLayoutInfoS2;
+        const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB;
+        bool layoutGCondition = tiling_->CLayoutInfoG == 1 &&
+                                (tiling_->BLayoutInfoG != 1 || tiling_->ALayoutInfoG != 1);
+        int32_t layoutG = tiling_->BLayoutInfoG > tiling_->ALayoutInfoG ? tiling_->BLayoutInfoG : tiling_->ALayoutInfoG;
+        int32_t batchOffsetBias = tiling_->CLayoutInfoS2 * batchC;
+        if (layoutGCondition) {
+            batchOffsetBias = batchOffsetBias / layoutG;
+        }
+        int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T);
+        if constexpr (C_TYPE::layout != LayoutMode::SBNGD) {
+            batchOffsetC = batchOffsetC * tiling_->CLayoutInfoS1;
+        }
+        uint64_t offset = 0;
+        uint32_t cntIterator = 0;
+        for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) {
+            const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T);
+            const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T);
+            const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T);
+            IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset);
+            GlobalTensor<DstT> cGlobal;
+            cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size);
+            mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic),
+                msg->body.enSequentialWrite, msg->body.matrixStrideA,
+                msg->body.matrixStrideB, msg->body.matrixStrideC);
+            cntIterator++;
+            if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) {
+                uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+                NotifyEvent<PIPE_FIX>(eventID);
+            }
+            offset += batchOffsetC;
+        }
+        // Now release UB
+        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
+            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+        if (msg->body.sync || msg->body.waitIterateBatch) {
+            NotifyEvent<PIPE_FIX>(eventID);
+        } else if (cntIterator >= INC_PROCESS_CHECK) {
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        return true;
+    }
+    
+    __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID)
+    {
+        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
+            return true;
+        }
+        if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) {
+            if (msg->body.iterateFakeMsg) {
+                if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg
+                    uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + kfcCommSrv->subBlockID);
+                    NotifyEvent<PIPE_FIX>(eventID);
+                    return true;
+                }
+            }
+        } else {
+            ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg.");
+        }
+        if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+           ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
+           IsSameType<DstT, int8_t>::value) ||
+           (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
+           IsSameType<DstT, uint8_t>::value))) {
+            GlobalTensor<int64_t> msgGlobal;
+            msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+            DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        }
+        IterateSetMessage(msg);
+        uint64_t size;
+        if constexpr (MM_CFG.singleCoreMN != 0) {
+            size = MM_CFG.singleCoreMN;
+        } else {
+            size = tiling_->singleCoreM * tiling_->singleCoreN;
+        }
+
+        GlobalTensor<DstT> cGlobal;
+        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+        const auto& scmLocal = GetTscmTensor<typename C_TYPE::T>(msg->body.cAddr, size);
+        uint64_t offset = 0;
+        uint64_t offsetSize = 0;
+        auto enSequentialWrite = msg->body.enSequentialWrite;
+        auto enAtomic = msg->body.enAtomic;
+        auto sync = msg->body.sync;
+        auto enPartialSum = msg->body.enPartialSum;
+        if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) {
+            ASSERT(msg->body.cAddr != 0); // The output address must be configured.
+            if constexpr (MM_CFG.baseMN != 0) {
+                offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0;
+            } else {
+                offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0;
+            }
+        } else {
+            if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
+                ASSERT(msg->body.cAddr != 0); // The output address must be configured.
+                if constexpr (MM_CFG.baseMN != 0) {
+                    offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0;
+                } else {
+                    offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0;
+                }
+            } else if (sync == 0) {
+                // For asynchronous Iterate, the offset must be used for address calculation and
+                // the size is baseM x baseN.
+                if constexpr (MM_CFG.baseMN != 0) {
+                    offsetSize = MM_CFG.baseMN;
+                } else {
+                    offsetSize = tiling_->baseM * tiling_->baseN;
+                }
+                enSequentialWrite = 1;
+            }
+        }
+        uint32_t cntIterator = 0;
+        TRACE_START(TraceId::MatMul_CALC);
+        // Asynchronous and configure the workspace
+        while (mul.Iterate(enPartialSum)) {
+            if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) {
+                if (unlikely(cntIterator == 0)) {
+                    if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) {
+                        TRACE_STOP(TraceId::MatMul_CALC);
+                        return false; // The queue is not switched, and no message needs to be returned.
+                    }
+                }
+            }
+            if constexpr (PhyPosIsL1(C_TYPE::pos)) {
+                mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite);
+            } else {
+                mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite);
+            }
+            cntIterator++;
+            if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) {
+                if (cntIterator < INC_PROCESS_CHECK) {
+                    if (funID == KFC_Enum::MMFUN_ITERATE) {
+                        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+                        NotifyEvent<PIPE_FIX>(eventID);
+                    }
+                }
+            }
+            offset += offsetSize;
+        }
+        // Now release UB
+        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
+            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+
+        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+        if (sync || msg->body.waitIterateAll) {
+            ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
+            NotifyEvent<PIPE_FIX>(eventID);
+        } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) {
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        mul.End();
+        TRACE_STOP(TraceId::MatMul_CALC);
+        return true;
+    }
+
+    __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID)
+    {
+        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
+            return true;
+        }
+        if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+           ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
+           IsSameType<DstT, int8_t>::value) ||
+           (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
+           IsSameType<DstT, uint8_t>::value))) {
+            GlobalTensor<int64_t> msgGlobal;
+            msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+            DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        }
+        IterateSetMessage(msg);
+        if (mul.GetSubBlockIdx() == 0) {
+            return true;
+        }
+        uint64_t size;
+        if constexpr (MM_CFG.singleCoreMN != 0) {
+            size = MM_CFG.singleCoreMN;
+        } else {
+            size = tiling_->singleCoreM * tiling_->singleCoreN;
+        }
+
+        GlobalTensor<DstT> cGlobal;
+        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+        mul.IterateAll(cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite,
+            msg->body.waitIterateAll, msg->body.iterateFakeMsg);
+
+        uint16_t eventID0 = static_cast<uint16_t>(this->devEvtID * 2 + 0);
+        uint16_t eventID1 = static_cast<uint16_t>(this->devEvtID * 2 + 1);
+        if (msg->body.sync || msg->body.waitIterateAll) {
+            ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
+            NotifyEvent<PIPE_FIX>(eventID0);
+            NotifyEvent<PIPE_FIX>(eventID1);
+        }
+        if (!msg->body.iterateFakeMsg) {
+            mul.End();
+        }
+        TRACE_STOP(TraceId::MatMul_CALC);
+        return true;
+    }
+
+    __aicore__ inline bool IsSharedMatmul()
+    {
+        if constexpr (MM_CFG.enableInit) {
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    __aicore__ inline bool ProcessIbShareSync(KFC_Enum funID, bool& freeMsg,
+        int &lastMsgId, const int subBlockID)
+    {
+        if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || MM_CFG.intraBlockPartSum) {
+            if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
+                if (lastMsgId == subBlockID) {
+                    freeMsg = false;
+                    return true;
+                }
+                lastMsgId = subBlockID;
+                return false;
+            }
+            return false;
+        } else {
+            return false;
+        }
+    }
+
+    __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID)
+    {
+        if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) ||
+            ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) {
+            if ((static_cast<uint16_t>(funID) & static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) ==
+                static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) {
+                if constexpr (MM_CFG.intraBlockPartSum) {
+                    return IterateIntraBlockPartSum(msg, funID);
+                } else {
+                    return Iterate(msg, funID);
+                }
+            }
+        }
+        if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0)) {
+            if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) {
+                return IterateBatch(msg);
+            }
+        }
+        if constexpr (MM_CFG.enableEnd) {
+            if (funID == KFC_Enum::MMFUN_END) {
+                mul.End();
+            }
+        }
+        if constexpr (MM_CFG.enableGetTensorC) {
+            if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) {
+                return GetTensorC(msg);
+            }
+        }
+        if constexpr (MM_CFG.enableSetOrgShape) {
+            if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) {
+                SetOrgShape(msg);
+                return true;
+            }
+        }
+        if constexpr (MM_CFG.enableInit) {
+            if (funID == KFC_Enum::MMFUN_INIT) {
+                Init(msg);
+                return true;
+            }
+        }
+        if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) {
+            if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) {
+                return IterateNBatch(msg);
+            }
+        }
+        if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) {
+            SetUserDefInfo(msg);
+            return true;
+        }
+        if (funID == KFC_Enum::MMFUN_SET_HF32) {
+            SetHF32(msg);
+            return true;
+        }
+        ASSERT("illegal function ID.");
+        return true;
+    }
+
+    template <class T> __aicore__ LocalTensor<T> GetTscmTensor(uint64_t addr, const uint64_t size)
+    {
+        LocalTensor<T> scmLocal;
+        TBuffAddr scmTbuf;
+        scmTbuf.logicPos = (uint8_t)(TPosition::TSCM);
+        scmTbuf.dataLen = size * sizeof(DstT);
+        scmTbuf.bufferAddr = addr;
+#if ASCENDC_CPU_DEBUG
+        scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr;
+#endif
+        scmLocal.SetAddr(scmTbuf);
+        return scmLocal;
+    }
+
+private:
+    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> mul;
+    GM_ADDR workspace;
+    KfcCommServer* kfcCommSrv;
+    TPipe* tpipe;
+    TCubeTiling* tiling_;
+    TCubeTiling tmpTiling_; // for compatible with init interface
+    typename IBShareCache<IsIBShare<A_TYPE, B_TYPE>()>::ShareCache gCache;
+    typename  ShareMatmulAux<!MM_CFG.enableInit>::MSG msgAux;
+    uint16_t instID;
+    uint16_t devEvtID;
+};
+} // namespace matmul
 #endif // __MATMUL_SERVER_H__
\ No newline at end of file
diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp
index 2f9013ed759f9be7695532831390a71a61766a83..53fb12a4e06802738aa95b727568ed3f229beb4f 100644
--- a/impl/matmul/matmul_tiling_algorithm.cpp
+++ b/impl/matmul/matmul_tiling_algorithm.cpp
@@ -1290,11 +1290,6 @@ void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32
             // for ascend910b1 bias: gm -> l1 -> bt, need extra l1 space, 支持bias随路转换
             l1Size += tilingIns_->tiling_.get_baseN() * biasTypeSize;
         }
-        if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B &&
-            tilingIns_->biasType_.pos != TPosition::VECCALC) {
-            // case1: in v100/v200 输入bias, 需要放到ub 上面参与运算, 空间大小为baseN * sizeof(biasType)
-            ubSize += tilingIns_->tiling_.get_baseN() * biasTypeSize;
-        }
     }
 
     // in v100/v200, nd2nz and nz2nd was simulated with intrins, need extra ub space
@@ -1449,10 +1444,14 @@ void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const
         if (tilingIns_->isBias && tilingIns_->biasType_.pos != TPosition::VECCALC) {
             biasLength = tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType);
         }
+        // quant tensor
+        if (tilingIns_->aType_.dataType == DataType::DT_INT8) {
+            int32_t quantLength = tilingIns_->tiling_.get_baseN() * sizeof(uint64_t);
+            biasLength = max(quantLength, biasLength);
+        }
     }
-    int reuseCnt = 2;
-    transLength = max(max(a1Length, b1Length), max(c1Length / reuseCnt, biasLength / reuseCnt));
-    return;
+
+    transLength = max(max(a1Length, b1Length), max(c1Length, biasLength));
 }
 
 bool MatmulTilingAlgorithm::CheckBaseMN() const
@@ -2345,4 +2344,4 @@ int64_t MatmulTilingAlgorithm::Process()
     const bool ans = CheckFinaleParams(coreStatus);
     return ans ? 0 : -1;
 }
-} // namespace matmul_tiling
+} // namespace matmul_tiling
\ No newline at end of file
diff --git a/impl/matmul/matmul_tiling_algorithm.h b/impl/matmul/matmul_tiling_algorithm.h
index 61c5d31b510ce35dd77c84249a8b642cfbdcd4fe..4e90f44f98d05f9b5f12d3870b310ea26b6369b3 100644
--- a/impl/matmul/matmul_tiling_algorithm.h
+++ b/impl/matmul/matmul_tiling_algorithm.h
@@ -395,4 +395,4 @@ private:
 };
 } // namespace matmul_tiling
 
-#endif // IMPL_MATMUL_MATMUL_TILING_ALGORITHM_H
+#endif // IMPL_MATMUL_MATMUL_TILING_ALGORITHM_H
\ No newline at end of file
diff --git a/impl/matmul/matmul_utils.h b/impl/matmul/matmul_utils.h
index 88ffc1c71e298e60bfd0973d73aba273dbaea85b..814890a2dbca48091fd3d6dc2f78d130256836cb 100644
--- a/impl/matmul/matmul_utils.h
+++ b/impl/matmul/matmul_utils.h
@@ -245,6 +245,11 @@ __aicore__ constexpr bool DoMatmulSpecialMDL(MatmulConfig mmCFG)
     return mmCFG.doSpecialMDL;
 }
 
+__aicore__ constexpr bool IsSharedMatmul(MatmulConfig mmCFG)
+{
+    return !mmCFG.enableInit;
+}
+
 __aicore__ constexpr MatmulVersion GetMatmulVersion(MatmulConfig mmCFG)
 {
     if (DoMatmulNorm(mmCFG)) {
diff --git a/lib/matmul/bmm_tiling.h b/lib/matmul/bmm_tiling.h
index 83e27f02ad3ce26fb3c020a13066710c4cd8b284..9e044368de8ed40ea2e0f27955c6f682cfa4a1fe 100644
--- a/lib/matmul/bmm_tiling.h
+++ b/lib/matmul/bmm_tiling.h
@@ -79,4 +79,4 @@ int32_t MultiCoreMatmulGetTmpBufSize(optiling::TCubeTiling &tiling, matmul_tilin
 int32_t BatchMatmulGetTmpBufSize(optiling::TCubeTiling &tiling, matmul_tiling::SysTilingTempBufSize &bufSize);
 };
 
-#endif // LIB_MATMUL_BMM_TILING_H
+#endif // LIB_MATMUL_BMM_TILING_H
\ No newline at end of file
diff --git a/lib/matmul/kernel_kfc.h b/lib/matmul/kernel_kfc.h
index 451d2a3e225d7ba05a71b2a651f0cfb4c6152395..d850786c651dc0504ad83dfeffc7695d819efd64 100644
--- a/lib/matmul/kernel_kfc.h
+++ b/lib/matmul/kernel_kfc.h
@@ -1,407 +1,407 @@
-/**
- * Copyright (c) 2024 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-/*!
- * \file kernel_kfc.h
- * \brief
- */
-#ifndef LIB_MATMUL_KERNEL_KFC_H
-#define LIB_MATMUL_KERNEL_KFC_H
-
-#if ASCENDC_CPU_DEBUG
-#include <cstring>
-#include <unistd.h>
-#endif
-
-#include "kernel_operator.h"
-#include "lib/matmul/matmul_client.h"
-#include "lib/matmul/matmul_server.h"
-namespace AscendC {
-class KfcServer { // AIC side
-public:
-    __aicore__ inline void Init(GM_ADDR workspaceGM)
-    {
-        ASSERT(workspaceGM != nullptr && "workspaceGM cannot be nullptr when init kfc server");
-
-        workspace = workspaceGM;
-        quitSize = 0;
-        for (int32_t i = 0; i < MIX_NUM; i++) {
-            kfcCommSrv[i].Init(workspace, i); // Initialize the message queue on the server.
-        }
-    }
-
-    __aicore__ inline bool isRun()
-    {
-        // The function exits when all AIVs exit. The client sends a Quit message when the destructor ends.
-        return quitSize < MIX_NUM;
-    }
-
-    template <class T, class... Args> __aicore__ inline void Run(T& a, Args&&... b)
-    {
-        TRACE_START(TraceId::KFC_SERVER_RUN);
-        auto ptr = kfcCommSrv;
-        __gm__ KfcMsg* msg;
-        bool ret = true;
-        for (int i = 0; i < MIX_NUM;) { // Get messages of each AIV core in polling mode.
-            TRACE_START(TraceId::KFC_SERVER_REV_MSG);
-            msg = ptr->RcvMessage();
-            TRACE_STOP(TraceId::KFC_SERVER_REV_MSG);
-            if (msg) {
-                // The check message is public
-                TRACE_START(TraceId::KFC_SERVER_PROCESS_MSG);
-                auto funID = KfcMsgGetFunID(msg->head);
-                auto srvID = static_cast<KFC_Enum>(static_cast<uint16_t>(funID) &
-                    static_cast<uint16_t>(KFC_Enum::SERVICE_ID_MASK));
-                bool freeMsg = true;
-                if (srvID == KFC_Enum::SERVICE_ID_MATMUL) {
-                    ret = RunAux(i, msg, funID, freeMsg, a, b...);
-                } else if (srvID == KFC_Enum::SERVICE_ID_SCM) {
-                    if (funID == KFC_Enum::SCMFUN_GM2L1) {
-                        ScmDataCopy(&msg->buffer);
-                    } else if (funID == KFC_Enum::SCMFUN_GM2L1ND2NZ) {
-                        ScmDataCopyND2NZ(&msg->buffer);
-                    }
-                    if (unlikely(msg->ubAddr >= 0)) {
-                        ptr->FreeUB(msg->ubAddr);
-                    }
-                } else if (funID == KFC_Enum::SERVICE_QUIT) {
-                    quitSize++;
-                } else {
-                    ASSERT("unsupported service id !");
-                }
-                if (freeMsg) {
-                    ptr->FreeMessage(msg); // Move the message backward by one after the message processed.
-                    TRACE_STOP(TraceId::KFC_SERVER_PROCESS_MSG);
-                } else {
-                    ptr->RollBackMsg();
-                    i++;
-                    ptr++;
-                    continue;
-                }
-            }
-            if (ret) { // =false, lock a queue and must wait for release.
-                i++;
-                ptr++;
-            }
-        }
-        TRACE_STOP(TraceId::KFC_SERVER_RUN);
-    }
-
-    template <class T, class... Args> __aicore__ inline void InitObj(TPipe* tpipe, T& a, Args&&... b)
-    {
-        if constexpr (sizeof(T) == sizeof(void*)) { // Skip previous invalid pointer for compatibility
-            InitObj(b...);
-        } else {
-            ASSERT(kfcCommSrv != nullptr && "kfc comm server cannot be nullptr when init obj");
-            auto ptr = kfcCommSrv;
-            for (int i = 0; i < MIX_NUM; i++, ptr++) {
-                InitObjAux(tpipe, ptr, i, 0, a, b...);
-            }
-        }
-    }
-
-    __aicore__ inline void Quit()
-    {}
-
-    template <class T, class... Args> __aicore__ static inline constexpr bool isTiling()
-    {
-        return sizeof(T) == sizeof(void*);
-    }
-
-    template <class T, class... Args> __aicore__ static T* GetTiling(T* t, Args&&... b)
-    {
-        return t;
-    }
-
-private:
-    template <class T, class... Args>
-    __aicore__ inline bool RunAuxSkip(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg,
-        T& a, Args&&... b)
-    {
-        return RunAux(subBlockID, msg, funID, freeMsg, b...);
-    }
-    template <class T, class... Args>
-    __aicore__ inline bool RunAux(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg, T& a, Args&&... b)
-    {
-        ASSERT(msg != nullptr && "msg cannot be nullptr when kfc server run aux");
-        ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)");
-        if (a.mm.mm[0].IsSharedMatmul()) {
-            if (a.mm.mm[0].GetInstID() == KfcMsgGetInstID(msg->head)) {
-                if (a.mm.mm[0].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) {
-                    return true;
-                }
-                freeMsg = true;
-                a.mm.mm[0].SetSubBlockIdx(static_cast<uint8_t>(subBlockID));
-                return a.mm.mm[0].Process(msg, funID);
-            } else if constexpr (sizeof...(b) == 0) {
-                ASSERT(0);
-                return true;
-            } else if constexpr (isTiling<Args...>()) {
-                if constexpr (sizeof...(b) > 1) {
-                    return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...);
-                }
-            } else if constexpr (sizeof...(b) >= 1) {
-                return RunAux(subBlockID, msg, funID, freeMsg, b...);
-            }
-            return true;
-        } else {
-            if (a.mm.mm[subBlockID].GetInstID() == KfcMsgGetInstID(msg->head)) {
-                if (a.mm.mm[subBlockID].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) {
-                    return true;
-                }
-                freeMsg = true;
-                a.mm.mm[subBlockID].SetSubBlockIdx(static_cast<uint8_t>(subBlockID));
-                return a.mm.mm[subBlockID].Process(msg, funID);
-            } else if constexpr (sizeof...(b) == 0) {
-                ASSERT(0);
-                return true;
-            } else if constexpr (isTiling<Args...>()) {
-                if constexpr (sizeof...(b) > 1) {
-                    return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...);
-                }
-            } else if constexpr (sizeof...(b) >= 1) {
-                return RunAux(subBlockID, msg, funID, freeMsg, b...);
-            }
-            return true;
-        }
-    }
-
-    template <class T, class... Args>
-    __aicore__ inline void InitObjAuxSkip(TPipe* tpipe, KfcCommServer* kfc, int subBlockID, int instID, T* a,
-        Args&&... b)
-    {
-        InitObjAux(tpipe, kfc, subBlockID, instID, b...);
-    }
-
-    template <class T, class... Args>
-    __aicore__ inline void InitObjAux(TPipe *tpipe, KfcCommServer *kfc, int subBlockID, int instID, T &a, Args &&...b)
-    {
-        ASSERT(kfc != nullptr && "kfc cannot be nullptr when kfc server init obj aux");
-        ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)");
-        ASSERT(tpipe != nullptr);
-        ASSERT(instID >= 0 && instID < MAX_MATMUL_OBJ && "matmul instID id be  [0, MAX_MATMUL_OBJ)");
-
-        if constexpr (sizeof...(b) == 0) {
-            if (a.mm.mm[0].IsSharedMatmul()) {
-                if (subBlockID == 0) {
-                    a.mm.mm[0].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace);
-                }
-            } else {
-                a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace);
-            }
-        } else if constexpr (isTiling<Args...>()) {
-            auto tiling = GetTiling(b...);
-            if (a.mm.mm[0].IsSharedMatmul()) {
-                if (subBlockID == 0) {
-                    a.mm.mm[0].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace);
-                    if constexpr (sizeof...(b) > 1) {
-                        InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...);
-                    }
-                } else {
-                    if constexpr (sizeof...(b) > 1) {
-                        InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...);
-                    }
-                }
-            } else {
-                a.mm.mm[subBlockID].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace);
-                if constexpr (sizeof...(b) > 1) {
-                    InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...);
-                }
-            }
-        } else {
-            a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace);
-            if constexpr (sizeof...(b) >= 1) {
-                InitObjAux(tpipe, kfc, subBlockID, instID + 1, b...);
-            }
-        }
-    }
-
-    // Apply for two servers on the server. aic<->aiv 1:1
-    KfcCommServer kfcCommSrv[MIX_NUM];
-    GM_ADDR workspace;
-    uint8_t quitSize;
-    int lastMsgId = 1;
-};
-
-template <const MatmulConfig& MM_CFG = CFG_NORM>
-constexpr bool IsSharedMatmul()
-{
-    return !MM_CFG.enableInit;
-}
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE,
-    const MatmulConfig& MM_CFG = CFG_NORM, class MM_CB = matmul::MatmulCallBackFunc<nullptr, nullptr, nullptr>>
-struct MatmulInstBase {
-    __aicore__ inline MatmulInstBase(){};
-};
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-struct MatmulInstShared : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
-    __aicore__ inline MatmulInstShared(){};
-    matmul::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> mm[1];
-};
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-struct MatmulInst : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
-    __aicore__ inline MatmulInst(){};
-    matmul::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> mm[MIX_NUM];
-};
-
-template <bool SHARED, class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG,
-    class MM_CB>
-struct MatmulInstAux {
-    __aicore__ inline MatmulInstAux(){};
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-struct MatmulInstAux<true, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
-    __aicore__ inline MatmulInstAux(){};
-    using MATMUL = MatmulInstShared<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>;
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-struct MatmulInstAux<false, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
-    __aicore__ inline MatmulInstAux(){};
-    using MATMUL = MatmulInst<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>;
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const MatmulConfig& MM_CFG = CFG_NORM,
-    class MM_CB = matmul::MatmulCallBackFunc<nullptr, nullptr, nullptr>>
-class MatmulServiceAux {
-    using SrcT = typename A_TYPE::T;
-    using SrcAT = typename A_TYPE::T;
-    using SrcBT = typename B_TYPE::T;
-    using DstT = typename C_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
-    using handle = __gm__ MsgGroupSyncAux*;
-
-public:
-    __aicore__ inline MatmulServiceAux() {}
-    typename MatmulInstAux<IsSharedMatmul<MM_CFG>(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm;
-
-    // stub functions for MatmulImpl
-    __aicore__ inline void Init(TCubeTiling* cubeTiling, TPipe* tpipe = nullptr){};
-
-    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK){};
-    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0){};
-    __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK){};
-    __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1){};
-
-    __aicore__ inline void SetTensorA(const GlobalTensor<SrcAT>& gm, bool isTranspose = false){};
-
-    __aicore__ inline void SetTensorAWithCopy(const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT>& leftMatrix,
-        bool isTranspose = false){};
-    __aicore__ inline void SetTensorB(const GlobalTensor<SrcBT>& gm, bool isTranspose = false){};
-
-    __aicore__ inline void SetTensorBWithCopy(const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT>& righMatrix,
-        bool isTranspose = false){};
-    __aicore__ inline void SetBias(const GlobalTensor<BiasT>& biasGlobal){};
-    __aicore__ inline void SetTensorA(const LocalTensor<SrcAT>& leftMatrix, bool isTranspose = false){};
-    __aicore__ inline void SetTensorB(const LocalTensor<SrcBT>& righMatrix, bool isTranspose = false){};
-    __aicore__ inline void SetBias(const LocalTensor<BiasT>& inputBias){};
-    __aicore__ inline void SetTensorA(SrcAT aScalar){};
-    __aicore__ inline void SetTensorB(SrcBT bScalar){};
-    __aicore__ inline void ClearBias(){};
-    __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {}
-    __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {}
-    __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {}
-    __aicore__ inline void SetQuantVector(const GlobalTensor<uint64_t>& quantTensor) {}
-    template <class T> __aicore__ inline void SetWorkspace(__gm__ T* addr, int size) {};
-    template <class T> __aicore__ inline void SetWorkspace(GlobalTensor<T>& addr){};
-    __aicore__ inline void End(){};
-    __aicore__ inline void SetHF32(bool enHF32 = false, int32_t transMode = 0){};
-
-    template <bool sync = true> __aicore__ inline bool Iterate(bool enPartialSum = false)
-    {
-        return false;
-    };
-    template <bool sync = true>
-    __aicore__ inline void IterateAll(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false){};
-    template <bool sync = true>
-    __aicore__ inline void IterateAll(const LocalTensor<DstT>& cMatrix, uint8_t enAtomic = 0){};
-    __aicore__ inline void WaitIterateAll() {};
-    template <bool sync = true, bool doPad = false>
-    __aicore__ inline void GetTensorC(const LocalTensor<DstT>& c, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
-        uint32_t dstGap = 0) {};
-    template <bool sync = true>
-    __aicore__ inline void GetTensorC(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false){};
-    template <bool sync = true>
-    __aicore__ inline void GetTensorC(const GlobalTensor<DstT> &c, const LocalTensor<DstT> &cLocal,
-        uint8_t enAtomic = 0, bool enSequentialWrite = false) {};
-    template <bool sync = true>
-    __aicore__ inline GlobalTensor<DstT> GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false)
-    {
-        GlobalTensor<DstT> global;
-        return global;
-    };
-    template <bool sync = true, bool waitIterateBatch = false>
-    __aicore__ inline void IterateBatch(const GlobalTensor<DstT>& gm, uint32_t batchA, uint32_t batchB,
-        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
-        const uint32_t matrixStrideC = 0) {};
-    template <bool sync = true>
-    __aicore__ inline void IterateBatch(const LocalTensor<DstT>& ubCmatrix, uint32_t batchA, uint32_t batchB,
-        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
-        const uint32_t matrixStrideC = 0) {};
-    template <bool sync = true, bool waitIterateBatch = false>
-    __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB,
-        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
-        const uint32_t matrixStrideC = 0) {};
-    template <bool sync = true>
-    __aicore__ inline GlobalTensor<DstT> GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {};
-    template <bool sync = true, bool doPad = false>
-    __aicore__ inline void GetBatchC(const LocalTensor<DstT>& c, uint32_t batchA, uint32_t batchB,
-        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
-        uint32_t dstGap = 0) {};
-    __aicore__ inline void WaitIterateBatch() {};
-    __aicore__ inline void SetLocalWorkspace(const LocalTensor<uint8_t>& tmpBuffer) {};
-    __aicore__ inline void AsyncGetTensorC(const LocalTensor<DstT>& c){};
-    __aicore__ inline void WaitGetTensorC(){};
-    template <bool isTurnOnDebug = true>
-    __aicore__ inline MatrixOffset GetOffsetC()
-    {
-        if constexpr (isTurnOnDebug) {
-            static_assert(!isTurnOnDebug, "unsupported!");
-        }
-    }
-};
-
-template <class T, class... Args>
-__aicore__ inline void SetMatrixKfcSkip(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace,
-    T& mm, Args&&... b)
-{
-    SetMatrixKfc(pipe, kfcClient, instID, workspace, b...);
-}
-
-template <class T, class... Args>
-__aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace,
-    T& mm, Args&&... b)
-{
-    ASSERT((pipe != nullptr) && "pipe should not be nullptr.");
-    ASSERT((kfcClient != nullptr) && "kfcClient should not be nullptr.");
-    ASSERT((workspace != nullptr) && "workspace should not be nullptr.");
-
-    if constexpr (sizeof...(b) == 0) {
-        InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace);
-    } else if constexpr (KfcServer::isTiling<Args...>()) {
-        auto tiling = KfcServer::GetTiling(b...);
-        InitKfcClient(mm, tiling, pipe, kfcClient, instID, workspace);
-        if constexpr (sizeof...(b) > 1) {
-            SetMatrixKfcSkip(pipe, kfcClient, instID + 1, workspace, b...);
-        }
-    } else {
-        InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace);
-        if constexpr (sizeof...(b) >= 1) {
-            SetMatrixKfc(pipe, kfcClient, instID + 1, workspace, b...);
-        }
-    }
-}
-}; // namespace AscendC
-
-#endif
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file kernel_kfc.h
+ * \brief
+ */
+#ifndef LIB_MATMUL_KERNEL_KFC_H
+#define LIB_MATMUL_KERNEL_KFC_H
+
+#if ASCENDC_CPU_DEBUG
+#include <cstring>
+#include <unistd.h>
+#endif
+
+#include "kernel_operator.h"
+#include "lib/matmul/matmul_client.h"
+#include "lib/matmul/matmul_server.h"
+namespace AscendC {
+class KfcServer { // AIC side
+public:
+    __aicore__ inline void Init(GM_ADDR workspaceGM)
+    {
+        ASSERT(workspaceGM != nullptr && "workspaceGM cannot be nullptr when init kfc server");
+
+        workspace = workspaceGM;
+        quitSize = 0;
+        for (int32_t i = 0; i < MIX_NUM; i++) {
+            kfcCommSrv[i].Init(workspace, i); // Initialize the message queue on the server.
+        }
+    }
+
+    __aicore__ inline bool isRun()
+    {
+        // The function exits when all AIVs exit. The client sends a Quit message when the destructor ends.
+        return quitSize < MIX_NUM;
+    }
+
+    template <class T, class... Args> __aicore__ inline void Run(T& a, Args&&... b)
+    {
+        TRACE_START(TraceId::KFC_SERVER_RUN);
+        auto ptr = kfcCommSrv;
+        __gm__ KfcMsg* msg;
+        bool ret = true;
+        for (int i = 0; i < MIX_NUM;) { // Get messages of each AIV core in polling mode.
+            TRACE_START(TraceId::KFC_SERVER_REV_MSG);
+            msg = ptr->RcvMessage();
+            TRACE_STOP(TraceId::KFC_SERVER_REV_MSG);
+            if (msg) {
+                // The check message is public
+                TRACE_START(TraceId::KFC_SERVER_PROCESS_MSG);
+                auto funID = KfcMsgGetFunID(msg->head);
+                auto srvID = static_cast<KFC_Enum>(static_cast<uint16_t>(funID) &
+                    static_cast<uint16_t>(KFC_Enum::SERVICE_ID_MASK));
+                bool freeMsg = true;
+                if (srvID == KFC_Enum::SERVICE_ID_MATMUL) {
+                    ret = RunAux(i, msg, funID, freeMsg, a, b...);
+                } else if (srvID == KFC_Enum::SERVICE_ID_SCM) {
+                    if (funID == KFC_Enum::SCMFUN_GM2L1) {
+                        ScmDataCopy(&msg->buffer);
+                    } else if (funID == KFC_Enum::SCMFUN_GM2L1ND2NZ) {
+                        ScmDataCopyND2NZ(&msg->buffer);
+                    }
+                    if (unlikely(msg->ubAddr >= 0)) {
+                        ptr->FreeUB(msg->ubAddr);
+                    }
+                } else if (funID == KFC_Enum::SERVICE_QUIT) {
+                    quitSize++;
+                } else {
+                    ASSERT("unsupported service id !");
+                }
+                if (freeMsg) {
+                    ptr->FreeMessage(msg); // Move the message backward by one after the message processed.
+                    TRACE_STOP(TraceId::KFC_SERVER_PROCESS_MSG);
+                } else {
+                    ptr->RollBackMsg();
+                    i++;
+                    ptr++;
+                    continue;
+                }
+            }
+            if (ret) { // =false, lock a queue and must wait for release.
+                i++;
+                ptr++;
+            }
+        }
+        TRACE_STOP(TraceId::KFC_SERVER_RUN);
+    }
+
+    template <class T, class... Args> __aicore__ inline void InitObj(TPipe* tpipe, T& a, Args&&... b)
+    {
+        if constexpr (sizeof(T) == sizeof(void*)) { // Skip previous invalid pointer for compatibility
+            InitObj(b...);
+        } else {
+            ASSERT(kfcCommSrv != nullptr && "kfc comm server cannot be nullptr when init obj");
+            auto ptr = kfcCommSrv;
+            for (int i = 0; i < MIX_NUM; i++, ptr++) {
+                InitObjAux(tpipe, ptr, i, 0, a, b...);
+            }
+        }
+    }
+
+    __aicore__ inline void Quit()
+    {}
+
+    template <class T, class... Args> __aicore__ static inline constexpr bool isTiling()
+    {
+        return sizeof(T) == sizeof(void*);
+    }
+
+    template <class T, class... Args> __aicore__ static T* GetTiling(T* t, Args&&... b)
+    {
+        return t;
+    }
+
+private:
+    template <class T, class... Args>
+    __aicore__ inline bool RunAuxSkip(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg,
+        T& a, Args&&... b)
+    {
+        return RunAux(subBlockID, msg, funID, freeMsg, b...);
+    }
+    template <class T, class... Args>
+    __aicore__ inline bool RunAux(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg, T& a, Args&&... b)
+    {
+        ASSERT(msg != nullptr && "msg cannot be nullptr when kfc server run aux");
+        ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)");
+        if (a.mm.mm[0].IsSharedMatmul()) {
+            if (a.mm.mm[0].GetInstID() == KfcMsgGetInstID(msg->head)) {
+                if (a.mm.mm[0].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) {
+                    return true;
+                }
+                freeMsg = true;
+                a.mm.mm[0].SetSubBlockIdx(static_cast<uint8_t>(subBlockID));
+                return a.mm.mm[0].Process(msg, funID);
+            } else if constexpr (sizeof...(b) == 0) {
+                ASSERT(0);
+                return true;
+            } else if constexpr (isTiling<Args...>()) {
+                if constexpr (sizeof...(b) > 1) {
+                    return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...);
+                }
+            } else if constexpr (sizeof...(b) >= 1) {
+                return RunAux(subBlockID, msg, funID, freeMsg, b...);
+            }
+            return true;
+        } else {
+            if (a.mm.mm[subBlockID].GetInstID() == KfcMsgGetInstID(msg->head)) {
+                if (a.mm.mm[subBlockID].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) {
+                    return true;
+                }
+                freeMsg = true;
+                a.mm.mm[subBlockID].SetSubBlockIdx(static_cast<uint8_t>(subBlockID));
+                return a.mm.mm[subBlockID].Process(msg, funID);
+            } else if constexpr (sizeof...(b) == 0) {
+                ASSERT(0);
+                return true;
+            } else if constexpr (isTiling<Args...>()) {
+                if constexpr (sizeof...(b) > 1) {
+                    return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...);
+                }
+            } else if constexpr (sizeof...(b) >= 1) {
+                return RunAux(subBlockID, msg, funID, freeMsg, b...);
+            }
+            return true;
+        }
+    }
+
+    template <class T, class... Args>
+    __aicore__ inline void InitObjAuxSkip(TPipe* tpipe, KfcCommServer* kfc, int subBlockID, int instID, T* a,
+        Args&&... b)
+    {
+        InitObjAux(tpipe, kfc, subBlockID, instID, b...);
+    }
+
+    template <class T, class... Args>
+    __aicore__ inline void InitObjAux(TPipe *tpipe, KfcCommServer *kfc, int subBlockID, int instID, T &a, Args &&...b)
+    {
+        ASSERT(kfc != nullptr && "kfc cannot be nullptr when kfc server init obj aux");
+        ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)");
+        ASSERT(tpipe != nullptr);
+        ASSERT(instID >= 0 && instID < MAX_MATMUL_OBJ && "matmul instID id be  [0, MAX_MATMUL_OBJ)");
+
+        if constexpr (sizeof...(b) == 0) {
+            if (a.mm.mm[0].IsSharedMatmul()) {
+                if (subBlockID == 0) {
+                    a.mm.mm[0].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace);
+                }
+            } else {
+                a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace);
+            }
+        } else if constexpr (isTiling<Args...>()) {
+            auto tiling = GetTiling(b...);
+            if (a.mm.mm[0].IsSharedMatmul()) {
+                if (subBlockID == 0) {
+                    a.mm.mm[0].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace);
+                    if constexpr (sizeof...(b) > 1) {
+                        InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...);
+                    }
+                } else {
+                    if constexpr (sizeof...(b) > 1) {
+                        InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...);
+                    }
+                }
+            } else {
+                a.mm.mm[subBlockID].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace);
+                if constexpr (sizeof...(b) > 1) {
+                    InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...);
+                }
+            }
+        } else {
+            a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace);
+            if constexpr (sizeof...(b) >= 1) {
+                InitObjAux(tpipe, kfc, subBlockID, instID + 1, b...);
+            }
+        }
+    }
+
+    // Apply for two servers on the server. aic<->aiv 1:1
+    KfcCommServer kfcCommSrv[MIX_NUM];
+    GM_ADDR workspace;
+    uint8_t quitSize;
+    int lastMsgId = 1;
+};
+
+template <const MatmulConfig& MM_CFG = CFG_NORM>
+constexpr bool IsSharedMatmul()
+{
+    return !MM_CFG.enableInit;
+}
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE,
+    const MatmulConfig& MM_CFG = CFG_NORM, class MM_CB = matmul::MatmulCallBackFunc<nullptr, nullptr, nullptr>>
+struct MatmulInstBase {
+    __aicore__ inline MatmulInstBase(){};
+};
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+struct MatmulInstShared : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
+    __aicore__ inline MatmulInstShared(){};
+    matmul::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> mm[1];
+};
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+struct MatmulInst : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
+    __aicore__ inline MatmulInst(){};
+    matmul::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> mm[MIX_NUM];
+};
+
+template <bool SHARED, class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG,
+    class MM_CB>
+struct MatmulInstAux {
+    __aicore__ inline MatmulInstAux(){};
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+struct MatmulInstAux<true, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
+    __aicore__ inline MatmulInstAux(){};
+    using MATMUL = MatmulInstShared<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+struct MatmulInstAux<false, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB> {
+    __aicore__ inline MatmulInstAux(){};
+    using MATMUL = MatmulInst<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const MatmulConfig& MM_CFG = CFG_NORM,
+    class MM_CB = matmul::MatmulCallBackFunc<nullptr, nullptr, nullptr>>
+class MatmulServiceAux {
+    using SrcT = typename A_TYPE::T;
+    using SrcAT = typename A_TYPE::T;
+    using SrcBT = typename B_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+    using handle = __gm__ MsgGroupSyncAux*;
+
+public:
+    __aicore__ inline MatmulServiceAux() {}
+    typename MatmulInstAux<IsSharedMatmul<MM_CFG>(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm;
+
+    // stub functions for MatmulImpl
+    __aicore__ inline void Init(TCubeTiling* cubeTiling, TPipe* tpipe = nullptr){};
+
+    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK){};
+    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0){};
+    __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK){};
+    __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1){};
+
+    __aicore__ inline void SetTensorA(const GlobalTensor<SrcAT>& gm, bool isTranspose = false){};
+
+    __aicore__ inline void SetTensorAWithCopy(const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT>& leftMatrix,
+        bool isTranspose = false){};
+    __aicore__ inline void SetTensorB(const GlobalTensor<SrcBT>& gm, bool isTranspose = false){};
+
+    __aicore__ inline void SetTensorBWithCopy(const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT>& righMatrix,
+        bool isTranspose = false){};
+    __aicore__ inline void SetBias(const GlobalTensor<BiasT>& biasGlobal){};
+    __aicore__ inline void SetTensorA(const LocalTensor<SrcAT>& leftMatrix, bool isTranspose = false){};
+    __aicore__ inline void SetTensorB(const LocalTensor<SrcBT>& righMatrix, bool isTranspose = false){};
+    __aicore__ inline void SetBias(const LocalTensor<BiasT>& inputBias){};
+    __aicore__ inline void SetTensorA(SrcAT aScalar){};
+    __aicore__ inline void SetTensorB(SrcBT bScalar){};
+    __aicore__ inline void ClearBias(){};
+    __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {}
+    __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {}
+    __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {}
+    __aicore__ inline void SetQuantVector(const GlobalTensor<uint64_t>& quantTensor) {}
+    template <class T> __aicore__ inline void SetWorkspace(__gm__ T* addr, int size) {};
+    template <class T> __aicore__ inline void SetWorkspace(GlobalTensor<T>& addr){};
+    __aicore__ inline void End(){};
+    __aicore__ inline void SetHF32(bool enHF32 = false, int32_t transMode = 0){};
+
+    template <bool sync = true> __aicore__ inline bool Iterate(bool enPartialSum = false)
+    {
+        return false;
+    };
+    template <bool sync = true>
+    __aicore__ inline void IterateAll(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false){};
+    template <bool sync = true>
+    __aicore__ inline void IterateAll(const LocalTensor<DstT>& cMatrix, uint8_t enAtomic = 0){};
+    __aicore__ inline void WaitIterateAll() {};
+    template <bool sync = true, bool doPad = false>
+    __aicore__ inline void GetTensorC(const LocalTensor<DstT>& c, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
+        uint32_t dstGap = 0) {};
+    template <bool sync = true>
+    __aicore__ inline void GetTensorC(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false){};
+    template <bool sync = true>
+    __aicore__ inline void GetTensorC(const GlobalTensor<DstT> &c, const LocalTensor<DstT> &cLocal,
+        uint8_t enAtomic = 0, bool enSequentialWrite = false) {};
+    template <bool sync = true>
+    __aicore__ inline GlobalTensor<DstT> GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false)
+    {
+        GlobalTensor<DstT> global;
+        return global;
+    };
+    template <bool sync = true, bool waitIterateBatch = false>
+    __aicore__ inline void IterateBatch(const GlobalTensor<DstT>& gm, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
+        const uint32_t matrixStrideC = 0) {};
+    template <bool sync = true>
+    __aicore__ inline void IterateBatch(const LocalTensor<DstT>& ubCmatrix, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
+        const uint32_t matrixStrideC = 0) {};
+    template <bool sync = true, bool waitIterateBatch = false>
+    __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
+        const uint32_t matrixStrideC = 0) {};
+    template <bool sync = true>
+    __aicore__ inline GlobalTensor<DstT> GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {};
+    template <bool sync = true, bool doPad = false>
+    __aicore__ inline void GetBatchC(const LocalTensor<DstT>& c, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
+        uint32_t dstGap = 0) {};
+    __aicore__ inline void WaitIterateBatch() {};
+    __aicore__ inline void SetLocalWorkspace(const LocalTensor<uint8_t>& tmpBuffer) {};
+    __aicore__ inline void AsyncGetTensorC(const LocalTensor<DstT>& c){};
+    __aicore__ inline void WaitGetTensorC(){};
+    template <bool isTurnOnDebug = true>
+    __aicore__ inline MatrixOffset GetOffsetC()
+    {
+        if constexpr (isTurnOnDebug) {
+            static_assert(!isTurnOnDebug, "unsupported!");
+        }
+    }
+};
+
+template <class T, class... Args>
+__aicore__ inline void SetMatrixKfcSkip(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace,
+    T& mm, Args&&... b)
+{
+    SetMatrixKfc(pipe, kfcClient, instID, workspace, b...);
+}
+
+template <class T, class... Args>
+__aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace,
+    T& mm, Args&&... b)
+{
+    ASSERT((pipe != nullptr) && "pipe should not be nullptr.");
+    ASSERT((kfcClient != nullptr) && "kfcClient should not be nullptr.");
+    ASSERT((workspace != nullptr) && "workspace should not be nullptr.");
+
+    if constexpr (sizeof...(b) == 0) {
+        InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace);
+    } else if constexpr (KfcServer::isTiling<Args...>()) {
+        auto tiling = KfcServer::GetTiling(b...);
+        InitKfcClient(mm, tiling, pipe, kfcClient, instID, workspace);
+        if constexpr (sizeof...(b) > 1) {
+            SetMatrixKfcSkip(pipe, kfcClient, instID + 1, workspace, b...);
+        }
+    } else {
+        InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace);
+        if constexpr (sizeof...(b) >= 1) {
+            SetMatrixKfc(pipe, kfcClient, instID + 1, workspace, b...);
+        }
+    }
+}
+}; // namespace AscendC
+
+#endif
\ No newline at end of file
diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h
index fce0ccda66827b5902be9c216cff095f593a135c..9377f7b7e336ac3a85f071371fe1f2bc0e5e4017 100644
--- a/lib/matmul/matmul.h
+++ b/lib/matmul/matmul.h
@@ -21,7 +21,7 @@
 #include "../../impl//matmul/matmul_macro_v220_basic_impl.h"
 #include "../../impl//matmul/matmul_macro_v200_impl.h"
 #include "../../impl/matmul/matmul_utils.h"
-#include "lib/matmul/matmul_call_back.h"
+#include "../../impl/matmul/matmul_call_back.h"
 
 namespace matmul {
 using namespace AscendC;
@@ -99,8 +99,6 @@ struct MatmulParamsNorm : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_T
     __gm__ SrcBT* bGlobal_;
     __gm__ BiasT* biasGlobal_;
 
-    TBuf<> calcBuf_;
-
     TPipe* tpipe_;
     const TCubeTiling* __restrict tiling_;
     __gm__ uint8_t* cacheWorkspaceAddr;
@@ -219,8 +217,6 @@ struct MatmulParamsMDL : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TY
     __gm__ SrcBT* bGlobal_;
     __gm__ BiasT* biasGlobal_;
 
-    TBuf<> calcBuf_;
-
     TPipe* tpipe_;
     const TCubeTiling* __restrict tiling_;
     __gm__ uint8_t* cacheWorkspaceAddr;
@@ -359,8 +355,6 @@ struct MatmulParamsIBShareNorm : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE,
     __gm__ SrcT* bGlobal_;
     __gm__ BiasT* biasGlobal_;
 
-    TBuf<> calcBuf_;
-
     TPipe* tpipe_;
     const TCubeTiling* __restrict tiling_;
     __gm__ uint8_t* cacheWorkspaceAddr;
@@ -554,43 +548,6 @@ struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersi
         BIAS_TYPE::isTrans, MM_CFG>;
 };
 
-template <class A_TYPE, class BIAS_TYPE>
-struct IntraBlockBase {
-    __aicore__ inline IntraBlockBase() {};
-};
-
-template <class A_TYPE, class BIAS_TYPE>
-struct IntraBlock {
-    using SrcT = typename A_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
-    __aicore__ inline IntraBlock(){};
-    __gm__ SrcT* aGlobal;
-    __gm__ SrcT* bGlobal;
-    __gm__ BiasT* biasGlobal;
-    int M;
-    int N;
-    int Ka;
-    int Kb;
-    int Kc;
-    int singleCoreM;
-    int singleCoreN;
-    int singleCoreK;
-    int mIter;
-    int nIter;
-    int kIter;
-    int baseUseM;
-    int baseUseN;
-    // measured in cube block
-    int blockUseM;
-    int blockUseN;
-    int tailM, tailK, tailN;
-    int cacheProcA = 0;
-    bool enableBias = false;
-    bool isTransposeA;
-    bool isTransposeB;
-    bool fakeMsg = false;
-};
-
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>>
 class MatmulImpl {
@@ -946,8 +903,41 @@ private:
     int Kc_;
     int32_t batchA_ = 1, batchB_ = 1;
     int32_t batchOuter_ = 1;
-    using INTRABLOCK =
-        typename Conditional<MM_CFG.intraBlockPartSum, IntraBlock<A_TYPE, BIAS_TYPE>, IntraBlockBase<A_TYPE, BIAS_TYPE>>::type;
+
+    struct IntraBlockBase {
+        __aicore__ inline IntraBlockBase() {};
+    };
+
+    struct IntraBlock {
+        __aicore__ inline IntraBlock(){};
+        __gm__ SrcT* aGlobal;
+        __gm__ SrcT* bGlobal;
+        __gm__ BiasT* biasGlobal;
+        int M;
+        int N;
+        int Ka;
+        int Kb;
+        int Kc;
+        int singleCoreM;
+        int singleCoreN;
+        int singleCoreK;
+        int mIter;
+        int nIter;
+        int kIter;
+        int baseUseM;
+        int baseUseN;
+        // measured in cube block
+        int blockUseM;
+        int blockUseN;
+        int tailM, tailK, tailN;
+        int cacheProcA = 0;
+        bool enableBias = false;
+        bool isTransposeA;
+        bool isTransposeB;
+        bool fakeMsg = false;
+    };
+
+    using INTRABLOCK = typename Conditional<MM_CFG.intraBlockPartSum, IntraBlock, IntraBlockBase>::type;
     INTRABLOCK intraBlockMatmul;
 };
 
diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h
index 869ceadfc976461f84f46addc81efccf70b031b5..000f897b4560367c4acd9cef9498321822d5a2ed 100644
--- a/lib/matmul/matmul_client.h
+++ b/lib/matmul/matmul_client.h
@@ -16,21 +16,16 @@
 #define LIB_MATMUL_MATMUL_CLIENT_H
 
 #include "lib/matmul/tiling.h"
-#include "lib/matmul/matmul_call_back.h"
+#include "../../impl/matmul/matmul_call_back.h"
 #include "../../impl/matmul/matmul_utils.h"
 #include "kernel_operator.h"
 #if ASCENDC_CPU_DEBUG
-#include "lib/matmul/matmul_server.h"
+#include "../../impl/matmul/matmul_server.h"
 #endif
 
 namespace matmul {
 using namespace AscendC;
 #if ASCENDC_CPU_DEBUG
-template <const MatmulConfig& MM_CFG = CFG_NORM>
-constexpr bool IsSharedMatmul()
-{
-    return !MM_CFG.enableInit;
-}
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE,
     const MatmulConfig& MM_CFG = CFG_NORM, class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>>
 struct MatmulInstBase {
@@ -764,7 +759,13 @@ public:
 #if ASCENDC_CPU_DEBUG
 public:
     // this is useless code just for cpu debug
-    typename MatmulInstAux<IsSharedMatmul<MM_CFG>(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm;
+    typename MatmulInstAux<IsSharedMatmul(MM_CFG),
+                           A_TYPE,
+                           B_TYPE,
+                           C_TYPE,
+                           BIAS_TYPE,
+                           MM_CFG,
+                           MM_CB>::MATMUL mm;
 
 #endif
 
@@ -943,4 +944,4 @@ private:
     }
 };
 } // namespace matmul
-#endif
+#endif
\ No newline at end of file
diff --git a/lib/matmul/matmul_intf.h b/lib/matmul/matmul_intf.h
index 6fccd093d4ccd0ca32833cb14cf1cb1ca29776c0..cc1bbc38a2b91c985f1e63a949d7e1e14789cff7 100644
--- a/lib/matmul/matmul_intf.h
+++ b/lib/matmul/matmul_intf.h
@@ -15,7 +15,7 @@
 #ifndef LIB_MATMUL_MATMUL_INTF_H
 #define LIB_MATMUL_MATMUL_INTF_H
 #if __CCE_AICORE__ == 220
-#include "lib/matmul/kernel_kfc.h"
+#include "../impl/matmul/kernel_kfc.h"
 #else
 #include "lib/matmul/matmul.h"
 #endif
@@ -124,27 +124,6 @@ using Matmul = matmul::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
 #else
 
-__aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace)
-{
-    SetAtomicNone();
-#if __CCE_AICORE__ == 220
-    if ASCEND_IS_AIC {
-        SetMaskNorm();
-        SetLoadDataBoundary((uint64_t)0);
-        SetLoadDataPaddingValue((uint64_t)0);
-    } else {
-        AscendCUtils::SetMask<uint64_t>((uint64_t)-1, (uint64_t)-1);
-        SetMaskNorm();
-    }
-#endif
-
-#ifdef __DAV_C220_CUBE__
-    ClearWorkspaceImpl(workspace);
-    uint16_t eventID = 3;
-    NotifyEvent<PIPE_MTE3>(eventID);
-#endif
-}
-
 #ifdef __DAV_C220_CUBE__
 #ifdef ASCENDC_CUBE_ONLY
 template <class T, class... Args> __aicore__ static T* GetCurTiling(T* t, Args&&... b)
diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h
index 4cdef51fed7b9f28cbb39df0d959ca5f16b180f6..b03687525aae27d1d87ad531f2b5d132bd93c6ee 100644
--- a/lib/matmul/matmul_tiling_base.h
+++ b/lib/matmul/matmul_tiling_base.h
@@ -317,4 +317,4 @@ protected:
 };
 } // namespace matmul_tiling
 
-#endif // LIB_MATMUL_MATMUL_TILING_BASE_H
+#endif // LIB_MATMUL_MATMUL_TILING_BASE_H
\ No newline at end of file
diff --git a/lib/matmul/matmul_tilingdata.h b/lib/matmul/matmul_tilingdata.h
index e1886971ec3a460f09cd5db7e1cc77eeb2d09128..a2b3dca04db59acffbc8ed9a47f6f185b6b3fb99 100644
--- a/lib/matmul/matmul_tilingdata.h
+++ b/lib/matmul/matmul_tilingdata.h
@@ -71,4 +71,4 @@ TILING_DATA_FIELD_DEF(int32_t, BatchNum);
 TILING_DATA_FIELD_DEF(int32_t, reserved);
 END_TILING_DATA_DEF;
 }
-#endif // LIB_MATMUL_MATMUL_TILINGDATA_H
+#endif // LIB_MATMUL_MATMUL_TILINGDATA_H
\ No newline at end of file
diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h
index 38ad988e411df5357235bb0789b730dc63d148b3..609f1753793726a0edbc84467685abecf121f9ba 100644
--- a/lib/matmul/tiling.h
+++ b/lib/matmul/tiling.h
@@ -55,10 +55,10 @@ enum class IterateOrder {
     ORDER_N,
     UNDEF,
 };
-
-enum class ScheduleMode {
-    NONE = 0,
-    L0_MN_DB = 1, // NORM template, L0 m/n db
+ 
+enum class ScheduleType {
+    INNER_PRODUCT = 0, // k loop, default type
+    OUTER_PRODUCT,     // m/n loop, depends on IterateOrder
 };
 
 enum class MatmulVersion {
@@ -124,14 +124,14 @@ struct MatmulConfig {
     bool intraBlockPartSum = false;
     // MDL support M/N db
     IterateOrder iterateOrder;
-    ScheduleMode scheduleMode;
+    ScheduleType scheduleType;
     bool enableDoubleCache;
 };
 
 __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
     const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1,
     const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF,
-    const ScheduleMode scheduleMode = ScheduleMode::NONE, const bool enUnitFlag = true)
+    const ScheduleType scheduleType = ScheduleType::INNER_PRODUCT, const bool enUnitFlag = true)
 {
     return {
         .doNorm = true,
@@ -172,7 +172,7 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f
         .enableL1CacheUB = false,
         .intraBlockPartSum = false,
         .iterateOrder = iterateOrder,
-        .scheduleMode = scheduleMode,
+        .scheduleType = scheduleType,
         .enableDoubleCache = false
     };
 }
@@ -221,7 +221,7 @@ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = fals
         .enableL1CacheUB = enableL1CacheUB,
         .intraBlockPartSum = false,
         .iterateOrder = IterateOrder::UNDEF,
-        .scheduleMode = ScheduleMode::NONE,
+        .scheduleType = ScheduleType::INNER_PRODUCT,
         .enableDoubleCache = false
     };
 }
@@ -269,7 +269,7 @@ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit
         .enableL1CacheUB = false,
         .intraBlockPartSum = false,
         .iterateOrder = IterateOrder::UNDEF,
-        .scheduleMode = ScheduleMode::NONE,
+        .scheduleType = ScheduleType::INNER_PRODUCT,
         .enableDoubleCache = false
     };
 }
@@ -317,7 +317,7 @@ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const ui
         .enableL1CacheUB = false,
         .intraBlockPartSum = false,
         .iterateOrder = IterateOrder::UNDEF,
-        .scheduleMode = ScheduleMode::NONE,
+        .scheduleType = ScheduleType::INNER_PRODUCT,
         .enableDoubleCache = false
     };
 }
@@ -366,7 +366,7 @@ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, c
         .enableL1CacheUB = false,
         .intraBlockPartSum = false,
         .iterateOrder = IterateOrder::UNDEF,
-        .scheduleMode = ScheduleMode::NONE,
+        .scheduleType = ScheduleType::INNER_PRODUCT,
         .enableDoubleCache = false
     };
 }
@@ -414,7 +414,7 @@ __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimi
         .enableL1CacheUB = false,
         .intraBlockPartSum = false,
         .iterateOrder = IterateOrder::UNDEF,
-        .scheduleMode = ScheduleMode::NONE,
+        .scheduleType = ScheduleType::INNER_PRODUCT,
         .enableDoubleCache = isDoubleCache
     };
 }
@@ -431,4 +431,4 @@ struct MatrixOffset {
 };
 
 extern int blockidx_;
-#endif // LIB_MATMUL_TILING_H
+#endif // LIB_MATMUL_TILING_H
\ No newline at end of file