diff --git a/impl/matmul/math_util.cpp b/impl/matmul/math_util.cpp deleted file mode 100644 index 0d81640fd32f9685568034c163e486239574b738..0000000000000000000000000000000000000000 --- a/impl/matmul/math_util.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file math_util.cpp - * \brief - */ - -#include "math_util.h" -#include -#include -#include -namespace matmul_tiling { -constexpr static int32_t SEED_MAP_MIN = 16; -constexpr static int32_t SEED_MAP_MAX = 1024; -constexpr static int32_t FACTOR_NUM_LIMIT = 4; -constexpr static int32_t L0_FACTOR_NUM_LIMIT = 2; -constexpr static int32_t L1_FACTOR_NUM_LIMIT = 4; -constexpr static int32_t MIN_FACTOR_LIMIT = 8; -constexpr static int32_t L0_FACTOR_LIMIT = 64; -constexpr static int32_t L1_FACTOR_LIMIT = 128; - -bool MathUtil::IsEqual(float leftValue, float rightValue) -{ - return std::fabs(leftValue - rightValue) <= std::numeric_limits::epsilon(); -} - -int32_t MathUtil::CeilDivision(int32_t num1, int32_t num2) -{ - if (num2 == 0) { - return 0; - } - return (num1 + num2 - 1) / num2; -} - -int32_t MathUtil::Align(int32_t num1, int32_t num2) -{ - return CeilDivision(num1, num2) * num2; -} - -int32_t MathUtil::AlignDown(int32_t num1, int32_t num2) -{ - if (num2 == 0) { - return 0; - } - return (num1 / num2) * num2; -} - -bool MathUtil::CheckMulOverflow(int32_t a, int32_t b, int32_t &c) -{ - if (a > 0 && b > 0) { - if (a > (INT32_MAX / b)) { - return false; - } - } else { - return false; - } - c = a * b; - return true; -} - -int32_t MathUtil::MapShape(int32_t shape, bool roundUpFlag) -{ - // map numbers between 32 to 1024 to number of power of 2, and map numbers greater than 1024 to 1024. - uint32_t seed = static_cast(SEED_MAP_MIN); - if (shape < static_cast(seed)) { - return shape; - } - while (static_cast(seed) < SEED_MAP_MAX) { - if (static_cast(seed) < shape && static_cast(seed << 1U) >= shape) { - break; - } - seed = seed << 1U; - } - if (roundUpFlag) { - seed = seed << 1U; - } - return static_cast(seed); -} - -void MathUtil::GetFactors(std::vector &factorList, int32_t srcNum, int32_t minFactor, int32_t maxFactor) -{ - for (int32_t factor = maxFactor; factor >= minFactor; factor--) { - if (srcNum % factor == 0) { - factorList.push_back(factor); - } - } -} - -void MathUtil::GetFactors(std::vector &factorList, int32_t srcNum, int32_t maxFactor) -{ - int32_t maxNum = std::min(srcNum, maxFactor); - for (int32_t factor = 1; factor <= maxNum; factor++) { - if (srcNum % factor == 0) { - factorList.push_back(factor); - } - } -} - -void MathUtil::GetFactorCnt(const int32_t shape, int32_t &factorCnt, const int32_t factorStart, const int32_t factorEnd) -{ - for (int32_t i = factorStart; i <= factorEnd; i++) { - if (shape < i) { - return; - } - if (shape % i == 0) { - ++factorCnt; - } - } -} - -void MathUtil::GetFactorLayerCnt(const int32_t shape, int32_t &factorCnt, const int32_t factorStart, - const int32_t factorEnd) -{ - std::vector factorList; - MathUtil::GetFactors(factorList, shape, factorStart, factorEnd); - for (const auto factor : factorList) { - int32_t fcnt = 0; - GetFactorCnt(factor, fcnt, 1, factor + 1); - factorCnt = fcnt >= factorCnt ? fcnt : factorCnt; - } -} - -void MathUtil::AddFactor(std::vector &dimsFactors, int32_t dim) -{ - dimsFactors.push_back(dim); - sort(dimsFactors.begin(), dimsFactors.end()); - (void)dimsFactors.erase(unique(dimsFactors.begin(), dimsFactors.end()), dimsFactors.cend()); -} - -int32_t MathUtil::GetNonFactorMap(std::vector &factorList, int32_t srcNum, int32_t maxFactor) -{ - int32_t factorCnt = 0; - int32_t mapFactor = srcNum; - MathUtil::GetFactorLayerCnt(srcNum, factorCnt, 1, maxFactor); - if (srcNum > 1 && factorCnt <= FACTOR_NUM_LIMIT) { - mapFactor = MathUtil::MapShape(srcNum, true); - } - GetFactors(factorList, mapFactor, maxFactor); - return mapFactor; -} - -void MathUtil::GetBlockFactors(std::vector &factorList, const int32_t oriShape, const int32_t mpShape, - const int32_t coreNum, const int32_t maxNum) -{ - // get all factors of ori_shape/mapshape/coreNum which smaller or equal to maxNum - for (int32_t i = 1; i <= maxNum; ++i) { - if ((oriShape % i == 0) || (mpShape % i == 0) || (coreNum % i == 0)) { - factorList.push_back(i); - } - } -} - -bool MathUtil::CheckFactorNumSatisfy(const int32_t dim) -{ - if (dim <= MIN_FACTOR_LIMIT) { - return true; - } - int32_t factorL0Cnt = 0; - int32_t factorL1Cnt = 0; - MathUtil::GetFactorLayerCnt(dim, factorL0Cnt, 1, L0_FACTOR_LIMIT); - if (dim > L1_FACTOR_LIMIT) { - MathUtil::GetFactorLayerCnt(dim, factorL1Cnt, L0_FACTOR_LIMIT + 1, L1_FACTOR_LIMIT); - } - bool factorNumNotSatisfied = (factorL0Cnt <= L0_FACTOR_NUM_LIMIT) || - ((dim > L1_FACTOR_LIMIT) && (factorL0Cnt + factorL1Cnt <= L1_FACTOR_NUM_LIMIT)); - return !factorNumNotSatisfied; -} - -int32_t MathUtil::FindBestSingleCore(const int32_t oriShape, const int32_t mappedShape, const int32_t coreNum, - bool isKDim) -{ - int32_t bestSingleCore = oriShape; - int32_t realSingleCore = MathUtil::CeilDivision(oriShape, coreNum); - int32_t mappedSingleCore = MathUtil::CeilDivision(mappedShape, coreNum); - - if (isKDim) { - int32_t bestShape = (oriShape % coreNum == 0) ? oriShape : mappedShape; - bestSingleCore = MathUtil::CeilDivision(bestShape, coreNum); - return bestSingleCore; - } - - if (coreNum == 1 && CheckFactorNumSatisfy(bestSingleCore)) { - return bestSingleCore; - } - - bestSingleCore = realSingleCore; - while (bestSingleCore != mappedSingleCore) { - if (CheckFactorNumSatisfy(bestSingleCore)) { - return bestSingleCore; - } - if (bestSingleCore < mappedSingleCore) { - ++bestSingleCore; - } else { - --bestSingleCore; - } - } - return bestSingleCore; -} -} // namespace matmul_tiling diff --git a/lib/matmul/matmul_call_back.h b/impl/matmul/matmul_call_back.h similarity index 98% rename from lib/matmul/matmul_call_back.h rename to impl/matmul/matmul_call_back.h index e2b7217dede5288193ebe4ad02ffe3ff2f4d0498..074cb3b7532a61949d4106a5cfc516a1b0aa7a0d 100644 --- a/lib/matmul/matmul_call_back.h +++ b/impl/matmul/matmul_call_back.h @@ -1,36 +1,36 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file matmul_call_back.h - * \brief - */ -#ifndef LIB_MATMUL_MATMUL_CALL_BACK_H -#define LIB_MATMUL_MATMUL_CALL_BACK_H - -namespace matmul { -using namespace AscendC; -template &co1Local, - const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, - void (*CopyA1)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK, - const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, - void (*CopyB1)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, int useK, int useN, - const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr> -struct MatmulCallBackFunc { - constexpr static void (*DataCopyOutPtr)(const __gm__ void* gm, const LocalTensor &co1Local, - const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = DataCopyOut; - constexpr static void (*CopyA1Ptr)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, - int useM, int useK, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyA1; - constexpr static void (*CopyB1Ptr)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, - int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1; -}; - -} // namespace matmul -#endif +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_call_back.h + * \brief + */ +#ifndef LIB_MATMUL_MATMUL_CALL_BACK_H +#define LIB_MATMUL_MATMUL_CALL_BACK_H + +namespace matmul { +using namespace AscendC; +template &co1Local, + const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, + void (*CopyA1)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK, + const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, + void (*CopyB1)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, int useK, int useN, + const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr> +struct MatmulCallBackFunc { + constexpr static void (*DataCopyOutPtr)(const __gm__ void* gm, const LocalTensor &co1Local, + const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = DataCopyOut; + constexpr static void (*CopyA1Ptr)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, + int useM, int useK, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyA1; + constexpr static void (*CopyB1Ptr)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, + int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1; +}; + +} // namespace matmul +#endif \ No newline at end of file diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index bad9046a77c0ecff8247c5e7c97ebccb7babc863..06c975a9bd1210d329e7ad91814d69d6dce9b863 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -697,9 +697,9 @@ __aicore__ inline void MatmulImplsingleCoreK <= var.tiling_->baseK, { KERNEL_LOG(KERNEL_ERROR, - "When singleCoreK is larger than baseK, the parameter scheduleMode of MM_CFG should not be L0_MN_DB");}); + "When singleCoreK is larger than baseK, the parameter scheduleType of MM_CFG should not be OUTER_PRODUCT");}); } #endif var.tpipe_ = tpipe; @@ -836,7 +836,7 @@ __aicore__ inline void MatmulImpl= 220 - if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) { + if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) { if constexpr (EnUnitFlag(MM_CFG)) { var.tpipe_->InitBuffer(var.CO1_, 2 * var.baseMN_ * sizeof(L0cT)); } else { @@ -884,10 +884,6 @@ __aicore__ inline void MatmulImplInitBuffer(var.qidCO2_, 1, var.baseMN_ * sizeof(DstT)); } - if constexpr (A_TYPE::format == CubeFormat::ND || B_TYPE::format == CubeFormat::ND || - !PhyPosIsUB(C_TYPE::pos)) { - var.tpipe_->InitBuffer(var.calcBuf_, var.tiling_->transLength); - } #endif InitShareBufEnd(var.tpipe_); @@ -1018,9 +1014,7 @@ __aicore__ inline void MatmulImpl(var.tiling_->shareUbSize); #if __CCE_AICORE__ == 200 - if constexpr (!MM_CFG.enVecND2NZ && (!PhyPosIsUB(C_TYPE::pos) || C_TYPE::format == CubeFormat::NZ)) { - shareUbSize = 0; - } + shareUbSize = 0; #endif uint32_t shareLens[3] = {static_cast(var.tiling_->shareL1Size), static_cast(var.tiling_->shareL0CSize), shareUbSize}; @@ -1125,29 +1119,16 @@ __aicore__ inline void MatmulImpl= 220 if (var.tiling_->isBias) { var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT)); } -#else - if constexpr (!MM_CFG.enVecND2NZ) { - if (var.tiling_->isBias) { - var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT)); - } - } #endif if constexpr ((IsSameType::value && IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t)); } -#if __CCE_AICORE__ < 220 - // need extra ub when SetQuantTensor, may not use - if constexpr (C_TYPE::format == CubeFormat::NZ && - (IsSameType::value || IsSameType::value)) { - var.tpipe_->InitBuffer(var.calcBuf_, var.tiling_->baseN * sizeof(uint64_t)); - } -#endif #if (__CCE_AICORE__ < 200) var.tpipe_->InitBuffer(var.qidA2_, 1, L0ASize_); var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_); @@ -1268,15 +1249,9 @@ __aicore__ inline void MatmulImpl(var.tiling_->shareUbSize); + uint32_t shareUbSize = static_cast(var.tiling_->shareUbSize); #if __CCE_AICORE__ == 200 - if constexpr (!MM_CFG.enVecND2NZ && (!PhyPosIsUB(C_TYPE::pos) || C_TYPE::format == CubeFormat::NZ)) { - shareUbSize = 0; - if constexpr (C_TYPE::format == CubeFormat::NZ && - (IsSameType::value || IsSameType::value)) { - shareUbSize = var.tiling_->baseN * sizeof(uint64_t); - } - } + shareUbSize = 0; #endif uint32_t shareLens[3] = {static_cast(var.tiling_->shareL1Size), static_cast(var.tiling_->shareL0CSize), shareUbSize}; @@ -1364,16 +1339,10 @@ uint32_t shareUbSize = static_cast(var.tiling_->shareUbSize); } #endif -#if __CCE_AICORE__ == 220 +#if __CCE_AICORE__ >= 220 if (var.tiling_->isBias) { var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT)); } -#else - if constexpr (!MM_CFG.enVecND2NZ) { - if (var.tiling_->isBias) { - var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT)); - } - } #endif #if __CCE_AICORE__ == 220 @@ -1391,13 +1360,6 @@ uint32_t shareUbSize = static_cast(var.tiling_->shareUbSize); var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t)); } #endif -#if __CCE_AICORE__ < 220 - // need extra ub when SetQuantTensor, may not use - if constexpr (C_TYPE::format == CubeFormat::NZ && - (IsSameType::value || IsSameType::value)) { - var.tpipe_->InitBuffer(var.calcBuf_, var.tiling_->baseN * sizeof(uint64_t)); - } -#endif #if (__CCE_AICORE__ < 200) var.tpipe_->InitBuffer(var.qidA2_, 1, L0ASize_); var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_); @@ -2222,6 +2184,11 @@ __aicore__ inline void MatmulImpl(); LocalTensor co2Local; + if constexpr (C_TYPE::format == CubeFormat::NZ) { + event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); + SetFlag(eventIDMte3ToV); + WaitFlag(eventIDMte3ToV); + } if constexpr (MM_CFG.enVecND2NZ) { if constexpr (!MM_CFG.enableUBReuse) { co2Local = var.localWorkspace[var.tiling_->transLength * 2].template ReinterpretCast(); @@ -4218,7 +4185,7 @@ __aicore__ inline void MatmulImpl(); } - if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) { + if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) { if (var.sMadNStep_ > var.tiling_->baseN) { // Means L0 N db, need to excute twice FixpipeOutToGm FixpipeOutToGm(gm, co1Local, var.curM_, var.curN_, enAtomic, enSequentialWrite); var.baseUseN_ = (var.curN_ + 2 == var.nIter_) ? var.tailN_ : var.tiling_->baseN; // update next var.curN_ baseUseN_ @@ -4482,9 +4449,15 @@ __aicore__ inline void MatmulImpl& biasGlobal, LocalTensor& cMatrix, int col) { auto bias = var.qidBias_.template AllocTensor(); - auto blockLen = Ceil(var.baseUseN_ * sizeof(BiasT), ONE_BLK_SIZE); - DataCopy(bias, biasGlobal[col * var.tiling_->baseN], { (uint16_t)1, - (uint16_t)blockLen, (uint16_t)0, (uint16_t)0 }); + // if var.baseUseN_ is not 32B align, use DataCopy Nd2Nz + if ((var.baseUseN_ * sizeof(BiasT)) % ONE_BLK_SIZE != 0) { + Nd2NzParams intriParams{ 1, 1, (uint16_t)var.baseUseN_, 0, (uint16_t)var.baseUseN_, 1, 1, 1 }; + DataCopy(bias, biasGlobal[col * var.tiling_->baseN], intriParams); + } else { + auto blockLen = Ceil(var.baseUseN_ * sizeof(BiasT), ONE_BLK_SIZE); + DataCopy(bias, biasGlobal[col * var.tiling_->baseN], + { (uint16_t)1, (uint16_t)blockLen, (uint16_t)0, (uint16_t)0 }); + } // delete after tpipe supports bias queue var.qidBias_.EnQue(bias); } @@ -4511,7 +4484,7 @@ __aicore__ inline void MatmulImpl(); #endif - if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) { + if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) { co1Local.SetSize(var.blockUseM_ * var.blockUseN_ * CUBE_MAX_SIZE * 2); } else { if constexpr (DoMatmulSpecialMDL(MM_CFG)) { @@ -5940,18 +5913,18 @@ __aicore__ inline void MatmulImplbaseN]; matmulInstr_.biasType_ = IsSameType::value ? 2 : 1; // 2:f32, 1:f16 matmulInstr_.sL1BiasOffset_ = 0; - matmulInstr_.template Compute(a1, b1, var.cMatrix_, bias); + matmulInstr_.template Compute(a1, b1, var.cMatrix_, bias); if constexpr (A_TYPE::layout == LayoutMode::NONE || MM_CFG.batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { var.qidBias_.FreeTensor(bias); } } else { matmulInstr_.biasType_ = 0; - matmulInstr_.template Compute(a1, b1, var.cMatrix_, bias); + matmulInstr_.template Compute(a1, b1, var.cMatrix_, bias); } } else { matmulInstr_.biasType_ = 0; - matmulInstr_.template Compute(a1, b1, var.cMatrix_, bias); + matmulInstr_.template Compute(a1, b1, var.cMatrix_, bias); } } @@ -6789,7 +6762,7 @@ template __aicore__ inline bool MatmulImpl::IterateNorm(bool enPartialSum) { - if constexpr (MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB) { + if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) { return IterateNormL0DB(enPartialSum); } if (unlikely(var.isFirstIter_)) { @@ -7150,8 +7123,8 @@ template __aicore__ inline bool MatmulImpl::IterateNormL0DB(bool enPartialSum) { - ASCENDC_ASSERT((MM_CFG.scheduleMode == ScheduleMode::L0_MN_DB), { - KERNEL_LOG(KERNEL_ERROR, "IterateNormL0DB only support scheduleMode == L0_MN_DB"); + ASCENDC_ASSERT((MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT), { + KERNEL_LOG(KERNEL_ERROR, "IterateNormL0DB only support scheduleType == OUTER_PRODUCT"); }); if (unlikely(var.isFirstIter_)) { var.isFirstIter_ = false; @@ -8389,7 +8362,8 @@ __aicore__ inline void MatmulImpl quantLocalTensor; if constexpr (C_TYPE::format == CubeFormat::NZ) { - quantLocalTensor = var.calcBuf_.template Get(); + quantLocalTensor = + var.localWorkspace[var.tiling_->transLength].template ReinterpretCast(); } else if constexpr (MM_CFG.enVecND2NZ) { if constexpr (!MM_CFG.enableUBReuse) { quantLocalTensor = @@ -8420,7 +8394,8 @@ __aicore__ inline void MatmulImpl quantLocalTensor; if constexpr (C_TYPE::format == CubeFormat::NZ) { - quantLocalTensor = var.calcBuf_.template Get(); + quantLocalTensor = + var.localWorkspace[var.tiling_->transLength].template ReinterpretCast(); } else if constexpr (MM_CFG.enVecND2NZ) { if constexpr (!MM_CFG.enableUBReuse) { quantLocalTensor = @@ -8840,10 +8815,7 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventID); - WaitFlag(eventID); - eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_S)); + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_S)); SetFlag(eventID); WaitFlag(eventID); } @@ -8864,19 +8836,15 @@ __aicore__ inline void MatmulImpl(); DataCopy(gmC[dstOffset + i * offset + var.baseUseN_], trans, { 1, 1, 0, 0 }); - if constexpr (IsSameType::value && - IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventID); - WaitFlag(eventID); - } if (i < var.baseUseM_ - 1) { SetFlag(eventIDMte3ToMte2); } } } } - + event_t eventID = static_cast(GetTPipePtr()->AllocEventID()); + SetFlag(eventID); + WaitFlag(eventID); // Release MTE2_MTE3 eventId: eventIDMte3ToMte2 GetTPipePtr()->ReleaseEventID(eventIDMte3ToMte2); } @@ -8901,15 +8869,8 @@ __aicore__ inline void MatmulImpl::value || IsSameType::value) { - if (var.baseUseN_ % 2 > 0) { - isOdd = true; - } - } - bool needDataCopyPad = !isTragetAligned && (M_ > var.singleCoreM_ || N_ > var.singleCoreN_ || isOdd); int gmOffset = blockCount * (blocklen - 2); - if (needDataCopyPad && blocklen == 1) { + if (!isTragetAligned && blocklen == 1) { auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); SetFlag(eventIDVToS); WaitFlag(eventIDVToS); @@ -8936,7 +8897,7 @@ __aicore__ inline void MatmulImpl 1) { + } else if (!isTragetAligned && blocklen > 1) { if constexpr (IsSameType::value || IsSameType::value) { LocalTensor transAligin; if constexpr (!MM_CFG.enableUBReuse) { @@ -9196,19 +9157,46 @@ __aicore__ inline void MatmulImplbaseN * M_ + var.curM_ * var.tiling_->baseM * BLOCK_CUBE; - int blockLen = var.blockUseM_ * BLOCK_CUBE * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; - int dstStride = (M_ - var.baseUseM_) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; - if (dstStride >= UINT16_MAX) { - int srcOffset = var.baseUseM_ * BLOCK_CUBE; - for (int i = 0; i < var.blockUseN_; ++i) { - DataCopy(gmC[dstOffset + i * M_ * BLOCK_CUBE], src[i * srcOffset], - { 1, static_cast(blockLen), 0, 0 }); - } + int64_t alignM; + int alignBaseUseM; + if constexpr (C_TYPE::format == CubeFormat::NZ){ + alignM = Ceil(M_, BLOCK_CUBE) * BLOCK_CUBE; + alignBaseUseM = Ceil(var.baseUseM_, BLOCK_CUBE) * BLOCK_CUBE; + } else { + alignM = M_; + alignBaseUseM = var.baseUseM_; + } + if constexpr (IsSameType::value || IsSameType::value) { + int64_t dstOffset = var.curN_ * var.tiling_->baseN * alignM + var.curM_ * var.tiling_->baseM * ONE_BLK_SIZE; + int blockLen = var.blockUseM_ * BLOCK_CUBE * sizeof(DstT); + int64_t dstStride = (alignM - alignBaseUseM) * sizeof(DstT); + int blockCount = Ceil(var.blockUseN_, 2); + if (dstStride >= UINT16_MAX) { + int srcOffset = var.baseUseM_ * ONE_BLK_SIZE; + for (int i = 0; i < blockCount; ++i) { + DataCopy(gmC[dstOffset + i * alignM * ONE_BLK_SIZE], src[i * srcOffset], + { 1, static_cast(blockLen), 0, 0 }); + } + } else { + DataCopy(gmC[dstOffset], src, + { static_cast(blockCount), static_cast(blockLen), 0, + static_cast(dstStride) }); + } } else { - DataCopy(gmC[dstOffset], src, - { static_cast(var.blockUseN_), static_cast(blockLen), 0, - static_cast(dstStride) }); + int64_t dstOffset = var.curN_ * var.tiling_->baseN * alignM + var.curM_ * var.tiling_->baseM * BLOCK_CUBE; + int blockLen = var.blockUseM_ * BLOCK_CUBE * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; + int64_t dstStride = (alignM - alignBaseUseM) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; + if (dstStride >= UINT16_MAX) { + int srcOffset = var.baseUseM_ * BLOCK_CUBE; + for (int i = 0; i < var.blockUseN_; ++i) { + DataCopy(gmC[dstOffset + i * alignM * BLOCK_CUBE], src[i * srcOffset], + { 1, static_cast(blockLen), 0, 0 }); + } + } else { + DataCopy(gmC[dstOffset], src, + { static_cast(var.blockUseN_), static_cast(blockLen), 0, + static_cast(dstStride) }); + } } } } else if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { diff --git a/impl/matmul/matmul_macro_v220_impl.h b/impl/matmul/matmul_macro_v220_impl.h index 9615936c389067d2ed26e3418203908b6b03be7d..1daafedd2a3659b551998d1cae6dd568ff3b29a7 100644 --- a/impl/matmul/matmul_macro_v220_impl.h +++ b/impl/matmul/matmul_macro_v220_impl.h @@ -106,7 +106,7 @@ public: inline __aicore__ void Init(); inline __aicore__ void Release(); template + ScheduleType scheduleType = ScheduleType::INNER_PRODUCT, IterateOrder iterateOrder = IterateOrder::UNDEF> inline __aicore__ void Compute(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, const LocalTensor &bias, int64_t offsetb = 0, uint8_t subIdx = 0); @@ -416,10 +416,6 @@ inline __aicore__ void MacroMatmul::value) { - l0bSrcAddrStride = l0bSrcAddrStride / 2; - l0bDstAddrStride = l0bDstAddrStride / 2; - } uint64_t l0bOffset = 0; for (uint64_t i = 0; i < l0bLoop; i++) { #if __CCE_AICORE__ >= 220 @@ -565,7 +561,7 @@ inline __aicore__ void MacroMatmul -template +template inline __aicore__ void MacroMatmul::Compute( const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, const LocalTensor &bias, int64_t offsetb, uint8_t subIdx) @@ -588,12 +584,12 @@ inline __aicore__ void MacroMatmul(l1AMatrix, l1BMatrix, cMatrix, bias, kC0Tail, kTail); return; } // n db - if constexpr (scheduleMode == ScheduleMode::L0_MN_DB && iterateOrder == IterateOrder::ORDER_M) { + if constexpr (scheduleType == ScheduleType::OUTER_PRODUCT && iterateOrder == IterateOrder::ORDER_M) { ComputeWithNdb(l1AMatrix, l1BMatrix, cMatrix, bias, kC0Tail, kTail); return; } diff --git a/lib/matmul/matmul_server.h b/impl/matmul/matmul_sever.h similarity index 96% rename from lib/matmul/matmul_server.h rename to impl/matmul/matmul_sever.h index 5a671203562849324542ec453ec9cac85b7fd005..caf8a3def2834cb5dea8f5a4aa63e8cfb54c3488 100644 --- a/lib/matmul/matmul_server.h +++ b/impl/matmul/matmul_sever.h @@ -1,838 +1,859 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file matmul_server.h - * \brief - */ -#ifndef LIB_MATMUL_MATMUL_SERVER_H -#define LIB_MATMUL_MATMUL_SERVER_H - -#include "lib/matmul/matmul.h" -#include "kernel_operator.h" - -namespace matmul { -using namespace AscendC; -template struct IBShareCache { - __aicore__ inline IBShareCache() {}; -}; - -template <> -struct IBShareCache { - __aicore__ inline IBShareCache() {}; - using ShareCache = uint16_t; -}; - -template <> -struct IBShareCache { - __aicore__ inline IBShareCache() {}; - using ShareCache = GlobalCache; -}; -template __aicore__ constexpr bool IsIBShare() -{ - if (A_TYPE::ibShare == true) { - return true; - } - if (B_TYPE::ibShare == true) { - return true; - } - return false; -} - -struct MatmulMsg { - uint32_t setOrgShape : 1; - uint32_t orgM; - uint32_t orgN; - uint32_t orgKa; - uint32_t orgKb; - uint32_t orgKc; -}; - -struct ShareMatmulBase { - __aicore__ inline ShareMatmulBase() {}; -}; - -struct ShareMatmul : ShareMatmulBase { - __aicore__ inline ShareMatmul(){}; - MatmulMsg msg0; - MatmulMsg msg1; -}; - -template -struct ShareMatmulAux { - __aicore__ inline ShareMatmulAux(){}; -}; - -template <> -struct ShareMatmulAux { - __aicore__ inline ShareMatmulAux(){}; - using MSG = ShareMatmulBase; -}; - -template <> -struct ShareMatmulAux { - __aicore__ inline ShareMatmulAux(){}; - using MSG = ShareMatmul; -}; - -template > -class MatmulService { - using SrcT = typename A_TYPE::T; - using DstT = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - -public: - __aicore__ inline MatmulService() {} - __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace) - { - ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server"); - ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server"); - ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server"); - ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server"); - this->instID = instID; - this->kfcCommSrv = kfc; - this->tpipe = tpipe; - this->workspace = workspace; - mul.SetSubBlockIdx(kfcCommSrv->subBlockID); - if constexpr (!MM_CFG.enableInit) { - msgAux.msg0.setOrgShape = false; - msgAux.msg1.setOrgShape = false; - } - this->devEvtID = instID; - if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) { - if (kfcCommSrv->subBlockID == 0) { - if (tiling) { - tiling_ = (TCubeTiling *)tiling; - gCache.template Init(tiling_, tpipe); - } - } - } - if (tiling) { - tiling_ = (TCubeTiling *)tiling; - mul.Init(tiling_, tpipe); - } - } - - __aicore__ inline void Init(__gm__ KfcMsg* msg) - { - if constexpr (!MM_CFG.enableInit) { - return; - } else { - ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server"); - ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server"); - auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr)); - tiling_ = &tmpTiling_; - auto temp2 = (uint32_t*)tiling_; - - constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE; - GlobalTensor tilingGlobal; - for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) { - Barrier(); - tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i)); - DataCacheCleanAndInvalid(tilingGlobal); - } - - for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) { - *temp2 = *temp1; - } - mul.Init(this->tiling_, this->tpipe); - } - } - - __aicore__ inline void SetSubBlockIdx(uint8_t idx) - { - mul.SetSubBlockIdx(idx); - } - - __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg) - { - if constexpr (!MM_CFG.enableInit) { - if (mul.GetSubBlockIdx() == 0) { - msgAux.msg0.orgM = msg->orgShape.orgM; - msgAux.msg0.orgN = msg->orgShape.orgN; - msgAux.msg0.orgKa = msg->orgShape.orgKa; - msgAux.msg0.orgKb = msg->orgShape.orgKb; - msgAux.msg0.orgKc = msg->orgShape.orgKc; - msgAux.msg0.setOrgShape = true; - } else { - msgAux.msg1.orgM = msg->orgShape.orgM; - msgAux.msg1.orgN = msg->orgShape.orgN; - msgAux.msg1.orgKa = msg->orgShape.orgKa; - msgAux.msg1.orgKb = msg->orgShape.orgKb; - msgAux.msg1.orgKc = msg->orgShape.orgKc; - msgAux.msg1.setOrgShape = true; - } - } else { - mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb, - msg->orgShape.orgKc); - } - } - - __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg) - { - if (msg->body.setTail) { - mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK); - } - } - - __aicore__ inline void SetTail(__gm__ KfcMsg* msg) - { - if (msg->body.setTail) { - mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK); - } - } - - __aicore__ inline void SetHF32(__gm__ KfcMsg* msg) - { - mul.SetHF32(static_cast(msg->body.enHF32), static_cast(msg->body.hf32TransMode)); - } - - __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg) - { - if (!msg->body.setTensorA) - return; - if constexpr (A_TYPE::format == CubeFormat::SCALAR) { - SrcT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr)); - auto temp2 = (uint8_t*)&scalar; - - for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { - *temp2 = *temp1; - } - mul.SetTensorA(scalar); - return; - } - const uint64_t size = (uint64_t)(msg->body.sizeAmatrix); - if constexpr (PhyPosIsL1(A_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.aAddr, size); - mul.SetTensorA(scmLocal, msg->body.isTransA); - } else { - GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr), size); - mul.SetTensorA(aGlobal, msg->body.isTransA); - } - } - - __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) - { - if (!msg->body.setTensorA) { - return; - } - if constexpr (A_TYPE::format == CubeFormat::SCALAR) { - SrcT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset); - auto temp2 = (uint8_t*)&scalar; - - for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { - *temp2 = *temp1; - } - mul.SetTensorA(scalar); - return; - } - if constexpr (PhyPosIsL1(A_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.aAddr + offset, size); - mul.SetTensorA(scmLocal, msg->body.isTransA); - } else { - GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr + offset), size); - mul.SetTensorA(aGlobal, msg->body.isTransA); - } - } - - __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg) - { - if (!msg->body.setQuant) { - return; - } - int quantMode = msg->body.quantMode; - if (quantMode == 1) { - uint64_t quantScalar = msg->body.quantScalar; - mul.SetQuantScalar(quantScalar); - } else if (quantMode == 2) { - const uint64_t size = static_cast(msg->body.quantSize); - GlobalTensor quantGlobal; - quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size); - mul.SetQuantVector(quantGlobal); - } - } - - __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg) - { - if constexpr (A_TYPE::layout == LayoutMode::NONE) { - return; - } - if (!msg->body.setBatch) { - return; - } - mul.SetBatchNum(msg->body.batchA, msg->body.batchB); - } - - __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg) - { - GlobalTensor msgGlobal; - msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); - DataCacheCleanAndInvalid(msgGlobal); - mul.SetSelfDefineData(msg->body.dataPtr); - if constexpr (!MM_CFG.enableReuse) { - GlobalTensor dataGlobal; - dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr)); - DataCacheCleanAndInvalid(dataGlobal); - } - } - - __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg) - { - mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); - } - - __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg) - { - if (!msg->body.setTensorB) - return; - if constexpr (B_TYPE::format == CubeFormat::SCALAR) { - SrcT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr)); - auto temp2 = (uint8_t*)&scalar; - - for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { - *temp2 = *temp1; - } - mul.SetTensorB(scalar); - return; - } - const uint64_t size = (uint64_t)(msg->body.sizeBmatrix); - if constexpr (PhyPosIsL1(B_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.bAddr, size); - mul.SetTensorB(scmLocal, msg->body.isTransB); - } else { - GlobalTensor bGlobal; - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr), size); - mul.SetTensorB(bGlobal, msg->body.isTransB); - } - } - - __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) - { - if (!msg->body.setTensorB) { - return; - } - if constexpr (B_TYPE::format == CubeFormat::SCALAR) { - SrcT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset); - auto temp2 = (uint8_t*)&scalar; - - for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { - *temp2 = *temp1; - } - mul.SetTensorB(scalar); - return; - } - if constexpr (PhyPosIsL1(B_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.bAddr + offset, size); - mul.SetTensorB(scmLocal, msg->body.isTransB); - } else { - GlobalTensor bGlobal; - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr + offset), size); - mul.SetTensorB(bGlobal, msg->body.isTransB); - } - } - - __aicore__ inline void SetBias(__gm__ KfcMsg* msg) - { - if (msg->body.setTensorBias) { - const uint64_t size = (uint64_t)tiling_->singleCoreN; - if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.biasAddr, size); - mul.SetBias(scmLocal); - } else { - GlobalTensor biasGlobal; - biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size); - mul.SetBias(biasGlobal); - } - } else if (msg->body.setClearBias) { - mul.ClearBias(); - } - } - - __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset) - { - if (msg->body.setTensorBias) { - const uint64_t size = (uint64_t)tiling_->singleCoreN; - if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.biasAddr + offset, size); - mul.SetBias(scmLocal); - } else { - GlobalTensor biasGlobal; - biasGlobal.SetGlobalBuffer( - reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size); - mul.SetBias(biasGlobal); - } - } else if (msg->body.setClearBias) { - mul.ClearBias(); - } - } - - __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg) - { - if constexpr (A_TYPE::layout != LayoutMode::NONE) { - return true; - } - uint64_t size; - if constexpr (MM_CFG.baseMN != 0) { - size = MM_CFG.baseMN; - } else { - size = tiling_->baseM * tiling_->baseN; - } - if constexpr (PhyPosIsL1(C_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); - mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); - } else { - GlobalTensor cGlobal; - - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); - mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); - } - // Now release UB - if constexpr (PhyPosIsUB(C_TYPE::pos)) { - if (unlikely(msg->ubAddr >= 0)) { - kfcCommSrv->FreeUB(msg->ubAddr); - } - } - if (msg->body.sync == 1) { // Synchronize - uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); - NotifyEvent(eventID); - } - return false; - } - - __aicore__ inline uint16_t GetInstID() - { - return instID; - } - __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg) - { - if constexpr (!MM_CFG.enableInit) { - if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) { - mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa, - msgAux.msg0.orgKb, msgAux.msg0.orgKc); - } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) { - mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa, - msgAux.msg1.orgKb, msgAux.msg1.orgKc); - } - } - if (msg->body.isFirstIter) { - SetTensorA(msg); - SetTensorB(msg); - if constexpr (MM_CFG.enableSetBias) { - SetBias(msg); - } - if constexpr (MM_CFG.enableSetTail) { - SetTail(msg); - } - if constexpr (MM_CFG.enableQuantVector) { - SetQuantVector(msg); - } - if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) || - ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { - SetBatchNum(msg); - } - if constexpr (MM_CFG.enableSetDefineData) { - SetSelfDefineData(msg); - } - } - } - - __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize, - const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0) - { - if (msg->body.isFirstIter) { - SetTensorA(msg, batchASize, offsetA); - SetTensorB(msg, batchBSize, offsetB); - SetBias(msg, offsetBias); - SetTail(msg); - SetQuantVector(msg); - SetBatchNum(msg); - } - } - - __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg) - { - if constexpr (A_TYPE::layout == LayoutMode::NONE) { - return true; - } - // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes. - GlobalTensor msgGlobal; - msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); - DataCacheCleanAndInvalid(msgGlobal); - ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM - IterateSetMessage(msg); - uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN; - - GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); - mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), - msg->body.enSequentialWrite, msg->body.matrixStrideA, - msg->body.matrixStrideB, msg->body.matrixStrideC); - - // Now release UB - if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || - PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { - if (unlikely(msg->ubAddr >= 0)) { - kfcCommSrv->FreeUB(msg->ubAddr); - } - } - if (msg->body.sync || msg->body.waitIterateBatch) { - uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); - NotifyEvent(eventID); - } - return true; - } - - __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg) - { - if constexpr (!MM_CFG.isNBatch) { - return true; - } - GlobalTensor msgGlobal; - msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); - DataCacheCleanAndInvalid(msgGlobal); - ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM - const uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN; - const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop; - uint64_t batchAOffset = tiling_->ALayoutInfoD * msg->body.batchA; - if constexpr (A_TYPE::layout != LayoutMode::SBNGD) { - batchAOffset = batchAOffset * tiling_->ALayoutInfoS; - } - const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop; - uint64_t batchBOffset = tiling_->BLayoutInfoD * msg->body.batchB; - if constexpr (B_TYPE::layout != LayoutMode::SBNGD) { - batchBOffset = batchBOffset * tiling_->BLayoutInfoS; - } - const uint64_t batchCOffset = tiling_->CLayoutInfoS2; - const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB; - bool layoutGCondition = tiling_->CLayoutInfoG == 1 && - (tiling_->BLayoutInfoG != 1 || tiling_->ALayoutInfoG != 1); - int32_t layoutG = tiling_->BLayoutInfoG > tiling_->ALayoutInfoG ? tiling_->BLayoutInfoG : tiling_->ALayoutInfoG; - int32_t batchOffsetBias = tiling_->CLayoutInfoS2 * batchC; - if (layoutGCondition) { - batchOffsetBias = batchOffsetBias / layoutG; - } - int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T); - if constexpr (C_TYPE::layout != LayoutMode::SBNGD) { - batchOffsetC = batchOffsetC * tiling_->CLayoutInfoS1; - } - uint64_t offset = 0; - uint32_t cntIterator = 0; - for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) { - const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T); - const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T); - const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T); - IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset); - GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size); - mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), - msg->body.enSequentialWrite, msg->body.matrixStrideA, - msg->body.matrixStrideB, msg->body.matrixStrideC); - cntIterator++; - if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) { - uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); - NotifyEvent(eventID); - } - offset += batchOffsetC; - } - // Now release UB - if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || - PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { - if (unlikely(msg->ubAddr >= 0)) { - kfcCommSrv->FreeUB(msg->ubAddr); - } - } - uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); - if (msg->body.sync || msg->body.waitIterateBatch) { - NotifyEvent(eventID); - } else if (cntIterator >= INC_PROCESS_CHECK) { - NotifyEvent(eventID); - } - return true; - } - - __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID) - { - if constexpr (A_TYPE::layout != LayoutMode::NONE) { - return true; - } - if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) { - if (msg->body.iterateFakeMsg) { - if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg - uint16_t eventID = static_cast(this->devEvtID * 2 + kfcCommSrv->subBlockID); - NotifyEvent(eventID); - return true; - } - } - } else { - ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg."); - } - if constexpr ((IsSameType::value && IsSameType::value) || - ((IsSameType::value || IsSameType::value) && - IsSameType::value) || - (IsSameType::value && (IsSameType::value || - IsSameType::value))) { - GlobalTensor msgGlobal; - msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); - DataCacheCleanAndInvalid(msgGlobal); - } - IterateSetMessage(msg); - uint64_t size; - if constexpr (MM_CFG.singleCoreMN != 0) { - size = MM_CFG.singleCoreMN; - } else { - size = tiling_->singleCoreM * tiling_->singleCoreN; - } - - GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); - const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); - uint64_t offset = 0; - uint64_t offsetSize = 0; - auto enSequentialWrite = msg->body.enSequentialWrite; - auto enAtomic = msg->body.enAtomic; - auto sync = msg->body.sync; - auto enPartialSum = msg->body.enPartialSum; - if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) { - ASSERT(msg->body.cAddr != 0); // The output address must be configured. - if constexpr (MM_CFG.baseMN != 0) { - offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0; - } else { - offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0; - } - } else { - if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { - ASSERT(msg->body.cAddr != 0); // The output address must be configured. - if constexpr (MM_CFG.baseMN != 0) { - offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0; - } else { - offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0; - } - } else if (sync == 0) { - // For asynchronous Iterate, the offset must be used for address calculation and - // the size is baseM x baseN. - if constexpr (MM_CFG.baseMN != 0) { - offsetSize = MM_CFG.baseMN; - } else { - offsetSize = tiling_->baseM * tiling_->baseN; - } - enSequentialWrite = 1; - } - } - uint32_t cntIterator = 0; - TRACE_START(TraceId::MatMul_CALC); - // Asynchronous and configure the workspace - while (mul.Iterate(enPartialSum)) { - if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { - if (unlikely(cntIterator == 0)) { - if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) { - TRACE_STOP(TraceId::MatMul_CALC); - return false; // The queue is not switched, and no message needs to be returned. - } - } - } - if constexpr (PhyPosIsL1(C_TYPE::pos)) { - mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite); - } else { - mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite); - } - cntIterator++; - if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { - if (cntIterator < INC_PROCESS_CHECK) { - if (funID == KFC_Enum::MMFUN_ITERATE) { - uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); - NotifyEvent(eventID); - } - } - } - offset += offsetSize; - } - // Now release UB - if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || - PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { - if (unlikely(msg->ubAddr >= 0)) { - kfcCommSrv->FreeUB(msg->ubAddr); - } - } - - uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); - if (sync || msg->body.waitIterateAll) { - ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); - NotifyEvent(eventID); - } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) { - NotifyEvent(eventID); - } - mul.End(); - TRACE_STOP(TraceId::MatMul_CALC); - return true; - } - - __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID) - { - if constexpr (A_TYPE::layout != LayoutMode::NONE) { - return true; - } - if constexpr ((IsSameType::value && IsSameType::value) || - ((IsSameType::value || IsSameType::value) && - IsSameType::value) || - (IsSameType::value && (IsSameType::value || - IsSameType::value))) { - GlobalTensor msgGlobal; - msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); - DataCacheCleanAndInvalid(msgGlobal); - } - IterateSetMessage(msg); - if (mul.GetSubBlockIdx() == 0) { - return true; - } - uint64_t size; - if constexpr (MM_CFG.singleCoreMN != 0) { - size = MM_CFG.singleCoreMN; - } else { - size = tiling_->singleCoreM * tiling_->singleCoreN; - } - - GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); - mul.IterateAll(cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite, - msg->body.waitIterateAll, msg->body.iterateFakeMsg); - - uint16_t eventID0 = static_cast(this->devEvtID * 2 + 0); - uint16_t eventID1 = static_cast(this->devEvtID * 2 + 1); - if (msg->body.sync || msg->body.waitIterateAll) { - ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); - NotifyEvent(eventID0); - NotifyEvent(eventID1); - } - if (!msg->body.iterateFakeMsg) { - mul.End(); - } - TRACE_STOP(TraceId::MatMul_CALC); - return true; - } - - __aicore__ inline bool IsSharedMatmul() - { - if constexpr (MM_CFG.enableInit) { - return false; - } else { - return true; - } - } - - __aicore__ inline bool ProcessIbShareSync(KFC_Enum funID, bool& freeMsg, - int &lastMsgId, const int subBlockID) - { - if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || MM_CFG.intraBlockPartSum) { - if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { - if (lastMsgId == subBlockID) { - freeMsg = false; - return true; - } - lastMsgId = subBlockID; - return false; - } - return false; - } else { - return false; - } - } - - __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID) - { - if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) || - ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) { - if ((static_cast(funID) & static_cast(KFC_Enum::MMFUN_MASK)) == - static_cast(KFC_Enum::MMFUN_MASK)) { - if constexpr (MM_CFG.intraBlockPartSum) { - return IterateIntraBlockPartSum(msg, funID); - } else { - return Iterate(msg, funID); - } - } - } - if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0)) { - if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) { - return IterateBatch(msg); - } - } - if constexpr (MM_CFG.enableEnd) { - if (funID == KFC_Enum::MMFUN_END) { - mul.End(); - } - } - if constexpr (MM_CFG.enableGetTensorC) { - if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) { - return GetTensorC(msg); - } - } - if constexpr (MM_CFG.enableSetOrgShape) { - if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) { - SetOrgShape(msg); - return true; - } - } - if constexpr (MM_CFG.enableInit) { - if (funID == KFC_Enum::MMFUN_INIT) { - Init(msg); - return true; - } - } - if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { - if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) { - return IterateNBatch(msg); - } - } - if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) { - SetUserDefInfo(msg); - return true; - } - if (funID == KFC_Enum::MMFUN_SET_HF32) { - SetHF32(msg); - return true; - } - ASSERT("illegal function ID."); - return true; - } - - template __aicore__ LocalTensor GetTscmTensor(uint64_t addr, const uint64_t size) - { - LocalTensor scmLocal; - TBuffAddr scmTbuf; - scmTbuf.logicPos = (uint8_t)(TPosition::TSCM); - scmTbuf.dataLen = size * sizeof(DstT); - scmTbuf.bufferAddr = addr; -#if ASCENDC_CPU_DEBUG - scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr; -#endif - scmLocal.SetAddr(scmTbuf); - return scmLocal; - } - -private: - MatmulImpl mul; - GM_ADDR workspace; - KfcCommServer* kfcCommSrv; - TPipe* tpipe; - TCubeTiling* tiling_; - TCubeTiling tmpTiling_; // for compatible with init interface - typename IBShareCache()>::ShareCache gCache; - typename ShareMatmulAux::MSG msgAux; - uint16_t instID; - uint16_t devEvtID; -}; -} // namespace matmul +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_server.h + * \brief + */ +#ifndef LIB_MATMUL_MATMUL_SERVER_H +#define LIB_MATMUL_MATMUL_SERVER_H + +#include "../../lib/matmul/matmul.h" +#include "kernel_operator.h" + +namespace matmul { +using namespace AscendC; +template struct IBShareCache { + __aicore__ inline IBShareCache() {}; +}; + +template <> +struct IBShareCache { + __aicore__ inline IBShareCache() {}; + using ShareCache = uint16_t; +}; + +template <> +struct IBShareCache { + __aicore__ inline IBShareCache() {}; + using ShareCache = GlobalCache; +}; +template __aicore__ constexpr bool IsIBShare() +{ + if (A_TYPE::ibShare == true) { + return true; + } + if (B_TYPE::ibShare == true) { + return true; + } + return false; +} + +struct MatmulMsg { + uint32_t setOrgShape : 1; + uint32_t orgM; + uint32_t orgN; + uint32_t orgKa; + uint32_t orgKb; + uint32_t orgKc; +}; + +struct ShareMatmulBase { + __aicore__ inline ShareMatmulBase() {}; +}; + +struct ShareMatmul : ShareMatmulBase { + __aicore__ inline ShareMatmul(){}; + MatmulMsg msg0; + MatmulMsg msg1; +}; + +template +struct ShareMatmulAux { + __aicore__ inline ShareMatmulAux(){}; +}; + +template <> +struct ShareMatmulAux { + __aicore__ inline ShareMatmulAux(){}; + using MSG = ShareMatmulBase; +}; + +template <> +struct ShareMatmulAux { + __aicore__ inline ShareMatmulAux(){}; + using MSG = ShareMatmul; +}; + +__aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace) +{ + SetAtomicNone(); +#if __CCE_AICORE__ == 220 + if ASCEND_IS_AIC { + SetMaskNorm(); + SetLoadDataBoundary((uint64_t)0); + SetLoadDataPaddingValue((uint64_t)0); + } else { + AscendCUtils::SetMask((uint64_t)-1, (uint64_t)-1); + SetMaskNorm(); + } +#endif + +#ifdef __DAV_C220_CUBE__ + ClearWorkspaceImpl(workspace); + uint16_t eventID = 3; + NotifyEvent(eventID); +#endif +} + +template > +class MatmulService { + using SrcT = typename A_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + +public: + __aicore__ inline MatmulService() {} + __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace) + { + ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server"); + ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server"); + ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server"); + ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server"); + this->instID = instID; + this->kfcCommSrv = kfc; + this->tpipe = tpipe; + this->workspace = workspace; + mul.SetSubBlockIdx(kfcCommSrv->subBlockID); + if constexpr (!MM_CFG.enableInit) { + msgAux.msg0.setOrgShape = false; + msgAux.msg1.setOrgShape = false; + } + this->devEvtID = instID; + if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) { + if (kfcCommSrv->subBlockID == 0) { + if (tiling) { + tiling_ = (TCubeTiling *)tiling; + gCache.template Init(tiling_, tpipe); + } + } + } + if (tiling) { + tiling_ = (TCubeTiling *)tiling; + mul.Init(tiling_, tpipe); + } + } + + __aicore__ inline void Init(__gm__ KfcMsg* msg) + { + if constexpr (!MM_CFG.enableInit) { + return; + } else { + ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server"); + ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server"); + auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr)); + tiling_ = &tmpTiling_; + auto temp2 = (uint32_t*)tiling_; + + constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE; + GlobalTensor tilingGlobal; + for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) { + Barrier(); + tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i)); + DataCacheCleanAndInvalid(tilingGlobal); + } + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.Init(this->tiling_, this->tpipe); + } + } + + __aicore__ inline void SetSubBlockIdx(uint8_t idx) + { + mul.SetSubBlockIdx(idx); + } + + __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg) + { + if constexpr (!MM_CFG.enableInit) { + if (mul.GetSubBlockIdx() == 0) { + msgAux.msg0.orgM = msg->orgShape.orgM; + msgAux.msg0.orgN = msg->orgShape.orgN; + msgAux.msg0.orgKa = msg->orgShape.orgKa; + msgAux.msg0.orgKb = msg->orgShape.orgKb; + msgAux.msg0.orgKc = msg->orgShape.orgKc; + msgAux.msg0.setOrgShape = true; + } else { + msgAux.msg1.orgM = msg->orgShape.orgM; + msgAux.msg1.orgN = msg->orgShape.orgN; + msgAux.msg1.orgKa = msg->orgShape.orgKa; + msgAux.msg1.orgKb = msg->orgShape.orgKb; + msgAux.msg1.orgKc = msg->orgShape.orgKc; + msgAux.msg1.setOrgShape = true; + } + } else { + mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb, + msg->orgShape.orgKc); + } + } + + __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg) + { + if (msg->body.setTail) { + mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK); + } + } + + __aicore__ inline void SetTail(__gm__ KfcMsg* msg) + { + if (msg->body.setTail) { + mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK); + } + } + + __aicore__ inline void SetHF32(__gm__ KfcMsg* msg) + { + mul.SetHF32(static_cast(msg->body.enHF32), static_cast(msg->body.hf32TransMode)); + } + + __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg) + { + if (!msg->body.setTensorA) + return; + if constexpr (A_TYPE::format == CubeFormat::SCALAR) { + SrcT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr)); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorA(scalar); + return; + } + const uint64_t size = (uint64_t)(msg->body.sizeAmatrix); + if constexpr (PhyPosIsL1(A_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.aAddr, size); + mul.SetTensorA(scmLocal, msg->body.isTransA); + } else { + GlobalTensor aGlobal; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr), size); + mul.SetTensorA(aGlobal, msg->body.isTransA); + } + } + + __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) + { + if (!msg->body.setTensorA) { + return; + } + if constexpr (A_TYPE::format == CubeFormat::SCALAR) { + SrcT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorA(scalar); + return; + } + if constexpr (PhyPosIsL1(A_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.aAddr + offset, size); + mul.SetTensorA(scmLocal, msg->body.isTransA); + } else { + GlobalTensor aGlobal; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.aAddr + offset), size); + mul.SetTensorA(aGlobal, msg->body.isTransA); + } + } + + __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg) + { + if (!msg->body.setQuant) { + return; + } + int quantMode = msg->body.quantMode; + if (quantMode == 1) { + uint64_t quantScalar = msg->body.quantScalar; + mul.SetQuantScalar(quantScalar); + } else if (quantMode == 2) { + const uint64_t size = static_cast(msg->body.quantSize); + GlobalTensor quantGlobal; + quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size); + mul.SetQuantVector(quantGlobal); + } + } + + __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg) + { + if constexpr (A_TYPE::layout == LayoutMode::NONE) { + return; + } + if (!msg->body.setBatch) { + return; + } + mul.SetBatchNum(msg->body.batchA, msg->body.batchB); + } + + __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg) + { + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + mul.SetSelfDefineData(msg->body.dataPtr); + if constexpr (!MM_CFG.enableReuse) { + GlobalTensor dataGlobal; + dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr)); + DataCacheCleanAndInvalid(dataGlobal); + } + } + + __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg) + { + mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); + } + + __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg) + { + if (!msg->body.setTensorB) + return; + if constexpr (B_TYPE::format == CubeFormat::SCALAR) { + SrcT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr)); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorB(scalar); + return; + } + const uint64_t size = (uint64_t)(msg->body.sizeBmatrix); + if constexpr (PhyPosIsL1(B_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.bAddr, size); + mul.SetTensorB(scmLocal, msg->body.isTransB); + } else { + GlobalTensor bGlobal; + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr), size); + mul.SetTensorB(bGlobal, msg->body.isTransB); + } + } + + __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) + { + if (!msg->body.setTensorB) { + return; + } + if constexpr (B_TYPE::format == CubeFormat::SCALAR) { + SrcT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorB(scalar); + return; + } + if constexpr (PhyPosIsL1(B_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.bAddr + offset, size); + mul.SetTensorB(scmLocal, msg->body.isTransB); + } else { + GlobalTensor bGlobal; + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcT*>(msg->body.bAddr + offset), size); + mul.SetTensorB(bGlobal, msg->body.isTransB); + } + } + + __aicore__ inline void SetBias(__gm__ KfcMsg* msg) + { + if (msg->body.setTensorBias) { + const uint64_t size = (uint64_t)tiling_->singleCoreN; + if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.biasAddr, size); + mul.SetBias(scmLocal); + } else { + GlobalTensor biasGlobal; + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size); + mul.SetBias(biasGlobal); + } + } else if (msg->body.setClearBias) { + mul.ClearBias(); + } + } + + __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset) + { + if (msg->body.setTensorBias) { + const uint64_t size = (uint64_t)tiling_->singleCoreN; + if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.biasAddr + offset, size); + mul.SetBias(scmLocal); + } else { + GlobalTensor biasGlobal; + biasGlobal.SetGlobalBuffer( + reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size); + mul.SetBias(biasGlobal); + } + } else if (msg->body.setClearBias) { + mul.ClearBias(); + } + } + + __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg) + { + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + uint64_t size; + if constexpr (MM_CFG.baseMN != 0) { + size = MM_CFG.baseMN; + } else { + size = tiling_->baseM * tiling_->baseN; + } + if constexpr (PhyPosIsL1(C_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); + mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); + } else { + GlobalTensor cGlobal; + + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); + } + // Now release UB + if constexpr (PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + if (msg->body.sync == 1) { // Synchronize + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + return false; + } + + __aicore__ inline uint16_t GetInstID() + { + return instID; + } + __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg) + { + if constexpr (!MM_CFG.enableInit) { + if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) { + mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa, + msgAux.msg0.orgKb, msgAux.msg0.orgKc); + } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) { + mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa, + msgAux.msg1.orgKb, msgAux.msg1.orgKc); + } + } + if (msg->body.isFirstIter) { + SetTensorA(msg); + SetTensorB(msg); + if constexpr (MM_CFG.enableSetBias) { + SetBias(msg); + } + if constexpr (MM_CFG.enableSetTail) { + SetTail(msg); + } + if constexpr (MM_CFG.enableQuantVector) { + SetQuantVector(msg); + } + if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) || + ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { + SetBatchNum(msg); + } + if constexpr (MM_CFG.enableSetDefineData) { + SetSelfDefineData(msg); + } + } + } + + __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize, + const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0) + { + if (msg->body.isFirstIter) { + SetTensorA(msg, batchASize, offsetA); + SetTensorB(msg, batchBSize, offsetB); + SetBias(msg, offsetBias); + SetTail(msg); + SetQuantVector(msg); + SetBatchNum(msg); + } + } + + __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg) + { + if constexpr (A_TYPE::layout == LayoutMode::NONE) { + return true; + } + // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes. + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM + IterateSetMessage(msg); + uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN; + + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), + msg->body.enSequentialWrite, msg->body.matrixStrideA, + msg->body.matrixStrideB, msg->body.matrixStrideC); + + // Now release UB + if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || + PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + if (msg->body.sync || msg->body.waitIterateBatch) { + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + return true; + } + + __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg) + { + if constexpr (!MM_CFG.isNBatch) { + return true; + } + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in Batch MM + const uint64_t size = tiling_->singleCoreM * tiling_->singleCoreN; + const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop; + uint64_t batchAOffset = tiling_->ALayoutInfoD * msg->body.batchA; + if constexpr (A_TYPE::layout != LayoutMode::SBNGD) { + batchAOffset = batchAOffset * tiling_->ALayoutInfoS; + } + const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop; + uint64_t batchBOffset = tiling_->BLayoutInfoD * msg->body.batchB; + if constexpr (B_TYPE::layout != LayoutMode::SBNGD) { + batchBOffset = batchBOffset * tiling_->BLayoutInfoS; + } + const uint64_t batchCOffset = tiling_->CLayoutInfoS2; + const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB; + bool layoutGCondition = tiling_->CLayoutInfoG == 1 && + (tiling_->BLayoutInfoG != 1 || tiling_->ALayoutInfoG != 1); + int32_t layoutG = tiling_->BLayoutInfoG > tiling_->ALayoutInfoG ? tiling_->BLayoutInfoG : tiling_->ALayoutInfoG; + int32_t batchOffsetBias = tiling_->CLayoutInfoS2 * batchC; + if (layoutGCondition) { + batchOffsetBias = batchOffsetBias / layoutG; + } + int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T); + if constexpr (C_TYPE::layout != LayoutMode::SBNGD) { + batchOffsetC = batchOffsetC * tiling_->CLayoutInfoS1; + } + uint64_t offset = 0; + uint32_t cntIterator = 0; + for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) { + const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T); + const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T); + const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T); + IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset); + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size); + mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), + msg->body.enSequentialWrite, msg->body.matrixStrideA, + msg->body.matrixStrideB, msg->body.matrixStrideC); + cntIterator++; + if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) { + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + offset += batchOffsetC; + } + // Now release UB + if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || + PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + if (msg->body.sync || msg->body.waitIterateBatch) { + NotifyEvent(eventID); + } else if (cntIterator >= INC_PROCESS_CHECK) { + NotifyEvent(eventID); + } + return true; + } + + __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID) + { + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) { + if (msg->body.iterateFakeMsg) { + if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg + uint16_t eventID = static_cast(this->devEvtID * 2 + kfcCommSrv->subBlockID); + NotifyEvent(eventID); + return true; + } + } + } else { + ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg."); + } + if constexpr ((IsSameType::value && IsSameType::value) || + ((IsSameType::value || IsSameType::value) && + IsSameType::value) || + (IsSameType::value && (IsSameType::value || + IsSameType::value))) { + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + } + IterateSetMessage(msg); + uint64_t size; + if constexpr (MM_CFG.singleCoreMN != 0) { + size = MM_CFG.singleCoreMN; + } else { + size = tiling_->singleCoreM * tiling_->singleCoreN; + } + + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); + uint64_t offset = 0; + uint64_t offsetSize = 0; + auto enSequentialWrite = msg->body.enSequentialWrite; + auto enAtomic = msg->body.enAtomic; + auto sync = msg->body.sync; + auto enPartialSum = msg->body.enPartialSum; + if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) { + ASSERT(msg->body.cAddr != 0); // The output address must be configured. + if constexpr (MM_CFG.baseMN != 0) { + offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0; + } else { + offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0; + } + } else { + if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { + ASSERT(msg->body.cAddr != 0); // The output address must be configured. + if constexpr (MM_CFG.baseMN != 0) { + offsetSize = enSequentialWrite ? MM_CFG.baseMN : 0; + } else { + offsetSize = enSequentialWrite ? (tiling_->baseM * tiling_->baseN) : 0; + } + } else if (sync == 0) { + // For asynchronous Iterate, the offset must be used for address calculation and + // the size is baseM x baseN. + if constexpr (MM_CFG.baseMN != 0) { + offsetSize = MM_CFG.baseMN; + } else { + offsetSize = tiling_->baseM * tiling_->baseN; + } + enSequentialWrite = 1; + } + } + uint32_t cntIterator = 0; + TRACE_START(TraceId::MatMul_CALC); + // Asynchronous and configure the workspace + while (mul.Iterate(enPartialSum)) { + if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + if (unlikely(cntIterator == 0)) { + if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) { + TRACE_STOP(TraceId::MatMul_CALC); + return false; // The queue is not switched, and no message needs to be returned. + } + } + } + if constexpr (PhyPosIsL1(C_TYPE::pos)) { + mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite); + } else { + mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite); + } + cntIterator++; + if constexpr ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + if (cntIterator < INC_PROCESS_CHECK) { + if (funID == KFC_Enum::MMFUN_ITERATE) { + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + } + } + offset += offsetSize; + } + // Now release UB + if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || + PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + if (sync || msg->body.waitIterateAll) { + ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); + NotifyEvent(eventID); + } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) { + NotifyEvent(eventID); + } + mul.End(); + TRACE_STOP(TraceId::MatMul_CALC); + return true; + } + + __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID) + { + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + if constexpr ((IsSameType::value && IsSameType::value) || + ((IsSameType::value || IsSameType::value) && + IsSameType::value) || + (IsSameType::value && (IsSameType::value || + IsSameType::value))) { + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + } + IterateSetMessage(msg); + if (mul.GetSubBlockIdx() == 0) { + return true; + } + uint64_t size; + if constexpr (MM_CFG.singleCoreMN != 0) { + size = MM_CFG.singleCoreMN; + } else { + size = tiling_->singleCoreM * tiling_->singleCoreN; + } + + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + mul.IterateAll(cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite, + msg->body.waitIterateAll, msg->body.iterateFakeMsg); + + uint16_t eventID0 = static_cast(this->devEvtID * 2 + 0); + uint16_t eventID1 = static_cast(this->devEvtID * 2 + 1); + if (msg->body.sync || msg->body.waitIterateAll) { + ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); + NotifyEvent(eventID0); + NotifyEvent(eventID1); + } + if (!msg->body.iterateFakeMsg) { + mul.End(); + } + TRACE_STOP(TraceId::MatMul_CALC); + return true; + } + + __aicore__ inline bool IsSharedMatmul() + { + if constexpr (MM_CFG.enableInit) { + return false; + } else { + return true; + } + } + + __aicore__ inline bool ProcessIbShareSync(KFC_Enum funID, bool& freeMsg, + int &lastMsgId, const int subBlockID) + { + if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || MM_CFG.intraBlockPartSum) { + if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { + if (lastMsgId == subBlockID) { + freeMsg = false; + return true; + } + lastMsgId = subBlockID; + return false; + } + return false; + } else { + return false; + } + } + + __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID) + { + if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) || + ((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) { + if ((static_cast(funID) & static_cast(KFC_Enum::MMFUN_MASK)) == + static_cast(KFC_Enum::MMFUN_MASK)) { + if constexpr (MM_CFG.intraBlockPartSum) { + return IterateIntraBlockPartSum(msg, funID); + } else { + return Iterate(msg, funID); + } + } + } + if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0)) { + if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) { + return IterateBatch(msg); + } + } + if constexpr (MM_CFG.enableEnd) { + if (funID == KFC_Enum::MMFUN_END) { + mul.End(); + } + } + if constexpr (MM_CFG.enableGetTensorC) { + if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) { + return GetTensorC(msg); + } + } + if constexpr (MM_CFG.enableSetOrgShape) { + if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) { + SetOrgShape(msg); + return true; + } + } + if constexpr (MM_CFG.enableInit) { + if (funID == KFC_Enum::MMFUN_INIT) { + Init(msg); + return true; + } + } + if constexpr (((MM_CFG.iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { + if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) { + return IterateNBatch(msg); + } + } + if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) { + SetUserDefInfo(msg); + return true; + } + if (funID == KFC_Enum::MMFUN_SET_HF32) { + SetHF32(msg); + return true; + } + ASSERT("illegal function ID."); + return true; + } + + template __aicore__ LocalTensor GetTscmTensor(uint64_t addr, const uint64_t size) + { + LocalTensor scmLocal; + TBuffAddr scmTbuf; + scmTbuf.logicPos = (uint8_t)(TPosition::TSCM); + scmTbuf.dataLen = size * sizeof(DstT); + scmTbuf.bufferAddr = addr; +#if ASCENDC_CPU_DEBUG + scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr; +#endif + scmLocal.SetAddr(scmTbuf); + return scmLocal; + } + +private: + MatmulImpl mul; + GM_ADDR workspace; + KfcCommServer* kfcCommSrv; + TPipe* tpipe; + TCubeTiling* tiling_; + TCubeTiling tmpTiling_; // for compatible with init interface + typename IBShareCache()>::ShareCache gCache; + typename ShareMatmulAux::MSG msgAux; + uint16_t instID; + uint16_t devEvtID; +}; +} // namespace matmul #endif // __MATMUL_SERVER_H__ \ No newline at end of file diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index 2f9013ed759f9be7695532831390a71a61766a83..53fb12a4e06802738aa95b727568ed3f229beb4f 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -1290,11 +1290,6 @@ void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32 // for ascend910b1 bias: gm -> l1 -> bt, need extra l1 space, 支持bias随路转换 l1Size += tilingIns_->tiling_.get_baseN() * biasTypeSize; } - if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B && - tilingIns_->biasType_.pos != TPosition::VECCALC) { - // case1: in v100/v200 输入bias, 需要放到ub 上面参与运算, 空间大小为baseN * sizeof(biasType) - ubSize += tilingIns_->tiling_.get_baseN() * biasTypeSize; - } } // in v100/v200, nd2nz and nz2nd was simulated with intrins, need extra ub space @@ -1449,10 +1444,14 @@ void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const if (tilingIns_->isBias && tilingIns_->biasType_.pos != TPosition::VECCALC) { biasLength = tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); } + // quant tensor + if (tilingIns_->aType_.dataType == DataType::DT_INT8) { + int32_t quantLength = tilingIns_->tiling_.get_baseN() * sizeof(uint64_t); + biasLength = max(quantLength, biasLength); + } } - int reuseCnt = 2; - transLength = max(max(a1Length, b1Length), max(c1Length / reuseCnt, biasLength / reuseCnt)); - return; + + transLength = max(max(a1Length, b1Length), max(c1Length, biasLength)); } bool MatmulTilingAlgorithm::CheckBaseMN() const @@ -2345,4 +2344,4 @@ int64_t MatmulTilingAlgorithm::Process() const bool ans = CheckFinaleParams(coreStatus); return ans ? 0 : -1; } -} // namespace matmul_tiling +} // namespace matmul_tiling \ No newline at end of file diff --git a/impl/matmul/matmul_tiling_algorithm.h b/impl/matmul/matmul_tiling_algorithm.h index 61c5d31b510ce35dd77c84249a8b642cfbdcd4fe..4e90f44f98d05f9b5f12d3870b310ea26b6369b3 100644 --- a/impl/matmul/matmul_tiling_algorithm.h +++ b/impl/matmul/matmul_tiling_algorithm.h @@ -395,4 +395,4 @@ private: }; } // namespace matmul_tiling -#endif // IMPL_MATMUL_MATMUL_TILING_ALGORITHM_H +#endif // IMPL_MATMUL_MATMUL_TILING_ALGORITHM_H \ No newline at end of file diff --git a/impl/matmul/matmul_utils.h b/impl/matmul/matmul_utils.h index 88ffc1c71e298e60bfd0973d73aba273dbaea85b..814890a2dbca48091fd3d6dc2f78d130256836cb 100644 --- a/impl/matmul/matmul_utils.h +++ b/impl/matmul/matmul_utils.h @@ -245,6 +245,11 @@ __aicore__ constexpr bool DoMatmulSpecialMDL(MatmulConfig mmCFG) return mmCFG.doSpecialMDL; } +__aicore__ constexpr bool IsSharedMatmul(MatmulConfig mmCFG) +{ + return !mmCFG.enableInit; +} + __aicore__ constexpr MatmulVersion GetMatmulVersion(MatmulConfig mmCFG) { if (DoMatmulNorm(mmCFG)) { diff --git a/lib/matmul/bmm_tiling.h b/lib/matmul/bmm_tiling.h index 83e27f02ad3ce26fb3c020a13066710c4cd8b284..9e044368de8ed40ea2e0f27955c6f682cfa4a1fe 100644 --- a/lib/matmul/bmm_tiling.h +++ b/lib/matmul/bmm_tiling.h @@ -79,4 +79,4 @@ int32_t MultiCoreMatmulGetTmpBufSize(optiling::TCubeTiling &tiling, matmul_tilin int32_t BatchMatmulGetTmpBufSize(optiling::TCubeTiling &tiling, matmul_tiling::SysTilingTempBufSize &bufSize); }; -#endif // LIB_MATMUL_BMM_TILING_H +#endif // LIB_MATMUL_BMM_TILING_H \ No newline at end of file diff --git a/lib/matmul/kernel_kfc.h b/lib/matmul/kernel_kfc.h index 451d2a3e225d7ba05a71b2a651f0cfb4c6152395..d850786c651dc0504ad83dfeffc7695d819efd64 100644 --- a/lib/matmul/kernel_kfc.h +++ b/lib/matmul/kernel_kfc.h @@ -1,407 +1,407 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file kernel_kfc.h - * \brief - */ -#ifndef LIB_MATMUL_KERNEL_KFC_H -#define LIB_MATMUL_KERNEL_KFC_H - -#if ASCENDC_CPU_DEBUG -#include -#include -#endif - -#include "kernel_operator.h" -#include "lib/matmul/matmul_client.h" -#include "lib/matmul/matmul_server.h" -namespace AscendC { -class KfcServer { // AIC side -public: - __aicore__ inline void Init(GM_ADDR workspaceGM) - { - ASSERT(workspaceGM != nullptr && "workspaceGM cannot be nullptr when init kfc server"); - - workspace = workspaceGM; - quitSize = 0; - for (int32_t i = 0; i < MIX_NUM; i++) { - kfcCommSrv[i].Init(workspace, i); // Initialize the message queue on the server. - } - } - - __aicore__ inline bool isRun() - { - // The function exits when all AIVs exit. The client sends a Quit message when the destructor ends. - return quitSize < MIX_NUM; - } - - template __aicore__ inline void Run(T& a, Args&&... b) - { - TRACE_START(TraceId::KFC_SERVER_RUN); - auto ptr = kfcCommSrv; - __gm__ KfcMsg* msg; - bool ret = true; - for (int i = 0; i < MIX_NUM;) { // Get messages of each AIV core in polling mode. - TRACE_START(TraceId::KFC_SERVER_REV_MSG); - msg = ptr->RcvMessage(); - TRACE_STOP(TraceId::KFC_SERVER_REV_MSG); - if (msg) { - // The check message is public - TRACE_START(TraceId::KFC_SERVER_PROCESS_MSG); - auto funID = KfcMsgGetFunID(msg->head); - auto srvID = static_cast(static_cast(funID) & - static_cast(KFC_Enum::SERVICE_ID_MASK)); - bool freeMsg = true; - if (srvID == KFC_Enum::SERVICE_ID_MATMUL) { - ret = RunAux(i, msg, funID, freeMsg, a, b...); - } else if (srvID == KFC_Enum::SERVICE_ID_SCM) { - if (funID == KFC_Enum::SCMFUN_GM2L1) { - ScmDataCopy(&msg->buffer); - } else if (funID == KFC_Enum::SCMFUN_GM2L1ND2NZ) { - ScmDataCopyND2NZ(&msg->buffer); - } - if (unlikely(msg->ubAddr >= 0)) { - ptr->FreeUB(msg->ubAddr); - } - } else if (funID == KFC_Enum::SERVICE_QUIT) { - quitSize++; - } else { - ASSERT("unsupported service id !"); - } - if (freeMsg) { - ptr->FreeMessage(msg); // Move the message backward by one after the message processed. - TRACE_STOP(TraceId::KFC_SERVER_PROCESS_MSG); - } else { - ptr->RollBackMsg(); - i++; - ptr++; - continue; - } - } - if (ret) { // =false, lock a queue and must wait for release. - i++; - ptr++; - } - } - TRACE_STOP(TraceId::KFC_SERVER_RUN); - } - - template __aicore__ inline void InitObj(TPipe* tpipe, T& a, Args&&... b) - { - if constexpr (sizeof(T) == sizeof(void*)) { // Skip previous invalid pointer for compatibility - InitObj(b...); - } else { - ASSERT(kfcCommSrv != nullptr && "kfc comm server cannot be nullptr when init obj"); - auto ptr = kfcCommSrv; - for (int i = 0; i < MIX_NUM; i++, ptr++) { - InitObjAux(tpipe, ptr, i, 0, a, b...); - } - } - } - - __aicore__ inline void Quit() - {} - - template __aicore__ static inline constexpr bool isTiling() - { - return sizeof(T) == sizeof(void*); - } - - template __aicore__ static T* GetTiling(T* t, Args&&... b) - { - return t; - } - -private: - template - __aicore__ inline bool RunAuxSkip(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg, - T& a, Args&&... b) - { - return RunAux(subBlockID, msg, funID, freeMsg, b...); - } - template - __aicore__ inline bool RunAux(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg, T& a, Args&&... b) - { - ASSERT(msg != nullptr && "msg cannot be nullptr when kfc server run aux"); - ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)"); - if (a.mm.mm[0].IsSharedMatmul()) { - if (a.mm.mm[0].GetInstID() == KfcMsgGetInstID(msg->head)) { - if (a.mm.mm[0].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) { - return true; - } - freeMsg = true; - a.mm.mm[0].SetSubBlockIdx(static_cast(subBlockID)); - return a.mm.mm[0].Process(msg, funID); - } else if constexpr (sizeof...(b) == 0) { - ASSERT(0); - return true; - } else if constexpr (isTiling()) { - if constexpr (sizeof...(b) > 1) { - return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...); - } - } else if constexpr (sizeof...(b) >= 1) { - return RunAux(subBlockID, msg, funID, freeMsg, b...); - } - return true; - } else { - if (a.mm.mm[subBlockID].GetInstID() == KfcMsgGetInstID(msg->head)) { - if (a.mm.mm[subBlockID].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) { - return true; - } - freeMsg = true; - a.mm.mm[subBlockID].SetSubBlockIdx(static_cast(subBlockID)); - return a.mm.mm[subBlockID].Process(msg, funID); - } else if constexpr (sizeof...(b) == 0) { - ASSERT(0); - return true; - } else if constexpr (isTiling()) { - if constexpr (sizeof...(b) > 1) { - return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...); - } - } else if constexpr (sizeof...(b) >= 1) { - return RunAux(subBlockID, msg, funID, freeMsg, b...); - } - return true; - } - } - - template - __aicore__ inline void InitObjAuxSkip(TPipe* tpipe, KfcCommServer* kfc, int subBlockID, int instID, T* a, - Args&&... b) - { - InitObjAux(tpipe, kfc, subBlockID, instID, b...); - } - - template - __aicore__ inline void InitObjAux(TPipe *tpipe, KfcCommServer *kfc, int subBlockID, int instID, T &a, Args &&...b) - { - ASSERT(kfc != nullptr && "kfc cannot be nullptr when kfc server init obj aux"); - ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)"); - ASSERT(tpipe != nullptr); - ASSERT(instID >= 0 && instID < MAX_MATMUL_OBJ && "matmul instID id be [0, MAX_MATMUL_OBJ)"); - - if constexpr (sizeof...(b) == 0) { - if (a.mm.mm[0].IsSharedMatmul()) { - if (subBlockID == 0) { - a.mm.mm[0].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace); - } - } else { - a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace); - } - } else if constexpr (isTiling()) { - auto tiling = GetTiling(b...); - if (a.mm.mm[0].IsSharedMatmul()) { - if (subBlockID == 0) { - a.mm.mm[0].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace); - if constexpr (sizeof...(b) > 1) { - InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...); - } - } else { - if constexpr (sizeof...(b) > 1) { - InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...); - } - } - } else { - a.mm.mm[subBlockID].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace); - if constexpr (sizeof...(b) > 1) { - InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...); - } - } - } else { - a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace); - if constexpr (sizeof...(b) >= 1) { - InitObjAux(tpipe, kfc, subBlockID, instID + 1, b...); - } - } - } - - // Apply for two servers on the server. aic<->aiv 1:1 - KfcCommServer kfcCommSrv[MIX_NUM]; - GM_ADDR workspace; - uint8_t quitSize; - int lastMsgId = 1; -}; - -template -constexpr bool IsSharedMatmul() -{ - return !MM_CFG.enableInit; -} -template > -struct MatmulInstBase { - __aicore__ inline MatmulInstBase(){}; -}; -template -struct MatmulInstShared : MatmulInstBase { - __aicore__ inline MatmulInstShared(){}; - matmul::MatmulService mm[1]; -}; -template -struct MatmulInst : MatmulInstBase { - __aicore__ inline MatmulInst(){}; - matmul::MatmulService mm[MIX_NUM]; -}; - -template -struct MatmulInstAux { - __aicore__ inline MatmulInstAux(){}; -}; - -template -struct MatmulInstAux { - __aicore__ inline MatmulInstAux(){}; - using MATMUL = MatmulInstShared; -}; - -template -struct MatmulInstAux { - __aicore__ inline MatmulInstAux(){}; - using MATMUL = MatmulInst; -}; - -template > -class MatmulServiceAux { - using SrcT = typename A_TYPE::T; - using SrcAT = typename A_TYPE::T; - using SrcBT = typename B_TYPE::T; - using DstT = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - using handle = __gm__ MsgGroupSyncAux*; - -public: - __aicore__ inline MatmulServiceAux() {} - typename MatmulInstAux(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm; - - // stub functions for MatmulImpl - __aicore__ inline void Init(TCubeTiling* cubeTiling, TPipe* tpipe = nullptr){}; - - __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK){}; - __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0){}; - __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK){}; - __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1){}; - - __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTranspose = false){}; - - __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, - bool isTranspose = false){}; - __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTranspose = false){}; - - __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& righMatrix, - bool isTranspose = false){}; - __aicore__ inline void SetBias(const GlobalTensor& biasGlobal){}; - __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTranspose = false){}; - __aicore__ inline void SetTensorB(const LocalTensor& righMatrix, bool isTranspose = false){}; - __aicore__ inline void SetBias(const LocalTensor& inputBias){}; - __aicore__ inline void SetTensorA(SrcAT aScalar){}; - __aicore__ inline void SetTensorB(SrcBT bScalar){}; - __aicore__ inline void ClearBias(){}; - __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {} - __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {} - __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {} - __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) {} - template __aicore__ inline void SetWorkspace(__gm__ T* addr, int size) {}; - template __aicore__ inline void SetWorkspace(GlobalTensor& addr){}; - __aicore__ inline void End(){}; - __aicore__ inline void SetHF32(bool enHF32 = false, int32_t transMode = 0){}; - - template __aicore__ inline bool Iterate(bool enPartialSum = false) - { - return false; - }; - template - __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, - bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false){}; - template - __aicore__ inline void IterateAll(const LocalTensor& cMatrix, uint8_t enAtomic = 0){}; - __aicore__ inline void WaitIterateAll() {}; - template - __aicore__ inline void GetTensorC(const LocalTensor& c, uint8_t enAtomic = 0, - bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, - uint32_t dstGap = 0) {}; - template - __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, - bool enSequentialWrite = false){}; - template - __aicore__ inline void GetTensorC(const GlobalTensor &c, const LocalTensor &cLocal, - uint8_t enAtomic = 0, bool enSequentialWrite = false) {}; - template - __aicore__ inline GlobalTensor GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false) - { - GlobalTensor global; - return global; - }; - template - __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t batchA, uint32_t batchB, - bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, - const uint32_t matrixStrideC = 0) {}; - template - __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, uint32_t batchA, uint32_t batchB, - bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, - const uint32_t matrixStrideC = 0) {}; - template - __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB, - bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, - const uint32_t matrixStrideC = 0) {}; - template - __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {}; - template - __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, - bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, - uint32_t dstGap = 0) {}; - __aicore__ inline void WaitIterateBatch() {}; - __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) {}; - __aicore__ inline void AsyncGetTensorC(const LocalTensor& c){}; - __aicore__ inline void WaitGetTensorC(){}; - template - __aicore__ inline MatrixOffset GetOffsetC() - { - if constexpr (isTurnOnDebug) { - static_assert(!isTurnOnDebug, "unsupported!"); - } - } -}; - -template -__aicore__ inline void SetMatrixKfcSkip(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace, - T& mm, Args&&... b) -{ - SetMatrixKfc(pipe, kfcClient, instID, workspace, b...); -} - -template -__aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace, - T& mm, Args&&... b) -{ - ASSERT((pipe != nullptr) && "pipe should not be nullptr."); - ASSERT((kfcClient != nullptr) && "kfcClient should not be nullptr."); - ASSERT((workspace != nullptr) && "workspace should not be nullptr."); - - if constexpr (sizeof...(b) == 0) { - InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace); - } else if constexpr (KfcServer::isTiling()) { - auto tiling = KfcServer::GetTiling(b...); - InitKfcClient(mm, tiling, pipe, kfcClient, instID, workspace); - if constexpr (sizeof...(b) > 1) { - SetMatrixKfcSkip(pipe, kfcClient, instID + 1, workspace, b...); - } - } else { - InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace); - if constexpr (sizeof...(b) >= 1) { - SetMatrixKfc(pipe, kfcClient, instID + 1, workspace, b...); - } - } -} -}; // namespace AscendC - -#endif +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file kernel_kfc.h + * \brief + */ +#ifndef LIB_MATMUL_KERNEL_KFC_H +#define LIB_MATMUL_KERNEL_KFC_H + +#if ASCENDC_CPU_DEBUG +#include +#include +#endif + +#include "kernel_operator.h" +#include "lib/matmul/matmul_client.h" +#include "lib/matmul/matmul_server.h" +namespace AscendC { +class KfcServer { // AIC side +public: + __aicore__ inline void Init(GM_ADDR workspaceGM) + { + ASSERT(workspaceGM != nullptr && "workspaceGM cannot be nullptr when init kfc server"); + + workspace = workspaceGM; + quitSize = 0; + for (int32_t i = 0; i < MIX_NUM; i++) { + kfcCommSrv[i].Init(workspace, i); // Initialize the message queue on the server. + } + } + + __aicore__ inline bool isRun() + { + // The function exits when all AIVs exit. The client sends a Quit message when the destructor ends. + return quitSize < MIX_NUM; + } + + template __aicore__ inline void Run(T& a, Args&&... b) + { + TRACE_START(TraceId::KFC_SERVER_RUN); + auto ptr = kfcCommSrv; + __gm__ KfcMsg* msg; + bool ret = true; + for (int i = 0; i < MIX_NUM;) { // Get messages of each AIV core in polling mode. + TRACE_START(TraceId::KFC_SERVER_REV_MSG); + msg = ptr->RcvMessage(); + TRACE_STOP(TraceId::KFC_SERVER_REV_MSG); + if (msg) { + // The check message is public + TRACE_START(TraceId::KFC_SERVER_PROCESS_MSG); + auto funID = KfcMsgGetFunID(msg->head); + auto srvID = static_cast(static_cast(funID) & + static_cast(KFC_Enum::SERVICE_ID_MASK)); + bool freeMsg = true; + if (srvID == KFC_Enum::SERVICE_ID_MATMUL) { + ret = RunAux(i, msg, funID, freeMsg, a, b...); + } else if (srvID == KFC_Enum::SERVICE_ID_SCM) { + if (funID == KFC_Enum::SCMFUN_GM2L1) { + ScmDataCopy(&msg->buffer); + } else if (funID == KFC_Enum::SCMFUN_GM2L1ND2NZ) { + ScmDataCopyND2NZ(&msg->buffer); + } + if (unlikely(msg->ubAddr >= 0)) { + ptr->FreeUB(msg->ubAddr); + } + } else if (funID == KFC_Enum::SERVICE_QUIT) { + quitSize++; + } else { + ASSERT("unsupported service id !"); + } + if (freeMsg) { + ptr->FreeMessage(msg); // Move the message backward by one after the message processed. + TRACE_STOP(TraceId::KFC_SERVER_PROCESS_MSG); + } else { + ptr->RollBackMsg(); + i++; + ptr++; + continue; + } + } + if (ret) { // =false, lock a queue and must wait for release. + i++; + ptr++; + } + } + TRACE_STOP(TraceId::KFC_SERVER_RUN); + } + + template __aicore__ inline void InitObj(TPipe* tpipe, T& a, Args&&... b) + { + if constexpr (sizeof(T) == sizeof(void*)) { // Skip previous invalid pointer for compatibility + InitObj(b...); + } else { + ASSERT(kfcCommSrv != nullptr && "kfc comm server cannot be nullptr when init obj"); + auto ptr = kfcCommSrv; + for (int i = 0; i < MIX_NUM; i++, ptr++) { + InitObjAux(tpipe, ptr, i, 0, a, b...); + } + } + } + + __aicore__ inline void Quit() + {} + + template __aicore__ static inline constexpr bool isTiling() + { + return sizeof(T) == sizeof(void*); + } + + template __aicore__ static T* GetTiling(T* t, Args&&... b) + { + return t; + } + +private: + template + __aicore__ inline bool RunAuxSkip(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg, + T& a, Args&&... b) + { + return RunAux(subBlockID, msg, funID, freeMsg, b...); + } + template + __aicore__ inline bool RunAux(int subBlockID, __gm__ KfcMsg* msg, KFC_Enum funID, bool& freeMsg, T& a, Args&&... b) + { + ASSERT(msg != nullptr && "msg cannot be nullptr when kfc server run aux"); + ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)"); + if (a.mm.mm[0].IsSharedMatmul()) { + if (a.mm.mm[0].GetInstID() == KfcMsgGetInstID(msg->head)) { + if (a.mm.mm[0].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) { + return true; + } + freeMsg = true; + a.mm.mm[0].SetSubBlockIdx(static_cast(subBlockID)); + return a.mm.mm[0].Process(msg, funID); + } else if constexpr (sizeof...(b) == 0) { + ASSERT(0); + return true; + } else if constexpr (isTiling()) { + if constexpr (sizeof...(b) > 1) { + return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...); + } + } else if constexpr (sizeof...(b) >= 1) { + return RunAux(subBlockID, msg, funID, freeMsg, b...); + } + return true; + } else { + if (a.mm.mm[subBlockID].GetInstID() == KfcMsgGetInstID(msg->head)) { + if (a.mm.mm[subBlockID].ProcessIbShareSync(funID, freeMsg, lastMsgId, subBlockID)) { + return true; + } + freeMsg = true; + a.mm.mm[subBlockID].SetSubBlockIdx(static_cast(subBlockID)); + return a.mm.mm[subBlockID].Process(msg, funID); + } else if constexpr (sizeof...(b) == 0) { + ASSERT(0); + return true; + } else if constexpr (isTiling()) { + if constexpr (sizeof...(b) > 1) { + return RunAuxSkip(subBlockID, msg, funID, freeMsg, b...); + } + } else if constexpr (sizeof...(b) >= 1) { + return RunAux(subBlockID, msg, funID, freeMsg, b...); + } + return true; + } + } + + template + __aicore__ inline void InitObjAuxSkip(TPipe* tpipe, KfcCommServer* kfc, int subBlockID, int instID, T* a, + Args&&... b) + { + InitObjAux(tpipe, kfc, subBlockID, instID, b...); + } + + template + __aicore__ inline void InitObjAux(TPipe *tpipe, KfcCommServer *kfc, int subBlockID, int instID, T &a, Args &&...b) + { + ASSERT(kfc != nullptr && "kfc cannot be nullptr when kfc server init obj aux"); + ASSERT(subBlockID >= 0 && subBlockID < MIX_NUM && "sub block id should be [0, MIX_NUM)"); + ASSERT(tpipe != nullptr); + ASSERT(instID >= 0 && instID < MAX_MATMUL_OBJ && "matmul instID id be [0, MAX_MATMUL_OBJ)"); + + if constexpr (sizeof...(b) == 0) { + if (a.mm.mm[0].IsSharedMatmul()) { + if (subBlockID == 0) { + a.mm.mm[0].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace); + } + } else { + a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace); + } + } else if constexpr (isTiling()) { + auto tiling = GetTiling(b...); + if (a.mm.mm[0].IsSharedMatmul()) { + if (subBlockID == 0) { + a.mm.mm[0].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace); + if constexpr (sizeof...(b) > 1) { + InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...); + } + } else { + if constexpr (sizeof...(b) > 1) { + InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...); + } + } + } else { + a.mm.mm[subBlockID].InitKfc(tpipe, (void *)tiling, kfc, instID, workspace); + if constexpr (sizeof...(b) > 1) { + InitObjAuxSkip(tpipe, kfc, subBlockID, instID + 1, b...); + } + } + } else { + a.mm.mm[subBlockID].InitKfc(tpipe, (void *)nullptr, kfc, instID, workspace); + if constexpr (sizeof...(b) >= 1) { + InitObjAux(tpipe, kfc, subBlockID, instID + 1, b...); + } + } + } + + // Apply for two servers on the server. aic<->aiv 1:1 + KfcCommServer kfcCommSrv[MIX_NUM]; + GM_ADDR workspace; + uint8_t quitSize; + int lastMsgId = 1; +}; + +template +constexpr bool IsSharedMatmul() +{ + return !MM_CFG.enableInit; +} +template > +struct MatmulInstBase { + __aicore__ inline MatmulInstBase(){}; +}; +template +struct MatmulInstShared : MatmulInstBase { + __aicore__ inline MatmulInstShared(){}; + matmul::MatmulService mm[1]; +}; +template +struct MatmulInst : MatmulInstBase { + __aicore__ inline MatmulInst(){}; + matmul::MatmulService mm[MIX_NUM]; +}; + +template +struct MatmulInstAux { + __aicore__ inline MatmulInstAux(){}; +}; + +template +struct MatmulInstAux { + __aicore__ inline MatmulInstAux(){}; + using MATMUL = MatmulInstShared; +}; + +template +struct MatmulInstAux { + __aicore__ inline MatmulInstAux(){}; + using MATMUL = MatmulInst; +}; + +template > +class MatmulServiceAux { + using SrcT = typename A_TYPE::T; + using SrcAT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + using handle = __gm__ MsgGroupSyncAux*; + +public: + __aicore__ inline MatmulServiceAux() {} + typename MatmulInstAux(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm; + + // stub functions for MatmulImpl + __aicore__ inline void Init(TCubeTiling* cubeTiling, TPipe* tpipe = nullptr){}; + + __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK){}; + __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0){}; + __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK){}; + __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1){}; + + __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTranspose = false){}; + + __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, + bool isTranspose = false){}; + __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTranspose = false){}; + + __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& righMatrix, + bool isTranspose = false){}; + __aicore__ inline void SetBias(const GlobalTensor& biasGlobal){}; + __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTranspose = false){}; + __aicore__ inline void SetTensorB(const LocalTensor& righMatrix, bool isTranspose = false){}; + __aicore__ inline void SetBias(const LocalTensor& inputBias){}; + __aicore__ inline void SetTensorA(SrcAT aScalar){}; + __aicore__ inline void SetTensorB(SrcBT bScalar){}; + __aicore__ inline void ClearBias(){}; + __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {} + __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {} + __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {} + __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) {} + template __aicore__ inline void SetWorkspace(__gm__ T* addr, int size) {}; + template __aicore__ inline void SetWorkspace(GlobalTensor& addr){}; + __aicore__ inline void End(){}; + __aicore__ inline void SetHF32(bool enHF32 = false, int32_t transMode = 0){}; + + template __aicore__ inline bool Iterate(bool enPartialSum = false) + { + return false; + }; + template + __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false){}; + template + __aicore__ inline void IterateAll(const LocalTensor& cMatrix, uint8_t enAtomic = 0){}; + __aicore__ inline void WaitIterateAll() {}; + template + __aicore__ inline void GetTensorC(const LocalTensor& c, uint8_t enAtomic = 0, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) {}; + template + __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false){}; + template + __aicore__ inline void GetTensorC(const GlobalTensor &c, const LocalTensor &cLocal, + uint8_t enAtomic = 0, bool enSequentialWrite = false) {}; + template + __aicore__ inline GlobalTensor GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false) + { + GlobalTensor global; + return global; + }; + template + __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0) {}; + template + __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0) {}; + template + __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0) {}; + template + __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {}; + template + __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) {}; + __aicore__ inline void WaitIterateBatch() {}; + __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) {}; + __aicore__ inline void AsyncGetTensorC(const LocalTensor& c){}; + __aicore__ inline void WaitGetTensorC(){}; + template + __aicore__ inline MatrixOffset GetOffsetC() + { + if constexpr (isTurnOnDebug) { + static_assert(!isTurnOnDebug, "unsupported!"); + } + } +}; + +template +__aicore__ inline void SetMatrixKfcSkip(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace, + T& mm, Args&&... b) +{ + SetMatrixKfc(pipe, kfcClient, instID, workspace, b...); +} + +template +__aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const int32_t instID, GM_ADDR workspace, + T& mm, Args&&... b) +{ + ASSERT((pipe != nullptr) && "pipe should not be nullptr."); + ASSERT((kfcClient != nullptr) && "kfcClient should not be nullptr."); + ASSERT((workspace != nullptr) && "workspace should not be nullptr."); + + if constexpr (sizeof...(b) == 0) { + InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace); + } else if constexpr (KfcServer::isTiling()) { + auto tiling = KfcServer::GetTiling(b...); + InitKfcClient(mm, tiling, pipe, kfcClient, instID, workspace); + if constexpr (sizeof...(b) > 1) { + SetMatrixKfcSkip(pipe, kfcClient, instID + 1, workspace, b...); + } + } else { + InitKfcClient(mm, (void*)nullptr, pipe, kfcClient, instID, workspace); + if constexpr (sizeof...(b) >= 1) { + SetMatrixKfc(pipe, kfcClient, instID + 1, workspace, b...); + } + } +} +}; // namespace AscendC + +#endif \ No newline at end of file diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index fce0ccda66827b5902be9c216cff095f593a135c..9377f7b7e336ac3a85f071371fe1f2bc0e5e4017 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -21,7 +21,7 @@ #include "../../impl//matmul/matmul_macro_v220_basic_impl.h" #include "../../impl//matmul/matmul_macro_v200_impl.h" #include "../../impl/matmul/matmul_utils.h" -#include "lib/matmul/matmul_call_back.h" +#include "../../impl/matmul/matmul_call_back.h" namespace matmul { using namespace AscendC; @@ -99,8 +99,6 @@ struct MatmulParamsNorm : public MatmulParamsBase calcBuf_; - TPipe* tpipe_; const TCubeTiling* __restrict tiling_; __gm__ uint8_t* cacheWorkspaceAddr; @@ -219,8 +217,6 @@ struct MatmulParamsMDL : public MatmulParamsBase calcBuf_; - TPipe* tpipe_; const TCubeTiling* __restrict tiling_; __gm__ uint8_t* cacheWorkspaceAddr; @@ -359,8 +355,6 @@ struct MatmulParamsIBShareNorm : public MatmulParamsBase calcBuf_; - TPipe* tpipe_; const TCubeTiling* __restrict tiling_; __gm__ uint8_t* cacheWorkspaceAddr; @@ -554,43 +548,6 @@ struct MatmulMacroImpl; }; -template -struct IntraBlockBase { - __aicore__ inline IntraBlockBase() {}; -}; - -template -struct IntraBlock { - using SrcT = typename A_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - __aicore__ inline IntraBlock(){}; - __gm__ SrcT* aGlobal; - __gm__ SrcT* bGlobal; - __gm__ BiasT* biasGlobal; - int M; - int N; - int Ka; - int Kb; - int Kc; - int singleCoreM; - int singleCoreN; - int singleCoreK; - int mIter; - int nIter; - int kIter; - int baseUseM; - int baseUseN; - // measured in cube block - int blockUseM; - int blockUseN; - int tailM, tailK, tailN; - int cacheProcA = 0; - bool enableBias = false; - bool isTransposeA; - bool isTransposeB; - bool fakeMsg = false; -}; - template > class MatmulImpl { @@ -946,8 +903,41 @@ private: int Kc_; int32_t batchA_ = 1, batchB_ = 1; int32_t batchOuter_ = 1; - using INTRABLOCK = - typename Conditional, IntraBlockBase>::type; + + struct IntraBlockBase { + __aicore__ inline IntraBlockBase() {}; + }; + + struct IntraBlock { + __aicore__ inline IntraBlock(){}; + __gm__ SrcT* aGlobal; + __gm__ SrcT* bGlobal; + __gm__ BiasT* biasGlobal; + int M; + int N; + int Ka; + int Kb; + int Kc; + int singleCoreM; + int singleCoreN; + int singleCoreK; + int mIter; + int nIter; + int kIter; + int baseUseM; + int baseUseN; + // measured in cube block + int blockUseM; + int blockUseN; + int tailM, tailK, tailN; + int cacheProcA = 0; + bool enableBias = false; + bool isTransposeA; + bool isTransposeB; + bool fakeMsg = false; + }; + + using INTRABLOCK = typename Conditional::type; INTRABLOCK intraBlockMatmul; }; diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index 869ceadfc976461f84f46addc81efccf70b031b5..000f897b4560367c4acd9cef9498321822d5a2ed 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -16,21 +16,16 @@ #define LIB_MATMUL_MATMUL_CLIENT_H #include "lib/matmul/tiling.h" -#include "lib/matmul/matmul_call_back.h" +#include "../../impl/matmul/matmul_call_back.h" #include "../../impl/matmul/matmul_utils.h" #include "kernel_operator.h" #if ASCENDC_CPU_DEBUG -#include "lib/matmul/matmul_server.h" +#include "../../impl/matmul/matmul_server.h" #endif namespace matmul { using namespace AscendC; #if ASCENDC_CPU_DEBUG -template -constexpr bool IsSharedMatmul() -{ - return !MM_CFG.enableInit; -} template > struct MatmulInstBase { @@ -764,7 +759,13 @@ public: #if ASCENDC_CPU_DEBUG public: // this is useless code just for cpu debug - typename MatmulInstAux(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::MATMUL mm; + typename MatmulInstAux::MATMUL mm; #endif @@ -943,4 +944,4 @@ private: } }; } // namespace matmul -#endif +#endif \ No newline at end of file diff --git a/lib/matmul/matmul_intf.h b/lib/matmul/matmul_intf.h index 6fccd093d4ccd0ca32833cb14cf1cb1ca29776c0..cc1bbc38a2b91c985f1e63a949d7e1e14789cff7 100644 --- a/lib/matmul/matmul_intf.h +++ b/lib/matmul/matmul_intf.h @@ -15,7 +15,7 @@ #ifndef LIB_MATMUL_MATMUL_INTF_H #define LIB_MATMUL_MATMUL_INTF_H #if __CCE_AICORE__ == 220 -#include "lib/matmul/kernel_kfc.h" +#include "../impl/matmul/kernel_kfc.h" #else #include "lib/matmul/matmul.h" #endif @@ -124,27 +124,6 @@ using Matmul = matmul::MatmulImpl((uint64_t)-1, (uint64_t)-1); - SetMaskNorm(); - } -#endif - -#ifdef __DAV_C220_CUBE__ - ClearWorkspaceImpl(workspace); - uint16_t eventID = 3; - NotifyEvent(eventID); -#endif -} - #ifdef __DAV_C220_CUBE__ #ifdef ASCENDC_CUBE_ONLY template __aicore__ static T* GetCurTiling(T* t, Args&&... b) diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index 4cdef51fed7b9f28cbb39df0d959ca5f16b180f6..b03687525aae27d1d87ad531f2b5d132bd93c6ee 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -317,4 +317,4 @@ protected: }; } // namespace matmul_tiling -#endif // LIB_MATMUL_MATMUL_TILING_BASE_H +#endif // LIB_MATMUL_MATMUL_TILING_BASE_H \ No newline at end of file diff --git a/lib/matmul/matmul_tilingdata.h b/lib/matmul/matmul_tilingdata.h index e1886971ec3a460f09cd5db7e1cc77eeb2d09128..a2b3dca04db59acffbc8ed9a47f6f185b6b3fb99 100644 --- a/lib/matmul/matmul_tilingdata.h +++ b/lib/matmul/matmul_tilingdata.h @@ -71,4 +71,4 @@ TILING_DATA_FIELD_DEF(int32_t, BatchNum); TILING_DATA_FIELD_DEF(int32_t, reserved); END_TILING_DATA_DEF; } -#endif // LIB_MATMUL_MATMUL_TILINGDATA_H +#endif // LIB_MATMUL_MATMUL_TILINGDATA_H \ No newline at end of file diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h index 38ad988e411df5357235bb0789b730dc63d148b3..609f1753793726a0edbc84467685abecf121f9ba 100644 --- a/lib/matmul/tiling.h +++ b/lib/matmul/tiling.h @@ -55,10 +55,10 @@ enum class IterateOrder { ORDER_N, UNDEF, }; - -enum class ScheduleMode { - NONE = 0, - L0_MN_DB = 1, // NORM template, L0 m/n db + +enum class ScheduleType { + INNER_PRODUCT = 0, // k loop, default type + OUTER_PRODUCT, // m/n loop, depends on IterateOrder }; enum class MatmulVersion { @@ -124,14 +124,14 @@ struct MatmulConfig { bool intraBlockPartSum = false; // MDL support M/N db IterateOrder iterateOrder; - ScheduleMode scheduleMode; + ScheduleType scheduleType; bool enableDoubleCache; }; __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1, const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF, - const ScheduleMode scheduleMode = ScheduleMode::NONE, const bool enUnitFlag = true) + const ScheduleType scheduleType = ScheduleType::INNER_PRODUCT, const bool enUnitFlag = true) { return { .doNorm = true, @@ -172,7 +172,7 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f .enableL1CacheUB = false, .intraBlockPartSum = false, .iterateOrder = iterateOrder, - .scheduleMode = scheduleMode, + .scheduleType = scheduleType, .enableDoubleCache = false }; } @@ -221,7 +221,7 @@ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = fals .enableL1CacheUB = enableL1CacheUB, .intraBlockPartSum = false, .iterateOrder = IterateOrder::UNDEF, - .scheduleMode = ScheduleMode::NONE, + .scheduleType = ScheduleType::INNER_PRODUCT, .enableDoubleCache = false }; } @@ -269,7 +269,7 @@ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit .enableL1CacheUB = false, .intraBlockPartSum = false, .iterateOrder = IterateOrder::UNDEF, - .scheduleMode = ScheduleMode::NONE, + .scheduleType = ScheduleType::INNER_PRODUCT, .enableDoubleCache = false }; } @@ -317,7 +317,7 @@ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const ui .enableL1CacheUB = false, .intraBlockPartSum = false, .iterateOrder = IterateOrder::UNDEF, - .scheduleMode = ScheduleMode::NONE, + .scheduleType = ScheduleType::INNER_PRODUCT, .enableDoubleCache = false }; } @@ -366,7 +366,7 @@ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, c .enableL1CacheUB = false, .intraBlockPartSum = false, .iterateOrder = IterateOrder::UNDEF, - .scheduleMode = ScheduleMode::NONE, + .scheduleType = ScheduleType::INNER_PRODUCT, .enableDoubleCache = false }; } @@ -414,7 +414,7 @@ __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimi .enableL1CacheUB = false, .intraBlockPartSum = false, .iterateOrder = IterateOrder::UNDEF, - .scheduleMode = ScheduleMode::NONE, + .scheduleType = ScheduleType::INNER_PRODUCT, .enableDoubleCache = isDoubleCache }; } @@ -431,4 +431,4 @@ struct MatrixOffset { }; extern int blockidx_; -#endif // LIB_MATMUL_TILING_H +#endif // LIB_MATMUL_TILING_H \ No newline at end of file