From 411ebd4150c7545bf3be17dfd504b7927db5ebe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=85=83=E6=9D=B0?= Date: Tue, 12 Aug 2025 12:00:25 +0000 Subject: [PATCH] =?UTF-8?q?=E8=A1=A5=E5=85=85=E5=AF=B9=E5=A4=96=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- impl/matmul/param/matmul_shape_tiling.h | 36 +-- impl/matmul/tiling/matmul_tiling_base.cpp | 1 - lib/matmul/matmul.h | 270 ++++++++++++++++++++-- lib/matmul/matmul_tiling.h | 1 - lib/matmul/tiling.h | 194 ++++++++++++---- 5 files changed, 419 insertions(+), 83 deletions(-) diff --git a/impl/matmul/param/matmul_shape_tiling.h b/impl/matmul/param/matmul_shape_tiling.h index 0c595399..c7244e8d 100644 --- a/impl/matmul/param/matmul_shape_tiling.h +++ b/impl/matmul/param/matmul_shape_tiling.h @@ -215,6 +215,23 @@ private: enable_if_t, bool> = false> __aicore__ inline void ConfigSpecificCheck() { + if constexpr (DoMatmulNorm(MM_CFG) && IMPL::AType::layout != LayoutMode::NONE) { + if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1 && + !ToMatmulConfig(MM_CFG).isBiasBatch) { + ASCENDC_ASSERT(false, { + KERNEL_LOG(KERNEL_ERROR, "Bias reuse does not support BatchMode::SINGLE_LARGE_THAN_L1"); + }); + } + +#if __CCE_AICORE__ == 220 + if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { + ASCENDC_ASSERT(tiling_.GetSingleCoreK() <= tiling_.GetBaseK(), { + KERNEL_LOG(KERNEL_ERROR, "When singleCoreK is larger than baseK, the parameter scheduleType of " + "MM_CFG should not be OUTER_PRODUCT"); + }); + } +#endif + } #if __CCE_AICORE__ < 220 // when output is int8 and ND format, do not support on the fly trans nd2nz if constexpr (IMPL::CType::format == CubeFormat::ND && !ToMatmulConfig(MM_CFG).enVecND2NZ && @@ -302,24 +319,7 @@ private: bool> = false> __aicore__ inline void ConfigSpecificCheck() { - if constexpr (IMPL::AType::layout != LayoutMode::NONE) { - ASCENDC_ASSERT(!DoMatmulMDL(MM_CFG), { KERNEL_LOG(KERNEL_ERROR, "BatchMatmul unsupport MDL."); }); - if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1 && - !ToMatmulConfig(MM_CFG).isBiasBatch) { - ASCENDC_ASSERT(false, { - KERNEL_LOG(KERNEL_ERROR, "Bias reuse does not supported BatchMode::SINGLE_LARGE_THAN_L1"); - }); - } - -#if __CCE_AICORE__ == 220 - if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { - ASCENDC_ASSERT(tiling_.GetSingleCoreK() <= tiling_.GetBaseK(), { - KERNEL_LOG(KERNEL_ERROR, "When singleCoreK is larger than baseK, the parameter scheduleType of " - "MM_CFG should not be OUTER_PRODUCT"); - }); - } -#endif - } + ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul config."); }); } __aicore__ inline void ConfigCommonCheck() diff --git a/impl/matmul/tiling/matmul_tiling_base.cpp b/impl/matmul/tiling/matmul_tiling_base.cpp index 07a79d6d..4efb3b2e 100644 --- a/impl/matmul/tiling/matmul_tiling_base.cpp +++ b/impl/matmul/tiling/matmul_tiling_base.cpp @@ -15,7 +15,6 @@ #include "lib/matmul/matmul_tiling_base.h" #include -#include #include #include "impl/host_log.h" diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index 7562bcca..4f085048 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -24,109 +24,349 @@ namespace AscendC { +/** + * @struct MatmulApiConfig + * @brief Matmul external configuration + */ template struct MatmulApiConfig { - using AType = A_TYPE; - using BType = B_TYPE; - using CType = C_TYPE; - using BiasType = BIAS_TYPE; - constexpr static MatmulConfig Config = ToMatmulConfig(MM_CFG); + using AType = A_TYPE; ///< MatmulType of A matrix + using BType = B_TYPE; ///< MatmulType of B matrix + using CType = C_TYPE; ///< MatmulType of C matrix + using BiasType = BIAS_TYPE; ///< MatmulType of Bias + constexpr static MatmulConfig Config = ToMatmulConfig(MM_CFG); ///< MatmulConfig }; +/** + * @class MatmulImpl + * @brief Matmul implementation of user-defined matmul object + */ template , MATMUL_POLICY_DEFAULT_OF(MatmulPolicy), typename = void> class MatmulImpl { public: - using AType = A_TYPE; - using BType = B_TYPE; - using CType = C_TYPE; - using BiasType = BIAS_TYPE; + using AType = A_TYPE; ///< MatmulType of A matrix + using BType = B_TYPE; ///< MatmulType of B matrix + using CType = C_TYPE; ///< MatmulType of C matrix + using BiasType = BIAS_TYPE; ///< MatmulType of Bias private: - using L0cT = typename GetMmDstType::Type; - using SrcT = typename A_TYPE::T; - using SrcAT = typename A_TYPE::T; - using SrcBT = typename B_TYPE::T; - using DstT = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; + using L0cT = typename GetMmDstType::Type; ///< data type of L0c + using SrcT = typename A_TYPE::T; ///< data type of input data + using SrcAT = typename A_TYPE::T; ///< data type of A matrix + using SrcBT = typename B_TYPE::T; ///< data type of B matrix + using DstT = typename C_TYPE::T; ///< data type of output data + using BiasT = typename BIAS_TYPE::T; ///< data type of bias public: __aicore__ inline MatmulImpl() {} + /** + * @brief Initialize tiling data in Matmul object and allocate resources according to tiling parameters + * @param [in] cubeTiling: matmul tiling + * @param [in] tpipe: TPipe object + */ __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe = nullptr) {} + /** + * @brief Set full original shape M N K of the number of elements + * @param [in] orgM: size of original A matrix M-axis shape + * @param [in] orgN: size of original B matrix N-axis shape + * @param [in] orgK: size of original A/B matrix K-axis shape, only when Ka equal to Kb + */ __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK) {} + /** + * @brief Set full original shape M N K of the number of elements + * @param [in] orgM: size of original A matrix M-axis shape + * @param [in] orgN: size of original B matrix N-axis shape + * @param [in] orgKa: size of original A matrix K-axis shape + * @param [in] orgKb: size of original B matrix K-axis shape + * @param [in] orgKc: size of C matrix N-axis shape, only when B matrix's N and C matrix's N are different + */ __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0) {} + /** + * @brief Set single core shape M N K of the number of elements + * @param [in] singleM: size of M-axis shape within a single core + * @param [in] singleN: size of N-axis shape within a single core + * @param [in] singleK: size of K-axis shape within a single core + */ __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK) {} + /** + * @brief Without changing tiling, reconfigure singleCoreM, singleCoreN, singleCoreK for this computation + * @param [in] tailM: size of M-axis shape within a single core + * @param [in] tailN: size of N-axis shape within a single core + * @param [in] tailK: size of K-axis shape within a single core + */ __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1) {} + /** + * @brief Set A matrix + * @param [in] gm: A matrix in GlobalTensor + * @param [in] isTransposeA: whether A matrix needs to be transposed + */ __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTransposeA = false) {} + /** + * @brief Set B matrix + * @param [in] gm: B matrix in GlobalTensor + * @param [in] isTransposeA: whether B matrix needs to be transposed + */ __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTransposeB = false) {} + /** + * @brief Set bias matrix + * @param [in] gm: bias matrix in GlobalTensor + */ __aicore__ inline void SetBias(const GlobalTensor& biasGlobal) {} + /** + * @brief When using MatmulCallBackFunc, set the required computation data or the storage address of data on GM + * @tparam [in] T: dataPtr data type, default is uint64_t + * @param [in] dataPtr: the required computation data or the storage address of data on GM + * @note must be called before SetTensorA and SetTensorB + */ template __aicore__ inline void SetSelfDefineData(T dataPtr) {} + /** + * @brief When using MatmulCallBackFunc, set the tiling address used by callback function + * @param [in] tilingPtr: the tiling address + * @note only need to be called once + */ __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {} + /** + * @brief Set the index matrix generated during the densification process of a sparse matrix + * @param [in] indexGlobal: the first address of the index matrix in Global Memory + */ __aicore__ inline void SetSparseIndex(const GlobalTensor& indexGlobal); + /** + * @brief Set the quantization scale for anti-quantization when A matrix's data type is half and B matrix's data type is int8 + * @param [in] offsetScalar: quantization scale for addition + * @param [in] scaleScalar: quantization scale for multiplication + */ __aicore__ inline void SetAntiQuantScalar(const SrcT offsetScalar, const SrcT scaleScalar) {} + /** + * @brief Set the quantization vector for anti-quantization when A matrix's data type is half and B matrix's data type is int8 + * @param [in] offsetTensor: quantization vector for addition + * @param [in] scaleTensor: quantization vector for multiplication + */ __aicore__ inline void SetAntiQuantVector(const LocalTensor &offsetTensor, const LocalTensor &scaleTensor) {} + /** + * @brief Set the quantization scale + * @param [in] quantScalar: quantization scale + */ __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {} + /** + * @brief Set the quantization vector + * @param [in] quantTensor: quantization vector + */ __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) {} + /** + * @brief Set A matrix + * @param [in] leftMatrix: A matrix in LocalTensor + * @param [in] isTransposeA: whether A matrix needs to be transposed + */ __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTransposeA = false) {} + /** + * @brief Copy data from UB to GM and then input as A matrix + * @param [in] gm: A matrix in GlobalTensor + * @param [in] leftMatrix: A matrix in LocalTensor + * @param [in] isTransposeA: whether A matrix needs to be transposed + */ __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, bool isTransposeA = false) {} + /** + * @brief Set B matrix + * @param [in] rightMatrix: B matrix in LocalTensor + * @param [in] isTransposeB: whether B matrix needs to be transposed + */ __aicore__ inline void SetTensorB(const LocalTensor& rightMatrix, bool isTransposeB = false) {} + /** + * @brief Set A matrix + * @param [in] aScalar: values set in A matrix + * @note scalar data will be expanded into a tensor of shape [1, K] + */ __aicore__ inline void SetTensorA(SrcAT aScalar) {} + /** + * @brief Set B matrix + * @param [in] bScalar: values set in B matrix + * @note scalar data will be expanded into a tensor of shape [1, K] + */ __aicore__ inline void SetTensorB(SrcBT bScalar) {} + /** + * @brief Copy data from UB to GM and then input as B matrix + * @param [in] gm: B matrix in GlobalTensor + * @param [in] rightMatrix: B matrix in LocalTensor + * @param [in] isTransposeB: whether B matrix needs to be transposed + */ __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& rightMatrix, bool isTransposeB = false) {} + /** + * @brief Set bias matrix + * @param [in] inputBias: bias matrix in LocalTensor + */ __aicore__ inline void SetBias(const LocalTensor& inputBias) {} + /** + * @brief Reset the batch number for Batch Matmul without chaging tiling + * @param [in] batchA: batch number of A matrix + * @param [in] batchB: batch number of B matrix + */ __aicore__ inline void SetBatchNum(int32_t batchA, int32_t batchB) {} + /** + * @brief Clear bias flag, bias will not be involved in the computation + */ __aicore__ inline void DisableBias() {} + /** + * @brief Clear bias flag, bias will not be involved in the computation + * @note recommend to use DisableBias + */ __aicore__ inline void ClearBias() {} + /** + * @brief Calculate a C matrix of size baseM * baseN + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data + */ template __aicore__ inline bool Iterate(bool enPartialSum = false) { return false; } + /** + * @brief Calculate a C matrix of size baseM * baseN + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data + * @param [in] localCmatrix: the LocalTensor memory on CO1 applied for by user, used to store the results of Iterate + */ template __aicore__ inline bool Iterate(bool enPartialSum, const LocalTensor& localCmatrix) { return false; } + /** + * @brief Calculate a C matrix of size singleCoreM * singleCoreN + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] gm: C matrix in GlobalTensor + * @param [in] enAtomic: whether to enable atomic operations + * @param [in] enSequentialWrite: whether to enable sequential write mode + * @param [in] waitIterateAll: whether to wait for IterateAll to complete by WaitIterateAll when in asynchronous mode + * @param [in] fakeMsg: whether to enable fake message when in IBShare or IntraBlockPartSum mode + */ template __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) {} + /** + * @brief Calculate a C matrix of size singleCoreM * singleCoreN + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] ubCmatrix: C matrix in LocalTensor + * @param [in] enAtomic: whether to enable atomic operations + */ template __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0) {} + /** + * @brief Calculate multiple C matrices of size singleCoreM * singleCoreN + * @param [in] gm: C matrix in GlobalTensor + * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data + * @param [in] enAtomic: whether to enable atomic operations + * @param [in] enSequentialWrite: whether to enable sequential write mode + * @param [in] matrixStrideA: offset between the starting address of adjacent nd matrix in A matrix, in terms of elements + * @param [in] matrixStrideB: offset between the starting address of adjacent nd matrix in B matrix, in terms of elements + * @param [in] matrixStrideC: reserved parameter + */ __aicore__ inline void IterateBatch(const GlobalTensor& gm, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) {} + /** + * @brief Calculate multiple C matrices of size singleCoreM * singleCoreN + * @param [in] ubCmatrix: C matrix in LocalTensor + * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data + * @param [in] enAtomic: whether to enable atomic operations + * @param [in] enSequentialWrite: whether to enable sequential write mode + * @param [in] matrixStrideA: offset between the starting address of adjacent nd matrix in A matrix, in terms of elements + * @param [in] matrixStrideB: offset between the starting address of adjacent nd matrix in B matrix, in terms of elements + * @param [in] matrixStrideC: reserved parameter + */ __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) {} + /** + * @brief After Iterate, get one or two C matrix slices + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] co2Local: get C matrix to VECIN, data format only supports NZ + * @param [in] enAtomic: whether to enable atomic operations + * @param [in] enSequentialWrite: whether to enable sequential write mode + */ template __aicore__ inline void GetTensorC(const LocalTensor& co2Local, uint8_t enAtomic = 0, bool enSequentialWrite = false) {} + /** + * @brief After Iterate, get one or two C matrix slices + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] gm: get C matrix to GM, data format supports ND or NZ + * @param [in] enAtomic: whether to enable atomic operations + * @param [in] enSequentialWrite: whether to enable sequential write mode + */ template __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false) {} + /** + * @brief After Iterate, get one or two C matrix slices + * @tparam [in] sync: set to synchronous or asynchronous mode + * @param [in] gm: get C matrix to GM, data format only supports NZ + * @param [in] co2Local: get C matrix to VECIN, data format only supports NZ + * @param [in] enAtomic: whether to enable atomic operations + * @param [in] enSequentialWrite: whether to enable sequential write mode + */ template __aicore__ inline void GetTensorC(const GlobalTensor &gm, const LocalTensor &co2Local, uint8_t enAtomic = 0, bool enSequentialWrite = false) {} + /** + * @brief Get the position of the current fragment in the entire C matrix + * @tparam [in] isTurnOnDebug: reserved parameter + * @note reserved function + */ template __aicore__ inline MatrixOffset GetOffsetC() { return {}; } + /** + * @brief Release Matmul computation resources to prevent confilicts among multiple Matmul objects + * @note call End once when switching computations between multiple Matmul objects + */ __aicore__ inline void End() {} + /** + * @brief Set whether to enable HF32 mode + * @param [in] enableHF32: whether to enable HF32 mode + * @param [in] transMode: when enable HF32 mode, set ROUND mode used when converting float to hf32 + * @note enable HF32 mode can improve performance but it may also reult in a loss of precision + */ __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0) {} + /** + * @brief Set sub-block index + * @param [in] subBlockIdx: sub-block index + */ __aicore__ inline void SetSubBlockIdx(uint8_t subBlockIdx) {} + /** + * @brief Get sub-block index + */ __aicore__ inline uint8_t GetSubBlockIdx() { return 0; } + /** + * @brief Allocate a temporary buffer for caching computation reselts + * @param [in] addr: workspace on GM provided by user, GM address type + * @param [in] size: number of elements + */ template __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size) {} + /** + * @brief Allocate a temporary buffer for caching computation reselts + * @param [in] addr: workspace on GM provided by user, GlobalTensor type + * @note recommend to use this function + */ template __aicore__ inline void SetWorkspace(GlobalTensor& addr) {} + /** + * @brief Set starting physical address of additional VECCALC space + * @param [in] tmpBuffer: temporary space + * @note when Matmul requires additional VECCALC space and user wants to reuse this additional space, + * the space must be pre-reserved and a LocalTensor must be allocated + */ __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) {} using CallBack = MM_CB; }; diff --git a/lib/matmul/matmul_tiling.h b/lib/matmul/matmul_tiling.h index fdae6489..37a492c1 100644 --- a/lib/matmul/matmul_tiling.h +++ b/lib/matmul/matmul_tiling.h @@ -18,7 +18,6 @@ #include #include -#include #include "matmul_tiling_base.h" #include "matmul_tilingdata.h" #include "kernel_tiling/kernel_tiling.h" diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h index 76ae6ec9..1629f4d7 100644 --- a/lib/matmul/tiling.h +++ b/lib/matmul/tiling.h @@ -16,6 +16,22 @@ #define LIB_MATMUL_TILING_H #include "../../impl/matmul/utils/matmul_config_impl.h" +/** + * @brief Get normal config with custom configuration + * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer + * when the inner axis of input matrix on a single core is greater than or equal to 65535 + * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul + * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions + * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer + * when BatchMatmul Layout is NORMAL + * @param [in] isMsgReuse: whether the dataPtr in the callback function set by SetSelfDefineData directly passes a value + * @param [in] iterateOrder: iterate order for matmul operation + * @param [in] scheduleType: set matmul data transfer mode + * @param [in] enUnitFlag: whether to enable UnitFlag + * @param [in] enableMixDualMaster: whether to enable MixDualMaster + * @param [in] isNBatchOut: set multi-batch output mode + * @return MatmulConfig with normal setting + */ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1, const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF, @@ -76,6 +92,23 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f }; } +/** + * @brief Get MDL config with custom configuration + * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer + * when the inner axis of input matrix on a single core is greater than or equal to 65535 + * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul + * @param [in] doMTE2Preload: enable M/N direction preload + * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions + * @param [in] isPerTensor: whether to enable B matrix per tensor quantization when the input is A16W8 + * @param [in] hasAntiQuantOffset: whether to enable B matrix quantization by using offset coefficient + * when the input is A16W8 + * @param [in] enUnitFlag: whether to enable UnitFlag + * @param [in] isMsgReuse: whether the dataPtr in the callback function set by SetSelfDefineData directly passes a value + * @param [in] enableUBReuse: whether to reuse Unified Buffer space + * @param [in] enableL1CacheUB: whether to enable L1 Buffer caching for Unified Buffer compute blocks + * @param [in] enableMixDualMaster: whether to enable MixDualMaster + * @return MatmulConfig with MDL setting + */ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false, bool hasAntiQuantOffset = false, const bool enUnitFlag = false, const bool isMsgReuse = true, @@ -135,6 +168,18 @@ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = fals }; } +/** + * @brief Get special MDL config with custom configuration + * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer + * when the inner axis of input matrix on a single core is greater than or equal to 65535 + * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul + * @param [in] doMTE2Preload: enable M/N direction preload + * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions + * @param [in] isPerTensor: whether to enable B matrix per tensor quantization when the input is A16W8 + * @param [in] hasAntiQuantOffset: whether to enable B matrix quantization by using offset coefficient + * when the input is A16W8 + * @return MatmulConfig with special MDL setting + */ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false, bool hasAntiQuantOffset = false) @@ -193,6 +238,18 @@ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit }; } +/** + * @brief Get basic config with custom configuration + * @param [in] basicM: size of M-axis shape involved in once mmad instruction + * @param [in] basicN: size of N-axis shape involved in once mmad instruction + * @param [in] basicK: size of K-axis shape involved in once mmad instruction + * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer + * when the inner axis of input matrix on a single core is greater than or equal to 65535 + * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul + * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer + * when BatchMatmul Layout is NORMAL + * @return MatmulConfig with basic setting + */ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const uint32_t basicN, const uint32_t basicK, const bool intrinsicsLimit = false, const bool batchLoop = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1) @@ -251,6 +308,23 @@ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const ui }; } +/** + * @brief Get special basic config with custom configuration + * @param [in] basicM: size of M-axis shape involved in once mmad instruction + * @param [in] basicN: size of N-axis shape involved in once mmad instruction + * @param [in] basicK: size of K-axis shape involved in once mmad instruction + * @param [in] singleCoreM: size of M-axis shape within a single core + * @param [in] singleCoreN: size of N-axis shape within a single core + * @param [in] singleCoreK: size of K-axis shape within a single core + * @param [in] stepM: multiple of baseM for A matrix in M-direction of A1 + * @param [in] stepN: multiple of baseN for B matrix in N-direction of B1 + * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer + * when the inner axis of input matrix on a single core is greater than or equal to 65535 + * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul + * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer + * when BatchMatmul Layout is NORMAL + * @return MatmulConfig with special basic setting + */ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, const uint32_t basicN, const uint32_t basicK, const uint32_t singleCoreM, const uint32_t singleCoreN, const uint32_t singleCoreK, const uint32_t stepM, const uint32_t stepN, const bool intrinsicsLimit = false, const bool batchLoop = false, @@ -310,6 +384,18 @@ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, c }; } +/** + * @brief Get IBShare normal config with custom configuration + * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer + * when the inner axis of input matrix on a single core is greater than or equal to 65535 + * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul + * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions + * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer + * when BatchMatmul Layout is NORMAL + * @param [in] isDoubleCache: whether L1 Buffer caches two blocks of data at the same time after using IBShare + * @param [in] enUnitFlag: whether to enable UnitFlag + * @return MatmulConfig with IBShare normal setting + */ __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1, const bool isDoubleCache = false, const bool enUnitFlag = true) @@ -373,6 +459,14 @@ constexpr MatmulConfig CFG_MDL = GetMDLConfig(); constexpr MatmulConfig MM_CFG_BB = GetBasicConfig(128, 128, 128); constexpr MatmulConfig CFG_IBSHARE_NORM = GetIBShareNormConfig(); +/** + * @brief Get matmul config with custom params + * \tparam [in] configMode: MatmulConfig template + * \tparam [in] ArgTypes: variable template params + * @param [in] args: variable params, one or more of + * MatmulShapeParams/MatmulQuantParams/MatmulBatchParams/MatmulFuncParams in any order + * @return MatmulConfig with custom setting + */ template __aicore__ inline constexpr MatmulConfig GetMMConfig(ArgTypes&&... args) { MatmulConfig mmConfig = CFG_NORM; @@ -387,54 +481,58 @@ __aicore__ inline constexpr MatmulConfig GetMMConfig(ArgTypes&&... args) { return mmConfig; } +/** + * @struct MatmulApiStaticTiling + * @brief Constant tiling struct, include a set of constant tiling parameters and a MatmulConfig + */ struct MatmulApiStaticTiling { - int32_t usedCoreNum = -1; - int32_t M = -1; - int32_t N = -1; - int32_t Ka = -1; - int32_t Kb = -1; - int32_t singleCoreM = -1; - int32_t singleCoreN = -1; - int32_t singleCoreK = -1; - int32_t baseM = -1; - int32_t baseN = -1; - int32_t baseK = -1; - int32_t depthA1 = -1; - int32_t depthB1 = -1; - int32_t stepM = -1; - int32_t stepN = -1; - int32_t isBias = -1; - int32_t transLength = -1; - int32_t iterateOrder = -1; - int32_t shareMode = -1; - int32_t shareL1Size = -1; - int32_t shareL0CSize = -1; - int32_t shareUbSize = -1; - int32_t stepKa = -1; - int32_t stepKb = -1; - int32_t depthAL1CacheUB = -1; - int32_t depthBL1CacheUB = -1; - int32_t dbL0A = -1; - int32_t dbL0B = -1; - int32_t dbL0C = -1; - int32_t ALayoutInfoB = -1; - int32_t ALayoutInfoS = -1; - int32_t ALayoutInfoN = -1; - int32_t ALayoutInfoG = -1; - int32_t ALayoutInfoD = -1; - int32_t BLayoutInfoB = -1; - int32_t BLayoutInfoS = -1; - int32_t BLayoutInfoN = -1; - int32_t BLayoutInfoG = -1; - int32_t BLayoutInfoD = -1; - int32_t CLayoutInfoB = -1; - int32_t CLayoutInfoS1 = -1; - int32_t CLayoutInfoN = -1; - int32_t CLayoutInfoG = -1; - int32_t CLayoutInfoS2 = -1; - int32_t BatchNum = -1; - int32_t mxTypePara = -1; - MatmulConfig cfg = CFG_NORM; + int32_t usedCoreNum = -1; ///< number of AICore used + int32_t M = -1; ///< size of original A matrix M-axis shape + int32_t N = -1; ///< size of original B matrix N-axis shape + int32_t Ka = -1; ///< size of original A matrix K-axis shape + int32_t Kb = -1; ///< size of original B matrix K-axis shape + int32_t singleCoreM = -1; ///< size of M-axis shape within a single core + int32_t singleCoreN = -1; ///< size of N-axis shape within a single core + int32_t singleCoreK = -1; ///< size of K-axis shape within a single core + int32_t baseM = -1; ///< size of M-axis shape involved in once mmad instruction + int32_t baseN = -1; ///< size of N-axis shape involved in once mmad instruction + int32_t baseK = -1; ///< size of K-axis shape involved in once mmad instruction + int32_t depthA1 = -1; ///< number of baseM * baseK full loaded in A1 + int32_t depthB1 = -1; ///< number of baseK * baseN full loaded in B1 + int32_t stepM = -1; ///< multiple of baseM for A matrix in M-direction of A1 + int32_t stepN = -1; ///< multiple of baseN for B matrix in N-direction of B1 + int32_t isBias = -1; ///< whether to enable bias + int32_t transLength = -1; ///< size of UB temporary space during the calculation + int32_t iterateOrder = -1; ///< order of each Iterate + int32_t shareMode = -1; ///< reserved parameter + int32_t shareL1Size = -1; ///< reserved parameter + int32_t shareL0CSize = -1; ///< reserved parameter + int32_t shareUbSize = -1; ///< reserved parameter + int32_t stepKa = -1; ///< multiple of baseK for A matrix in K-direction of A1 + int32_t stepKb = -1; ///< multiple of baseK for B matrix in K-direction of B1 + int32_t depthAL1CacheUB = -1; ///< number of A matrix L1 Buffer caching for Unified Buffer compute blocks + int32_t depthBL1CacheUB = -1; ///< number of B matrix L1 Buffer caching for Unified Buffer compute blocks + int32_t dbL0A = -1; ///< whether A matrix MTE1 is using double buffer + int32_t dbL0B = -1; ///< whether B matrix MTE1 is using double buffer + int32_t dbL0C = -1; ///< whether MMAD is using double buffer + int32_t ALayoutInfoB = -1; ///< B-axis information of A matrix layout + int32_t ALayoutInfoS = -1; ///< S-axis information of A matrix layout + int32_t ALayoutInfoN = -1; ///< N-axis information of A matrix layout + int32_t ALayoutInfoG = -1; ///< G-axis information of A matrix layout + int32_t ALayoutInfoD = -1; ///< D-axis information of A matrix layout + int32_t BLayoutInfoB = -1; ///< B-axis information of B matrix layout + int32_t BLayoutInfoS = -1; ///< S-axis information of B matrix layout + int32_t BLayoutInfoN = -1; ///< N-axis information of B matrix layout + int32_t BLayoutInfoG = -1; ///< G-axis information of B matrix layout + int32_t BLayoutInfoD = -1; ///< D-axis information of B matrix layout + int32_t CLayoutInfoB = -1; ///< B-axis information of C matrix layout + int32_t CLayoutInfoS1 = -1; ///< S1-axis information of C matrix layout + int32_t CLayoutInfoN = -1; ///< N-axis information of C matrix layout + int32_t CLayoutInfoG = -1; ///< G-axis information of C matrix layout + int32_t CLayoutInfoS2 = -1; ///< S2-axis information of C matrix layout + int32_t BatchNum = -1; ///< batch number of Batch Matmul + int32_t mxTypePara = -1; ///< multiple of scaleA/scaleB load size in L1 compared to the load size of A/B matrix in L1 + MatmulConfig cfg = CFG_NORM; ///< MatmulConfig parameter }; -#endif // LIB_MATMUL_TILING_H \ No newline at end of file +#endif ///< LIB_MATMUL_TILING_H \ No newline at end of file -- Gitee