From 411ebd4150c7545bf3be17dfd504b7927db5ebe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=85=83=E6=9D=B0?= <liyuanjie5@huawei.com>
Date: Tue, 12 Aug 2025 12:00:25 +0000
Subject: [PATCH] =?UTF-8?q?=E8=A1=A5=E5=85=85=E5=AF=B9=E5=A4=96=E6=8E=A5?=
 =?UTF-8?q?=E5=8F=A3=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 impl/matmul/param/matmul_shape_tiling.h   |  36 +--
 impl/matmul/tiling/matmul_tiling_base.cpp |   1 -
 lib/matmul/matmul.h                       | 270 ++++++++++++++++++++--
 lib/matmul/matmul_tiling.h                |   1 -
 lib/matmul/tiling.h                       | 194 ++++++++++++----
 5 files changed, 419 insertions(+), 83 deletions(-)
diff --git a/impl/matmul/param/matmul_shape_tiling.h b/impl/matmul/param/matmul_shape_tiling.h
index 0c595399..c7244e8d 100644
--- a/impl/matmul/param/matmul_shape_tiling.h
+++ b/impl/matmul/param/matmul_shape_tiling.h
@@ -215,6 +215,23 @@ private:
               enable_if_t<NormInitScene<MM_CFG_ALIAS>, bool> = false>
     __aicore__ inline void ConfigSpecificCheck()
     {
+        if constexpr (DoMatmulNorm(MM_CFG) && IMPL::AType::layout != LayoutMode::NONE) {
+            if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1 &&
+                          !ToMatmulConfig(MM_CFG).isBiasBatch) {
+                ASCENDC_ASSERT(false, {
+                    KERNEL_LOG(KERNEL_ERROR, "Bias reuse does not support BatchMode::SINGLE_LARGE_THAN_L1");
+                });
+            }
+
+#if __CCE_AICORE__ == 220
+            if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
+                ASCENDC_ASSERT(tiling_.GetSingleCoreK() <= tiling_.GetBaseK(), {
+                    KERNEL_LOG(KERNEL_ERROR, "When singleCoreK is larger than baseK, the parameter scheduleType of "
+                                             "MM_CFG should not be OUTER_PRODUCT");
+                });
+            }
+#endif
+        }
 #if __CCE_AICORE__ < 220
         // when output is int8 and ND format, do not support on the fly trans nd2nz
         if constexpr (IMPL::CType::format == CubeFormat::ND && !ToMatmulConfig(MM_CFG).enVecND2NZ &&
@@ -302,24 +319,7 @@ private:
                           bool> = false>
     __aicore__ inline void ConfigSpecificCheck()
     {
-        if constexpr (IMPL::AType::layout != LayoutMode::NONE) {
-            ASCENDC_ASSERT(!DoMatmulMDL(MM_CFG), { KERNEL_LOG(KERNEL_ERROR, "BatchMatmul unsupport MDL."); });
-            if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1 &&
-                          !ToMatmulConfig(MM_CFG).isBiasBatch) {
-                ASCENDC_ASSERT(false, {
-                    KERNEL_LOG(KERNEL_ERROR, "Bias reuse does not supported BatchMode::SINGLE_LARGE_THAN_L1");
-                });
-            }
-
-#if __CCE_AICORE__ == 220
-            if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
-                ASCENDC_ASSERT(tiling_.GetSingleCoreK() <= tiling_.GetBaseK(), {
-                    KERNEL_LOG(KERNEL_ERROR, "When singleCoreK is larger than baseK, the parameter scheduleType of "
-                                             "MM_CFG should not be OUTER_PRODUCT");
-                });
-            }
-#endif
-        }
+        ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Unsupported matmul config."); });
     }
 
     __aicore__ inline void ConfigCommonCheck()
diff --git a/impl/matmul/tiling/matmul_tiling_base.cpp b/impl/matmul/tiling/matmul_tiling_base.cpp
index 07a79d6d..4efb3b2e 100644
--- a/impl/matmul/tiling/matmul_tiling_base.cpp
+++ b/impl/matmul/tiling/matmul_tiling_base.cpp
@@ -15,7 +15,6 @@
 #include "lib/matmul/matmul_tiling_base.h"
 
 #include <iostream>
-#include <map>
 #include <algorithm>
 
 #include "impl/host_log.h"
diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h
index 7562bcca..4f085048 100644
--- a/lib/matmul/matmul.h
+++ b/lib/matmul/matmul.h
@@ -24,109 +24,349 @@
 
 namespace AscendC {
 
+/**
+ * @struct MatmulApiConfig
+ * @brief Matmul external configuration
+ */
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG>
 struct MatmulApiConfig {
-    using AType = A_TYPE;
-    using BType = B_TYPE;
-    using CType = C_TYPE;
-    using BiasType = BIAS_TYPE;
-    constexpr static MatmulConfig Config = ToMatmulConfig(MM_CFG);
+    using AType = A_TYPE;       ///< MatmulType of A matrix
+    using BType = B_TYPE;       ///< MatmulType of B matrix
+    using CType = C_TYPE;       ///< MatmulType of C matrix
+    using BiasType = BIAS_TYPE; ///< MatmulType of Bias
+    constexpr static MatmulConfig Config = ToMatmulConfig(MM_CFG);  ///< MatmulConfig
 };
 
+/**
+ * @class MatmulImpl
+ * @brief Matmul implementation of user-defined matmul object
+ */
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
 class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy), typename = void>
 class MatmulImpl
 {
 public:
-    using AType = A_TYPE;
-    using BType = B_TYPE;
-    using CType = C_TYPE;
-    using BiasType = BIAS_TYPE;
+    using AType = A_TYPE;       ///< MatmulType of A matrix
+    using BType = B_TYPE;       ///< MatmulType of B matrix
+    using CType = C_TYPE;       ///< MatmulType of C matrix
+    using BiasType = BIAS_TYPE; ///< MatmulType of Bias
 private:
-    using L0cT = typename GetMmDstType<typename A_TYPE::T>::Type;
-    using SrcT = typename A_TYPE::T;
-    using SrcAT = typename A_TYPE::T;
-    using SrcBT = typename B_TYPE::T;
-    using DstT = typename C_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
+    using L0cT = typename GetMmDstType<typename A_TYPE::T>::Type;   ///< data type of L0c
+    using SrcT = typename A_TYPE::T;        ///< data type of input data
+    using SrcAT = typename A_TYPE::T;       ///< data type of A matrix
+    using SrcBT = typename B_TYPE::T;       ///< data type of B matrix
+    using DstT = typename C_TYPE::T;        ///< data type of output data
+    using BiasT = typename BIAS_TYPE::T;    ///< data type of bias
 
 public:
     __aicore__ inline MatmulImpl() {}
+    /**
+     * @brief Initialize tiling data in Matmul object and allocate resources according to tiling parameters
+     * @param [in] cubeTiling: matmul tiling
+     * @param [in] tpipe: TPipe object
+     */
     __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe = nullptr) {}
+    /**
+     * @brief Set full original shape M N K of the number of elements
+     * @param [in] orgM: size of original A matrix M-axis shape
+     * @param [in] orgN: size of original B matrix N-axis shape
+     * @param [in] orgK: size of original A/B matrix K-axis shape, only when Ka equal to Kb
+     */
     __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK) {}
+    /**
+     * @brief Set full original shape M N K of the number of elements
+     * @param [in] orgM: size of original A matrix M-axis shape
+     * @param [in] orgN: size of original B matrix N-axis shape
+     * @param [in] orgKa: size of original A matrix K-axis shape
+     * @param [in] orgKb: size of original B matrix K-axis shape
+     * @param [in] orgKc: size of C matrix N-axis shape, only when B matrix's N and C matrix's N are different
+     */
     __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0) {}
+    /**
+     * @brief Set single core shape M N K of the number of elements
+     * @param [in] singleM: size of M-axis shape within a single core
+     * @param [in] singleN: size of N-axis shape within a single core
+     * @param [in] singleK: size of K-axis shape within a single core
+     */
     __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK) {}
+    /**
+     * @brief Without changing tiling, reconfigure singleCoreM, singleCoreN, singleCoreK for this computation
+     * @param [in] tailM: size of M-axis shape within a single core
+     * @param [in] tailN: size of N-axis shape within a single core
+     * @param [in] tailK: size of K-axis shape within a single core
+     */
     __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1) {}
+    /**
+     * @brief Set A matrix
+     * @param [in] gm: A matrix in GlobalTensor
+     * @param [in] isTransposeA: whether A matrix needs to be transposed
+     */
     __aicore__ inline void SetTensorA(const GlobalTensor<SrcAT>& gm, bool isTransposeA = false) {}
+    /**
+     * @brief Set B matrix
+     * @param [in] gm: B matrix in GlobalTensor
+     * @param [in] isTransposeA: whether B matrix needs to be transposed
+     */
     __aicore__ inline void SetTensorB(const GlobalTensor<SrcBT>& gm, bool isTransposeB = false) {}
+    /**
+     * @brief Set bias matrix
+     * @param [in] gm: bias matrix in GlobalTensor
+     */
     __aicore__ inline void SetBias(const GlobalTensor<BiasT>& biasGlobal) {}
+    /**
+     * @brief When using MatmulCallBackFunc, set the required computation data or the storage address of data on GM
+     * @tparam [in] T: dataPtr data type, default is uint64_t
+     * @param [in] dataPtr: the required computation data or the storage address of data on GM
+     * @note must be called before SetTensorA and SetTensorB
+     */
     template <class T>
     __aicore__ inline void SetSelfDefineData(T dataPtr) {}
+    /**
+     * @brief When using MatmulCallBackFunc, set the tiling address used by callback function
+     * @param [in] tilingPtr: the tiling address
+     * @note only need to be called once
+     */
     __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {}
+    /**
+     * @brief Set the index matrix generated during the densification process of a sparse matrix
+     * @param [in] indexGlobal: the first address of the index matrix in Global Memory
+     */
     __aicore__ inline void SetSparseIndex(const GlobalTensor<uint8_t>& indexGlobal);
+    /**
+     * @brief Set the quantization scale for anti-quantization when A matrix's data type is half and B matrix's data type is int8
+     * @param [in] offsetScalar: quantization scale for addition
+     * @param [in] scaleScalar: quantization scale for multiplication
+     */
     __aicore__ inline void SetAntiQuantScalar(const SrcT offsetScalar, const SrcT scaleScalar) {}
+    /**
+     * @brief Set the quantization vector for anti-quantization when A matrix's data type is half and B matrix's data type is int8
+     * @param [in] offsetTensor: quantization vector for addition
+     * @param [in] scaleTensor: quantization vector for multiplication
+     */
     __aicore__ inline void SetAntiQuantVector(const LocalTensor<SrcT> &offsetTensor,
         const LocalTensor<SrcT> &scaleTensor) {}
+    /**
+     * @brief Set the quantization scale
+     * @param [in] quantScalar: quantization scale
+     */
     __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {}
+    /**
+     * @brief Set the quantization vector
+     * @param [in] quantTensor: quantization vector
+     */
     __aicore__ inline void SetQuantVector(const GlobalTensor<uint64_t>& quantTensor) {}
+    /**
+     * @brief Set A matrix
+     * @param [in] leftMatrix: A matrix in LocalTensor
+     * @param [in] isTransposeA: whether A matrix needs to be transposed
+     */
     __aicore__ inline void SetTensorA(const LocalTensor<SrcAT>& leftMatrix, bool isTransposeA = false) {}
+    /**
+     * @brief Copy data from UB to GM and then input as A matrix
+     * @param [in] gm: A matrix in GlobalTensor
+     * @param [in] leftMatrix: A matrix in LocalTensor
+     * @param [in] isTransposeA: whether A matrix needs to be transposed
+     */
     __aicore__ inline void SetTensorAWithCopy(const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT>& leftMatrix,
         bool isTransposeA = false) {}
+    /**
+     * @brief Set B matrix
+     * @param [in] rightMatrix: B matrix in LocalTensor
+     * @param [in] isTransposeB: whether B matrix needs to be transposed
+     */
     __aicore__ inline void SetTensorB(const LocalTensor<SrcBT>& rightMatrix, bool isTransposeB = false) {}
+    /**
+     * @brief Set A matrix
+     * @param [in] aScalar: values set in A matrix
+     * @note scalar data will be expanded into a tensor of shape [1, K]
+     */
     __aicore__ inline void SetTensorA(SrcAT aScalar) {}
+    /**
+     * @brief Set B matrix
+     * @param [in] bScalar: values set in B matrix
+     * @note scalar data will be expanded into a tensor of shape [1, K]
+     */
     __aicore__ inline void SetTensorB(SrcBT bScalar) {}
+    /**
+     * @brief Copy data from UB to GM and then input as B matrix
+     * @param [in] gm: B matrix in GlobalTensor
+     * @param [in] rightMatrix: B matrix in LocalTensor
+     * @param [in] isTransposeB: whether B matrix needs to be transposed
+     */
     __aicore__ inline void SetTensorBWithCopy(const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT>& rightMatrix,
         bool isTransposeB = false) {}
+    /**
+     * @brief Set bias matrix
+     * @param [in] inputBias: bias matrix in LocalTensor
+     */
     __aicore__ inline void SetBias(const LocalTensor<BiasT>& inputBias) {}
+    /**
+     * @brief Reset the batch number for Batch Matmul without chaging tiling
+     * @param [in] batchA: batch number of A matrix
+     * @param [in] batchB: batch number of B matrix
+     */
     __aicore__ inline void SetBatchNum(int32_t batchA, int32_t batchB) {}
+    /**
+     * @brief Clear bias flag, bias will not be involved in the computation
+     */
     __aicore__ inline void DisableBias() {}
+    /**
+     * @brief Clear bias flag, bias will not be involved in the computation
+     * @note recommend to use DisableBias
+     */
     __aicore__ inline void ClearBias() {}
+    /**
+     * @brief Calculate a C matrix of size baseM * baseN
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data
+     */
     template <bool sync = true> __aicore__ inline bool Iterate(bool enPartialSum = false)
     {
         return false;
     }
+    /**
+     * @brief Calculate a C matrix of size baseM * baseN
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data
+     * @param [in] localCmatrix: the LocalTensor memory on CO1 applied for by user, used to store the results of Iterate
+     */
     template <bool sync = true, typename T> __aicore__ inline bool Iterate(bool enPartialSum,
         const LocalTensor<T>& localCmatrix)
     {
         return false;
     }
+    /**
+     * @brief Calculate a C matrix of size singleCoreM * singleCoreN
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] gm: C matrix in GlobalTensor
+     * @param [in] enAtomic: whether to enable atomic operations
+     * @param [in] enSequentialWrite: whether to enable sequential write mode
+     * @param [in] waitIterateAll: whether to wait for IterateAll to complete by WaitIterateAll when in asynchronous mode
+     * @param [in] fakeMsg: whether to enable fake message when in IBShare or IntraBlockPartSum mode
+     */
     template <bool sync = true>
     __aicore__ inline void IterateAll(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
         bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) {}
+    /**
+     * @brief Calculate a C matrix of size singleCoreM * singleCoreN
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] ubCmatrix: C matrix in LocalTensor
+     * @param [in] enAtomic: whether to enable atomic operations
+     */
     template <bool sync = true>
     __aicore__ inline void IterateAll(const LocalTensor<DstT>& ubCmatrix, uint8_t enAtomic = 0) {}
 
+    /**
+     * @brief Calculate multiple C matrices of size singleCoreM * singleCoreN
+     * @param [in] gm: C matrix in GlobalTensor
+     * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data
+     * @param [in] enAtomic: whether to enable atomic operations
+     * @param [in] enSequentialWrite: whether to enable sequential write mode
+     * @param [in] matrixStrideA: offset between the starting address of adjacent nd matrix in A matrix, in terms of elements
+     * @param [in] matrixStrideB: offset between the starting address of adjacent nd matrix in B matrix, in terms of elements
+     * @param [in] matrixStrideC: reserved parameter
+     */
     __aicore__ inline void IterateBatch(const GlobalTensor<DstT>& gm,
         bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
         const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) {}
+        /**
+     * @brief Calculate multiple C matrices of size singleCoreM * singleCoreN
+     * @param [in] ubCmatrix: C matrix in LocalTensor
+     * @param [in] enPartialSum: whether to accumulate the result of Iterate into CO1 data
+     * @param [in] enAtomic: whether to enable atomic operations
+     * @param [in] enSequentialWrite: whether to enable sequential write mode
+     * @param [in] matrixStrideA: offset between the starting address of adjacent nd matrix in A matrix, in terms of elements
+     * @param [in] matrixStrideB: offset between the starting address of adjacent nd matrix in B matrix, in terms of elements
+     * @param [in] matrixStrideC: reserved parameter
+     */
     __aicore__ inline void IterateBatch(const LocalTensor<DstT>& ubCmatrix,
         bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
         const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) {}
 
+    /**
+     * @brief After Iterate, get one or two C matrix slices
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] co2Local: get C matrix to VECIN, data format only supports NZ
+     * @param [in] enAtomic: whether to enable atomic operations
+     * @param [in] enSequentialWrite: whether to enable sequential write mode
+     */
     template <bool sync = true>
     __aicore__ inline void GetTensorC(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
         bool enSequentialWrite = false) {}
+    /**
+     * @brief After Iterate, get one or two C matrix slices
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] gm: get C matrix to GM, data format supports ND or NZ
+     * @param [in] enAtomic: whether to enable atomic operations
+     * @param [in] enSequentialWrite: whether to enable sequential write mode
+     */
     template <bool sync = true>
     __aicore__ inline void GetTensorC(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
         bool enSequentialWrite = false) {}
+    /**
+     * @brief After Iterate, get one or two C matrix slices
+     * @tparam [in] sync: set to synchronous or asynchronous mode
+     * @param [in] gm: get C matrix to GM, data format only supports NZ
+     * @param [in] co2Local: get C matrix to VECIN, data format only supports NZ
+     * @param [in] enAtomic: whether to enable atomic operations
+     * @param [in] enSequentialWrite: whether to enable sequential write mode
+     */
     template <bool sync = true>
     __aicore__ inline void GetTensorC(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
         uint8_t enAtomic = 0, bool enSequentialWrite = false) {}
+    /**
+     * @brief Get the position of the current fragment in the entire C matrix
+     * @tparam [in] isTurnOnDebug: reserved parameter
+     * @note reserved function
+     */
     template <bool isTurnOnDebug = true>
     __aicore__ inline MatrixOffset GetOffsetC()
     {
         return {};
     }
+    /**
+     * @brief Release Matmul computation resources to prevent confilicts among multiple Matmul objects
+     * @note call End once when switching computations between multiple Matmul objects
+     */
     __aicore__ inline void End() {}
+    /**
+     * @brief Set whether to enable HF32 mode
+     * @param [in] enableHF32: whether to enable HF32 mode
+     * @param [in] transMode: when enable HF32 mode, set ROUND mode used when converting float to hf32
+     * @note  enable HF32 mode can improve performance but it may also reult in a loss of precision
+     */
     __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0) {}
+    /**
+     * @brief Set sub-block index
+     * @param [in] subBlockIdx: sub-block index
+     */
     __aicore__ inline void SetSubBlockIdx(uint8_t subBlockIdx) {}
+    /**
+     * @brief Get sub-block index
+     */
     __aicore__ inline uint8_t GetSubBlockIdx()
     {
         return 0;
     }
+    /**
+     * @brief Allocate a temporary buffer for caching computation reselts
+     * @param [in] addr: workspace on GM provided by user, GM address type
+     * @param [in] size: number of elements
+     */
     template <class T> __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size) {}
+    /**
+     * @brief Allocate a temporary buffer for caching computation reselts
+     * @param [in] addr: workspace on GM provided by user, GlobalTensor type
+     * @note recommend to use this function
+     */
     template <class T> __aicore__ inline void SetWorkspace(GlobalTensor<T>& addr) {}
 
+    /**
+     * @brief Set starting physical address of additional VECCALC space
+     * @param [in] tmpBuffer: temporary space
+     * @note when Matmul requires additional VECCALC space and user wants to reuse this additional space,
+     *       the space must be pre-reserved and a LocalTensor must be allocated
+     */
     __aicore__ inline void SetLocalWorkspace(const LocalTensor<uint8_t>& tmpBuffer) {}
     using CallBack = MM_CB;
 };
diff --git a/lib/matmul/matmul_tiling.h b/lib/matmul/matmul_tiling.h
index fdae6489..37a492c1 100644
--- a/lib/matmul/matmul_tiling.h
+++ b/lib/matmul/matmul_tiling.h
@@ -18,7 +18,6 @@
 
 #include <cstdint>
 #include <string>
-#include <array>
 #include "matmul_tiling_base.h"
 #include "matmul_tilingdata.h"
 #include "kernel_tiling/kernel_tiling.h"
diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h
index 76ae6ec9..1629f4d7 100644
--- a/lib/matmul/tiling.h
+++ b/lib/matmul/tiling.h
@@ -16,6 +16,22 @@
 #define LIB_MATMUL_TILING_H
 #include "../../impl/matmul/utils/matmul_config_impl.h"
 
+/**
+ * @brief Get normal config with custom configuration
+ * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer
+ *                              when the inner axis of input matrix on a single core is greater than or equal to 65535
+ * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul
+ * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions
+ * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer
+ *                      when BatchMatmul Layout is NORMAL
+ * @param [in] isMsgReuse: whether the dataPtr in the callback function set by SetSelfDefineData directly passes a value
+ * @param [in] iterateOrder: iterate order for matmul operation
+ * @param [in] scheduleType: set matmul data transfer mode
+ * @param [in] enUnitFlag: whether to enable UnitFlag
+ * @param [in] enableMixDualMaster: whether to enable MixDualMaster
+ * @param [in] isNBatchOut: set multi-batch output mode
+ * @return MatmulConfig with normal setting
+ */
 __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
     const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1,
     const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF,
@@ -76,6 +92,23 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f
     };
 }
 
+/**
+ * @brief Get MDL config with custom configuration
+ * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer
+ *                              when the inner axis of input matrix on a single core is greater than or equal to 65535
+ * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul
+ * @param [in] doMTE2Preload: enable M/N direction preload
+ * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions
+ * @param [in] isPerTensor: whether to enable B matrix per tensor quantization when the input is A16W8
+ * @param [in] hasAntiQuantOffset: whether to enable B matrix quantization by using offset coefficient
+ *                                 when the input is A16W8
+ * @param [in] enUnitFlag: whether to enable UnitFlag
+ * @param [in] isMsgReuse: whether the dataPtr in the callback function set by SetSelfDefineData directly passes a value
+ * @param [in] enableUBReuse: whether to reuse Unified Buffer space
+ * @param [in] enableL1CacheUB: whether to enable L1 Buffer caching for Unified Buffer compute blocks
+ * @param [in] enableMixDualMaster: whether to enable MixDualMaster
+ * @return MatmulConfig with MDL setting
+ */
 __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
     const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false,
     bool hasAntiQuantOffset = false, const bool enUnitFlag = false, const bool isMsgReuse = true,
@@ -135,6 +168,18 @@ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = fals
     };
 }
 
+/**
+ * @brief Get special MDL config with custom configuration
+ * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer
+ *                              when the inner axis of input matrix on a single core is greater than or equal to 65535
+ * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul
+ * @param [in] doMTE2Preload: enable M/N direction preload
+ * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions
+ * @param [in] isPerTensor: whether to enable B matrix per tensor quantization when the input is A16W8
+ * @param [in] hasAntiQuantOffset: whether to enable B matrix quantization by using offset coefficient
+ *                                 when the input is A16W8
+ * @return MatmulConfig with special MDL setting
+ */
 __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
     const uint32_t doMTE2Preload = 0, const bool isVecND2NZ = false, bool isPerTensor = false,
     bool hasAntiQuantOffset = false)
@@ -193,6 +238,18 @@ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit
     };
 }
 
+/**
+ * @brief Get basic config with custom configuration
+ * @param [in] basicM: size of M-axis shape involved in once mmad instruction
+ * @param [in] basicN: size of N-axis shape involved in once mmad instruction
+ * @param [in] basicK: size of K-axis shape involved in once mmad instruction
+ * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer
+ *                              when the inner axis of input matrix on a single core is greater than or equal to 65535
+ * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul
+ * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer
+ *                      when BatchMatmul Layout is NORMAL
+ * @return MatmulConfig with basic setting
+ */
 __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const uint32_t basicN,
     const uint32_t basicK, const bool intrinsicsLimit = false, const bool batchLoop = false,
     const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1)
@@ -251,6 +308,23 @@ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const ui
     };
 }
 
+/**
+ * @brief Get special basic config with custom configuration
+ * @param [in] basicM: size of M-axis shape involved in once mmad instruction
+ * @param [in] basicN: size of N-axis shape involved in once mmad instruction
+ * @param [in] basicK: size of K-axis shape involved in once mmad instruction
+ * @param [in] singleCoreM: size of M-axis shape within a single core
+ * @param [in] singleCoreN: size of N-axis shape within a single core
+ * @param [in] singleCoreK: size of K-axis shape within a single core
+ * @param [in] stepM: multiple of baseM for A matrix in M-direction of A1
+ * @param [in] stepN: multiple of baseN for B matrix in N-direction of B1
+ * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer
+ *                              when the inner axis of input matrix on a single core is greater than or equal to 65535
+ * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul
+ * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer
+ *                      when BatchMatmul Layout is NORMAL
+ * @return MatmulConfig with special basic setting
+ */
 __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, const uint32_t basicN,
     const uint32_t basicK, const uint32_t singleCoreM, const uint32_t singleCoreN, const uint32_t singleCoreK,
     const uint32_t stepM, const uint32_t stepN, const bool intrinsicsLimit = false, const bool batchLoop = false,
@@ -310,6 +384,18 @@ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, c
     };
 }
 
+/**
+ * @brief Get IBShare normal config with custom configuration
+ * @param [in] intrinsicsLimit: whether to enable looping data transfer from Global Memory to L1 Buffer
+ *                              when the inner axis of input matrix on a single core is greater than or equal to 65535
+ * @param [in] batchLoop: whether multi-batch input and multi-batch output are enabled, only for BatchMatmul
+ * @param [in] isVecND2NZ: whether to enable the conversion from ND to NZ format using vector instructions
+ * @param [in] bmmMode: set the relationship between multi-batch data in A/B matrix and the size of L1 Buffer
+ *                      when BatchMatmul Layout is NORMAL
+ * @param [in] isDoubleCache: whether L1 Buffer caches two blocks of data at the same time after using IBShare
+ * @param [in] enUnitFlag: whether to enable UnitFlag
+ * @return MatmulConfig with IBShare normal setting
+ */
 __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
     const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1,
     const bool isDoubleCache = false, const bool enUnitFlag = true)
@@ -373,6 +459,14 @@ constexpr MatmulConfig CFG_MDL = GetMDLConfig();
 constexpr MatmulConfig MM_CFG_BB = GetBasicConfig(128, 128, 128);
 constexpr MatmulConfig CFG_IBSHARE_NORM = GetIBShareNormConfig();
 
+/**
+ * @brief Get matmul config with custom params
+ * \tparam [in] configMode: MatmulConfig template
+ * \tparam [in] ArgTypes: variable template params
+ * @param [in] args: variable params, one or more of
+ *                   MatmulShapeParams/MatmulQuantParams/MatmulBatchParams/MatmulFuncParams in any order
+ * @return MatmulConfig with custom setting
+ */
 template <MatmulConfigMode configMode, typename... ArgTypes>
 __aicore__ inline constexpr MatmulConfig GetMMConfig(ArgTypes&&... args) {
     MatmulConfig mmConfig = CFG_NORM;
@@ -387,54 +481,58 @@ __aicore__ inline constexpr MatmulConfig GetMMConfig(ArgTypes&&... args) {
     return mmConfig;
 }
 
+/**
+ * @struct MatmulApiStaticTiling
+ * @brief Constant tiling struct, include a set of constant tiling parameters and a MatmulConfig
+ */
 struct MatmulApiStaticTiling {
-    int32_t usedCoreNum = -1;
-    int32_t M = -1;
-    int32_t N = -1;
-    int32_t Ka = -1;
-    int32_t Kb = -1;
-    int32_t singleCoreM = -1;
-    int32_t singleCoreN = -1;
-    int32_t singleCoreK = -1;
-    int32_t baseM = -1;
-    int32_t baseN = -1;
-    int32_t baseK = -1;
-    int32_t depthA1 = -1;
-    int32_t depthB1 = -1;
-    int32_t stepM = -1;
-    int32_t stepN = -1;
-    int32_t isBias = -1;
-    int32_t transLength = -1;
-    int32_t iterateOrder = -1;
-    int32_t shareMode = -1;
-    int32_t shareL1Size = -1;
-    int32_t shareL0CSize = -1;
-    int32_t shareUbSize = -1;
-    int32_t stepKa = -1;
-    int32_t stepKb = -1;
-    int32_t depthAL1CacheUB = -1;
-    int32_t depthBL1CacheUB = -1;
-    int32_t dbL0A = -1;
-    int32_t dbL0B = -1;
-    int32_t dbL0C = -1;
-    int32_t ALayoutInfoB = -1;
-    int32_t ALayoutInfoS = -1;
-    int32_t ALayoutInfoN = -1;
-    int32_t ALayoutInfoG = -1;
-    int32_t ALayoutInfoD = -1;
-    int32_t BLayoutInfoB = -1;
-    int32_t BLayoutInfoS = -1;
-    int32_t BLayoutInfoN = -1;
-    int32_t BLayoutInfoG = -1;
-    int32_t BLayoutInfoD = -1;
-    int32_t CLayoutInfoB = -1;
-    int32_t CLayoutInfoS1 = -1;
-    int32_t CLayoutInfoN = -1;
-    int32_t CLayoutInfoG = -1;
-    int32_t CLayoutInfoS2 = -1;
-    int32_t BatchNum = -1;
-    int32_t mxTypePara = -1;
-    MatmulConfig cfg = CFG_NORM;
+    int32_t usedCoreNum = -1;       ///< number of AICore used
+    int32_t M = -1;                 ///< size of original A matrix M-axis shape
+    int32_t N = -1;                 ///< size of original B matrix N-axis shape
+    int32_t Ka = -1;                ///< size of original A matrix K-axis shape
+    int32_t Kb = -1;                ///< size of original B matrix K-axis shape
+    int32_t singleCoreM = -1;       ///< size of M-axis shape within a single core
+    int32_t singleCoreN = -1;       ///< size of N-axis shape within a single core
+    int32_t singleCoreK = -1;       ///< size of K-axis shape within a single core
+    int32_t baseM = -1;             ///< size of M-axis shape involved in once mmad instruction
+    int32_t baseN = -1;             ///< size of N-axis shape involved in once mmad instruction
+    int32_t baseK = -1;             ///< size of K-axis shape involved in once mmad instruction
+    int32_t depthA1 = -1;           ///< number of baseM * baseK full loaded in A1
+    int32_t depthB1 = -1;           ///< number of baseK * baseN full loaded in B1
+    int32_t stepM = -1;             ///< multiple of baseM for A matrix in M-direction of A1
+    int32_t stepN = -1;             ///< multiple of baseN for B matrix in N-direction of B1
+    int32_t isBias = -1;            ///< whether to enable bias
+    int32_t transLength = -1;       ///< size of UB temporary space during the calculation
+    int32_t iterateOrder = -1;      ///< order of each Iterate
+    int32_t shareMode = -1;         ///< reserved parameter
+    int32_t shareL1Size = -1;       ///< reserved parameter
+    int32_t shareL0CSize = -1;      ///< reserved parameter
+    int32_t shareUbSize = -1;       ///< reserved parameter
+    int32_t stepKa = -1;            ///< multiple of baseK for A matrix in K-direction of A1
+    int32_t stepKb = -1;            ///< multiple of baseK for B matrix in K-direction of B1
+    int32_t depthAL1CacheUB = -1;   ///< number of A matrix L1 Buffer caching for Unified Buffer compute blocks
+    int32_t depthBL1CacheUB = -1;   ///< number of B matrix L1 Buffer caching for Unified Buffer compute blocks
+    int32_t dbL0A = -1;             ///< whether A matrix MTE1 is using double buffer
+    int32_t dbL0B = -1;             ///< whether B matrix MTE1 is using double buffer
+    int32_t dbL0C = -1;             ///< whether MMAD is using double buffer
+    int32_t ALayoutInfoB = -1;      ///< B-axis information of A matrix layout
+    int32_t ALayoutInfoS = -1;      ///< S-axis information of A matrix layout
+    int32_t ALayoutInfoN = -1;      ///< N-axis information of A matrix layout
+    int32_t ALayoutInfoG = -1;      ///< G-axis information of A matrix layout
+    int32_t ALayoutInfoD = -1;      ///< D-axis information of A matrix layout
+    int32_t BLayoutInfoB = -1;      ///< B-axis information of B matrix layout
+    int32_t BLayoutInfoS = -1;      ///< S-axis information of B matrix layout
+    int32_t BLayoutInfoN = -1;      ///< N-axis information of B matrix layout
+    int32_t BLayoutInfoG = -1;      ///< G-axis information of B matrix layout
+    int32_t BLayoutInfoD = -1;      ///< D-axis information of B matrix layout
+    int32_t CLayoutInfoB = -1;      ///< B-axis information of C matrix layout
+    int32_t CLayoutInfoS1 = -1;     ///< S1-axis information of C matrix layout
+    int32_t CLayoutInfoN = -1;      ///< N-axis information of C matrix layout
+    int32_t CLayoutInfoG = -1;      ///< G-axis information of C matrix layout
+    int32_t CLayoutInfoS2 = -1;     ///< S2-axis information of C matrix layout
+    int32_t BatchNum = -1;          ///< batch number of Batch Matmul
+    int32_t mxTypePara = -1;        ///< multiple of scaleA/scaleB load size in L1 compared to the load size of A/B matrix in L1
+    MatmulConfig cfg = CFG_NORM;    ///< MatmulConfig parameter
 };
 
-#endif // LIB_MATMUL_TILING_H
\ No newline at end of file
+#endif ///< LIB_MATMUL_TILING_H
\ No newline at end of file
-- 
Gitee