From d9b9627203d85028c77e9a0082dbd4682e9cd684 Mon Sep 17 00:00:00 2001
From: "@mr-zql" <zhangqianlong8@huawei.com>
Date: Mon, 25 Aug 2025 11:50:21 +0800
Subject: [PATCH 1/2] add a comment

---
 lib/matmul/matmul_config.h | 211 ++++++++++++++++++++++---------------
 1 file changed, 126 insertions(+), 85 deletions(-)

diff --git a/lib/matmul/matmul_config.h b/lib/matmul/matmul_config.h
index ab7711fd..d923857e 100644
--- a/lib/matmul/matmul_config.h
+++ b/lib/matmul/matmul_config.h
@@ -27,127 +27,168 @@
 
 constexpr uint32_t SHARED_CO1_BUFFER_SIZE = 64 * 1024;
 
+/**
+* @enum class CubeFormat
+* @brief physical data layout formats
+*/
 enum class CubeFormat {
-    ND = 0,
-    NZ,
-    ZN,
-    ZZ,
-    NN,
-    ND_ALIGN,
-    SCALAR,
-    VECTOR,
-    ROW_MAJOR = ND,  // ND
-    COLUMN_MAJOR = 8, // DN
+    ND = 0,           // Standard format, row-major
+    NZ,               // Colum-major between blocks, row-major within blocks
+    ZN,               // row-major between blocks, Colum-major within blocks
+    ZZ,               // row-major between blocks, row-major within blocks
+    NN,               // Colum-major between blocks, Colum-major within blocks
+    ND_ALIGN,         // Aligned ND format, the output is aligned to 32-byte boundaries along the N dimension
+    SCALAR,           // A single numerical value with no dimensions
+    VECTOR,           // 1D array format for vector data
+    ROW_MAJOR = ND,   // Row-major format (equivalent to ND format)
+    COLUMN_MAJOR = 8, // Column-major format (DN format)
 };
 
+/**
+ * @enum class LayoutMode
+ * @brief Matrix layout modes
+*/
 enum class LayoutMode {
-    NONE = 0,
-    BSNGD,
-    SBNGD,
-    BNGS1S2,
-    NORMAL
+    NONE = 0,  // Default value, indicating that BatchMatmul is not used
+    BSNGD,     // Reshaped format from original BSH shape
+    SBNGD,     // Reshaped format from original SBH shape
+    BNGS1S2,   // Typically output of matrix multiplication for the first two layouts
+    NORMAL     // General data format with BMNK layout
 };
 
+/**
+ * @enum class BatchMode
+ * @brief Memory scheduling modes for batch processing
+ *
+ * When the Layout type is NORMAL in a BatchMatmul scenario,set the relationship between the total size of the
+ * multi-batch data of BatchMatmul input matrices A/B and the L1 Buffer size
+ */
 enum class BatchMode {
-    NONE = 0,
-    BATCH_LESS_THAN_L1,
-    BATCH_LARGE_THAN_L1,
-    SINGLE_LARGE_THAN_L1
+    NONE = 0,            // No specific batch processing mode
+    BATCH_LESS_THAN_L1,  // Total size of multi-batch data < L1 buffer size
+    BATCH_LARGE_THAN_L1, // Total size of multi-batch data > L1 buffer size
+    SINGLE_LARGE_THAN_L1 // Size of single batch data > L1 buffer size
 };
 
+/**
+ * @enum class IterateOrder
+ * @brief The loop iteration order for Matmul matrix operations
+ */
 enum class IterateOrder {
-    ORDER_M = 0,
-    ORDER_N,
-    UNDEF,
+    ORDER_M = 0, // Offset along the M-axis first, then along the N-axis
+    ORDER_N,     // Offset along the N-axis first, then along the M-axis
+    UNDEF,       // Currently invalid
 };
  
+/**
+ * @enum class ScheduleType
+ * @brief Configure Matmul data transfer mode
+ */
 enum class ScheduleType {
     INNER_PRODUCT = 0, // k loop, default type
     OUTER_PRODUCT,     // m/n loop, depends on IterateOrder
 };
 
+/**
+ * @enum class MatmulVersion
+ * @brief Different implementation versions of the Matmul operator
+ */
 enum class MatmulVersion {
-    NORMAL = 0,
-    MULTI_DATA_LOAD,
-    BASIC_BLOCK,
-    IBSHARE_NORM,
+    NORMAL = 0,      // Basic version
+    MULTI_DATA_LOAD, // Multiple data preloading
+    BASIC_BLOCK,     // Block computation using basic blocks
+    IBSHARE_NORM,    // Inner block sharing and normalization fusion optimization
 };
 
+/**
+ * @enum IterateMode
+ * @brief Optimize the overhead of Matmul computation
+ */
 enum IterateMode : uint8_t {
-    ITERATE_MODE_NORMAL  = 0b00000001,
-    ITERATE_MODE_ALL     = 0b00000010,
-    ITERATE_MODE_BATCH   = 0b00000100,
-    ITERATE_MODE_N_BATCH = 0b00001000,
-    ITERATE_MODE_DEFAULT = 0b11111111,
+    ITERATE_MODE_NORMAL  = 0b00000001, // Only the Iterate interface is called in Matmul computation
+    ITERATE_MODE_ALL     = 0b00000010, // Only the IterateAll interface is called in Matmul computation
+    ITERATE_MODE_BATCH   = 0b00000100, // Only the IterateBatch interface is called in Matmul computation
+    ITERATE_MODE_N_BATCH = 0b00001000, // Only the IterateNBatch interface is called in Matmul computation
+    ITERATE_MODE_DEFAULT = 0b11111111, // Default mode with all modes enabled
 };
 
+/**
+ * @struct MatmulConfig
+ * @brief Core configuration for the Matmul operator
+ *
+ * Integrates all control parameters for matrix multiplication computation
+ */
 struct MatmulConfig {
-    bool doNorm;
-    bool doBasicBlock;
-    bool doMultiDataLoad;
+    bool doNorm;          // Enabled by default (true) to activate the Norm template, false to disable the template
+    bool doBasicBlock;    // true: enables the BasicBlock template; false: disables the BasicBlock template
+    bool doMultiDataLoad; // true: enables the doMultiDataLoad template; false: disables the doMultiDataLoad template
     // basic MNK could only be valid in basic block mode
-    uint32_t basicM;
-    uint32_t basicN;
-    uint32_t basicK;
-    bool intrinsicsCheck;
-    bool isNBatch;
-    bool enVecND2NZ;
+    uint32_t basicM;      // M dimension size of the basic block
+    uint32_t basicN;      // N dimension size of the basic block
+    uint32_t basicK;      // K dimension size of the basic block
+    bool intrinsicsCheck; // true: enable loop-based data loading, false: disable loop-based data loading
+    bool isNBatch;     // true: Enables multi-batch, false: Disables multi-batch (default value)
+    bool enVecND2NZ;   // true: Enable ND2NZ with vector instructions, false: Disable ND2NZ with vector instructions
      // only be valid in special basic block mode
-    bool doSpecialBasicBlock;
-    uint32_t doMTE2Preload;
-    uint32_t singleCoreM;
-    uint32_t singleCoreN;
-    uint32_t singleCoreK;
-    uint32_t stepM;
-    uint32_t stepN;
-    uint32_t baseMN;
-    uint32_t singleCoreMN;
-    bool enUnitFlag = true;
+    bool doSpecialBasicBlock; // true: Enables the SpecialBasicBlock, false: Disables the SpecialBasicBlock
+    uint32_t doMTE2Preload;   // 0: Disabled (default), 1: Enable M-dimension preload, 2: Enable N-dimension preload
+    uint32_t singleCoreM;     // Size of the M-axis shape within a single core, in units of elements
+    uint32_t singleCoreN;     // Size of the N-axis shape within a single core, in units of elements
+    uint32_t singleCoreK;     // Size of the K-axis shape within a single core, in units of elements
+    uint32_t stepM;           // The multiple of baseM in the bufferM direction for the left matrix cached in A1
+    uint32_t stepN;           // The multiple of baseN in the bufferN direction for the right matrix cached in B1
+    uint32_t baseMN;          // Size of baseM * baseN
+    uint32_t singleCoreMN;    // Size of singleCoreM * singleCoreN
+    bool enUnitFlag = true;   // true: Enables unitflag, false: Disables unitflag
     // AntiQuant Param
-    bool isPerTensor;
-    bool hasAntiQuantOffset;
-    bool doIBShareNorm;
+    bool isPerTensor;         // B matrix quantization is per-tensor when enabled (for A(half) and B(int8_t) input)
+    bool hasAntiQuantOffset;  // Whether to use offset coefficient for B matrix quantization (A: half, B: int8_t)
+    bool doIBShareNorm;       // true: Enables IBShare, false: Disables IBShare
     // MDL support stepN == 2
-    bool doSpecialMDL;
-    bool enableInit = true;
-    BatchMode batchMode;
+    bool doSpecialMDL;        // true: Enables MDL, false: Disables MDL
+    bool enableInit = true;   // true: Enables the Init function
+    BatchMode batchMode;      // BatchMatmul (NORMAL layout): Set multi-batch A/B total size vs L1 Buffer size
 
     // Add for process performance
-    bool enableEnd = true;
-    bool enableGetTensorC = true;
-    bool enableSetOrgShape = true;
-    bool enableSetBias = true;
-    bool enableSetTail = true;
-    bool enableQuantVector = true;
-    bool enableSetDefineData = true;
-    uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT;
-    bool enableReuse = true;
+    bool enableEnd = true;        // true: The End function needs to be called during Matmul computation
+    bool enableGetTensorC = true; // true: The GetTensorC function needs to be called during Matmul computation
+    bool enableSetOrgShape = true; // true: The SetOrgShape function needs to be called during Matmul computation
+    bool enableSetBias = true;     // true: Enable bias computation (default value)
+    bool enableSetTail = true;     // true: Call SetTail during Matmul computation (default)
+    bool enableQuantVector = true; // true: Call SetQuantVector and SetQuantScalar during Matmul (default)
+    bool enableSetDefineData = true; // Whether to set defined data
+    uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT; // No limit on calls to Iterate interfaces (default)
+    bool enableReuse = true;  // Whether to enable data reuse
     // enable UB reuse(ND2NZ & ND2NZ) for V200
-    bool enableUBReuse;
-    bool enableL1CacheUB;
+    bool enableUBReuse;   // Whether to enable Unified Buffer reuse
+    bool enableL1CacheUB; // Whether to enable L1 Buffer caching for Unified Buffer compute blocks
     // for intra-block l0c add
-    bool intraBlockPartSum = false;
+    bool intraBlockPartSum = false; // false: Disable accumulation of two AIV cores' results in L0C Buffer (default)
     // MDL support M/N db
-    IterateOrder iterateOrder { IterateOrder::UNDEF };
-    ScheduleType scheduleType;
-    bool enableDoubleCache;
-    bool isBiasBatch = true;
-    bool enableStaticPadZeros = false;
-    bool isPartialOutput = false;
-    bool enableMixDualMaster = false;
-    bool isA2B2Shared = false;
-    bool isEnableChannelSplit = false;
-    bool enableKdimReorderLoad = false;
-    bool isCO1Shared = false;
-    uint32_t sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE;
-    bool isNBatchOut = false;
+    IterateOrder iterateOrder { IterateOrder::UNDEF }; // UNDEF: Currently invalid.
+    ScheduleType scheduleType;  // Configure the data transfer mode for Matmul
+    bool enableDoubleCache;   // When IBShare is enabled, whether to cache two data blocks in L1 Buffer simultaneously
+    bool isBiasBatch = true;           // true: Bias includes the Batch dimension (default)
+    bool enableStaticPadZeros = false; // false: No auto zero-fill during transfer; user must pad manually (default)
+    bool isPartialOutput = false;      // false: Disables the PartialOutput feature
+    bool enableMixDualMaster = false;  // Whether to enable MixDualMaster (default: false)
+    bool isA2B2Shared = false;         // Whether to enable global management for A2 and B2; false: disabled (default)
+    bool isEnableChannelSplit = false; // Whether to enable the channel_split functionality, false: disabled (default)
+    bool enableKdimReorderLoad = false; // false: Disable staggered K-axis data loading (default)
+    bool isCO1Shared = false; // Whether to enable CO1 memory sharing, false: disabled (default)
+    uint32_t sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE; // Specify the size of the shared CO1 buffer
+    bool isNBatchOut = false; // Whether the Nth batch has been output
 };
 
+/**
+ * @enum class MatmulConfigMode
+ * @brief MatmulConfigMode specifies the MatmulConfig template to be retrieved and modified.
+ */
 enum class MatmulConfigMode {
-    CONFIG_NORM,
-    CONFIG_MDL,
-    CONFIG_SPECIALMDL,
-    CONFIG_IBSHARE
+    CONFIG_NORM,       // Set MatmulConfig default value to Norm template
+    CONFIG_MDL,        // Set MatmulConfig default value to MDL template
+    CONFIG_SPECIALMDL, // Set MatmulConfig default value to SpecialMDL template
+    CONFIG_IBSHARE     // Set MatmulConfig default value to IBShare template
 };
 
 struct MatmulShapeParams {
-- 
Gitee


From 8281f975d2a868f7c66539df3fa0fdd2ea902e3e Mon Sep 17 00:00:00 2001
From: "@mr-zql" <zhangqianlong8@huawei.com>
Date: Mon, 8 Sep 2025 16:21:16 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E5=8D=95=E8=AF=8D=E6=8B=BC=E5=86=99?=
 =?UTF-8?q?=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 impl/matmul/tiling/matmul_constant_tiling_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/impl/matmul/tiling/matmul_constant_tiling_utils.h b/impl/matmul/tiling/matmul_constant_tiling_utils.h
index a1f4e64f..d1a46b1f 100644
--- a/impl/matmul/tiling/matmul_constant_tiling_utils.h
+++ b/impl/matmul/tiling/matmul_constant_tiling_utils.h
@@ -39,7 +39,7 @@ constexpr int32_t L1_SIZE = 512 * 1024;
 constexpr int32_t SCALE_K_SIZE = 32;
 constexpr int32_t MIN_MX_PARAM = 257;// scaleFactorA = 1, scaleFactorB = 1
 constexpr int32_t MX_L1_BUFFER_NUM = 4;// A/B/scaleA/scaleB buffer
-constexpr uint32_t SCALE_FACTOR_MAX_VALUE = 127;// scaleFactorA/scaleFactorB is 7 bit, max vaule is 127
+constexpr uint32_t SCALE_FACTOR_MAX_VALUE = 127;// scaleFactorA/scaleFactorB is 7 bit, max value is 127
 }
 
 enum class L1TilingType : uint8_t {
-- 
Gitee