From d9b9627203d85028c77e9a0082dbd4682e9cd684 Mon Sep 17 00:00:00 2001 From: "@mr-zql" Date: Mon, 25 Aug 2025 11:50:21 +0800 Subject: [PATCH 1/2] add a comment --- lib/matmul/matmul_config.h | 211 ++++++++++++++++++++++--------------- 1 file changed, 126 insertions(+), 85 deletions(-) diff --git a/lib/matmul/matmul_config.h b/lib/matmul/matmul_config.h index ab7711fd..d923857e 100644 --- a/lib/matmul/matmul_config.h +++ b/lib/matmul/matmul_config.h @@ -27,127 +27,168 @@ constexpr uint32_t SHARED_CO1_BUFFER_SIZE = 64 * 1024; +/** +* @enum class CubeFormat +* @brief physical data layout formats +*/ enum class CubeFormat { - ND = 0, - NZ, - ZN, - ZZ, - NN, - ND_ALIGN, - SCALAR, - VECTOR, - ROW_MAJOR = ND, // ND - COLUMN_MAJOR = 8, // DN + ND = 0, // Standard format, row-major + NZ, // Colum-major between blocks, row-major within blocks + ZN, // row-major between blocks, Colum-major within blocks + ZZ, // row-major between blocks, row-major within blocks + NN, // Colum-major between blocks, Colum-major within blocks + ND_ALIGN, // Aligned ND format, the output is aligned to 32-byte boundaries along the N dimension + SCALAR, // A single numerical value with no dimensions + VECTOR, // 1D array format for vector data + ROW_MAJOR = ND, // Row-major format (equivalent to ND format) + COLUMN_MAJOR = 8, // Column-major format (DN format) }; +/** + * @enum class LayoutMode + * @brief Matrix layout modes +*/ enum class LayoutMode { - NONE = 0, - BSNGD, - SBNGD, - BNGS1S2, - NORMAL + NONE = 0, // Default value, indicating that BatchMatmul is not used + BSNGD, // Reshaped format from original BSH shape + SBNGD, // Reshaped format from original SBH shape + BNGS1S2, // Typically output of matrix multiplication for the first two layouts + NORMAL // General data format with BMNK layout }; +/** + * @enum class BatchMode + * @brief Memory scheduling modes for batch processing + * + * When the Layout type is NORMAL in a BatchMatmul scenario,set the relationship between the total size of the + * multi-batch data of BatchMatmul input matrices A/B and the L1 Buffer size + */ enum class BatchMode { - NONE = 0, - BATCH_LESS_THAN_L1, - BATCH_LARGE_THAN_L1, - SINGLE_LARGE_THAN_L1 + NONE = 0, // No specific batch processing mode + BATCH_LESS_THAN_L1, // Total size of multi-batch data < L1 buffer size + BATCH_LARGE_THAN_L1, // Total size of multi-batch data > L1 buffer size + SINGLE_LARGE_THAN_L1 // Size of single batch data > L1 buffer size }; +/** + * @enum class IterateOrder + * @brief The loop iteration order for Matmul matrix operations + */ enum class IterateOrder { - ORDER_M = 0, - ORDER_N, - UNDEF, + ORDER_M = 0, // Offset along the M-axis first, then along the N-axis + ORDER_N, // Offset along the N-axis first, then along the M-axis + UNDEF, // Currently invalid }; +/** + * @enum class ScheduleType + * @brief Configure Matmul data transfer mode + */ enum class ScheduleType { INNER_PRODUCT = 0, // k loop, default type OUTER_PRODUCT, // m/n loop, depends on IterateOrder }; +/** + * @enum class MatmulVersion + * @brief Different implementation versions of the Matmul operator + */ enum class MatmulVersion { - NORMAL = 0, - MULTI_DATA_LOAD, - BASIC_BLOCK, - IBSHARE_NORM, + NORMAL = 0, // Basic version + MULTI_DATA_LOAD, // Multiple data preloading + BASIC_BLOCK, // Block computation using basic blocks + IBSHARE_NORM, // Inner block sharing and normalization fusion optimization }; +/** + * @enum IterateMode + * @brief Optimize the overhead of Matmul computation + */ enum IterateMode : uint8_t { - ITERATE_MODE_NORMAL = 0b00000001, - ITERATE_MODE_ALL = 0b00000010, - ITERATE_MODE_BATCH = 0b00000100, - ITERATE_MODE_N_BATCH = 0b00001000, - ITERATE_MODE_DEFAULT = 0b11111111, + ITERATE_MODE_NORMAL = 0b00000001, // Only the Iterate interface is called in Matmul computation + ITERATE_MODE_ALL = 0b00000010, // Only the IterateAll interface is called in Matmul computation + ITERATE_MODE_BATCH = 0b00000100, // Only the IterateBatch interface is called in Matmul computation + ITERATE_MODE_N_BATCH = 0b00001000, // Only the IterateNBatch interface is called in Matmul computation + ITERATE_MODE_DEFAULT = 0b11111111, // Default mode with all modes enabled }; +/** + * @struct MatmulConfig + * @brief Core configuration for the Matmul operator + * + * Integrates all control parameters for matrix multiplication computation + */ struct MatmulConfig { - bool doNorm; - bool doBasicBlock; - bool doMultiDataLoad; + bool doNorm; // Enabled by default (true) to activate the Norm template, false to disable the template + bool doBasicBlock; // true: enables the BasicBlock template; false: disables the BasicBlock template + bool doMultiDataLoad; // true: enables the doMultiDataLoad template; false: disables the doMultiDataLoad template // basic MNK could only be valid in basic block mode - uint32_t basicM; - uint32_t basicN; - uint32_t basicK; - bool intrinsicsCheck; - bool isNBatch; - bool enVecND2NZ; + uint32_t basicM; // M dimension size of the basic block + uint32_t basicN; // N dimension size of the basic block + uint32_t basicK; // K dimension size of the basic block + bool intrinsicsCheck; // true: enable loop-based data loading, false: disable loop-based data loading + bool isNBatch; // true: Enables multi-batch, false: Disables multi-batch (default value) + bool enVecND2NZ; // true: Enable ND2NZ with vector instructions, false: Disable ND2NZ with vector instructions // only be valid in special basic block mode - bool doSpecialBasicBlock; - uint32_t doMTE2Preload; - uint32_t singleCoreM; - uint32_t singleCoreN; - uint32_t singleCoreK; - uint32_t stepM; - uint32_t stepN; - uint32_t baseMN; - uint32_t singleCoreMN; - bool enUnitFlag = true; + bool doSpecialBasicBlock; // true: Enables the SpecialBasicBlock, false: Disables the SpecialBasicBlock + uint32_t doMTE2Preload; // 0: Disabled (default), 1: Enable M-dimension preload, 2: Enable N-dimension preload + uint32_t singleCoreM; // Size of the M-axis shape within a single core, in units of elements + uint32_t singleCoreN; // Size of the N-axis shape within a single core, in units of elements + uint32_t singleCoreK; // Size of the K-axis shape within a single core, in units of elements + uint32_t stepM; // The multiple of baseM in the bufferM direction for the left matrix cached in A1 + uint32_t stepN; // The multiple of baseN in the bufferN direction for the right matrix cached in B1 + uint32_t baseMN; // Size of baseM * baseN + uint32_t singleCoreMN; // Size of singleCoreM * singleCoreN + bool enUnitFlag = true; // true: Enables unitflag, false: Disables unitflag // AntiQuant Param - bool isPerTensor; - bool hasAntiQuantOffset; - bool doIBShareNorm; + bool isPerTensor; // B matrix quantization is per-tensor when enabled (for A(half) and B(int8_t) input) + bool hasAntiQuantOffset; // Whether to use offset coefficient for B matrix quantization (A: half, B: int8_t) + bool doIBShareNorm; // true: Enables IBShare, false: Disables IBShare // MDL support stepN == 2 - bool doSpecialMDL; - bool enableInit = true; - BatchMode batchMode; + bool doSpecialMDL; // true: Enables MDL, false: Disables MDL + bool enableInit = true; // true: Enables the Init function + BatchMode batchMode; // BatchMatmul (NORMAL layout): Set multi-batch A/B total size vs L1 Buffer size // Add for process performance - bool enableEnd = true; - bool enableGetTensorC = true; - bool enableSetOrgShape = true; - bool enableSetBias = true; - bool enableSetTail = true; - bool enableQuantVector = true; - bool enableSetDefineData = true; - uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT; - bool enableReuse = true; + bool enableEnd = true; // true: The End function needs to be called during Matmul computation + bool enableGetTensorC = true; // true: The GetTensorC function needs to be called during Matmul computation + bool enableSetOrgShape = true; // true: The SetOrgShape function needs to be called during Matmul computation + bool enableSetBias = true; // true: Enable bias computation (default value) + bool enableSetTail = true; // true: Call SetTail during Matmul computation (default) + bool enableQuantVector = true; // true: Call SetQuantVector and SetQuantScalar during Matmul (default) + bool enableSetDefineData = true; // Whether to set defined data + uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT; // No limit on calls to Iterate interfaces (default) + bool enableReuse = true; // Whether to enable data reuse // enable UB reuse(ND2NZ & ND2NZ) for V200 - bool enableUBReuse; - bool enableL1CacheUB; + bool enableUBReuse; // Whether to enable Unified Buffer reuse + bool enableL1CacheUB; // Whether to enable L1 Buffer caching for Unified Buffer compute blocks // for intra-block l0c add - bool intraBlockPartSum = false; + bool intraBlockPartSum = false; // false: Disable accumulation of two AIV cores' results in L0C Buffer (default) // MDL support M/N db - IterateOrder iterateOrder { IterateOrder::UNDEF }; - ScheduleType scheduleType; - bool enableDoubleCache; - bool isBiasBatch = true; - bool enableStaticPadZeros = false; - bool isPartialOutput = false; - bool enableMixDualMaster = false; - bool isA2B2Shared = false; - bool isEnableChannelSplit = false; - bool enableKdimReorderLoad = false; - bool isCO1Shared = false; - uint32_t sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE; - bool isNBatchOut = false; + IterateOrder iterateOrder { IterateOrder::UNDEF }; // UNDEF: Currently invalid. + ScheduleType scheduleType; // Configure the data transfer mode for Matmul + bool enableDoubleCache; // When IBShare is enabled, whether to cache two data blocks in L1 Buffer simultaneously + bool isBiasBatch = true; // true: Bias includes the Batch dimension (default) + bool enableStaticPadZeros = false; // false: No auto zero-fill during transfer; user must pad manually (default) + bool isPartialOutput = false; // false: Disables the PartialOutput feature + bool enableMixDualMaster = false; // Whether to enable MixDualMaster (default: false) + bool isA2B2Shared = false; // Whether to enable global management for A2 and B2; false: disabled (default) + bool isEnableChannelSplit = false; // Whether to enable the channel_split functionality, false: disabled (default) + bool enableKdimReorderLoad = false; // false: Disable staggered K-axis data loading (default) + bool isCO1Shared = false; // Whether to enable CO1 memory sharing, false: disabled (default) + uint32_t sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE; // Specify the size of the shared CO1 buffer + bool isNBatchOut = false; // Whether the Nth batch has been output }; +/** + * @enum class MatmulConfigMode + * @brief MatmulConfigMode specifies the MatmulConfig template to be retrieved and modified. + */ enum class MatmulConfigMode { - CONFIG_NORM, - CONFIG_MDL, - CONFIG_SPECIALMDL, - CONFIG_IBSHARE + CONFIG_NORM, // Set MatmulConfig default value to Norm template + CONFIG_MDL, // Set MatmulConfig default value to MDL template + CONFIG_SPECIALMDL, // Set MatmulConfig default value to SpecialMDL template + CONFIG_IBSHARE // Set MatmulConfig default value to IBShare template }; struct MatmulShapeParams { -- Gitee From 8281f975d2a868f7c66539df3fa0fdd2ea902e3e Mon Sep 17 00:00:00 2001 From: "@mr-zql" Date: Mon, 8 Sep 2025 16:21:16 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=8D=95=E8=AF=8D=E6=8B=BC=E5=86=99?= =?UTF-8?q?=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- impl/matmul/tiling/matmul_constant_tiling_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impl/matmul/tiling/matmul_constant_tiling_utils.h b/impl/matmul/tiling/matmul_constant_tiling_utils.h index a1f4e64f..d1a46b1f 100644 --- a/impl/matmul/tiling/matmul_constant_tiling_utils.h +++ b/impl/matmul/tiling/matmul_constant_tiling_utils.h @@ -39,7 +39,7 @@ constexpr int32_t L1_SIZE = 512 * 1024; constexpr int32_t SCALE_K_SIZE = 32; constexpr int32_t MIN_MX_PARAM = 257;// scaleFactorA = 1, scaleFactorB = 1 constexpr int32_t MX_L1_BUFFER_NUM = 4;// A/B/scaleA/scaleB buffer -constexpr uint32_t SCALE_FACTOR_MAX_VALUE = 127;// scaleFactorA/scaleFactorB is 7 bit, max vaule is 127 +constexpr uint32_t SCALE_FACTOR_MAX_VALUE = 127;// scaleFactorA/scaleFactorB is 7 bit, max value is 127 } enum class L1TilingType : uint8_t { -- Gitee