diff --git a/lib/matmul/matmul_config.h b/lib/matmul/matmul_config.h index ab7711fde95ff1cfb5d09f5b5a5e04c5684113ba..cdaba7dd7efa102b1ec15f9ec30502ffef51b679 100644 --- a/lib/matmul/matmul_config.h +++ b/lib/matmul/matmul_config.h @@ -150,45 +150,75 @@ enum class MatmulConfigMode { CONFIG_IBSHARE }; +/** + * @struct MatmulShapeParams + * @brief tiling shape information + * + * Constantized for tiling shape information + */ struct MatmulShapeParams { - uint32_t singleCoreM; - uint32_t singleCoreN; - uint32_t singleCoreK; - uint32_t basicM; - uint32_t basicN; - uint32_t basicK; + uint32_t singleCoreM; // size of M axis shape within a single core, in unit of element + uint32_t singleCoreN; // size of N axis shape within a single core, in unit of element + uint32_t singleCoreK; // size of K axis shape within a single core, in unit of element + uint32_t basicM; // size of M axis shape for Matmul caculation, in unit of element + uint32_t basicN; // size of N axis shape for Matmul caculation, in unit of element + uint32_t basicK; // size of K axis shape for Matmul caculation, in unit of element }; +/** + * @struct MatmulQuantParams + * @brief quant config + * + * Scenario of quant: A is float16_t and B is int8_t + */ struct MatmulQuantParams { - bool isPerTensor; - bool hasAntiQuantOffset; + bool isPerTensor; // whether B quant is per tensor + bool hasAntiQuantOffset; // whether B uses of offset coefficients }; +/** + * @struct MatmulBatchParams + * @brief batch matmul config + * + * Enable when batchMode is not BatchMode::None + */ struct MatmulBatchParams { - bool isNBatch; - BatchMode batchMode; - bool isBiasBatch = true; - bool isNBatchOut = false; + bool isNBatch; // whether invoke IterNBatch to achieve multiple batch inputs and outputs + BatchMode batchMode; // relationship between the total size of A/B and the size of L1 Buffer + bool isBiasBatch = true; // whether the size of the bias include the batch axis + bool isNBatchOut = false; // whether to cache multiple batch outputs to copy out together }; +/** + * @struct MatmulFuncParams + * @brief matmul function config + * + * Matmul common feature config + */ struct MatmulFuncParams { - bool intrinsicsCheck; - bool enVecND2NZ; - bool enableDoubleCache; - bool enableL1CacheUB; - uint32_t doMTE2Preload; - IterateOrder iterateOrder; - ScheduleType scheduleType; - bool enableReuse = true; - bool enableUBReuse; - bool isPartialOutput = false; - bool isA2B2Shared = false; - bool isEnableChannelSplit = false; - bool enableKdimReorderLoad = false; + bool intrinsicsCheck; // enable cyclic DataCopy from GM to L1 when the element num of A/B's inner axis >= 65535 + bool enVecND2NZ; // enable use the Vector instruction to transform ND to NZ + bool enableDoubleCache; // enable double data cached in L1 for IBShare + bool enableL1CacheUB; // enable use L1 to cache for UB buffer + uint32_t doMTE2Preload; // enable the preload function for M/N direction to reduce the MTE2 gap + IterateOrder iterateOrder; // the loop iterate order of M or N direction + ScheduleType scheduleType; // the type of Matmul data copy + bool enableReuse = true; // enable directly pass the calculate data for the dataPtr in the callback funtion + // set by the SetSelfDefineData function + bool enableUBReuse; // enable reuse of UB buffer to cache double data for two iterate + bool isPartialOutput = false; // enable K axis does not atomic add + bool isA2B2Shared = false; // enable all Mamtul instance share the double buffer feature of A2 and B2 + bool isEnableChannelSplit = false; // enable C matrix split form [x]*[m*n] to [2x]*[m*(n/2)] + // when C's DataType is float, Position is GM and CubeFormat is NZ + bool enableKdimReorderLoad = false; // enable K axis load data by peak-shifting }; +/** + * @struct MatmulBiasParams + * @brief matmul bias config + */ struct MatmulBiasParams { - bool enableSetBias = true; + bool enableSetBias = true; // enable SetBias function }; struct MatrixOffset { diff --git a/lib/matmul/matmul_tiling.h b/lib/matmul/matmul_tiling.h index fdae64893eeffd5a5479ac7ab05e6291b792b2e2..b6de3af98b256c1ea51009728ff0fcadafda3dee 100644 --- a/lib/matmul/matmul_tiling.h +++ b/lib/matmul/matmul_tiling.h @@ -25,7 +25,10 @@ #include "tiling/platform/platform_ascendc.h" namespace matmul_tiling { -// single core matmul tiling +/** + * @class MatmulApiTiling + * @brief single core matmul tiling + */ class MatmulApiTiling : public MatmulApiTilingBase { public: MatmulApiTiling() {}; @@ -33,7 +36,19 @@ public: : MatmulApiTilingBase(ascendcPlatform){}; explicit MatmulApiTiling(const PlatformInfo& platform) : MatmulApiTilingBase(platform) {}; ~MatmulApiTiling() override = default; + + /** + * @brief Get caculated tiling information + * @param [in] tiling the structure to store the tiling result defined on the Host side + * @return return 0 if success, else return -1 + */ int64_t GetTiling(optiling::TCubeTiling &tiling) override; + + /** + * @brief Get caculated tiling information + * @param [in] tiling the structure to store the tiling result defined on the Kernel side + * @return return 0 if success, else return -1 + */ int64_t GetTiling(TCubeTiling &tiling) override; protected: @@ -42,7 +57,19 @@ protected: } // namespace matmul_tiling extern "C" { +/** + * @brief After invoke GetTiling function, obtain the used size of L1/UB/L0C buffer based on TCubeTiling + * @param [in] tiling tiling information defined on the Host side + * @param [in] bufSize the structure to store the used size of L1/UB/L0C buffer + * @return return 0 if success, else return -1 + */ int32_t MatmulGetTmpBufSize(optiling::TCubeTiling &tiling, matmul_tiling::SysTilingTempBufSize &bufSize); +/** + * @brief After invoke GetTiling function, obtain the used size of L1/UB/L0C buffer based on TCubeTiling + * @param [in] tiling tiling information defined on the Kernel side + * @param [in] bufSize the structure to store the used sizeof L1/UB/L0C buffer + * @return return 0 if success, else return -1 + */ int32_t MatmulGetTmpBufSizeV2(TCubeTiling &tiling, matmul_tiling::SysTilingTempBufSize &bufSize); };