diff --git a/impl/reduce/mean/mean_tiling.cpp b/impl/reduce/mean/mean_tiling.cpp index 6838fb099b180eeb5f515a295fef925b59c40232..d57f0a30328efe31d95753e722f956f735b66488 100644 --- a/impl/reduce/mean/mean_tiling.cpp +++ b/impl/reduce/mean/mean_tiling.cpp @@ -25,7 +25,7 @@ inline void CheckMeanHostParams(const uint32_t n, const uint32_t srcTypeSize, ASCENDC_HOST_ASSERT( ((srcTypeSize == 2U && accTypeSize == 2U) || (srcTypeSize == 4U && accTypeSize == 4U) || - (srcTypeSize == 2U && srcTypeSize == 4U)), + (srcTypeSize == 2U && accTypeSize == 4U)), return, "[Mean][GetMeanMaxMinTmpSize] The parameter (srcTypeSize, accTypeSize) is (%u, %u), expected is (2, 2)/(4, 4)/(2, 4).", srcTypeSize, accTypeSize diff --git a/lib/matmul/bmm_tiling.h b/lib/matmul/bmm_tiling.h index 20ab4b01089ea99d705a9b8fdf0d416e198a4566..ab544668f1182fc9be33551bd873f95029baff5f 100644 --- a/lib/matmul/bmm_tiling.h +++ b/lib/matmul/bmm_tiling.h @@ -21,19 +21,90 @@ #include "tiling/platform/platform_ascendc.h" namespace matmul_tiling { -// matmu tiling for multi core +/** + * @class MultiCoreMatmulTiling + * @brief matmul tiling for multi core + * + * Provide a set of Matmul Tiling APIs for multi-core computing scenarios, + * enabling users to easily obtain the Tiling parameters required for Matmul kernel calculations. + * Users only need to pass information such as the Position, Format, and Dtype of matrices A/B/C, + * and by calling the API interface, they can get the relevant parameters from the + * TCubeTiling structure in the Matmul Init API. +*/ class MultiCoreMatmulTiling : public MatmulApiTilingBase { public: + /** + * @brief no-parameter constructor. + */ MultiCoreMatmulTiling() {}; + /** + * @brief Use the PlatformAscendC class to pass information. + * @param ascendcPlatform : hardware platform information. + */ explicit MultiCoreMatmulTiling(const platform_ascendc::PlatformAscendC &ascendcPlatform) : MatmulApiTilingBase(ascendcPlatform){}; + /** + * @brief Use the PlatformInfo class to pass information. + * Input hardware version and memory size provided by each hardware unit in the AI Core. + * @param platform : platform information. + */ explicit MultiCoreMatmulTiling(const PlatformInfo &platform) : MatmulApiTilingBase(platform) {}; - int32_t SetDim(int32_t dim); // Sets the allowed block dim. - int32_t SetShape(int32_t m, int32_t n, int32_t k) override; // Sets the size of the original input + /** + * @brief Set the allowed block dim. + * @param dim : core number. + */ + int32_t SetDim(int32_t dim); + /** + * @brief Set the dimensions m, n, and k for the Matmul computation, + which can represent either the original full matrix or a submatrix, measured in elements. + The matrix multiplication of these dimensions can be performed using a single core or multiple cores. + * @param m : Set the M-direction size calculated by Matmul. The unit is element. + * @param n : Set the N-direction size calculated by Matmul. The unit is element. + * @param k : Set the K-direction size calculated by Matmul. The unit is element. + */ + int32_t SetShape(int32_t m, int32_t n, int32_t k) override; // Set the size of the original input + /** + * @brief Set the shape for single-core computation of Matmul as singleMIn, singleNIn, and singleKIn, + with units in elements. + * @param singleMIn : The size of singleMIn is set in units of elements, with a default value of -1. + A value of -1 indicates that no specific singleMIn is set, and this value is calculated by the tiling + function itself. + * @param singleNIn : The size of singleNIn is set in units of elements, with a default value of -1. + A value of -1 indicates that no specific singleNIn is set, and this value is calculated by the tiling + function itself. + * @param singleKIn : The size of singleKIn is set in units of elements, with a default value of -1. + A value of -1 indicates that no specific singleKIn is set, and this value is calculated by the tiling + function itself. + */ virtual int32_t SetSingleShape(int32_t singleMIn = -1, int32_t singleNIn = -1, int32_t singleKIn = -1); + /** + * @brief Get Tiling parameters. + * @param tiling : The Tiling structure stores the final tiling results. The TCubeTiling structure with the + optiling namespace, which define the Matmul TilingData on the Host side. + */ int64_t GetTiling(optiling::TCubeTiling &tiling) override; + /** + * @brief Get Tiling parameters. + * @param tiling : The Tiling structure stores the final tiling results. The TCubeTiling structure without the + optiling namespace, Matmul TilingData defined on the Kernel side. + */ int64_t GetTiling(TCubeTiling &tiling) override; + /** + * @brief Set the maximum and minimum values for singleCoreM/singleCoreN/singleCoreK. + * @param maxM : Set the maximum value of singleCoreM to -1, which indicates that no specific maximum value + for singleCoreM is set. This value is calculated by the Tiling function itself. + * @param maxN : Set the maximum value of singleCoreN to -1, which indicates that no specific maximum value + for singleCoreN is set. This value is calculated by the Tiling function itself. + * @param maxK : Set the maximum value of singleCoreK to -1, which indicates that no specific maximum value + for singleCoreK is set. This value is calculated by the Tiling function itself. + * @param minM : Set the minimum value of singleCoreM to -1, which indicates that no specific minimum value + for singleCoreM is set. This value is calculated by the Tiling function itself. + * @param minN : Set the minimum value of singleCoreN to -1, which indicates that no specific minimum value + for singleCoreN is set. This value is calculated by the Tiling function itself. + * @param minK : Set the minimum value of singleCoreK to -1, which indicates that no specific minimum value + for singleCoreK is set. This value is calculated by the Tiling function itself. + */ virtual int32_t SetSingleRange(int32_t maxM = -1, int32_t maxN = -1, int32_t maxK = -1, int32_t minM = -1, int32_t minN = -1, int32_t minK = -1) { @@ -45,12 +116,39 @@ public: this->minSingleK = minK; return 0; } + /** + * @brief When performing multi-core splitting, set the alignment value for singleCoreM/singleCoreN/singleCoreK. + for example, setting the alignment value for singleCoreM to 64(in units of elements) ensures that the split + singleCoreM is a multiple of 64. + * @param alignM : The alignment value for singleCoreM. If -1 or 0 is passed, it indicates that the specified + alignment value for singleCoreM is not set, and this value will be calculated by the Tiling function itself. + * @param alignN : The alignment value for singleCoreN. If -1 or 0 is passed, it indicates that the specified + alignment value for singleCoreN is not set, and this value will be calculated by the Tiling function itself. + * @param alignK : The alignment value for singleCoreK. If -1 or 0 is passed, it indicates that the specified + alignment value for singleCoreK is not set, and this value will be calculated by the Tiling function itself. + */ int32_t SetAlignSplit(int32_t alignM, int32_t alignN, int32_t alignK); - // Get the amount of data processed at a time. + /** + * @brief Get the calculated singleCoreM/singleCoreN/singleCoreK. + * @param shapeM : Get the singleCoreM value calculated form multi-core tiling. + * @param shapeN : Get the singleCoreN value calculated form multi-core tiling. + * @param shapeK : Get the singleCoreK value calculated form multi-core tiling. + */ int32_t GetSingleShape(int32_t &shapeM, int32_t &shapeN, int32_t &shapeK); - // Get the BlockDim used after multi core tiling. - // It is carried by users to the kernel to control the service logic in the kernel. + /** + * @brief Get the BlockDim used after multi core tiling. It is carried by users to the kernel to control + the service logic in the kernel. + * @param dim : Get the number of cores required fro computation, dim = mDim * nDim. + * @param mDim : Determine the number of core required for the M direction during computation. + * @param nDim : Determine the number of core required for the N direction during computation. + */ int32_t GetCoreNum(int32_t &dim, int32_t &mDim, int32_t &nDim); + /** + * @brief In a multi-core scenario, this interface enables the split of the K-axis. if this interface + is not called, the K-axis will not be split by default. It should be used before calling the + GetTiling interface. + * @param flag : whether to enable the K-axis cutting. + */ void EnableMultiCoreSplitK(bool flag) { enableSplitK_ = flag;