From c45272326e22bc2dac691becf7e798345871f938 Mon Sep 17 00:00:00 2001 From: mingling Date: Sat, 21 Jun 2025 16:49:29 +0800 Subject: [PATCH] add tiling pointer --- .../MatmulCustomMultiCoreNewTiling/README.md | 14 ++ .../op_host/matmul_custom.cpp | 108 ++++++++++++ .../op_host/matmul_custom_tiling.h | 25 +++ .../op_kernel/matmul_custom.cpp | 162 ++++++++++++++++++ .../op_kernel/matmul_custom_tiling.h | 21 +++ 5 files changed, 330 insertions(+) create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md new file mode 100644 index 000000000..75c0b140e --- /dev/null +++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md @@ -0,0 +1,14 @@ +## 算子原型 + + + + + + + + + + + + +
算子类型(OpType)MatmulCustom
算子输入nameshapedata typeformat
a1024 * 256float16ND
b256 * 640float16ND
bias640floatND
算子输出c1024 * 640floatND
核函数名matmul_custom
\ No newline at end of file diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp new file mode 100644 index 000000000..eea77fe5b --- /dev/null +++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp @@ -0,0 +1,108 @@ +/** + * @file matmul_custom.cpp + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "../op_kernel/matmul_custom_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" +#include "tiling/tiling_api.h" +using namespace matmul_tiling; + +namespace optiling { +/** + * @brief Generate matmul tiling. + * @param context: Tiling kernel context. + * @retval Status of GetTiling (GRAPH_SUCCESS or GRAPH_FAILED). + */ +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + auto shape_a = context->GetInputTensor(0)->GetOriginShape(); + auto shape_b = context->GetInputTensor(1)->GetOriginShape(); + int32_t M = shape_a.GetDim(0); + int32_t N = shape_b.GetDim(1); + int32_t K = shape_a.GetDim(1); + int32_t baseM = 128; + int32_t baseN = 128; + int32_t singleCoreM = 512; + int32_t singleCoreN = 640; + MultiCoreMatmulTiling cubeTiling(ascendcPlatform); + cubeTiling.SetDim(ascendcPlatform.GetCoreNumAiv()); // Set the number of cores that participate in multi-core computaion is 48. + cubeTiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + cubeTiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + cubeTiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetShape(M, N, K); + cubeTiling.SetOrgShape(M, N, K); + if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) { + cubeTiling.SetSingleShape(singleCoreM, singleCoreN, -1); // Set the fixed singleCoreM=512, singleCoreN=640. + cubeTiling.SetFixSplit(baseM, baseN, -1); // Set the fixed baseM=128, baseN=128. + } + cubeTiling.SetBias(true); + cubeTiling.SetBufferSpace(-1, -1, -1); + MatmulCustomTilingData *tiling = context->GetTilingData(); + if (cubeTiling.GetTiling(tiling->cubeTilingData) == -1) { + return ge::GRAPH_FAILED; + } + + uint64_t localMemSize; + ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, localMemSize); + tiling->set_localMemSize(localMemSize); + + if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) { + context->SetBlockDim(2); + context->SetTilingKey(2); + } else { + /* SetBlockDim here refers to the number of cube cores, so for separated arch(AIC:AIV=1:2), + vector cores number is set 48 by SetDim, cube core number need to be set 24 here.*/ + context->SetBlockDim(24); + context->SetTilingKey(1); + } + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + size_t userWorkspaceSize = 0; + size_t systemWorkspaceSize = static_cast(ascendcPlatform.GetLibApiWorkSpaceSize()); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize; + + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ops { +class MatmulCustom : public OpDef { +public: + explicit MatmulCustom(const char *name) : OpDef(name) + { + this->Input("a") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Input("b") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Input("bias") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}); + this->Output("c") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}); + + this->AICore() + .SetTiling(optiling::TilingFunc) + .AddConfig("ascend310p") + .AddConfig("ascend910b"); + } +}; + +OP_ADD(MatmulCustom); +} // namespace ops diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h new file mode 100644 index 000000000..fd898cba9 --- /dev/null +++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h @@ -0,0 +1,25 @@ +/** + * @file matmul_custom_tiling.h + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef MATMUL_TILING_H +#define MATMUL_TILING_H + +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(MatmulCustomTilingData) +TILING_DATA_FIELD_DEF(uint64_t, localMemSize); +TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulCustomTilingData) +} // namespace optiling + +#endif \ No newline at end of file diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp new file mode 100644 index 000000000..d0d86f000 --- /dev/null +++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp @@ -0,0 +1,162 @@ +/** + * @file matmul_custom.cpp + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "lib/matmul_intf.h" +#include "matmul_custom_tiling.h" + +using namespace matmul; + +__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) +{ + return (a + b - 1) / b; +} + +template class MatmulKernel { +public: + __aicore__ inline MatmulKernel(){}; + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + uint64_t memSize, const TCubeTiling &tiling); + template __aicore__ inline void Process(AscendC::TPipe *pipe); + + __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB, + int32_t &offsetC, int32_t &offsetBias); + + Matmul, MatmulType, + MatmulType, MatmulType> + matmulObj; + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; + uint64_t localMemSize = 0; + int32_t mIdx = 0; + int32_t nIdx = 0; +}; + +/** + * @brief Set matmul input and output gm addr of current core. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tiling: matmul tiling data. + * @retval None + */ +template +__aicore__ inline void MatmulKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, + GM_ADDR workspace, uint64_t memSize, const TCubeTiling &tiling) +{ + this->tiling = tiling; + this->localMemSize = memSize; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N); + + int32_t offsetA = 0; + int32_t offsetB = 0; + int32_t offsetC = 0; + int32_t offsetBias = 0; + CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); // Calculate the gm offset based on the blockidx. + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka, tiling.Kb); + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +/** + * @brief Main process of matmul calculation + * @param pipe: Global memory and sync management TPipe object. + * @retval None + */ +template +template +__aicore__ inline void MatmulKernel::Process(AscendC::TPipe *pipe) +{ + if (GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + // Set temp UB space if the setTmpSpace is true. + if constexpr (setTmpSpace) { + AscendC::TBuf<> tmpMMFormatUb; + AscendC::LocalTensor mmformatUb; + pipe->InitBuffer(tmpMMFormatUb, localMemSize); + mmformatUb = tmpMMFormatUb.Get(localMemSize); + matmulObj.SetLocalWorkspace(mmformatUb); + } + auto tailM = tiling.M - mIdx * tiling.singleCoreM; + auto tailN = tiling.N - nIdx * tiling.singleCoreN; + auto mUse = tailM > tiling.singleCoreM ? tiling.singleCoreM : (tailM > 0 ? tailM : tiling.M); + auto nUse = tailN > tiling.singleCoreN ? tiling.singleCoreN : (tailN > 0 ? tailN : tiling.N); + matmulObj.SetTail(mUse, nUse, -1); + matmulObj.SetTensorA(aGlobal); + matmulObj.SetTensorB(bGlobal); + matmulObj.SetBias(biasGlobal); + matmulObj.IterateAll(cGlobal); + matmulObj.End(); +} + +/** + * @brief Calculate the gm offset based on the blockidx. + * @param blockIdx: Current Core blockidx. + * @param tiling: Matmul tiling data. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ +template +__aicore__ inline void +MatmulKernel::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, + int32_t &offsetB, int32_t &offsetC, int32_t &offsetBias) +{ + auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); + mIdx = blockIdx % mSingleBlocks; + nIdx = blockIdx / mSingleBlocks; + + offsetA = mIdx * tiling.Ka * tiling.singleCoreM; + offsetB = nIdx * tiling.singleCoreN; + offsetC = mIdx * tiling.N * tiling.singleCoreM + nIdx * tiling.singleCoreN; + offsetBias = nIdx * tiling.singleCoreN; +} + +/** + * @brief matmul kernel function entry + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tiling: Tiling data addr. + * @retval None + */ +extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + GM_ADDR tiling) +{ + REGISTER_TILING_DEFAULT(MatmulCustomTilingData); + GET_TILING_DATA(tilingData, tiling); + MatmulKernel matmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tilingData.cubeTilingData); // Initialize the matmul object. + matmulKernel.Init(a, b, bias, c, workspace, tilingData.localMemSize, tilingData.cubeTilingData); + if (TILING_KEY_IS(1)) { + matmulKernel.Process(&pipe); + } else if (TILING_KEY_IS(2)) { + matmulKernel.Process(&pipe); + } +} \ No newline at end of file diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h new file mode 100644 index 000000000..8f32f3418 --- /dev/null +++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h @@ -0,0 +1,21 @@ +/** + * @file matmul_custom_tiling.h + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef MATMUL_TILING_H +#define MATMUL_TILING_H + +#include +#include "kernel_tiling/kernel_tiling.h" + +struct MatmulCustomTilingData { + uint64_t localMemSize; + TCubeTiling cubeTilingData; +}; + +#endif // MATMUL_TILING_H \ No newline at end of file -- Gitee