From c45272326e22bc2dac691becf7e798345871f938 Mon Sep 17 00:00:00 2001
From: mingling <chenmingling@huawei.com>
Date: Sat, 21 Jun 2025 16:49:29 +0800
Subject: [PATCH] add tiling pointer

---
 .../MatmulCustomMultiCoreNewTiling/README.md  |  14 ++
 .../op_host/matmul_custom.cpp                 | 108 ++++++++++++
 .../op_host/matmul_custom_tiling.h            |  25 +++
 .../op_kernel/matmul_custom.cpp               | 162 ++++++++++++++++++
 .../op_kernel/matmul_custom_tiling.h          |  21 +++
 5 files changed, 330 insertions(+)
 create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md
 create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h
 create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md
new file mode 100644
index 000000000..75c0b140e
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md
@@ -0,0 +1,14 @@
+## 算子原型
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">MatmulCustom</td></tr>
+</tr>
+<tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">a</td><td align="center">1024 * 256</td><td align="center">float16</td><td align="center">ND</td></tr>
+<tr><td align="center">b</td><td align="center">256 * 640</td><td align="center">float16</td><td align="center">ND</td></tr>
+<tr><td align="center">bias</td><td align="center">640</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">1024 * 640</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">matmul_custom</td></tr>
+</table>
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp
new file mode 100644
index 000000000..eea77fe5b
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp
@@ -0,0 +1,108 @@
+/**
+ * @file matmul_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "../op_kernel/matmul_custom_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/tiling_api.h"
+using namespace matmul_tiling;
+
+namespace optiling {
+/**
+  * @brief  Generate matmul tiling.
+  * @param  context: Tiling kernel context.
+  * @retval Status of GetTiling (GRAPH_SUCCESS or GRAPH_FAILED).
+  */
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    auto shape_a = context->GetInputTensor(0)->GetOriginShape();
+    auto shape_b = context->GetInputTensor(1)->GetOriginShape();
+    int32_t M = shape_a.GetDim(0);
+    int32_t N = shape_b.GetDim(1);
+    int32_t K = shape_a.GetDim(1);
+    int32_t baseM = 128;
+    int32_t baseN = 128;
+    int32_t singleCoreM = 512;
+    int32_t singleCoreN = 640;
+    MultiCoreMatmulTiling cubeTiling(ascendcPlatform);
+    cubeTiling.SetDim(ascendcPlatform.GetCoreNumAiv()); // Set the number of cores that participate in multi-core computaion is 48.
+    cubeTiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
+    cubeTiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
+    cubeTiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.SetOrgShape(M, N, K);
+    if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) {
+        cubeTiling.SetSingleShape(singleCoreM, singleCoreN, -1);  // Set the fixed singleCoreM=512, singleCoreN=640.
+        cubeTiling.SetFixSplit(baseM, baseN, -1);  // Set the fixed baseM=128, baseN=128.
+    }
+    cubeTiling.SetBias(true);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+    MatmulCustomTilingData *tiling = context->GetTilingData<MatmulCustomTilingData>();
+    if (cubeTiling.GetTiling(tiling->cubeTilingData) == -1) {
+        return ge::GRAPH_FAILED;
+    }
+
+    uint64_t localMemSize;
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, localMemSize);
+    tiling->set_localMemSize(localMemSize);
+
+    if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) {
+        context->SetBlockDim(2);
+        context->SetTilingKey(2);
+    } else {
+        /* SetBlockDim here refers to the number of cube cores, so for separated arch(AIC:AIV=1:2),
+            vector cores number is set 48 by SetDim, cube core number need to be set 24 here.*/
+        context->SetBlockDim(24);
+        context->SetTilingKey(1);
+    }
+
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    size_t userWorkspaceSize = 0;
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform.GetLibApiWorkSpaceSize());
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize;
+
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ops {
+class MatmulCustom : public OpDef {
+public:
+    explicit MatmulCustom(const char *name) : OpDef(name)
+    {
+        this->Input("a")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND});
+        this->Input("b")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND});
+        this->Input("bias")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND});
+        this->Output("c")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND});
+
+        this->AICore()
+            .SetTiling(optiling::TilingFunc)
+            .AddConfig("ascend310p")
+            .AddConfig("ascend910b");
+    }
+};
+
+OP_ADD(MatmulCustom);
+} // namespace ops
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h
new file mode 100644
index 000000000..fd898cba9
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h
@@ -0,0 +1,25 @@
+/**
+ * @file matmul_custom_tiling.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef MATMUL_TILING_H
+#define MATMUL_TILING_H
+
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(MatmulCustomTilingData)
+TILING_DATA_FIELD_DEF(uint64_t, localMemSize);
+TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulCustomTilingData)
+} // namespace optiling
+
+#endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp
new file mode 100644
index 000000000..d0d86f000
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp
@@ -0,0 +1,162 @@
+/**
+ * @file matmul_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+#include "matmul_custom_tiling.h"
+
+using namespace matmul;
+
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+    return (a + b - 1) / b;
+}
+
+template <typename aType, typename bType, typename cType, typename biasType> class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+                                uint64_t memSize, const TCubeTiling &tiling);
+    template <bool setTmpSpace = false> __aicore__ inline void Process(AscendC::TPipe *pipe);
+
+    __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB,
+                                      int32_t &offsetC, int32_t &offsetBias);
+
+    Matmul<MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>, MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
+           MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>, MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>>
+        matmulObj;
+
+    AscendC::GlobalTensor<aType> aGlobal;
+    AscendC::GlobalTensor<bType> bGlobal;
+    AscendC::GlobalTensor<cType> cGlobal;
+    AscendC::GlobalTensor<biasType> biasGlobal;
+    TCubeTiling tiling;
+    uint64_t localMemSize = 0;
+    int32_t mIdx = 0;
+    int32_t nIdx = 0;
+};
+
+/**
+  * @brief  Set matmul input and output gm addr of current core.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tiling: matmul tiling data.
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c,
+                                                                         GM_ADDR workspace, uint64_t memSize, const TCubeTiling &tiling)
+{
+    this->tiling = tiling;
+    this->localMemSize = memSize;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N);
+
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); // Calculate the gm offset based on the blockidx.
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka, tiling.Kb);
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+/**
+  * @brief  Main process of matmul calculation
+  * @param  pipe: Global memory and sync management TPipe object.
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+template <bool setTmpSpace>
+__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe *pipe)
+{
+    if (GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+    // Set temp UB space if the setTmpSpace is true.
+    if constexpr (setTmpSpace) {
+        AscendC::TBuf<> tmpMMFormatUb;
+        AscendC::LocalTensor<uint8_t> mmformatUb;
+        pipe->InitBuffer(tmpMMFormatUb, localMemSize);
+        mmformatUb = tmpMMFormatUb.Get<uint8_t>(localMemSize);
+        matmulObj.SetLocalWorkspace(mmformatUb);
+    }
+    auto tailM = tiling.M - mIdx * tiling.singleCoreM;
+    auto tailN = tiling.N - nIdx * tiling.singleCoreN;
+    auto mUse = tailM > tiling.singleCoreM ? tiling.singleCoreM : (tailM > 0 ? tailM : tiling.M);
+    auto nUse = tailN > tiling.singleCoreN ? tiling.singleCoreN : (tailN > 0 ? tailN : tiling.N);
+    matmulObj.SetTail(mUse, nUse, -1);
+    matmulObj.SetTensorA(aGlobal);
+    matmulObj.SetTensorB(bGlobal);
+    matmulObj.SetBias(biasGlobal);
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+/**
+  * @brief  Calculate the gm offset based on the blockidx.
+  * @param  blockIdx: Current Core blockidx.
+  * @param  tiling: Matmul tiling data.
+  * @param  offsetA: Gm offset of A matrix.
+  * @param  offsetB: Gm offset of B matrix.
+  * @param  offsetC: Gm offset of C matrix.
+  * @param  offsetBias: Gm offset of Bias matrix.
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void
+MatmulKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA,
+                                                        int32_t &offsetB, int32_t &offsetC, int32_t &offsetBias)
+{
+    auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
+    mIdx = blockIdx % mSingleBlocks;
+    nIdx = blockIdx / mSingleBlocks;
+
+    offsetA = mIdx * tiling.Ka * tiling.singleCoreM;
+    offsetB = nIdx * tiling.singleCoreN;
+    offsetC = mIdx * tiling.N * tiling.singleCoreM + nIdx * tiling.singleCoreN;
+    offsetBias = nIdx * tiling.singleCoreN;
+}
+
+/**
+  * @brief  matmul kernel function entry
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tiling: Tiling data addr.
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+                                                    GM_ADDR tiling)
+{
+    REGISTER_TILING_DEFAULT(MatmulCustomTilingData);
+    GET_TILING_DATA(tilingData, tiling);
+    MatmulKernel<half, half, float, float> matmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tilingData.cubeTilingData); // Initialize the matmul object.
+    matmulKernel.Init(a, b, bias, c, workspace, tilingData.localMemSize, tilingData.cubeTilingData);
+    if (TILING_KEY_IS(1)) {
+        matmulKernel.Process(&pipe);
+    } else if (TILING_KEY_IS(2)) {
+        matmulKernel.Process<true>(&pipe);
+    }
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h
new file mode 100644
index 000000000..8f32f3418
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h
@@ -0,0 +1,21 @@
+/**
+ * @file matmul_custom_tiling.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef MATMUL_TILING_H
+#define MATMUL_TILING_H
+
+#include <cstdint>
+#include "kernel_tiling/kernel_tiling.h"
+
+struct MatmulCustomTilingData {
+    uint64_t localMemSize;
+    TCubeTiling cubeTilingData;
+};
+
+#endif  // MATMUL_TILING_H
\ No newline at end of file
-- 
Gitee


算子类型(OpType)	MatmulCustom
算子输入	name	shape	data type	format
	a	1024 * 256	float16	ND
	b	256 * 640	float16	ND
	bias	640	float	ND
算子输出	c	1024 * 640	float	ND
核函数名	matmul_custom