diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..75c0b140e0582dce7063977c2a9a836f268d404c
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/README.md
@@ -0,0 +1,14 @@
+## 算子原型
+
+算子类型(OpType) | MatmulCustom |
+
+算子输入 | name | shape | data type | format |
+a | 1024 * 256 | float16 | ND |
+b | 256 * 640 | float16 | ND |
+bias | 640 | float | ND |
+
+
+算子输出 | c | 1024 * 640 | float | ND |
+
+核函数名 | matmul_custom |
+
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eea77fe5b14f77a14f010a5f8379079d62d60c58
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom.cpp
@@ -0,0 +1,108 @@
+/**
+ * @file matmul_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "../op_kernel/matmul_custom_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/tiling_api.h"
+using namespace matmul_tiling;
+
+namespace optiling {
+/**
+ * @brief Generate matmul tiling.
+ * @param context: Tiling kernel context.
+ * @retval Status of GetTiling (GRAPH_SUCCESS or GRAPH_FAILED).
+ */
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+ auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+ auto shape_a = context->GetInputTensor(0)->GetOriginShape();
+ auto shape_b = context->GetInputTensor(1)->GetOriginShape();
+ int32_t M = shape_a.GetDim(0);
+ int32_t N = shape_b.GetDim(1);
+ int32_t K = shape_a.GetDim(1);
+ int32_t baseM = 128;
+ int32_t baseN = 128;
+ int32_t singleCoreM = 512;
+ int32_t singleCoreN = 640;
+ MultiCoreMatmulTiling cubeTiling(ascendcPlatform);
+ cubeTiling.SetDim(ascendcPlatform.GetCoreNumAiv()); // Set the number of cores that participate in multi-core computaion is 48.
+ cubeTiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
+ cubeTiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
+ cubeTiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+ cubeTiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+ cubeTiling.SetShape(M, N, K);
+ cubeTiling.SetOrgShape(M, N, K);
+ if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) {
+ cubeTiling.SetSingleShape(singleCoreM, singleCoreN, -1); // Set the fixed singleCoreM=512, singleCoreN=640.
+ cubeTiling.SetFixSplit(baseM, baseN, -1); // Set the fixed baseM=128, baseN=128.
+ }
+ cubeTiling.SetBias(true);
+ cubeTiling.SetBufferSpace(-1, -1, -1);
+ MatmulCustomTilingData *tiling = context->GetTilingData();
+ if (cubeTiling.GetTiling(tiling->cubeTilingData) == -1) {
+ return ge::GRAPH_FAILED;
+ }
+
+ uint64_t localMemSize;
+ ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, localMemSize);
+ tiling->set_localMemSize(localMemSize);
+
+ if (ascendcPlatform.GetSocVersion() == platform_ascendc::SocVersion::ASCEND310P) {
+ context->SetBlockDim(2);
+ context->SetTilingKey(2);
+ } else {
+ /* SetBlockDim here refers to the number of cube cores, so for separated arch(AIC:AIV=1:2),
+ vector cores number is set 48 by SetDim, cube core number need to be set 24 here.*/
+ context->SetBlockDim(24);
+ context->SetTilingKey(1);
+ }
+
+ tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+ context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+ size_t userWorkspaceSize = 0;
+ size_t systemWorkspaceSize = static_cast(ascendcPlatform.GetLibApiWorkSpaceSize());
+ size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+ currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize;
+
+ return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ops {
+class MatmulCustom : public OpDef {
+public:
+ explicit MatmulCustom(const char *name) : OpDef(name)
+ {
+ this->Input("a")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_FLOAT16})
+ .Format({ge::FORMAT_ND});
+ this->Input("b")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_FLOAT16})
+ .Format({ge::FORMAT_ND});
+ this->Input("bias")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_FLOAT})
+ .Format({ge::FORMAT_ND});
+ this->Output("c")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_FLOAT})
+ .Format({ge::FORMAT_ND});
+
+ this->AICore()
+ .SetTiling(optiling::TilingFunc)
+ .AddConfig("ascend310p")
+ .AddConfig("ascend910b");
+ }
+};
+
+OP_ADD(MatmulCustom);
+} // namespace ops
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd898cba924c38b9900881ac14adb2b52cb36ed1
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_host/matmul_custom_tiling.h
@@ -0,0 +1,25 @@
+/**
+ * @file matmul_custom_tiling.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef MATMUL_TILING_H
+#define MATMUL_TILING_H
+
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(MatmulCustomTilingData)
+TILING_DATA_FIELD_DEF(uint64_t, localMemSize);
+TILING_DATA_FIELD_DEF_STRUCT(TCubeTiling, cubeTilingData);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(MatmulCustom, MatmulCustomTilingData)
+} // namespace optiling
+
+#endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0d86f00041b1261cd79f06ef8869171048c4f6f
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom.cpp
@@ -0,0 +1,162 @@
+/**
+ * @file matmul_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+#include "matmul_custom_tiling.h"
+
+using namespace matmul;
+
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+ return (a + b - 1) / b;
+}
+
+template class MatmulKernel {
+public:
+ __aicore__ inline MatmulKernel(){};
+ __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+ uint64_t memSize, const TCubeTiling &tiling);
+ template __aicore__ inline void Process(AscendC::TPipe *pipe);
+
+ __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB,
+ int32_t &offsetC, int32_t &offsetBias);
+
+ Matmul, MatmulType,
+ MatmulType, MatmulType>
+ matmulObj;
+
+ AscendC::GlobalTensor aGlobal;
+ AscendC::GlobalTensor bGlobal;
+ AscendC::GlobalTensor cGlobal;
+ AscendC::GlobalTensor biasGlobal;
+ TCubeTiling tiling;
+ uint64_t localMemSize = 0;
+ int32_t mIdx = 0;
+ int32_t nIdx = 0;
+};
+
+/**
+ * @brief Set matmul input and output gm addr of current core.
+ * @param a: A matrix gm addr.
+ * @param b: B matrix gm addr.
+ * @param bias: Bias gm addr.
+ * @param c: C matrix gm addr.
+ * @param workspace: Temporary gm space addr required by matmul calc.
+ * @param tiling: matmul tiling data.
+ * @retval None
+ */
+template
+__aicore__ inline void MatmulKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c,
+ GM_ADDR workspace, uint64_t memSize, const TCubeTiling &tiling)
+{
+ this->tiling = tiling;
+ this->localMemSize = memSize;
+ aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka);
+ bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N);
+ cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N);
+ biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N);
+
+ int32_t offsetA = 0;
+ int32_t offsetB = 0;
+ int32_t offsetC = 0;
+ int32_t offsetBias = 0;
+ CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); // Calculate the gm offset based on the blockidx.
+ aGlobal = aGlobal[offsetA];
+ bGlobal = bGlobal[offsetB];
+ cGlobal = cGlobal[offsetC];
+ biasGlobal = biasGlobal[offsetBias];
+ matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka, tiling.Kb);
+ if (GetSysWorkSpacePtr() == nullptr) {
+ return;
+ }
+}
+
+/**
+ * @brief Main process of matmul calculation
+ * @param pipe: Global memory and sync management TPipe object.
+ * @retval None
+ */
+template
+template
+__aicore__ inline void MatmulKernel::Process(AscendC::TPipe *pipe)
+{
+ if (GetBlockIdx() >= tiling.usedCoreNum) {
+ return;
+ }
+ // Set temp UB space if the setTmpSpace is true.
+ if constexpr (setTmpSpace) {
+ AscendC::TBuf<> tmpMMFormatUb;
+ AscendC::LocalTensor mmformatUb;
+ pipe->InitBuffer(tmpMMFormatUb, localMemSize);
+ mmformatUb = tmpMMFormatUb.Get(localMemSize);
+ matmulObj.SetLocalWorkspace(mmformatUb);
+ }
+ auto tailM = tiling.M - mIdx * tiling.singleCoreM;
+ auto tailN = tiling.N - nIdx * tiling.singleCoreN;
+ auto mUse = tailM > tiling.singleCoreM ? tiling.singleCoreM : (tailM > 0 ? tailM : tiling.M);
+ auto nUse = tailN > tiling.singleCoreN ? tiling.singleCoreN : (tailN > 0 ? tailN : tiling.N);
+ matmulObj.SetTail(mUse, nUse, -1);
+ matmulObj.SetTensorA(aGlobal);
+ matmulObj.SetTensorB(bGlobal);
+ matmulObj.SetBias(biasGlobal);
+ matmulObj.IterateAll(cGlobal);
+ matmulObj.End();
+}
+
+/**
+ * @brief Calculate the gm offset based on the blockidx.
+ * @param blockIdx: Current Core blockidx.
+ * @param tiling: Matmul tiling data.
+ * @param offsetA: Gm offset of A matrix.
+ * @param offsetB: Gm offset of B matrix.
+ * @param offsetC: Gm offset of C matrix.
+ * @param offsetBias: Gm offset of Bias matrix.
+ * @retval None
+ */
+template
+__aicore__ inline void
+MatmulKernel::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA,
+ int32_t &offsetB, int32_t &offsetC, int32_t &offsetBias)
+{
+ auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
+ mIdx = blockIdx % mSingleBlocks;
+ nIdx = blockIdx / mSingleBlocks;
+
+ offsetA = mIdx * tiling.Ka * tiling.singleCoreM;
+ offsetB = nIdx * tiling.singleCoreN;
+ offsetC = mIdx * tiling.N * tiling.singleCoreM + nIdx * tiling.singleCoreN;
+ offsetBias = nIdx * tiling.singleCoreN;
+}
+
+/**
+ * @brief matmul kernel function entry
+ * @param a: A matrix gm addr.
+ * @param b: B matrix gm addr.
+ * @param bias: Bias gm addr.
+ * @param c: C matrix gm addr.
+ * @param workspace: Temporary gm space addr required by matmul calc.
+ * @param tiling: Tiling data addr.
+ * @retval None
+ */
+extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+ GM_ADDR tiling)
+{
+ REGISTER_TILING_DEFAULT(MatmulCustomTilingData);
+ GET_TILING_DATA(tilingData, tiling);
+ MatmulKernel matmulKernel;
+ AscendC::TPipe pipe;
+ REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tilingData.cubeTilingData); // Initialize the matmul object.
+ matmulKernel.Init(a, b, bias, c, workspace, tilingData.localMemSize, tilingData.cubeTilingData);
+ if (TILING_KEY_IS(1)) {
+ matmulKernel.Process(&pipe);
+ } else if (TILING_KEY_IS(2)) {
+ matmulKernel.Process(&pipe);
+ }
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f32f341869f8952b3ce6fdf6976aa60d863d6ad
--- /dev/null
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCoreNewTiling/op_kernel/matmul_custom_tiling.h
@@ -0,0 +1,21 @@
+/**
+ * @file matmul_custom_tiling.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef MATMUL_TILING_H
+#define MATMUL_TILING_H
+
+#include
+#include "kernel_tiling/kernel_tiling.h"
+
+struct MatmulCustomTilingData {
+ uint64_t localMemSize;
+ TCubeTiling cubeTilingData;
+};
+
+#endif // MATMUL_TILING_H
\ No newline at end of file