diff --git a/samples/xlogy.py b/samples/xlogy.py new file mode 100644 index 0000000000000000000000000000000000000000..e0d9dc04d3aa33d84d6b8dc8b2f7832e41331d28 --- /dev/null +++ b/samples/xlogy.py @@ -0,0 +1,19 @@ +import torch +import torch_npu +import tdb_C + +from compare_data import compare_numpy_arrays + +def xlogy_golden(x1, x2): + res = torch.xlogy(x1, x2) + return res + +def tdb_xlogy_test(): + x1 = torch.randn(2, 1024, 640).uniform_(-1, 1).float().npu() + x2 = torch.randn(2, 1024, 640).uniform_(-1, 1).float().npu() + res = tdb_C.xlogy(x1, x2) + + golden = xlogy_golden(x1.cpu(), x2.cpu()) + compare_numpy_arrays(res.cpu().numpy(), golden.numpy()) +if __name__ == "__main__": + tdb_xlogy_test() \ No newline at end of file diff --git a/tdb/common/ops/csrc/XlogyKernelNpu.cpp b/tdb/common/ops/csrc/XlogyKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..500afff80cd24b3133cc59beb93bfc205f940ee1 --- /dev/null +++ b/tdb/common/ops/csrc/XlogyKernelNpu.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2025 Td-Tech Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "OpApiCommon.h" +#include "functions.h" + +at::Tensor xlogy(const at::Tensor &x1, const at::Tensor &x2) +{ + at::Tensor out = at::empty(x1.sizes(), x1.options()); + EXEC_NPU_CMD(aclnnXlogy, x1, x2, out); + return out; +} diff --git a/tdb/common/ops/csrc/functions.h b/tdb/common/ops/csrc/functions.h index a7f459042f84dd9b37c88ff19877ebc514b8ffda..37437f30c29bacee7dbcfe11951c5c5cbdd25a86 100644 --- a/tdb/common/ops/csrc/functions.h +++ b/tdb/common/ops/csrc/functions.h @@ -33,5 +33,6 @@ at::Tensor mse_loss_grad(const at::Tensor &predict, const at::Tensor &label, con at::Tensor clip_by_value(const at::Tensor &x1, const at::Tensor &clip_value_min, const at::Tensor &clip_value_max); at::Tensor mse_loss(const at::Tensor &predict, const at::Tensor &label, int64_t reduction); at::Tensor swiglu(const at::Tensor &x, const at::Tensor &weight, const at::Tensor &bias, const float beta); +at::Tensor xlogy(const at::Tensor &x1, const at::Tensor &x2); #endif // __FUNCTIONS_H__ diff --git a/tdb/common/ops/csrc/pybind.cpp b/tdb/common/ops/csrc/pybind.cpp index bfbbc40ed041e3bf14da2dcd514ae2fb70c8b410..60969670319a869076ffe952a9b3b5491b300fa2 100644 --- a/tdb/common/ops/csrc/pybind.cpp +++ b/tdb/common/ops/csrc/pybind.cpp @@ -13,4 +13,5 @@ void init_common(pybind11::module &m) m.def("clip_by_value", &clip_by_value); m.def("mse_loss", &mse_loss); m.def("swiglu", &swiglu); + m.def("xlogy", &xlogy); } diff --git a/tdb/common/ops/kernels/operators/op_host/xlogy.cpp b/tdb/common/ops/kernels/operators/op_host/xlogy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da42457a1dd44b7823611bb4800c94461f4cfdf8 --- /dev/null +++ b/tdb/common/ops/kernels/operators/op_host/xlogy.cpp @@ -0,0 +1,93 @@ + +#include "xlogy_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { + +constexpr int32_t CORE_NUM = 1; + +constexpr int32_t X1_INDEX = 0; +constexpr int32_t BYTE_REPEAT = 256; + +constexpr int32_t BUF_COUNT = (2 + 1) * 1 + 1; + +static ge::graphStatus TilingFunc(gert::TilingContext* context) { + XlogyTilingData tiling; + uint32_t totalCoreNum = CORE_NUM; + uint64_t ubSizePlatForm; + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSizePlatForm); + + const gert::StorageShape* xShape = context->GetInputShape(X1_INDEX); + const ge::DataType dataType = context->GetInputDesc(X1_INDEX)->GetDataType(); + int32_t dTypeSize = ge::GetSizeByDataType(dataType); + int32_t perRepeatCount = BYTE_REPEAT / dTypeSize; + int64_t totalDataCount = xShape->GetStorageShape().GetShapeSize(); + + uint64_t ubMaxProcCount = ubSizePlatForm / BUF_COUNT / BYTE_REPEAT * perRepeatCount; + context->SetTilingKey(101); + if (dataType != ge::DT_FLOAT) { + context->SetTilingKey(201); + } + + tiling.set_ubMaxProcCount(ubMaxProcCount); + tiling.set_totalDataCount(totalDataCount); + tiling.set_loopCount(totalDataCount / ubMaxProcCount); + tiling.set_tailCount(totalDataCount % ubMaxProcCount); + + context->SetBlockDim(totalCoreNum); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + printf( + "==zf==Tiling Data, ubSizePlatForm:%lu, ubMaxProcCount:%lu, totalDataCount:%lu, loopCount:%lu, tailCount:%lu\n", + ubSizePlatForm, tiling.get_ubMaxProcCount(), tiling.get_totalDataCount(), tiling.get_loopCount(), + tiling.get_tailCount()); + + // 以下代码是ascendebug必须的 + size_t* workspaces = context->GetWorkspaceSizes(1); + workspaces[0] = 16 * 1024 * 1024; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext* context) { + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class Xlogy : public OpDef { +public: + explicit Xlogy(const char* name) : OpDef(name) { + this->Input("x1") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16, ge::DT_FLOAT}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Input("x2") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16, ge::DT_FLOAT}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16, ge::DT_FLOAT}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape); + + this->AICore().SetTiling(optiling::TilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend310p"); + } +}; + +OP_ADD(Xlogy); +} // namespace ops diff --git a/tdb/common/ops/kernels/operators/op_host/xlogy_tiling.h b/tdb/common/ops/kernels/operators/op_host/xlogy_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..b7dd6d748f5ce70b4cc916da4efa2c4b60c72c13 --- /dev/null +++ b/tdb/common/ops/kernels/operators/op_host/xlogy_tiling.h @@ -0,0 +1,13 @@ + +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(XlogyTilingData) + TILING_DATA_FIELD_DEF(uint64_t, ubMaxProcCount); + TILING_DATA_FIELD_DEF(uint64_t, totalDataCount); + TILING_DATA_FIELD_DEF(uint64_t, loopCount); + TILING_DATA_FIELD_DEF(uint64_t, tailCount); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(Xlogy, XlogyTilingData) +} // namespace optiling diff --git a/tdb/common/ops/kernels/operators/op_kernel/xlogy.cpp b/tdb/common/ops/kernels/operators/op_kernel/xlogy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bdee352dee4327d85afe4e2fa0e750ba235c312a --- /dev/null +++ b/tdb/common/ops/kernels/operators/op_kernel/xlogy.cpp @@ -0,0 +1,17 @@ +#include "xlogy_n_d.h" + +using namespace Xlogy; + +extern "C" __global__ __aicore__ void xlogy(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) { + GET_TILING_DATA(tilingData, tiling); + + if (TILING_KEY_IS(101)) { + XlogyND op; + op.Init(x1, x2, y, &tilingData); + op.Process(); + } else if (TILING_KEY_IS(201)) { + XlogyND op; + op.Init(x1, x2, y, &tilingData); + op.Process(); + } +} \ No newline at end of file diff --git a/tdb/common/ops/kernels/operators/op_kernel/xlogy_n_d.h b/tdb/common/ops/kernels/operators/op_kernel/xlogy_n_d.h new file mode 100644 index 0000000000000000000000000000000000000000..bc21281fd4876b3b2806004799cfdeefd118c2c9 --- /dev/null +++ b/tdb/common/ops/kernels/operators/op_kernel/xlogy_n_d.h @@ -0,0 +1,141 @@ +#ifndef XLOGY_N_D_H +#define XLOGY_N_D_H + +#include "kernel_operator.h" + +namespace Xlogy { +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t BYTE_BLOCK = 32; +constexpr int32_t BYTE_REPEAT = 256; + +constexpr int32_t TMP_BUF_COUNT = 1; + +constexpr float ZERO = 0.0; +constexpr float POS_ONE = 1.0; + +template +class XlogyND { +public: + __aicore__ inline XlogyND() = default; + __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, const XlogyTilingData* __restrict tilingData); + __aicore__ inline void Process(); + +private: + template + __aicore__ inline T1 CeilDiv(T1 a, T2 b) { + T1 bTemp(b); + return bTemp == 0 ? a : (a + bTemp - 1) / bTemp; + }; + + template + __aicore__ inline T1 CeilAlignA2B(T1 a, T2 b) { + T1 bTemp(b); + return bTemp == 0 ? a : CeilDiv(a, bTemp) * bTemp; + }; + + __aicore__ inline void CopyIn(int64_t gmOffset, int64_t dataCount); + __aicore__ inline void Compute(int64_t gmOffset, int64_t dataCount); + __aicore__ inline void CopyOut(int64_t gmOffset, int64_t dataCount); + +private: + TPipe pipe; + TQue x1Que, x2Que; + TQue yQue; + TBuf tempValBuf; + GlobalTensor x1GM, x2GM, yGM; + LocalTensor tempValLT; + int64_t blockIdx = 0; + uint64_t perBlockCount = 0; + uint64_t perRepeatCount = 0; + + // tiling params + uint64_t ubMaxProcCount = 0; + uint64_t totalDataCount = 0; + uint64_t loopCount = 0; + uint64_t tailCount = 0; +}; + +template +__aicore__ inline void XlogyND::Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, + const XlogyTilingData* __restrict tilingData) { + blockIdx = GetBlockIdx(); + perBlockCount = BYTE_BLOCK / sizeof(T); + perRepeatCount = BYTE_REPEAT / sizeof(T); + ubMaxProcCount = tilingData->ubMaxProcCount; + totalDataCount = tilingData->totalDataCount; + loopCount = tilingData->loopCount; + tailCount = tilingData->tailCount; + + x1GM.SetGlobalBuffer((__gm__ T*)x1, totalDataCount); + x2GM.SetGlobalBuffer((__gm__ T*)x2, totalDataCount); + yGM.SetGlobalBuffer((__gm__ T*)y, totalDataCount); + + uint64_t singleBufferSize = ubMaxProcCount * sizeof(T); + pipe.InitBuffer(x1Que, BUFFER_NUM, singleBufferSize); + pipe.InitBuffer(x2Que, BUFFER_NUM, singleBufferSize); + pipe.InitBuffer(yQue, BUFFER_NUM, singleBufferSize); + + pipe.InitBuffer(tempValBuf, singleBufferSize * TMP_BUF_COUNT); + + tempValLT = tempValBuf.Get(); +} + +template +__aicore__ inline void XlogyND::Process() { + int64_t gmOffset = 0; + for (int64_t i = 0; i < loopCount; i++) { + CopyIn(gmOffset, ubMaxProcCount); + Compute(gmOffset, ubMaxProcCount); + CopyOut(gmOffset, ubMaxProcCount); + gmOffset += ubMaxProcCount; + } + if (tailCount) { + int64_t alignCopyCount = CeilAlignA2B(tailCount, perBlockCount); + int64_t alignComputeCount = CeilAlignA2B(tailCount, perRepeatCount); + CopyIn(gmOffset, alignCopyCount); + Compute(gmOffset, alignComputeCount); + CopyOut(gmOffset, alignCopyCount); + } +} + +template +__aicore__ inline void XlogyND::CopyIn(int64_t gmOffset, int64_t dataCount) { + LocalTensor x1InLT = x1Que.AllocTensor(); + LocalTensor x2InLT = x2Que.AllocTensor(); + DataCopy(x1InLT, x1GM[gmOffset], dataCount); + DataCopy(x2InLT, x2GM[gmOffset], dataCount); + x1Que.EnQue(x1InLT); + x2Que.EnQue(x2InLT); +} + +template +__aicore__ inline void XlogyND::Compute(int64_t gmOffset, int64_t dataCount) { + LocalTensor x1LT = x1Que.DeQue(); + LocalTensor x2LT = x2Que.DeQue(); + LocalTensor yLT = yQue.AllocTensor(); + + LocalTensor t1 = tempValLT; + LocalTensor mask = tempValLT.template ReinterpretCast(); + Duplicate(t1, T(ZERO), dataCount); + Compare(mask, x1LT, t1, CMPMODE::NE, dataCount); + Select(x2LT, mask, x2LT, T(POS_ONE), SELMODE::VSEL_TENSOR_SCALAR_MODE, dataCount); + + Ln(x2LT, x2LT, dataCount); + Mul(yLT, x1LT, x2LT, dataCount); + + yQue.EnQue(yLT); + x1Que.FreeTensor(x1LT); + x2Que.FreeTensor(x2LT); +} + +template +__aicore__ inline void XlogyND::CopyOut(int64_t gmOffset, int64_t dataCount) { + LocalTensor yOutLT = yQue.DeQue(); + DataCopy(yGM[gmOffset], yOutLT, dataCount); + yQue.FreeTensor(yOutLT); +} +} // namespace Xlogy + +#endif // XLOGY_N_D_H \ No newline at end of file