diff --git a/samples/xlogy.py b/samples/xlogy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0d9dc04d3aa33d84d6b8dc8b2f7832e41331d28
--- /dev/null
+++ b/samples/xlogy.py
@@ -0,0 +1,19 @@
+import torch
+import torch_npu
+import tdb_C
+
+from compare_data import compare_numpy_arrays
+
+def xlogy_golden(x1, x2):
+    res = torch.xlogy(x1, x2)
+    return res
+
+def tdb_xlogy_test():
+    x1 = torch.randn(2, 1024, 640).uniform_(-1, 1).float().npu()
+    x2 = torch.randn(2, 1024, 640).uniform_(-1, 1).float().npu()
+    res = tdb_C.xlogy(x1, x2)
+
+    golden = xlogy_golden(x1.cpu(), x2.cpu())
+    compare_numpy_arrays(res.cpu().numpy(), golden.numpy())
+if __name__ == "__main__":
+    tdb_xlogy_test()
\ No newline at end of file
diff --git a/tdb/common/ops/csrc/XlogyKernelNpu.cpp b/tdb/common/ops/csrc/XlogyKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..500afff80cd24b3133cc59beb93bfc205f940ee1
--- /dev/null
+++ b/tdb/common/ops/csrc/XlogyKernelNpu.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) 2025 Td-Tech Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include "OpApiCommon.h"
+#include "functions.h"
+
+at::Tensor xlogy(const at::Tensor &x1, const at::Tensor &x2)
+{
+    at::Tensor out = at::empty(x1.sizes(), x1.options());
+    EXEC_NPU_CMD(aclnnXlogy, x1, x2, out);
+    return out;
+}
diff --git a/tdb/common/ops/csrc/functions.h b/tdb/common/ops/csrc/functions.h
index a7f459042f84dd9b37c88ff19877ebc514b8ffda..37437f30c29bacee7dbcfe11951c5c5cbdd25a86 100644
--- a/tdb/common/ops/csrc/functions.h
+++ b/tdb/common/ops/csrc/functions.h
@@ -33,5 +33,6 @@ at::Tensor mse_loss_grad(const at::Tensor &predict, const at::Tensor &label, con
 at::Tensor clip_by_value(const at::Tensor &x1, const at::Tensor &clip_value_min, const at::Tensor &clip_value_max);
 at::Tensor mse_loss(const at::Tensor &predict, const at::Tensor &label, int64_t reduction);
 at::Tensor swiglu(const at::Tensor &x, const at::Tensor &weight, const at::Tensor &bias, const float beta);
+at::Tensor xlogy(const at::Tensor &x1, const at::Tensor &x2);
 
 #endif // __FUNCTIONS_H__
diff --git a/tdb/common/ops/csrc/pybind.cpp b/tdb/common/ops/csrc/pybind.cpp
index bfbbc40ed041e3bf14da2dcd514ae2fb70c8b410..60969670319a869076ffe952a9b3b5491b300fa2 100644
--- a/tdb/common/ops/csrc/pybind.cpp
+++ b/tdb/common/ops/csrc/pybind.cpp
@@ -13,4 +13,5 @@ void init_common(pybind11::module &m)
     m.def("clip_by_value", &clip_by_value);
     m.def("mse_loss", &mse_loss);
     m.def("swiglu", &swiglu);
+    m.def("xlogy", &xlogy);
 }
diff --git a/tdb/common/ops/kernels/operators/op_host/xlogy.cpp b/tdb/common/ops/kernels/operators/op_host/xlogy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da42457a1dd44b7823611bb4800c94461f4cfdf8
--- /dev/null
+++ b/tdb/common/ops/kernels/operators/op_host/xlogy.cpp
@@ -0,0 +1,93 @@
+
+#include "xlogy_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace optiling {
+
+constexpr int32_t CORE_NUM = 1;
+
+constexpr int32_t X1_INDEX = 0;
+constexpr int32_t BYTE_REPEAT = 256;
+
+constexpr int32_t BUF_COUNT = (2 + 1) * 1 + 1;
+
+static ge::graphStatus TilingFunc(gert::TilingContext* context) {
+    XlogyTilingData tiling;
+    uint32_t totalCoreNum = CORE_NUM;
+    uint64_t ubSizePlatForm;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSizePlatForm);
+
+    const gert::StorageShape* xShape = context->GetInputShape(X1_INDEX);
+    const ge::DataType dataType = context->GetInputDesc(X1_INDEX)->GetDataType();
+    int32_t dTypeSize = ge::GetSizeByDataType(dataType);
+    int32_t perRepeatCount = BYTE_REPEAT / dTypeSize;
+    int64_t totalDataCount = xShape->GetStorageShape().GetShapeSize();
+
+    uint64_t ubMaxProcCount = ubSizePlatForm / BUF_COUNT / BYTE_REPEAT * perRepeatCount;
+    context->SetTilingKey(101);
+    if (dataType != ge::DT_FLOAT) {
+        context->SetTilingKey(201);
+    }
+
+    tiling.set_ubMaxProcCount(ubMaxProcCount);
+    tiling.set_totalDataCount(totalDataCount);
+    tiling.set_loopCount(totalDataCount / ubMaxProcCount);
+    tiling.set_tailCount(totalDataCount % ubMaxProcCount);
+
+    context->SetBlockDim(totalCoreNum);
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+
+    printf(
+        "==zf==Tiling Data, ubSizePlatForm:%lu, ubMaxProcCount:%lu, totalDataCount:%lu, loopCount:%lu, tailCount:%lu\n",
+        ubSizePlatForm, tiling.get_ubMaxProcCount(), tiling.get_totalDataCount(), tiling.get_loopCount(),
+        tiling.get_tailCount());
+
+    // 以下代码是ascendebug必须的
+    size_t* workspaces = context->GetWorkspaceSizes(1);
+    workspaces[0] = 16 * 1024 * 1024;
+    return ge::GRAPH_SUCCESS;
+}
+}  // namespace optiling
+
+namespace ge {
+static ge::graphStatus InferShape(gert::InferShapeContext* context) {
+    const gert::Shape* x1_shape = context->GetInputShape(0);
+    gert::Shape* y_shape = context->GetOutputShape(0);
+    *y_shape = *x1_shape;
+    return GRAPH_SUCCESS;
+}
+}  // namespace ge
+
+namespace ops {
+class Xlogy : public OpDef {
+public:
+    explicit Xlogy(const char* name) : OpDef(name) {
+        this->Input("x1")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("x2")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("y")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->SetInferShape(ge::InferShape);
+
+        this->AICore().SetTiling(optiling::TilingFunc);
+        this->AICore().AddConfig("ascend910b");
+        this->AICore().AddConfig("ascend310p");
+    }
+};
+
+OP_ADD(Xlogy);
+}  // namespace ops
diff --git a/tdb/common/ops/kernels/operators/op_host/xlogy_tiling.h b/tdb/common/ops/kernels/operators/op_host/xlogy_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7dd6d748f5ce70b4cc916da4efa2c4b60c72c13
--- /dev/null
+++ b/tdb/common/ops/kernels/operators/op_host/xlogy_tiling.h
@@ -0,0 +1,13 @@
+
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(XlogyTilingData)
+    TILING_DATA_FIELD_DEF(uint64_t, ubMaxProcCount);
+    TILING_DATA_FIELD_DEF(uint64_t, totalDataCount);
+    TILING_DATA_FIELD_DEF(uint64_t, loopCount);
+    TILING_DATA_FIELD_DEF(uint64_t, tailCount);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(Xlogy, XlogyTilingData)
+}  // namespace optiling
diff --git a/tdb/common/ops/kernels/operators/op_kernel/xlogy.cpp b/tdb/common/ops/kernels/operators/op_kernel/xlogy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bdee352dee4327d85afe4e2fa0e750ba235c312a
--- /dev/null
+++ b/tdb/common/ops/kernels/operators/op_kernel/xlogy.cpp
@@ -0,0 +1,17 @@
+#include "xlogy_n_d.h"
+
+using namespace Xlogy;
+
+extern "C" __global__ __aicore__ void xlogy(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) {
+    GET_TILING_DATA(tilingData, tiling);
+
+    if (TILING_KEY_IS(101)) {
+        XlogyND<float> op;
+        op.Init(x1, x2, y, &tilingData);
+        op.Process();
+    } else if (TILING_KEY_IS(201)) {
+        XlogyND<half> op;
+        op.Init(x1, x2, y, &tilingData);
+        op.Process();
+    }
+}
\ No newline at end of file
diff --git a/tdb/common/ops/kernels/operators/op_kernel/xlogy_n_d.h b/tdb/common/ops/kernels/operators/op_kernel/xlogy_n_d.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc21281fd4876b3b2806004799cfdeefd118c2c9
--- /dev/null
+++ b/tdb/common/ops/kernels/operators/op_kernel/xlogy_n_d.h
@@ -0,0 +1,141 @@
+#ifndef XLOGY_N_D_H
+#define XLOGY_N_D_H
+
+#include "kernel_operator.h"
+
+namespace Xlogy {
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 1;
+constexpr int32_t BYTE_BLOCK = 32;
+constexpr int32_t BYTE_REPEAT = 256;
+
+constexpr int32_t TMP_BUF_COUNT = 1;
+
+constexpr float ZERO = 0.0;
+constexpr float POS_ONE = 1.0;
+
+template <typename T>
+class XlogyND {
+public:
+    __aicore__ inline XlogyND() = default;
+    __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, const XlogyTilingData* __restrict tilingData);
+    __aicore__ inline void Process();
+
+private:
+    template <typename T1, typename T2>
+    __aicore__ inline T1 CeilDiv(T1 a, T2 b) {
+        T1 bTemp(b);
+        return bTemp == 0 ? a : (a + bTemp - 1) / bTemp;
+    };
+
+    template <typename T1, typename T2>
+    __aicore__ inline T1 CeilAlignA2B(T1 a, T2 b) {
+        T1 bTemp(b);
+        return bTemp == 0 ? a : CeilDiv(a, bTemp) * bTemp;
+    };
+
+    __aicore__ inline void CopyIn(int64_t gmOffset, int64_t dataCount);
+    __aicore__ inline void Compute(int64_t gmOffset, int64_t dataCount);
+    __aicore__ inline void CopyOut(int64_t gmOffset, int64_t dataCount);
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> x1Que, x2Que;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> yQue;
+    TBuf<QuePosition::VECCALC> tempValBuf;
+    GlobalTensor<T> x1GM, x2GM, yGM;
+    LocalTensor<T> tempValLT;
+    int64_t blockIdx = 0;
+    uint64_t perBlockCount = 0;
+    uint64_t perRepeatCount = 0;
+
+    // tiling params
+    uint64_t ubMaxProcCount = 0;
+    uint64_t totalDataCount = 0;
+    uint64_t loopCount = 0;
+    uint64_t tailCount = 0;
+};
+
+template <typename T>
+__aicore__ inline void XlogyND<T>::Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR y,
+                                        const XlogyTilingData* __restrict tilingData) {
+    blockIdx = GetBlockIdx();
+    perBlockCount = BYTE_BLOCK / sizeof(T);
+    perRepeatCount = BYTE_REPEAT / sizeof(T);
+    ubMaxProcCount = tilingData->ubMaxProcCount;
+    totalDataCount = tilingData->totalDataCount;
+    loopCount = tilingData->loopCount;
+    tailCount = tilingData->tailCount;
+
+    x1GM.SetGlobalBuffer((__gm__ T*)x1, totalDataCount);
+    x2GM.SetGlobalBuffer((__gm__ T*)x2, totalDataCount);
+    yGM.SetGlobalBuffer((__gm__ T*)y, totalDataCount);
+
+    uint64_t singleBufferSize = ubMaxProcCount * sizeof(T);
+    pipe.InitBuffer(x1Que, BUFFER_NUM, singleBufferSize);
+    pipe.InitBuffer(x2Que, BUFFER_NUM, singleBufferSize);
+    pipe.InitBuffer(yQue, BUFFER_NUM, singleBufferSize);
+
+    pipe.InitBuffer(tempValBuf, singleBufferSize * TMP_BUF_COUNT);
+
+    tempValLT = tempValBuf.Get<T>();
+}
+
+template <typename T>
+__aicore__ inline void XlogyND<T>::Process() {
+    int64_t gmOffset = 0;
+    for (int64_t i = 0; i < loopCount; i++) {
+        CopyIn(gmOffset, ubMaxProcCount);
+        Compute(gmOffset, ubMaxProcCount);
+        CopyOut(gmOffset, ubMaxProcCount);
+        gmOffset += ubMaxProcCount;
+    }
+    if (tailCount) {
+        int64_t alignCopyCount = CeilAlignA2B(tailCount, perBlockCount);
+        int64_t alignComputeCount = CeilAlignA2B(tailCount, perRepeatCount);
+        CopyIn(gmOffset, alignCopyCount);
+        Compute(gmOffset, alignComputeCount);
+        CopyOut(gmOffset, alignCopyCount);
+    }
+}
+
+template <typename T>
+__aicore__ inline void XlogyND<T>::CopyIn(int64_t gmOffset, int64_t dataCount) {
+    LocalTensor<T> x1InLT = x1Que.AllocTensor<T>();
+    LocalTensor<T> x2InLT = x2Que.AllocTensor<T>();
+    DataCopy(x1InLT, x1GM[gmOffset], dataCount);
+    DataCopy(x2InLT, x2GM[gmOffset], dataCount);
+    x1Que.EnQue(x1InLT);
+    x2Que.EnQue(x2InLT);
+}
+
+template <typename T>
+__aicore__ inline void XlogyND<T>::Compute(int64_t gmOffset, int64_t dataCount) {
+    LocalTensor<T> x1LT = x1Que.DeQue<T>();
+    LocalTensor<T> x2LT = x2Que.DeQue<T>();
+    LocalTensor<T> yLT = yQue.AllocTensor<T>();
+
+    LocalTensor<T> t1 = tempValLT;
+    LocalTensor<uint8_t> mask = tempValLT.template ReinterpretCast<uint8_t>();
+    Duplicate(t1, T(ZERO), dataCount);
+    Compare(mask, x1LT, t1, CMPMODE::NE, dataCount);
+    Select(x2LT, mask, x2LT, T(POS_ONE), SELMODE::VSEL_TENSOR_SCALAR_MODE, dataCount);
+
+    Ln(x2LT, x2LT, dataCount);
+    Mul(yLT, x1LT, x2LT, dataCount);
+
+    yQue.EnQue(yLT);
+    x1Que.FreeTensor(x1LT);
+    x2Que.FreeTensor(x2LT);
+}
+
+template <typename T>
+__aicore__ inline void XlogyND<T>::CopyOut(int64_t gmOffset, int64_t dataCount) {
+    LocalTensor<T> yOutLT = yQue.DeQue<T>();
+    DataCopy(yGM[gmOffset], yOutLT, dataCount);
+    yQue.FreeTensor(yOutLT);
+}
+}  // namespace Xlogy
+
+#endif  // XLOGY_N_D_H
\ No newline at end of file