diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake
index 5c5a256621cfdf4b7299e6d9d51b736ae82454d9..0a9724ac77f12c8cd70e4ca73d9fe97969a3abb7 100644
--- a/cmake/kernel_headers.cmake
+++ b/cmake/kernel_headers.cmake
@@ -196,3 +196,14 @@ file(CREATE_LINK ../activation/geglu_tiling_intf.h
         ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling_intf.h SYMBOLIC)
 file(CREATE_LINK ../activation/geglu_tiling.h
         ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling.h SYMBOLIC)
+
+# cumprod
+file(MAKE_DIRECTORY  ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod)
+file(CREATE_LINK ../math/kernel_operator_cumprod_intf.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/kernel_operator_cumprod_intf.h SYMBOLIC)
+file(CREATE_LINK ../math/cumprod.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/cumprod.h SYMBOLIC)
+file(CREATE_LINK ../math/cumprod_tiling_intf.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/cumprod_tiling_intf.h SYMBOLIC)
+file(CREATE_LINK ../math/cumprod_tiling.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/cumprod_tiling.h SYMBOLIC)
diff --git a/cmake/tiling_headers.cmake b/cmake/tiling_headers.cmake
index 6e9ee95df767472dd48c2add1961fce831665f8c..698741da233a6055a223a54b9dbcdffe3173fff3 100644
--- a/cmake/tiling_headers.cmake
+++ b/cmake/tiling_headers.cmake
@@ -233,3 +233,10 @@ file(CREATE_LINK ../../lib/math/xor_tiling.h
 
 file(CREATE_LINK ../lib/tiling_api.h
         ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/tiling_api.h SYMBOLIC)
+
+# cumprod
+file(MAKE_DIRECTORY  ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/cumprod)
+file(CREATE_LINK ../../lib/math/cumprod_tiling.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/cumprod/cumprod_tiling.h SYMBOLIC)
+file(CREATE_LINK ../../lib/math/cumprod_tiling_intf.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/cumprod/cumprod_tiling_intf.h SYMBOLIC)
diff --git a/impl/math/cumprod/cumprod_impl.h b/impl/math/cumprod/cumprod_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9861454b10727e0dea97a8c9e261e6c1d3fac68
--- /dev/null
+++ b/impl/math/cumprod/cumprod_impl.h
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the
+ * "License"). Please refer to the License for details. You may not use this
+ * file except in compliance with the License. THIS SOFTWARE IS PROVIDED ON AN
+ * "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS
+ * FOR A PARTICULAR PURPOSE. See LICENSE in the root of the software repository
+ * for the full text of the License.
+ */
+
+/* !
+ * \file cumprod_impl.h
+ * \brief
+ */
+#ifndef IMPL_MATH_CUMPROD_CUMPROD_IMPL_H
+#define IMPL_MATH_CUMPROD_CUMPROD_IMPL_H
+
+#include "kernel_tensor.h"
+#include "kernel_operator_intf.h"
+#if __CCE_AICORE__ >= 200
+
+namespace AscendC {
+struct CumProdInfo {
+    uint32_t outter{0};
+    uint32_t inner{0};  // 32-byte alignment
+};
+
+struct CumProdConfig {
+    bool isLastAxis{true};
+    bool isReuseSource{false};
+    bool outputLastRow{false};
+};
+
+template <typename T>
+__aicore__ inline void CumProdLastDim(const LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor,
+               LocalTensor<T> tempBuffer, const cumProdInfo &cumProdInfo) {
+  constexpr uint32_t oneBlockElementNum = ONE_BLK_SIZE / sizeof(T);
+  uint16_t alignOutter = (cumProdInfo.outter + NCHW_CONV_ADDR_LIST_SIZE - 1) /
+                         NCHW_CONV_ADDR_LIST_SIZE * NCHW_CONV_ADDR_LIST_SIZE;
+  uint64_t transDataTo5HDDstLocalList[NCHW_CONV_ADDR_LIST_SIZE];
+  uint64_t transDataTo5HDSrcLocalList[NCHW_CONV_ADDR_LIST_SIZE];
+  uint8_t repeatTimes = 1;
+  uint16_t dstRepStride = 0;
+  uint16_t srcRepStride = 0;
+  if (cumProdInfo.outter == alignOutter && alignOutter > cumProdInfo.inner) {
+    repeatTimes = alignOutter / NCHW_CONV_ADDR_LIST_SIZE;
+    if (repeatTimes > 1) {
+      dstRepStride = 1;
+      srcRepStride = cumProdInfo.inner;
+    }
+    TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride,
+                                srcRepStride);
+    for (int32_t i = 0; i < cumProdInfo.inner / oneBlockElementNum; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)srcTensor[i * oneBlockElementNum + n * cumProdInfo.inner]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n] =
+            (uint64_t)tempBuffer[i * oneBlockElementNum * alignOutter +
+                                 alignOutter * n]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<T>(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList,
+                        params);
+    }
+  } else {
+    repeatTimes = cumProdInfo.inner / oneBlockElementNum;
+    if (repeatTimes > 1) {
+      dstRepStride = alignOutter;
+      srcRepStride = 1;
+    }
+    TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride,
+                                srcRepStride);
+    for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)srcTensor[((i * NCHW_CONV_ADDR_LIST_SIZE +
+                                  n % (cumProdInfo.outter -
+                                       i * NCHW_CONV_ADDR_LIST_SIZE)) *
+                                 cumProdInfo.inner)]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n] =
+            (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + alignOutter * n]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<T>(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList,
+                        params);
+    }
+  }
+  PipeBarrier<PIPE_V>();
+  SetMaskCount();
+  SetVectorMask<float, MaskMode::COUNTER>(alignOutter * cumProdInfo.inner);
+  LocalTensor<float> floatTempBuffer =
+      tempBuffer[alignOutter * cumProdInfo.inner]
+          .template ReinterpretCast<float>();
+  Cast<float, T, false>(
+      floatTempBuffer, tempBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+      {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE});
+  PipeBarrier<PIPE_V>();
+
+  SetVectorMask<float>(0, alignOutter);
+  const BinaryRepeatParams binaryParams;
+  for (uint32_t row = 1; row < cumProdInfo.inner; ++row) {
+    Mul<float, false>(floatTempBuffer[row * alignOutter],
+                      floatTempBuffer[(row - 1) * alignOutter],
+                      floatTempBuffer[row * alignOutter], MASK_PLACEHOLDER, 1,
+                      binaryParams);
+    PipeBarrier<PIPE_V>();
+  }
+
+  SetVectorMask<T, MaskMode::COUNTER>(alignOutter * cumProdInfo.inner);
+  Cast<T, float, false>(
+      tempBuffer, floatTempBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+      {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
+  PipeBarrier<PIPE_V>();
+  SetMaskNorm();
+  ResetMask();
+  auto tempBuffer2 = tempBuffer[alignOutter * cumProdInfo.inner];
+  if (alignOutter > cumProdInfo.inner) {
+    repeatTimes = alignOutter / oneBlockElementNum;
+    if (repeatTimes > 1) {
+      dstRepStride = cumProdInfo.inner;
+      srcRepStride = 1;
+    } else {
+      dstRepStride = 0;
+      srcRepStride = 0;
+    }
+    TransDataTo5HDParams paramsBack(false, false, repeatTimes, dstRepStride,
+                                    srcRepStride);
+    for (int32_t i = 0; i < cumProdInfo.inner / NCHW_CONV_ADDR_LIST_SIZE; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)
+                tempBuffer[(i * NCHW_CONV_ADDR_LIST_SIZE + n) * alignOutter]
+                    .GetPhyAddr();
+        transDataTo5HDDstLocalList[n] =
+            (uint64_t)tempBuffer2[i * NCHW_CONV_ADDR_LIST_SIZE +
+                                  n * cumProdInfo.inner]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<T>(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList,
+                        paramsBack);
+    }
+  } else {
+    repeatTimes = cumProdInfo.inner / oneBlockElementNum;
+    if (repeatTimes > 1) {
+      dstRepStride = alignOutter;
+      srcRepStride = 1;
+    } else {
+      dstRepStride = 0;
+      srcRepStride = 0;
+    }
+    TransDataTo5HDParams paramsBack(false, false, repeatTimes, srcRepStride,
+                                    dstRepStride);
+    for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + alignOutter * n]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n] =
+            (uint64_t)tempBuffer2[(i * NCHW_CONV_ADDR_LIST_SIZE + n) *
+                                  cumProdInfo.inner]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<T>(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList,
+                        paramsBack);
+    }
+  }
+  PipeBarrier<PIPE_V>();
+  SetMaskCount();
+  SetVectorMask<T>(0, cumProdInfo.outter * cumProdInfo.inner);
+  Muls<T, false>(dstTensor, tempBuffer2, 1, MASK_PLACEHOLDER, 1,
+                 {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
+  PipeBarrier<PIPE_V>();
+  SetMaskNorm();
+  ResetMask();
+}
+
+template <>
+__aicore__ inline void CumProdLastDim(const LocalTensor<float> &dstTensor,
+                                      const LocalTensor<float> &srcTensor,
+                                      LocalTensor<float> tempBuffer,
+                                      const cumProdInfo &cumProdInfo) {
+  constexpr uint32_t oneBlockElementNum = ONE_BLK_SIZE / sizeof(float);
+  uint8_t repeatTimes = 1;
+  uint16_t dstRepStride = 0;
+  uint16_t srcRepStride = 0;
+  uint16_t alignOutter = (cumProdInfo.outter + NCHW_CONV_ADDR_LIST_SIZE - 1) /
+                         NCHW_CONV_ADDR_LIST_SIZE * NCHW_CONV_ADDR_LIST_SIZE;
+  uint64_t transDataTo5HDDstLocalList[NCHW_CONV_ADDR_LIST_SIZE];
+  uint64_t transDataTo5HDSrcLocalList[NCHW_CONV_ADDR_LIST_SIZE];
+  if (cumProdInfo.outter == alignOutter && alignOutter > cumProdInfo.inner) {
+    repeatTimes = alignOutter / NCHW_CONV_ADDR_LIST_SIZE;
+    if (repeatTimes > 1) {
+      dstRepStride = 2;
+      srcRepStride = cumProdInfo.inner * 2;
+    }
+    TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride,
+                                srcRepStride);
+    for (int32_t i = 0; i < cumProdInfo.inner / oneBlockElementNum; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)srcTensor[i * oneBlockElementNum + n * cumProdInfo.inner]
+                .GetPhyAddr();
+      }
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2;
+           n++) {
+        transDataTo5HDDstLocalList[n * 2] =
+            (uint64_t)tempBuffer[(i * oneBlockElementNum + n) * alignOutter]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n * 2 + 1] =
+            (uint64_t)tempBuffer[(i * oneBlockElementNum + n) * alignOutter +
+                                 oneBlockElementNum]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<float>(transDataTo5HDDstLocalList,
+                            transDataTo5HDSrcLocalList, params);
+    }
+  } else {
+    repeatTimes = cumProdInfo.inner / oneBlockElementNum;
+    if (repeatTimes > 1) {
+      dstRepStride = alignOutter;
+      srcRepStride = 1;
+    }
+    TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride,
+                                srcRepStride);
+    for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)srcTensor[((i * NCHW_CONV_ADDR_LIST_SIZE +
+                                  n % (cumProdInfo.outter -
+                                       i * NCHW_CONV_ADDR_LIST_SIZE)) *
+                                 cumProdInfo.inner)]
+                .GetPhyAddr();
+      }
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) {
+        transDataTo5HDDstLocalList[n * 2] =
+            (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + n * alignOutter]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n * 2 + 1] =
+            (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE +
+                                 n * alignOutter + oneBlockElementNum]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<float>(transDataTo5HDDstLocalList,
+                            transDataTo5HDSrcLocalList, params);
+    }
+  }
+  PipeBarrier<PIPE_V>();
+  SetMaskCount();
+  SetVectorMask<float>(0, alignOutter);
+  const BinaryRepeatParams binaryParams;
+  uint32_t addOffset = alignOutter;
+  for (uint32_t row = 1; row < cumProdInfo.inner; ++row) {
+    Mul<float, false>(tempBuffer[addOffset],
+                      tempBuffer[addOffset - alignOutter],
+                      tempBuffer[addOffset], MASK_PLACEHOLDER, 1, binaryParams);
+    addOffset += alignOutter;
+    PipeBarrier<PIPE_V>();
+  }
+  SetMaskNorm();
+  ResetMask();
+
+  auto tempBuffer2 = tempBuffer[alignOutter * cumProdInfo.inner];
+  if (alignOutter > cumProdInfo.inner) {
+    repeatTimes = alignOutter / NCHW_CONV_ADDR_LIST_SIZE;
+    if (repeatTimes > 1) {
+      dstRepStride = cumProdInfo.inner * 2;
+      srcRepStride = 2;
+    } else {
+      dstRepStride = 0;
+      srcRepStride = 0;
+    }
+    TransDataTo5HDParams paramsBack(false, false, repeatTimes, dstRepStride,
+                                    srcRepStride);
+    for (int32_t i = 0; i < cumProdInfo.inner / oneBlockElementNum; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)tempBuffer[i * oneBlockElementNum * alignOutter +
+                                 n * alignOutter]
+                .GetPhyAddr();
+        transDataTo5HDSrcLocalList[n + NCHW_CONV_ADDR_LIST_SIZE / 2] =
+            (uint64_t)tempBuffer[i * oneBlockElementNum * alignOutter +
+                                 n * alignOutter + oneBlockElementNum]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n * 2] =
+            (uint64_t)
+                tempBuffer2[i * oneBlockElementNum + n * cumProdInfo.inner]
+                    .GetPhyAddr();
+        transDataTo5HDDstLocalList[n * 2 + 1] =
+            (uint64_t)tempBuffer2[i * oneBlockElementNum +
+                                  (n + oneBlockElementNum) * cumProdInfo.inner]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<float>(transDataTo5HDDstLocalList,
+                            transDataTo5HDSrcLocalList, paramsBack);
+    }
+
+  } else {
+    repeatTimes = cumProdInfo.inner / oneBlockElementNum;
+    if (repeatTimes > 1) {
+      dstRepStride = alignOutter;
+      srcRepStride = 1;
+    } else {
+      dstRepStride = 0;
+      srcRepStride = 0;
+    }
+    TransDataTo5HDParams paramsBack(false, false, repeatTimes, srcRepStride,
+                                    dstRepStride);
+    for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) {
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) {
+        transDataTo5HDSrcLocalList[n] =
+            (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + n * alignOutter]
+                .GetPhyAddr();
+        transDataTo5HDSrcLocalList[n + NCHW_CONV_ADDR_LIST_SIZE / 2] =
+            (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE +
+                                 n * alignOutter + oneBlockElementNum]
+                .GetPhyAddr();
+      }
+      for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) {
+        transDataTo5HDDstLocalList[n * 2] =
+            (uint64_t)tempBuffer2[(i * NCHW_CONV_ADDR_LIST_SIZE + n) *
+                                  cumProdInfo.inner]
+                .GetPhyAddr();
+        transDataTo5HDDstLocalList[n * 2 + 1] =
+            (uint64_t)tempBuffer2[(i * NCHW_CONV_ADDR_LIST_SIZE +
+                                   (n + NCHW_CONV_ADDR_LIST_SIZE / 2)) *
+                                  cumProdInfo.inner]
+                .GetPhyAddr();
+      }
+      TransDataTo5HD<float>(transDataTo5HDDstLocalList,
+                            transDataTo5HDSrcLocalList, paramsBack);
+    }
+  }
+  PipeBarrier<PIPE_V>();
+  SetMaskCount();
+  SetVectorMask<float>(0, cumProdInfo.outter * cumProdInfo.inner);
+  Muls<float, false>(dstTensor, tempBuffer2, 1, MASK_PLACEHOLDER, 1,
+                     {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
+  PipeBarrier<PIPE_V>();
+  SetMaskNorm();
+  ResetMask();
+}
+
+template <typename T>
+__aicore__ inline void CumProdFirstDim(const LocalTensor<T> &dstTensor,
+                                       const LocalTensor<T> &srcTensor,
+                                       LocalTensor<uint8_t> &sharedTmpBuffer,
+                                       const cumProdInfo &cumProdInfo) {
+  if constexpr (sizeof(T) == 2) {
+    const uint32_t minTmpBufferSize =
+        cumProdInfo.outter * cumProdInfo.inner * sizeof(float);
+    const uint32_t tmpBufferSize = sharedTmpBuffer.GetSize();
+#if ASCENDC_CPU_DEBUG
+    ASCENDC_ASSERT((tmpBufferSize >= minTmpBufferSize), {
+        KERNEL_LOG(KERNEL_ERROR,
+            "tmpBufferSize can't smaller than minTmpBufferSize, tmpBufferSize is %u, minTmpBufferSize is %u!",
+            tmpBufferSize,
+            minTmpBufferSize);
+    });
+#endif
+    SetMaskCount();
+    SetVectorMask<float, MaskMode::COUNTER>(cumProdInfo.outter *
+                                            cumProdInfo.inner);
+    LocalTensor<float> tmpBuffer = sharedTmpBuffer.ReinterpretCast<float>();
+    Cast<float, T, false>(
+        tmpBuffer, srcTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+        {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE});
+    PipeBarrier<PIPE_V>();
+
+    SetVectorMask<T>(0, cumProdInfo.inner);
+    const BinaryRepeatParams binaryParams;
+    for (uint32_t row = 1; row < cumProdInfo.outter; ++row) {
+      Mul<float, false>(tmpBuffer[row * cumProdInfo.inner],
+                        tmpBuffer[(row - 1) * cumProdInfo.inner],
+                        tmpBuffer[row * cumProdInfo.inner], MASK_PLACEHOLDER, 1,
+                        binaryParams);
+      PipeBarrier<PIPE_V>();
+    }
+
+    SetVectorMask<T, MaskMode::COUNTER>(cumProdInfo.outter * cumProdInfo.inner);
+    Cast<T, float, false>(
+        dstTensor, tmpBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+        {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
+    PipeBarrier<PIPE_V>();
+
+  } else {
+    SetMaskCount();
+    SetVectorMask<T>(0, cumProdInfo.inner);
+    Muls<T, false>(dstTensor, srcTensor, 1, MASK_PLACEHOLDER, 1,
+                   {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
+    PipeBarrier<PIPE_V>();
+    const BinaryRepeatParams binaryParams;
+    for (uint32_t row = 1; row < cumProdInfo.outter; ++row) {
+      Mul<T, false>(dstTensor[row * cumProdInfo.inner],
+                    dstTensor[(row - 1) * cumProdInfo.inner],
+                    srcTensor[row * cumProdInfo.inner], MASK_PLACEHOLDER, 1,
+                    binaryParams);
+      PipeBarrier<PIPE_V>();
+    }
+    SetMaskNorm();
+    ResetMask();
+  }
+}
+} // namespace AscendC
+#endif // IMPL_MATH_CUMPROD_CUMPROD_IMPL_H
diff --git a/impl/math/cumprod/cumprod_tiling.cpp b/impl/math/cumprod/cumprod_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2e693de8f30e6eb8e8e7af9514f32c97f9104bc
--- /dev/null
+++ b/impl/math/cumprod/cumprod_tiling.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file cumprod_tiling.cpp
+ * \brief
+ */
+#include "lib/math/cumprod_tiling.h"
+
+#include <cstdint>
+
+#include "graph/tensor.h"
+#include "impl/host_log.h"
+namespace AscendC {
+namespace {
+constexpr uint32_t CUMPROD_HALF_TMP_SIZE = 6;
+constexpr uint32_t CUMPROD_FLOAT_TMP_SIZE = 0;
+constexpr uint32_t CUMPROD_ONE_REPEAT_BYTE_SIZE = 256;
+
+inline uint32_t GetCumProdMaxTmpSize(const uint32_t inputSize, const uint32_t typeSize)
+{
+    const uint32_t calcPro = (typeSize == sizeof(float)) ? CUMPROD_FLOAT_TMP_SIZE : CUMPROD_HALF_TMP_SIZE;
+    return calcPro * std::max(inputSize * typeSize, CUMPROD_ONE_REPEAT_BYTE_SIZE);
+}
+
+inline uint32_t GetCumProdMinTmpSize(const uint32_t typeSize)
+{
+    const uint32_t calcPro = (typeSize == sizeof(float)) ? CUMPROD_FLOAT_TMP_SIZE : CUMPROD_HALF_TMP_SIZE;
+    return calcPro * CUMPROD_ONE_REPEAT_BYTE_SIZE;
+}
+} // namespace
+
+void GetCumPodMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource,
+    uint32_t &maxValue, uint32_t &minValue)
+{
+    (void)isReuseSource;
+    const uint32_t inputSize = srcShape.GetShapeSize();
+    ASCENDC_HOST_ASSERT(inputSize > 0, return, "Input Shape size must be greater than 0.");
+
+    minValue = GetCumProdMinTmpSize(typeSize);
+    maxValue = GetCumProdMaxTmpSize(inputSize, typeSize);
+}
+}// namespace AscendC
diff --git a/lib/math/cumprod.h b/lib/math/cumprod.h
new file mode 100644
index 0000000000000000000000000000000000000000..09d2e68a0e694eb802e93e5ecac47a41f724e361
--- /dev/null
+++ b/lib/math/cumprod.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file cumprod.h
+ * \brief
+ */
+#ifndef LIB_MATH_CUMPROD_H
+#define LIB_MATH_CUMPROD_H
+
+#include "kernel_tensor.h"
+#include "kernel_operator_intf.h"
+#include "../../impl/math/cumprod/cumprod_impl.h"
+#if ASCENDC_CPU_DEBUG
+#include "kernel_log.h"
+#endif
+#if __CCE_AICORE__ >= 200
+
+namespace AscendC {
+#pragma begin_pipe(V)
+
+constexpr CumProdConfig defaultCumProdConfig = {true, false, true};
+
+/* !
+ * \brief This function calculates the average based on the orientation of the last axis or fist axis.
+ * For details about the interface description, see
+ * https://pytorch.org/docs/stable/generated/torch.cumprod.html
+ *
+ * \note support data type: half and float
+ *
+ * \param [out] dstTensor, output LocalTensor
+ * \param [out] lastRowTensor, the last row of the output LocalTensor
+ * \param [in] srcTensor, input LocalTensor
+ * \param [in] sharedTmpBuffer, input local temporary Tensor
+ * \param [in] cumProdInfo, shape information of srcTensor
+ */
+
+template <typename T, const CumProdConfig &config = defaultCumProdConfig>
+__aicore__ inline void CumProd(LocalTensor<T> &dstTensor, LocalTensor<T> &lastRowTensor, const LocalTensor<T> &srcTensor,
+    LocalTensor<uint8_t> &sharedTmpBuffer, const CumProdInfo &cumProdInfo)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+
+#if ASCENDC_CPU_DEBUG
+    bool ans = cumProdInfo.inner > 0 && (cumProdInfo.inner * sizeof(T) % ONE_BLK_SIZE == 0);
+    ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "inner is %u, is not 32B aligned.", cumProdInfo.inner); });
+    ans = srcTensor.GetSize() >= (cumProdInfo.inner * cumProdInfo.outter);
+    ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "srcTensor size isn't enough!."); });
+    ans = dstTensor.GetSize() >= (cumProdInfo.inner * cumProdInfo.outter);
+    ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "dstTensor size isn't enough!."); });
+    if (config.outputLastRow) {
+        ans = lastRowTensor.GetSize() >= cumProdInfo.inner;
+        ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "outputLastRow size isn't enough!."); });
+    }
+#endif
+
+    if constexpr (config.isLastAxis) {
+        uint32_t minCastTempBufferSize = 0;
+        if constexpr (sizeof(T) == 2) {
+            minCastTempBufferSize = cumProdInfo.inner * NCHW_CONV_ADDR_LIST_SIZE * sizeof(half);
+        }
+        const uint32_t minTmpBufferSize = minCastTempBufferSize + NCHW_CONV_ADDR_LIST_SIZE * cumProdInfo.inner *
+                                                                      sizeof(T) * 2;
+        const uint32_t tmpBufferSize = sharedTmpBuffer.GetSize();
+#if ASCENDC_CPU_DEBUG
+        ASCENDC_ASSERT((tmpBufferSize >= minTmpBufferSize), {
+            KERNEL_LOG(KERNEL_ERROR,
+                "tmpBufferSize can't smaller than minTmpBufferSize, tmpBufferSize is %u, minTmpBufferSize is %u!",
+                tmpBufferSize,
+                minTmpBufferSize);
+        });
+#endif
+        const uint32_t oneRepeateSize = tmpBufferSize / minTmpBufferSize * NCHW_CONV_ADDR_LIST_SIZE;
+        const uint32_t rangeM = cumProdInfo.outter / oneRepeateSize;
+        const uint32_t tailM = cumProdInfo.outter - oneRepeateSize * rangeM;
+        uint32_t dstLocalOffset = 0;
+        uint32_t srcLocalOffset = 0;
+        LocalTensor<T> tmpBuffer = sharedTmpBuffer.ReinterpretCast<T>();
+        for (uint32_t i = 0; i < rangeM; i++) {
+            CumProdLastDim<T>(
+                dstTensor[dstLocalOffset], srcTensor[srcLocalOffset], tmpBuffer, {oneRepeateSize, cumProdInfo.inner});
+            dstLocalOffset += cumProdInfo.inner * oneRepeateSize;
+            srcLocalOffset += cumProdInfo.inner * oneRepeateSize;
+        }
+
+        if (tailM != 0) {
+            CumProdLastDim<T>(
+                dstTensor[dstLocalOffset], srcTensor[srcLocalOffset], tmpBuffer, {tailM, cumProdInfo.inner});
+        }
+    } else {
+        CumProdFirstDim<T>(dstTensor, srcTensor, sharedTmpBuffer, cumProdInfo);
+    }
+
+    if constexpr (config.outputLastRow) {
+        SetMaskCount();
+        SetVectorMask<T>(0, cumProdInfo.inner);
+        Muls<T, false>(lastRowTensor,
+            dstTensor[(cumProdInfo.outter - 1) * cumProdInfo.inner],
+            1,
+            MASK_PLACEHOLDER,
+            1,
+            {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
+        PipeBarrier<PIPE_V>();
+        SetMaskNorm();
+        ResetMask();
+    }
+}
+
+/* !
+ * \brief This function calculates the average based on the orientation of the last axis or fist axis.
+ * For details about the interface description, see
+ * https://pytorch.org/docs/stable/generated/torch.cumprod.html
+ *
+ * \note support data type: half and float
+ *
+ * \param [out] dstTensor, output LocalTensor
+ * \param [out] lastRowTensor, the last row of the output LocalTensor
+ * \param [in] srcTensor, input LocalTensor
+ * \param [in] cumProdInfo, shape information of srcTensor
+ */
+
+template <typename T, const CumProdConfig &config = defaultCumProdConfig>
+__aicore__ inline void CumProd(LocalTensor<T> &dstTensor, LocalTensor<T> &lastRowTensor, const LocalTensor<T> &srcTensor,
+    const cumProdInfo &cumProdInfo)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    LocalTensor<uint8_t> sharedTmpBuffer;
+    bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
+    ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
+    CumProd<T, config>(dstTensor, lastRowTensor, srcTensor, sharedTmpBuffer, cumProdInfo);
+}
+
+#pragma end_pipe
+}  // namespace AscendC
+
+#endif
diff --git a/lib/math/cumprod_tiling.h b/lib/math/cumprod_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..23b06272089d80eb79af37c118da570fa68c0703
--- /dev/null
+++ b/lib/math/cumprod_tiling.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file cumprod_tiling.h
+ * \brief
+ */
+#ifndef LIB_MATH_CUMPROD_TILING_H
+#define LIB_MATH_CUMPROD_TILING_H
+#include <cstdint>
+
+#include "graph/tensor.h"
+namespace AscendC {
+/*
+ * @ingroup GetCumProdMaxMinTmpSize
+ * @brief get cumprod api calculate need max and min temporary local space size
+ * @param [in] srcShape : src tensor shape
+ * @param [in] typeSize : src tensor dtype size
+ * @param [in] isLastAxis : whether to operate along the last axis
+ * @param [in] isReuseSource : whether to reuse the src Tensor
+ * @return max temporary local space size
+ * @return min temporary local space size
+ */
+void GetCumProdMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isLastAxis,
+    const bool isReuseSource, uint32_t &maxValue, uint32_t &minValue);
+}  // namespace AscendC
diff --git a/lib/math/cumprod_tiling_intf.h b/lib/math/cumprod_tiling_intf.h
new file mode 100644
index 0000000000000000000000000000000000000000..4409b4a1888b00882672f3c3a85a780e72b81cf3
--- /dev/null
+++ b/lib/math/cumprod_tiling_intf.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file cumprod_tiling_intf.h
+ * \brief
+ */
+#ifndef LIB_MATH_CUMPROD_TILING_INTF_H
+#define LIB_MATH_CUMPROD_TILING_INTF_H
+
+#include "cumprod_tiling.h"
+namespace AscendC {
+[[deprecated(__FILE__ " is deprecated, please use cumprod_tiling.h instead!")]]
+typedef void CumprodTilingDeprecatedHeader;
+using LibCumprodTilingInterface = CumprodTilingDeprecatedHeader;
+} // namespace AscendC
diff --git a/lib/math/kernel_operator_cumprod_intf.h b/lib/math/kernel_operator_cumprod_intf.h
new file mode 100644
index 0000000000000000000000000000000000000000..db041a9f3cedc9b5d624da66f7c74184e9a482e9
--- /dev/null
+++ b/lib/math/kernel_operator_cumprod_intf.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file kernel_operator_cumprod_intf.h
+ * \brief
+ */
+#ifndef ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE_H
+#define ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE_H
+
+#include "cumprod.h"
+
+namespace AscendC {
+[[deprecated(__FILE__ " is deprecated, please use cumprod.h instead!")]] typedef void using_deprecated_header_h;
+using ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE = using_deprecated_header_h;
+} // namespace AscendC
+#endif // ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE_H
diff --git a/tests/math/test_operator_cumprod.cpp b/tests/math/test_operator_cumprod.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4598c100c54ae214b2955a058abcda2718b784d1
--- /dev/null
+++ b/tests/math/test_operator_cumprod.cpp
@@ -0,0 +1,95 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file test_operator_cumprod.cpp
+ * \brief
+ */
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+
+using namespace std;
+using namespace AscendC;
+
+class TEST_CUMPROD : public testing::Test {
+protected:
+    void SetUp()
+    {
+        AscendC::SetGCoreType(2);
+    }
+    void TearDown()
+    {
+        AscendC::SetGCoreType(0);
+    }
+};
+
+template <typename T>
+void main_vec_cumprod_demo(__gm__ uint8_t* __restrict__ dstGm, __gm__ uint8_t* __restrict__ lastRawGm, __gm__ uint8_t* __restrict__ srcGm, uint32_t dataSize)
+{
+    TPipe tpipe;
+    GlobalTensor<T> input0Global;
+    GlobalTensor<T> input1Global;
+    GlobalTensor<T> output0Global;
+    GlobalTensor<T> output1Global;
+    input0Global.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(srcGm), dataSize);
+    output0Global.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(dstGm), dataSize);
+    output1Global.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(lastRawGm), dataSize);
+
+    TBuf<TPosition::VECCALC> tbuf1;
+    tpipe.InitBuffer(tbuf1, dataSize * sizeof(T));
+    LocalTensor<T> input0Local = tbuf1.Get<T>();
+
+    TBuf<TPosition::VECCALC> tbuf2;
+    tpipe.InitBuffer(tbuf2, dataSize * sizeof(T));
+    LocalTensor<T> input1Local = tbuf2.Get<T>();
+
+    LocalTensor<uint8_t> tmpLocal;
+
+    TBuf<TPosition::VECCALC> tbuf3;
+    tpipe.InitBuffer(tbuf3, dataSize * sizeof(T));
+    LocalTensor<T> output0Local = tbuf3.Get<T>();
+
+    TBuf<TPosition::VECCALC> tbuf4;
+    tpipe.InitBuffer(tbuf4, dataSize * sizeof(T));
+    LocalTensor<T> output1Local = tbuf4.Get<T>();
+
+    DataCopy(input0Local, input0Global, dataSize);
+
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    CumProd<T>(output0Local, output1Local, input0Local, CumProdInfo{1, dataSize});
+
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+
+    DataCopy(output0Global, output0Local, dataSize);
+    DataCopy(output1Global, output1Local, dataSize);
+
+    pipe_barrier(PIPE_ALL);
+}
+#define CUMPROD_TESTCASE(DATA_TYPE)                                                                             \
+    TEST_F(TEST_CUMPROD, CUMPROD##DATA_TYPE##Case)                                                              \
+    {                                                                                                           \
+        uint32_t dataSize = 256;                                                                                \
+        uint32_t sel_mask_size = dataSize / AscendCUtils::GetBitSize(sizeof(uint8_t));                          \
+        uint8_t input0Gm[dataSize * sizeof(DATA_TYPE)];                                                         \
+        uint8_t outputGm[dataSize * sizeof(DATA_TYPE)];                                                         \
+        uint8_t lastRawGm[dataSize * sizeof(DATA_TYPE)];                                                        \
+                                                                                                                \
+        main_vec_cumprod_demo<DATA_TYPE>(outputGm, lastRawGm, inputGm, dataSize);                               \
+                                                                                                                \
+        for (uint32_t i = 0; i < dataSize; i++) {                                                               \
+            EXPECT_EQ(outputGm[i], 0x00);                                                                       \
+            EXPECT_EQ(lastRawGm[i], 0x00);                                                                      \
+        }                                                                                                       \
+    }
+CUMPROD_TESTCASE(half);
+CUMPROD_TESTCASE(float);
diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index c72d121bf7f873671a29d3a3fa4dd0067c630297..26a39b7a62bdf52d380b28835da452e3acd2a458 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -3094,3 +3094,26 @@ TEST_F(TestTiling, tiling_compute_error)
     ret = bmm_tiling.Compute();
     EXPECT_EQ(ret, -1);
 }
+
+TEST_F(TestTiling, TestCumProdTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto CumProdShape = ge::Shape(shapeDims);
+    uint32_t CumProdNeedMaxSize;
+    uint32_t CumProdNeedMinSize;
+    GetCumProdMaxMinTmpSize(CumProdShape, 2, true, CumProdNeedMaxSize, CumProdNeedMinSize);
+    EXPECT_EQ(CumProdNeedMaxSize, 131072);
+    EXPECT_EQ(CumProdNeedMinSize, 1024);
+    GetCumProdMaxMinTmpSize(CumProdShape, 2, true, CumProdNeedMaxSize, CumProdNeedMinSize);
+}
+
+TEST_F(TestTiling, TestCumProdTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto CumProdShape = ge::Shape(shapeDims);
+    uint32_t CumProdNeedMaxSize;
+    uint32_t CumProdNeedMinSize;
+    GetCumProdMaxMinTmpSize(CumProdShape, 4, true, CumProdNeedMaxSize, CumProdNeedMinSize);
+    EXPECT_EQ(CumProdNeedMaxSize, 0);
+    EXPECT_EQ(CumProdNeedMinSize, 0);
+}