diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake index 5c5a256621cfdf4b7299e6d9d51b736ae82454d9..0a9724ac77f12c8cd70e4ca73d9fe97969a3abb7 100644 --- a/cmake/kernel_headers.cmake +++ b/cmake/kernel_headers.cmake @@ -196,3 +196,14 @@ file(CREATE_LINK ../activation/geglu_tiling_intf.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling_intf.h SYMBOLIC) file(CREATE_LINK ../activation/geglu_tiling.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling.h SYMBOLIC) + +# cumprod +file(MAKE_DIRECTORY ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod) +file(CREATE_LINK ../math/kernel_operator_cumprod_intf.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/kernel_operator_cumprod_intf.h SYMBOLIC) +file(CREATE_LINK ../math/cumprod.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/cumprod.h SYMBOLIC) +file(CREATE_LINK ../math/cumprod_tiling_intf.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/cumprod_tiling_intf.h SYMBOLIC) +file(CREATE_LINK ../math/cumprod_tiling.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/cumprod/cumprod_tiling.h SYMBOLIC) diff --git a/cmake/tiling_headers.cmake b/cmake/tiling_headers.cmake index 6e9ee95df767472dd48c2add1961fce831665f8c..698741da233a6055a223a54b9dbcdffe3173fff3 100644 --- a/cmake/tiling_headers.cmake +++ b/cmake/tiling_headers.cmake @@ -233,3 +233,10 @@ file(CREATE_LINK ../../lib/math/xor_tiling.h file(CREATE_LINK ../lib/tiling_api.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/tiling_api.h SYMBOLIC) + +# cumprod +file(MAKE_DIRECTORY ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/cumprod) +file(CREATE_LINK ../../lib/math/cumprod_tiling.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/cumprod/cumprod_tiling.h SYMBOLIC) +file(CREATE_LINK ../../lib/math/cumprod_tiling_intf.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/tiling/cumprod/cumprod_tiling_intf.h SYMBOLIC) diff --git a/impl/math/cumprod/cumprod_impl.h b/impl/math/cumprod/cumprod_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..e9861454b10727e0dea97a8c9e261e6c1d3fac68 --- /dev/null +++ b/impl/math/cumprod/cumprod_impl.h @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the + * "License"). Please refer to the License for details. You may not use this + * file except in compliance with the License. THIS SOFTWARE IS PROVIDED ON AN + * "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS + * FOR A PARTICULAR PURPOSE. See LICENSE in the root of the software repository + * for the full text of the License. + */ + +/* ! + * \file cumprod_impl.h + * \brief + */ +#ifndef IMPL_MATH_CUMPROD_CUMPROD_IMPL_H +#define IMPL_MATH_CUMPROD_CUMPROD_IMPL_H + +#include "kernel_tensor.h" +#include "kernel_operator_intf.h" +#if __CCE_AICORE__ >= 200 + +namespace AscendC { +struct CumProdInfo { + uint32_t outter{0}; + uint32_t inner{0}; // 32-byte alignment +}; + +struct CumProdConfig { + bool isLastAxis{true}; + bool isReuseSource{false}; + bool outputLastRow{false}; +}; + +template +__aicore__ inline void CumProdLastDim(const LocalTensor &dstTensor, const LocalTensor &srcTensor, + LocalTensor tempBuffer, const cumProdInfo &cumProdInfo) { + constexpr uint32_t oneBlockElementNum = ONE_BLK_SIZE / sizeof(T); + uint16_t alignOutter = (cumProdInfo.outter + NCHW_CONV_ADDR_LIST_SIZE - 1) / + NCHW_CONV_ADDR_LIST_SIZE * NCHW_CONV_ADDR_LIST_SIZE; + uint64_t transDataTo5HDDstLocalList[NCHW_CONV_ADDR_LIST_SIZE]; + uint64_t transDataTo5HDSrcLocalList[NCHW_CONV_ADDR_LIST_SIZE]; + uint8_t repeatTimes = 1; + uint16_t dstRepStride = 0; + uint16_t srcRepStride = 0; + if (cumProdInfo.outter == alignOutter && alignOutter > cumProdInfo.inner) { + repeatTimes = alignOutter / NCHW_CONV_ADDR_LIST_SIZE; + if (repeatTimes > 1) { + dstRepStride = 1; + srcRepStride = cumProdInfo.inner; + } + TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride, + srcRepStride); + for (int32_t i = 0; i < cumProdInfo.inner / oneBlockElementNum; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)srcTensor[i * oneBlockElementNum + n * cumProdInfo.inner] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n] = + (uint64_t)tempBuffer[i * oneBlockElementNum * alignOutter + + alignOutter * n] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList, + params); + } + } else { + repeatTimes = cumProdInfo.inner / oneBlockElementNum; + if (repeatTimes > 1) { + dstRepStride = alignOutter; + srcRepStride = 1; + } + TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride, + srcRepStride); + for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)srcTensor[((i * NCHW_CONV_ADDR_LIST_SIZE + + n % (cumProdInfo.outter - + i * NCHW_CONV_ADDR_LIST_SIZE)) * + cumProdInfo.inner)] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n] = + (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + alignOutter * n] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList, + params); + } + } + PipeBarrier(); + SetMaskCount(); + SetVectorMask(alignOutter * cumProdInfo.inner); + LocalTensor floatTempBuffer = + tempBuffer[alignOutter * cumProdInfo.inner] + .template ReinterpretCast(); + Cast( + floatTempBuffer, tempBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + + SetVectorMask(0, alignOutter); + const BinaryRepeatParams binaryParams; + for (uint32_t row = 1; row < cumProdInfo.inner; ++row) { + Mul(floatTempBuffer[row * alignOutter], + floatTempBuffer[(row - 1) * alignOutter], + floatTempBuffer[row * alignOutter], MASK_PLACEHOLDER, 1, + binaryParams); + PipeBarrier(); + } + + SetVectorMask(alignOutter * cumProdInfo.inner); + Cast( + tempBuffer, floatTempBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + SetMaskNorm(); + ResetMask(); + auto tempBuffer2 = tempBuffer[alignOutter * cumProdInfo.inner]; + if (alignOutter > cumProdInfo.inner) { + repeatTimes = alignOutter / oneBlockElementNum; + if (repeatTimes > 1) { + dstRepStride = cumProdInfo.inner; + srcRepStride = 1; + } else { + dstRepStride = 0; + srcRepStride = 0; + } + TransDataTo5HDParams paramsBack(false, false, repeatTimes, dstRepStride, + srcRepStride); + for (int32_t i = 0; i < cumProdInfo.inner / NCHW_CONV_ADDR_LIST_SIZE; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t) + tempBuffer[(i * NCHW_CONV_ADDR_LIST_SIZE + n) * alignOutter] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n] = + (uint64_t)tempBuffer2[i * NCHW_CONV_ADDR_LIST_SIZE + + n * cumProdInfo.inner] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList, + paramsBack); + } + } else { + repeatTimes = cumProdInfo.inner / oneBlockElementNum; + if (repeatTimes > 1) { + dstRepStride = alignOutter; + srcRepStride = 1; + } else { + dstRepStride = 0; + srcRepStride = 0; + } + TransDataTo5HDParams paramsBack(false, false, repeatTimes, srcRepStride, + dstRepStride); + for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + alignOutter * n] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n] = + (uint64_t)tempBuffer2[(i * NCHW_CONV_ADDR_LIST_SIZE + n) * + cumProdInfo.inner] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, transDataTo5HDSrcLocalList, + paramsBack); + } + } + PipeBarrier(); + SetMaskCount(); + SetVectorMask(0, cumProdInfo.outter * cumProdInfo.inner); + Muls(dstTensor, tempBuffer2, 1, MASK_PLACEHOLDER, 1, + {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + SetMaskNorm(); + ResetMask(); +} + +template <> +__aicore__ inline void CumProdLastDim(const LocalTensor &dstTensor, + const LocalTensor &srcTensor, + LocalTensor tempBuffer, + const cumProdInfo &cumProdInfo) { + constexpr uint32_t oneBlockElementNum = ONE_BLK_SIZE / sizeof(float); + uint8_t repeatTimes = 1; + uint16_t dstRepStride = 0; + uint16_t srcRepStride = 0; + uint16_t alignOutter = (cumProdInfo.outter + NCHW_CONV_ADDR_LIST_SIZE - 1) / + NCHW_CONV_ADDR_LIST_SIZE * NCHW_CONV_ADDR_LIST_SIZE; + uint64_t transDataTo5HDDstLocalList[NCHW_CONV_ADDR_LIST_SIZE]; + uint64_t transDataTo5HDSrcLocalList[NCHW_CONV_ADDR_LIST_SIZE]; + if (cumProdInfo.outter == alignOutter && alignOutter > cumProdInfo.inner) { + repeatTimes = alignOutter / NCHW_CONV_ADDR_LIST_SIZE; + if (repeatTimes > 1) { + dstRepStride = 2; + srcRepStride = cumProdInfo.inner * 2; + } + TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride, + srcRepStride); + for (int32_t i = 0; i < cumProdInfo.inner / oneBlockElementNum; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)srcTensor[i * oneBlockElementNum + n * cumProdInfo.inner] + .GetPhyAddr(); + } + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; + n++) { + transDataTo5HDDstLocalList[n * 2] = + (uint64_t)tempBuffer[(i * oneBlockElementNum + n) * alignOutter] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n * 2 + 1] = + (uint64_t)tempBuffer[(i * oneBlockElementNum + n) * alignOutter + + oneBlockElementNum] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, + transDataTo5HDSrcLocalList, params); + } + } else { + repeatTimes = cumProdInfo.inner / oneBlockElementNum; + if (repeatTimes > 1) { + dstRepStride = alignOutter; + srcRepStride = 1; + } + TransDataTo5HDParams params(false, false, repeatTimes, dstRepStride, + srcRepStride); + for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)srcTensor[((i * NCHW_CONV_ADDR_LIST_SIZE + + n % (cumProdInfo.outter - + i * NCHW_CONV_ADDR_LIST_SIZE)) * + cumProdInfo.inner)] + .GetPhyAddr(); + } + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) { + transDataTo5HDDstLocalList[n * 2] = + (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + n * alignOutter] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n * 2 + 1] = + (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + + n * alignOutter + oneBlockElementNum] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, + transDataTo5HDSrcLocalList, params); + } + } + PipeBarrier(); + SetMaskCount(); + SetVectorMask(0, alignOutter); + const BinaryRepeatParams binaryParams; + uint32_t addOffset = alignOutter; + for (uint32_t row = 1; row < cumProdInfo.inner; ++row) { + Mul(tempBuffer[addOffset], + tempBuffer[addOffset - alignOutter], + tempBuffer[addOffset], MASK_PLACEHOLDER, 1, binaryParams); + addOffset += alignOutter; + PipeBarrier(); + } + SetMaskNorm(); + ResetMask(); + + auto tempBuffer2 = tempBuffer[alignOutter * cumProdInfo.inner]; + if (alignOutter > cumProdInfo.inner) { + repeatTimes = alignOutter / NCHW_CONV_ADDR_LIST_SIZE; + if (repeatTimes > 1) { + dstRepStride = cumProdInfo.inner * 2; + srcRepStride = 2; + } else { + dstRepStride = 0; + srcRepStride = 0; + } + TransDataTo5HDParams paramsBack(false, false, repeatTimes, dstRepStride, + srcRepStride); + for (int32_t i = 0; i < cumProdInfo.inner / oneBlockElementNum; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)tempBuffer[i * oneBlockElementNum * alignOutter + + n * alignOutter] + .GetPhyAddr(); + transDataTo5HDSrcLocalList[n + NCHW_CONV_ADDR_LIST_SIZE / 2] = + (uint64_t)tempBuffer[i * oneBlockElementNum * alignOutter + + n * alignOutter + oneBlockElementNum] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n * 2] = + (uint64_t) + tempBuffer2[i * oneBlockElementNum + n * cumProdInfo.inner] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n * 2 + 1] = + (uint64_t)tempBuffer2[i * oneBlockElementNum + + (n + oneBlockElementNum) * cumProdInfo.inner] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, + transDataTo5HDSrcLocalList, paramsBack); + } + + } else { + repeatTimes = cumProdInfo.inner / oneBlockElementNum; + if (repeatTimes > 1) { + dstRepStride = alignOutter; + srcRepStride = 1; + } else { + dstRepStride = 0; + srcRepStride = 0; + } + TransDataTo5HDParams paramsBack(false, false, repeatTimes, srcRepStride, + dstRepStride); + for (int32_t i = 0; i < alignOutter / NCHW_CONV_ADDR_LIST_SIZE; i++) { + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) { + transDataTo5HDSrcLocalList[n] = + (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + n * alignOutter] + .GetPhyAddr(); + transDataTo5HDSrcLocalList[n + NCHW_CONV_ADDR_LIST_SIZE / 2] = + (uint64_t)tempBuffer[i * NCHW_CONV_ADDR_LIST_SIZE + + n * alignOutter + oneBlockElementNum] + .GetPhyAddr(); + } + for (int32_t n = 0; n < NCHW_CONV_ADDR_LIST_SIZE / 2; n++) { + transDataTo5HDDstLocalList[n * 2] = + (uint64_t)tempBuffer2[(i * NCHW_CONV_ADDR_LIST_SIZE + n) * + cumProdInfo.inner] + .GetPhyAddr(); + transDataTo5HDDstLocalList[n * 2 + 1] = + (uint64_t)tempBuffer2[(i * NCHW_CONV_ADDR_LIST_SIZE + + (n + NCHW_CONV_ADDR_LIST_SIZE / 2)) * + cumProdInfo.inner] + .GetPhyAddr(); + } + TransDataTo5HD(transDataTo5HDDstLocalList, + transDataTo5HDSrcLocalList, paramsBack); + } + } + PipeBarrier(); + SetMaskCount(); + SetVectorMask(0, cumProdInfo.outter * cumProdInfo.inner); + Muls(dstTensor, tempBuffer2, 1, MASK_PLACEHOLDER, 1, + {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + SetMaskNorm(); + ResetMask(); +} + +template +__aicore__ inline void CumProdFirstDim(const LocalTensor &dstTensor, + const LocalTensor &srcTensor, + LocalTensor &sharedTmpBuffer, + const cumProdInfo &cumProdInfo) { + if constexpr (sizeof(T) == 2) { + const uint32_t minTmpBufferSize = + cumProdInfo.outter * cumProdInfo.inner * sizeof(float); + const uint32_t tmpBufferSize = sharedTmpBuffer.GetSize(); +#if ASCENDC_CPU_DEBUG + ASCENDC_ASSERT((tmpBufferSize >= minTmpBufferSize), { + KERNEL_LOG(KERNEL_ERROR, + "tmpBufferSize can't smaller than minTmpBufferSize, tmpBufferSize is %u, minTmpBufferSize is %u!", + tmpBufferSize, + minTmpBufferSize); + }); +#endif + SetMaskCount(); + SetVectorMask(cumProdInfo.outter * + cumProdInfo.inner); + LocalTensor tmpBuffer = sharedTmpBuffer.ReinterpretCast(); + Cast( + tmpBuffer, srcTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + + SetVectorMask(0, cumProdInfo.inner); + const BinaryRepeatParams binaryParams; + for (uint32_t row = 1; row < cumProdInfo.outter; ++row) { + Mul(tmpBuffer[row * cumProdInfo.inner], + tmpBuffer[(row - 1) * cumProdInfo.inner], + tmpBuffer[row * cumProdInfo.inner], MASK_PLACEHOLDER, 1, + binaryParams); + PipeBarrier(); + } + + SetVectorMask(cumProdInfo.outter * cumProdInfo.inner); + Cast( + dstTensor, tmpBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + + } else { + SetMaskCount(); + SetVectorMask(0, cumProdInfo.inner); + Muls(dstTensor, srcTensor, 1, MASK_PLACEHOLDER, 1, + {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + const BinaryRepeatParams binaryParams; + for (uint32_t row = 1; row < cumProdInfo.outter; ++row) { + Mul(dstTensor[row * cumProdInfo.inner], + dstTensor[(row - 1) * cumProdInfo.inner], + srcTensor[row * cumProdInfo.inner], MASK_PLACEHOLDER, 1, + binaryParams); + PipeBarrier(); + } + SetMaskNorm(); + ResetMask(); + } +} +} // namespace AscendC +#endif // IMPL_MATH_CUMPROD_CUMPROD_IMPL_H diff --git a/impl/math/cumprod/cumprod_tiling.cpp b/impl/math/cumprod/cumprod_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f2e693de8f30e6eb8e8e7af9514f32c97f9104bc --- /dev/null +++ b/impl/math/cumprod/cumprod_tiling.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file cumprod_tiling.cpp + * \brief + */ +#include "lib/math/cumprod_tiling.h" + +#include + +#include "graph/tensor.h" +#include "impl/host_log.h" +namespace AscendC { +namespace { +constexpr uint32_t CUMPROD_HALF_TMP_SIZE = 6; +constexpr uint32_t CUMPROD_FLOAT_TMP_SIZE = 0; +constexpr uint32_t CUMPROD_ONE_REPEAT_BYTE_SIZE = 256; + +inline uint32_t GetCumProdMaxTmpSize(const uint32_t inputSize, const uint32_t typeSize) +{ + const uint32_t calcPro = (typeSize == sizeof(float)) ? CUMPROD_FLOAT_TMP_SIZE : CUMPROD_HALF_TMP_SIZE; + return calcPro * std::max(inputSize * typeSize, CUMPROD_ONE_REPEAT_BYTE_SIZE); +} + +inline uint32_t GetCumProdMinTmpSize(const uint32_t typeSize) +{ + const uint32_t calcPro = (typeSize == sizeof(float)) ? CUMPROD_FLOAT_TMP_SIZE : CUMPROD_HALF_TMP_SIZE; + return calcPro * CUMPROD_ONE_REPEAT_BYTE_SIZE; +} +} // namespace + +void GetCumPodMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource, + uint32_t &maxValue, uint32_t &minValue) +{ + (void)isReuseSource; + const uint32_t inputSize = srcShape.GetShapeSize(); + ASCENDC_HOST_ASSERT(inputSize > 0, return, "Input Shape size must be greater than 0."); + + minValue = GetCumProdMinTmpSize(typeSize); + maxValue = GetCumProdMaxTmpSize(inputSize, typeSize); +} +}// namespace AscendC diff --git a/lib/math/cumprod.h b/lib/math/cumprod.h new file mode 100644 index 0000000000000000000000000000000000000000..09d2e68a0e694eb802e93e5ecac47a41f724e361 --- /dev/null +++ b/lib/math/cumprod.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file cumprod.h + * \brief + */ +#ifndef LIB_MATH_CUMPROD_H +#define LIB_MATH_CUMPROD_H + +#include "kernel_tensor.h" +#include "kernel_operator_intf.h" +#include "../../impl/math/cumprod/cumprod_impl.h" +#if ASCENDC_CPU_DEBUG +#include "kernel_log.h" +#endif +#if __CCE_AICORE__ >= 200 + +namespace AscendC { +#pragma begin_pipe(V) + +constexpr CumProdConfig defaultCumProdConfig = {true, false, true}; + +/* ! + * \brief This function calculates the average based on the orientation of the last axis or fist axis. + * For details about the interface description, see + * https://pytorch.org/docs/stable/generated/torch.cumprod.html + * + * \note support data type: half and float + * + * \param [out] dstTensor, output LocalTensor + * \param [out] lastRowTensor, the last row of the output LocalTensor + * \param [in] srcTensor, input LocalTensor + * \param [in] sharedTmpBuffer, input local temporary Tensor + * \param [in] cumProdInfo, shape information of srcTensor + */ + +template +__aicore__ inline void CumProd(LocalTensor &dstTensor, LocalTensor &lastRowTensor, const LocalTensor &srcTensor, + LocalTensor &sharedTmpBuffer, const CumProdInfo &cumProdInfo) +{ + if ASCEND_IS_AIC { + return; + } + +#if ASCENDC_CPU_DEBUG + bool ans = cumProdInfo.inner > 0 && (cumProdInfo.inner * sizeof(T) % ONE_BLK_SIZE == 0); + ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "inner is %u, is not 32B aligned.", cumProdInfo.inner); }); + ans = srcTensor.GetSize() >= (cumProdInfo.inner * cumProdInfo.outter); + ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "srcTensor size isn't enough!."); }); + ans = dstTensor.GetSize() >= (cumProdInfo.inner * cumProdInfo.outter); + ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "dstTensor size isn't enough!."); }); + if (config.outputLastRow) { + ans = lastRowTensor.GetSize() >= cumProdInfo.inner; + ASCENDC_ASSERT(ans, { KERNEL_LOG(KERNEL_ERROR, "outputLastRow size isn't enough!."); }); + } +#endif + + if constexpr (config.isLastAxis) { + uint32_t minCastTempBufferSize = 0; + if constexpr (sizeof(T) == 2) { + minCastTempBufferSize = cumProdInfo.inner * NCHW_CONV_ADDR_LIST_SIZE * sizeof(half); + } + const uint32_t minTmpBufferSize = minCastTempBufferSize + NCHW_CONV_ADDR_LIST_SIZE * cumProdInfo.inner * + sizeof(T) * 2; + const uint32_t tmpBufferSize = sharedTmpBuffer.GetSize(); +#if ASCENDC_CPU_DEBUG + ASCENDC_ASSERT((tmpBufferSize >= minTmpBufferSize), { + KERNEL_LOG(KERNEL_ERROR, + "tmpBufferSize can't smaller than minTmpBufferSize, tmpBufferSize is %u, minTmpBufferSize is %u!", + tmpBufferSize, + minTmpBufferSize); + }); +#endif + const uint32_t oneRepeateSize = tmpBufferSize / minTmpBufferSize * NCHW_CONV_ADDR_LIST_SIZE; + const uint32_t rangeM = cumProdInfo.outter / oneRepeateSize; + const uint32_t tailM = cumProdInfo.outter - oneRepeateSize * rangeM; + uint32_t dstLocalOffset = 0; + uint32_t srcLocalOffset = 0; + LocalTensor tmpBuffer = sharedTmpBuffer.ReinterpretCast(); + for (uint32_t i = 0; i < rangeM; i++) { + CumProdLastDim( + dstTensor[dstLocalOffset], srcTensor[srcLocalOffset], tmpBuffer, {oneRepeateSize, cumProdInfo.inner}); + dstLocalOffset += cumProdInfo.inner * oneRepeateSize; + srcLocalOffset += cumProdInfo.inner * oneRepeateSize; + } + + if (tailM != 0) { + CumProdLastDim( + dstTensor[dstLocalOffset], srcTensor[srcLocalOffset], tmpBuffer, {tailM, cumProdInfo.inner}); + } + } else { + CumProdFirstDim(dstTensor, srcTensor, sharedTmpBuffer, cumProdInfo); + } + + if constexpr (config.outputLastRow) { + SetMaskCount(); + SetVectorMask(0, cumProdInfo.inner); + Muls(lastRowTensor, + dstTensor[(cumProdInfo.outter - 1) * cumProdInfo.inner], + 1, + MASK_PLACEHOLDER, + 1, + {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); + PipeBarrier(); + SetMaskNorm(); + ResetMask(); + } +} + +/* ! + * \brief This function calculates the average based on the orientation of the last axis or fist axis. + * For details about the interface description, see + * https://pytorch.org/docs/stable/generated/torch.cumprod.html + * + * \note support data type: half and float + * + * \param [out] dstTensor, output LocalTensor + * \param [out] lastRowTensor, the last row of the output LocalTensor + * \param [in] srcTensor, input LocalTensor + * \param [in] cumProdInfo, shape information of srcTensor + */ + +template +__aicore__ inline void CumProd(LocalTensor &dstTensor, LocalTensor &lastRowTensor, const LocalTensor &srcTensor, + const cumProdInfo &cumProdInfo) +{ + if ASCEND_IS_AIC { + return; + } + LocalTensor sharedTmpBuffer; + bool ans = PopStackBuffer(sharedTmpBuffer); + ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); + CumProd(dstTensor, lastRowTensor, srcTensor, sharedTmpBuffer, cumProdInfo); +} + +#pragma end_pipe +} // namespace AscendC + +#endif diff --git a/lib/math/cumprod_tiling.h b/lib/math/cumprod_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..23b06272089d80eb79af37c118da570fa68c0703 --- /dev/null +++ b/lib/math/cumprod_tiling.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file cumprod_tiling.h + * \brief + */ +#ifndef LIB_MATH_CUMPROD_TILING_H +#define LIB_MATH_CUMPROD_TILING_H +#include + +#include "graph/tensor.h" +namespace AscendC { +/* + * @ingroup GetCumProdMaxMinTmpSize + * @brief get cumprod api calculate need max and min temporary local space size + * @param [in] srcShape : src tensor shape + * @param [in] typeSize : src tensor dtype size + * @param [in] isLastAxis : whether to operate along the last axis + * @param [in] isReuseSource : whether to reuse the src Tensor + * @return max temporary local space size + * @return min temporary local space size + */ +void GetCumProdMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isLastAxis, + const bool isReuseSource, uint32_t &maxValue, uint32_t &minValue); +} // namespace AscendC diff --git a/lib/math/cumprod_tiling_intf.h b/lib/math/cumprod_tiling_intf.h new file mode 100644 index 0000000000000000000000000000000000000000..4409b4a1888b00882672f3c3a85a780e72b81cf3 --- /dev/null +++ b/lib/math/cumprod_tiling_intf.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file cumprod_tiling_intf.h + * \brief + */ +#ifndef LIB_MATH_CUMPROD_TILING_INTF_H +#define LIB_MATH_CUMPROD_TILING_INTF_H + +#include "cumprod_tiling.h" +namespace AscendC { +[[deprecated(__FILE__ " is deprecated, please use cumprod_tiling.h instead!")]] +typedef void CumprodTilingDeprecatedHeader; +using LibCumprodTilingInterface = CumprodTilingDeprecatedHeader; +} // namespace AscendC diff --git a/lib/math/kernel_operator_cumprod_intf.h b/lib/math/kernel_operator_cumprod_intf.h new file mode 100644 index 0000000000000000000000000000000000000000..db041a9f3cedc9b5d624da66f7c74184e9a482e9 --- /dev/null +++ b/lib/math/kernel_operator_cumprod_intf.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file kernel_operator_cumprod_intf.h + * \brief + */ +#ifndef ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE_H +#define ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE_H + +#include "cumprod.h" + +namespace AscendC { +[[deprecated(__FILE__ " is deprecated, please use cumprod.h instead!")]] typedef void using_deprecated_header_h; +using ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE = using_deprecated_header_h; +} // namespace AscendC +#endif // ASCENDC_MODULE_OPERATOR_CUMPROD_INTERFACE_H diff --git a/tests/math/test_operator_cumprod.cpp b/tests/math/test_operator_cumprod.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4598c100c54ae214b2955a058abcda2718b784d1 --- /dev/null +++ b/tests/math/test_operator_cumprod.cpp @@ -0,0 +1,95 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file test_operator_cumprod.cpp + * \brief + */ +#include +#include "kernel_operator.h" + +using namespace std; +using namespace AscendC; + +class TEST_CUMPROD : public testing::Test { +protected: + void SetUp() + { + AscendC::SetGCoreType(2); + } + void TearDown() + { + AscendC::SetGCoreType(0); + } +}; + +template +void main_vec_cumprod_demo(__gm__ uint8_t* __restrict__ dstGm, __gm__ uint8_t* __restrict__ lastRawGm, __gm__ uint8_t* __restrict__ srcGm, uint32_t dataSize) +{ + TPipe tpipe; + GlobalTensor input0Global; + GlobalTensor input1Global; + GlobalTensor output0Global; + GlobalTensor output1Global; + input0Global.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(srcGm), dataSize); + output0Global.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(dstGm), dataSize); + output1Global.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(lastRawGm), dataSize); + + TBuf tbuf1; + tpipe.InitBuffer(tbuf1, dataSize * sizeof(T)); + LocalTensor input0Local = tbuf1.Get(); + + TBuf tbuf2; + tpipe.InitBuffer(tbuf2, dataSize * sizeof(T)); + LocalTensor input1Local = tbuf2.Get(); + + LocalTensor tmpLocal; + + TBuf tbuf3; + tpipe.InitBuffer(tbuf3, dataSize * sizeof(T)); + LocalTensor output0Local = tbuf3.Get(); + + TBuf tbuf4; + tpipe.InitBuffer(tbuf4, dataSize * sizeof(T)); + LocalTensor output1Local = tbuf4.Get(); + + DataCopy(input0Local, input0Global, dataSize); + + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + CumProd(output0Local, output1Local, input0Local, CumProdInfo{1, dataSize}); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + + DataCopy(output0Global, output0Local, dataSize); + DataCopy(output1Global, output1Local, dataSize); + + pipe_barrier(PIPE_ALL); +} +#define CUMPROD_TESTCASE(DATA_TYPE) \ + TEST_F(TEST_CUMPROD, CUMPROD##DATA_TYPE##Case) \ + { \ + uint32_t dataSize = 256; \ + uint32_t sel_mask_size = dataSize / AscendCUtils::GetBitSize(sizeof(uint8_t)); \ + uint8_t input0Gm[dataSize * sizeof(DATA_TYPE)]; \ + uint8_t outputGm[dataSize * sizeof(DATA_TYPE)]; \ + uint8_t lastRawGm[dataSize * sizeof(DATA_TYPE)]; \ + \ + main_vec_cumprod_demo(outputGm, lastRawGm, inputGm, dataSize); \ + \ + for (uint32_t i = 0; i < dataSize; i++) { \ + EXPECT_EQ(outputGm[i], 0x00); \ + EXPECT_EQ(lastRawGm[i], 0x00); \ + } \ + } +CUMPROD_TESTCASE(half); +CUMPROD_TESTCASE(float); diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index c72d121bf7f873671a29d3a3fa4dd0067c630297..26a39b7a62bdf52d380b28835da452e3acd2a458 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -3094,3 +3094,26 @@ TEST_F(TestTiling, tiling_compute_error) ret = bmm_tiling.Compute(); EXPECT_EQ(ret, -1); } + +TEST_F(TestTiling, TestCumProdTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto CumProdShape = ge::Shape(shapeDims); + uint32_t CumProdNeedMaxSize; + uint32_t CumProdNeedMinSize; + GetCumProdMaxMinTmpSize(CumProdShape, 2, true, CumProdNeedMaxSize, CumProdNeedMinSize); + EXPECT_EQ(CumProdNeedMaxSize, 131072); + EXPECT_EQ(CumProdNeedMinSize, 1024); + GetCumProdMaxMinTmpSize(CumProdShape, 2, true, CumProdNeedMaxSize, CumProdNeedMinSize); +} + +TEST_F(TestTiling, TestCumProdTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto CumProdShape = ge::Shape(shapeDims); + uint32_t CumProdNeedMaxSize; + uint32_t CumProdNeedMinSize; + GetCumProdMaxMinTmpSize(CumProdShape, 4, true, CumProdNeedMaxSize, CumProdNeedMinSize); + EXPECT_EQ(CumProdNeedMaxSize, 0); + EXPECT_EQ(CumProdNeedMinSize, 0); +}