From b95a044cdbff14d882c0750a2caa87cdb981a7e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=9F=E4=BF=8A=E6=88=90?= Date: Wed, 21 May 2025 09:47:55 +0800 Subject: [PATCH 1/5] add new reduce host check --- impl/reduce/mean/mean_tiling.cpp | 23 ++- impl/reduce/reduce_all/reduce_all_v220_impl.h | 4 +- impl/reduce/reduce_any/reduce_any_v220_impl.h | 4 +- impl/reduce/reduce_sum/reduce_sum_v220_impl.h | 2 + impl/reduce/reduce_tiling.cpp | 176 ++++++++++-------- .../reduce_xor_sum/reduce_xor_sum_tiling.cpp | 11 +- impl/reduce/sum/sum_tiling.cpp | 10 +- lib/reduce/reduce_tiling.h | 1 + lib/sort/topk_tiling.h | 1 + 9 files changed, 151 insertions(+), 81 deletions(-) diff --git a/impl/reduce/mean/mean_tiling.cpp b/impl/reduce/mean/mean_tiling.cpp index c22cacec..6838fb09 100644 --- a/impl/reduce/mean/mean_tiling.cpp +++ b/impl/reduce/mean/mean_tiling.cpp @@ -10,16 +10,37 @@ #include "lib/reduce/mean_tiling.h" #include "register/tilingdata_base.h" +#include "impl/host_log.h" + namespace AscendC { constexpr uint32_t MEAN_CALC_PROC = 1; const uint32_t MEAN_ONE_BLK_SIZE = 32; const uint32_t MEAN_ONE_REPEAT_BYTE_SIZE = 256; const uint32_t HALF_TYPE_SIZE = 2; const uint32_t FLOAT_TYPE_SIZE = 4; + +inline void CheckMeanHostParams(const uint32_t n, const uint32_t srcTypeSize, + const uint32_t accTypeSize, const bool isReuseSource) +{ + ASCENDC_HOST_ASSERT( + ((srcTypeSize == 2U && accTypeSize == 2U) || + (srcTypeSize == 4U && accTypeSize == 4U) || + (srcTypeSize == 2U && srcTypeSize == 4U)), + return, + "[Mean][GetMeanMaxMinTmpSize] The parameter (srcTypeSize, accTypeSize) is (%u, %u), expected is (2, 2)/(4, 4)/(2, 4).", + srcTypeSize, accTypeSize + ); + ASCENDC_HOST_ASSERT(n > 0, + return, "[Mean][GetMeanMaxMinTmpSize] The parameter n is %u, expected is greater than 0!", n); + if (isReuseSource) { + TILING_LOG_WARNING("[Mean][GetMeanMaxMinTmpSize] The parameter isReuseSource is true, which is not effective!"); + } +} + void GetMeanMaxMinTmpSize(const uint32_t n, const uint32_t srcTypeSize, const uint32_t accTypeSize, const bool isReuseSource, uint32_t& maxSize, uint32_t& minSize) { - (void)isReuseSource; + CheckMeanHostParams(n, srcTypeSize, accTypeSize, isReuseSource); if (srcTypeSize == 0) { return; } diff --git a/impl/reduce/reduce_all/reduce_all_v220_impl.h b/impl/reduce/reduce_all/reduce_all_v220_impl.h index 223639d8..aa5386bb 100644 --- a/impl/reduce/reduce_all/reduce_all_v220_impl.h +++ b/impl/reduce/reduce_all/reduce_all_v220_impl.h @@ -64,9 +64,9 @@ __aicore__ inline void ReduceAllImpl(const LocalTensor& dstTensor, const Loca BinaryReduceByFirstAxis>( dstTensor, srcTensor, tmpTensor, first, last, padLast); } - SetMaskNorm(); - ResetMask(); } + SetMaskNorm(); + ResetMask(); } } // namespace Internal } // namespace AscendC diff --git a/impl/reduce/reduce_any/reduce_any_v220_impl.h b/impl/reduce/reduce_any/reduce_any_v220_impl.h index b8d7f59d..5c3bd252 100644 --- a/impl/reduce/reduce_any/reduce_any_v220_impl.h +++ b/impl/reduce/reduce_any/reduce_any_v220_impl.h @@ -64,9 +64,9 @@ __aicore__ inline void ReduceAnyImpl(const LocalTensor& dstTensor, const Loca BinaryReduceByFirstAxis>( dstTensor, srcTensor, tmpTensor, first, last, padLast); } - SetMaskNorm(); - ResetMask(); } + SetMaskNorm(); + ResetMask(); } } // namespace Internal } // namespace AscendC diff --git a/impl/reduce/reduce_sum/reduce_sum_v220_impl.h b/impl/reduce/reduce_sum/reduce_sum_v220_impl.h index 75890b67..f2921187 100644 --- a/impl/reduce/reduce_sum/reduce_sum_v220_impl.h +++ b/impl/reduce/reduce_sum/reduce_sum_v220_impl.h @@ -268,6 +268,8 @@ __aicore__ inline void ReduceSumImpl(const LocalTensor& dstTensor, const Loca BinaryReduceByFirstAxis>( dstTensor, srcTensor, tmpBuf, first, last, padLast); } + SetMaskNorm(); + ResetMask(); } } // namespace Internal } // namespace AscendC diff --git a/impl/reduce/reduce_tiling.cpp b/impl/reduce/reduce_tiling.cpp index b427d5b4..d6efe31f 100644 --- a/impl/reduce/reduce_tiling.cpp +++ b/impl/reduce/reduce_tiling.cpp @@ -10,6 +10,7 @@ #include "lib/reduce/reduce_tiling.h" +#include #include #include @@ -44,27 +45,31 @@ uint32_t FindK(uint32_t n) { } inline void CheckParams(std::vector shapeDims, bool isSrcInnerPad, ReducePattern pattern, - uint32_t first, uint32_t last) + uint32_t first, uint32_t last, std::string apiName, std::string funcName) { - ASCENDC_HOST_ASSERT(shapeDims.size() == ALLOWED_SHAPE_DIM, return, "srcShape dims must be 2."); - ASCENDC_HOST_ASSERT(isSrcInnerPad, return, "isSrcInnerPad must be true on this platform."); + ASCENDC_HOST_ASSERT(shapeDims.size() == ALLOWED_SHAPE_DIM, return, + "[%s][%s] srcShape dims must be 2.", apiName.c_str(), funcName.c_str()); + ASCENDC_HOST_ASSERT(isSrcInnerPad, return, + "[%s][%s] isSrcInnerPad must be true on this platform.", apiName.c_str(), funcName.c_str()); ASCENDC_HOST_ASSERT(pattern == ReducePattern::AR || pattern == ReducePattern::RA, return, - "Currently only support AR and RA pattern."); - ASCENDC_HOST_ASSERT(first > 0 && last > 0, return, "both first and last axis must be greater than 0."); + "[%s][%s] Currently only support AR and RA pattern.", apiName.c_str(), funcName.c_str()); + ASCENDC_HOST_ASSERT(first > 0 && last > 0, return, + "[%s][%s] both first and last axis must be greater than 0.", apiName.c_str(), funcName.c_str()); } } // namespace void GetReduceCommonMaxMinTmpSize(const ge::Shape &srcShape, const ge::DataType dataType, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, - uint32_t &maxValue, uint32_t &minValue, bool isBinaryAdd) + uint32_t &maxValue, uint32_t &minValue, bool isBinaryAdd, + std::string apiName, std::string funcName) { std::vector shapeDims = srcShape.GetDims(); const uint32_t first = static_cast(shapeDims[0]); const uint32_t last = static_cast(shapeDims[1]); - CheckParams(shapeDims, isSrcInnerPad, pattern, first, last); + CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, apiName, funcName); if (isReuseSource) { maxValue = minValue = 0U; return; @@ -96,18 +101,79 @@ void GetReduceCommonMaxMinTmpSize(const ge::Shape &srcShape, maxValue = minValue = k * ((last * GetTypeSize(dataType) + ONE_BLK_SIZE - 1u) / ONE_BLK_SIZE * ONE_BLK_SIZE); } +inline void GetReduceSumMeanCommonTmpSize(const ge::Shape &srcShape, + const ge::DataType dataType, + ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, + uint32_t &maxValue, uint32_t &minValue, std::string apiName, std::string funcName) +{ + std::vector shapeDims = srcShape.GetDims(); + const uint32_t first = static_cast(shapeDims[0]); + const uint32_t last = static_cast(shapeDims[1]); + CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, apiName, funcName); + if (isReuseSource) { + maxValue = minValue = 0U; + return; + } + uint32_t elePerBlk = ONE_BLK_SIZE / FLOAT_TYPE_SIZE; + if (pattern == ReducePattern::AR) { + uint32_t k = FindK(last); + if (k == last && first > 1U) { + k >>= 1U; + } + if (last <= B32_ELEM_NUM_PER_REPEAT) { + maxValue = minValue = 0U; + } else { + maxValue = minValue = (first * k) * FLOAT_TYPE_SIZE; + } + } else { + uint32_t k = FindK(first); + uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk; + if (first == k && first > 1U) { + k >>= 1U; + } + maxValue = minValue = (k * padLast) * FLOAT_TYPE_SIZE; + } + return; +} + +inline void GetReduceAnyAllCommonTmpSize(const ge::Shape &srcShape, + const ge::DataType dataType, + ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, + uint32_t &maxValue, uint32_t &minValue, std::string apiName, std::string funcName) +{ + std::vector shapeDims = srcShape.GetDims(); + const uint32_t first = static_cast(shapeDims[0]); + const uint32_t last = static_cast(shapeDims[1]); + CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, apiName, funcName); + if (pattern == ReducePattern::AR) { + uint32_t elePerBlk = static_cast(ONE_BLK_SIZE / sizeof(uint8_t)); + uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk; + minValue = maxValue = static_cast(padLast * sizeof(uint16_t)) + (first * elePerBlk); + } else { + if (isReuseSource) { + maxValue = minValue = 0U; + return; + } + uint32_t k = FindK(first); + if (k == first && first > 1U) { + k >>= 1U; + } + maxValue = minValue = k * ((last + ONE_BLK_SIZE - 1U) / ONE_BLK_SIZE * ONE_BLK_SIZE); + } + return; +} + void GetReduceProdMaxMinTmpSize(const ge::Shape &srcShape, const ge::DataType dataType, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, uint32_t &maxValue, uint32_t &minValue) { - ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, "it only supports float type on this platform."); - + ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, + "[ReduceProd][GetReduceProdMaxMinTmpSize] it only supports float type on this platform."); std::vector shapeDims = srcShape.GetDims(); - const uint32_t first = static_cast(shapeDims[0]); const uint32_t last = static_cast(shapeDims[1]); - CheckParams(shapeDims, isSrcInnerPad, pattern, first, last); + CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, "ReduceProd", "GetReduceProdMaxMinTmpSize"); if (isReuseSource) { minValue = pattern == ReducePattern::AR ? ONE_REPEAT_BYTE_SIZE : 0U; maxValue = minValue; @@ -137,8 +203,9 @@ void GetReduceMaxMaxMinTmpSize(const ge::Shape &srcShape, { ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_FLOAT16, return, - "it only supports float and half type on this platform."); - GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false); + "[ReduceMax][GetReduceMaxMaxMinTmpSize] it only supports float and half type on this platform."); + GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false, + "ReduceMax", "GetReduceMaxMaxMinTmpSize"); } void GetReduceMinMaxMinTmpSize(const ge::Shape &srcShape, @@ -148,8 +215,9 @@ void GetReduceMinMaxMinTmpSize(const ge::Shape &srcShape, { ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_FLOAT16, return, - "it only supports float and half type on this platform."); - GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false); + "[ReduceMin][GetReduceMinMaxMinTmpSize] it only supports float and half type on this platform."); + GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false, + "ReduceMin", "GetReduceMinMaxMinTmpSize"); } void GetReduceAnyMaxMinTmpSize(const ge::Shape &srcShape, @@ -157,13 +225,15 @@ void GetReduceAnyMaxMinTmpSize(const ge::Shape &srcShape, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, uint32_t &maxValue, uint32_t &minValue) { + ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8, + return, + "[ReduceAny][GetReduceAnyMaxMinTmpSize] it only supports float and uint8_t type on this platform."); if (dataType == ge::DT_UINT8) { - GetReduceAllMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue); + GetReduceAnyAllCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + "ReduceAny", "GetReduceAnyMaxMinTmpSize"); } else { - ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8, - return, - "it only supports float and uint8_t type on this platform."); - GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false); + GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + false, "ReduceAny", "GetReduceAnyMaxMinTmpSize"); } } @@ -173,31 +243,13 @@ void GetReduceAllMaxMinTmpSize(const ge::Shape &srcShape, uint32_t &maxValue, uint32_t &minValue) { ASCENDC_HOST_ASSERT((dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8), return, - "it only supports float uint8 type on this platform."); - std::vector shapeDims = srcShape.GetDims(); - const uint32_t first = static_cast(shapeDims[0]); - const uint32_t last = static_cast(shapeDims[1]); - CheckParams(shapeDims, isSrcInnerPad, pattern, first, last); + "[ReduceAll][GetReduceAllMaxMinTmpSize] it only supports float and uint8 type on this platform."); if (dataType == ge::DT_UINT8) { - if (pattern == ReducePattern::AR) { - uint32_t elePerBlk = static_cast(ONE_BLK_SIZE / sizeof(uint8_t)); - uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk; - minValue = maxValue = static_cast(padLast * sizeof(uint16_t)) + (first * elePerBlk); - } else { - if (isReuseSource) { - maxValue = minValue = 0U; - return; - } - uint32_t k = FindK(first); - if (k == first && first > 1U) { - k >>= 1U; - } - maxValue = minValue = k * ((last + ONE_BLK_SIZE - 1U) / ONE_BLK_SIZE * ONE_BLK_SIZE); - } - return; + GetReduceAnyAllCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + "ReduceAll", "GetReduceAllMaxMinTmpSize"); } else { - GetReduceCommonMaxMinTmpSize( - srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false); + GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + false, "ReduceAll", "GetReduceAllMaxMinTmpSize"); } } @@ -206,35 +258,10 @@ void GetReduceSumMaxMinTmpSize(const ge::Shape &srcShape, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, uint32_t &maxValue, uint32_t &minValue) { - ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, "it only supports float type on this platform."); - std::vector shapeDims = srcShape.GetDims(); - const uint32_t first = static_cast(shapeDims[0]); - const uint32_t last = static_cast(shapeDims[1]); - CheckParams(shapeDims, isSrcInnerPad, pattern, first, last); - if (isReuseSource) { - maxValue = minValue = 0U; - return; - } - uint32_t elePerBlk = ONE_BLK_SIZE / FLOAT_TYPE_SIZE; - if (pattern == ReducePattern::AR) { - uint32_t k = FindK(last); - if (k == last && first > 1U) { - k >>= 1U; - } - if (last <= B32_ELEM_NUM_PER_REPEAT) { - maxValue = minValue = 0U; - } else { - maxValue = minValue = (first * k) * FLOAT_TYPE_SIZE; - } - } else { - uint32_t k = FindK(first); - uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk; - if (first == k && first > 1U) { - k >>= 1U; - } - maxValue = minValue = (k * padLast) * FLOAT_TYPE_SIZE; - } - return; + ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, + "[ReduceSum][GetReduceSumMaxMinTmpSize] it only supports float type on this platform."); + GetReduceSumMeanCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + "ReduceSum", "GetReduceSumMaxMinTmpSize"); } void GetReduceMeanMaxMinTmpSize(const ge::Shape &srcShape, @@ -242,6 +269,9 @@ void GetReduceMeanMaxMinTmpSize(const ge::Shape &srcShape, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, uint32_t &maxValue, uint32_t &minValue) { - GetReduceSumMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue); + ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, + "[ReduceMean][GetReduceMeanMaxMinTmpSize] it only supports float type on this platform."); + GetReduceSumMeanCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + "ReduceMean", "GetReduceMeanMaxMinTmpSize"); } } // namespace AscendC diff --git a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp index d705e532..24445609 100644 --- a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp +++ b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp @@ -32,7 +32,16 @@ void GetReduceXorSumMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t type uint32_t &maxValue, uint32_t &minValue) { const uint32_t inputSize = srcShape.GetShapeSize(); - ASCENDC_HOST_ASSERT(inputSize > 0, return, "ReduceXorSum input Shape size must be greater than 0."); + + ASCENDC_HOST_ASSERT(inputSize > 0U, return, + "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape size is %u, expected is greater than 0!", + inputSize); + std::vector shapeDims = srcShape.GetDims(); + ASCENDC_HOST_ASSERT(shapeDims.size() > 0UL, return, + "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape dimension number is %lli, expected is greater than 0!", + shapeDims.size()); + ASCENDC_HOST_ASSERT(typeSize == 2U, return, + "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter typeSize is %u, expected is 2!", typeSize); maxValue = GetTmpSize(inputSize, typeSize, isReuseSource); minValue = maxValue; diff --git a/impl/reduce/sum/sum_tiling.cpp b/impl/reduce/sum/sum_tiling.cpp index 95e087ac..817031cd 100644 --- a/impl/reduce/sum/sum_tiling.cpp +++ b/impl/reduce/sum/sum_tiling.cpp @@ -19,8 +19,14 @@ namespace AscendC { void GetSumMaxMinTmpSize( const uint32_t n, const uint32_t typeSize, const bool isReuseSource, uint32_t &maxSize, uint32_t &minSize) { - (void)isReuseSource; - ASCENDC_HOST_ASSERT(typeSize > 0, return, "typeSize must be greater than 0."); + if (isReuseSource) { + TILING_LOG_WARNING("[Sum][GetSumMaxMinTmpSize] The parameter isReuseSource is true, which is not effective!"); + } + ASCENDC_HOST_ASSERT(typeSize > 0, return, + "[Sum][GetSumMaxMinTmpSize] The parameter typeSize is %u, expected is 2 or 4!", typeSize); + ASCENDC_HOST_ASSERT(n > 0, + return, "[Sum][GetSumMaxMinTmpSize] The parameter n is %u, expected is greater than 0!", n); + constexpr uint32_t sumOneBlkSize = 32; constexpr uint32_t sumOneRepeatByteSize = 256; diff --git a/lib/reduce/reduce_tiling.h b/lib/reduce/reduce_tiling.h index a43f6f83..c27591e4 100644 --- a/lib/reduce/reduce_tiling.h +++ b/lib/reduce/reduce_tiling.h @@ -15,6 +15,7 @@ #ifndef LIB_REDUCE_REDUCE_TILING_H #define LIB_REDUCE_REDUCE_TILING_H #include +#include "graph/types.h" #include "graph/tensor.h" namespace AscendC { diff --git a/lib/sort/topk_tiling.h b/lib/sort/topk_tiling.h index 07e36dbb..e08d1af4 100644 --- a/lib/sort/topk_tiling.h +++ b/lib/sort/topk_tiling.h @@ -11,6 +11,7 @@ #define LIB_SORT_TOPK_TILING_H #include "topk_tilingdata.h" #include "tiling/platform/platform_ascendc.h" +#include "graph/types.h" namespace AscendC { -- Gitee From 65b4a40f6d68166c4fbe46aaa7ebbe9060a9c721 Mon Sep 17 00:00:00 2001 From: chen-yiyuan Date: Fri, 23 May 2025 11:34:45 +0800 Subject: [PATCH 2/5] update --- impl/CMakeLists.txt | 31 + impl/transdata/transdata_impl.h | 318 +++++++ impl/transdata/transdata_tiling.cpp | 140 +++ lib/kernel_api.h | 159 ++++ lib/transdata/transdata.h | 47 + lib/transdata/transdata_tiling.h | 67 ++ tests/CMakeLists.txt | 8 + tests/tiling/test_tiling.cpp | 1369 +++++++++++++++++++++++++++ 8 files changed, 2139 insertions(+) create mode 100644 impl/transdata/transdata_impl.h create mode 100644 impl/transdata/transdata_tiling.cpp create mode 100644 lib/kernel_api.h create mode 100644 lib/transdata/transdata.h create mode 100644 lib/transdata/transdata_tiling.h diff --git a/impl/CMakeLists.txt b/impl/CMakeLists.txt index c29d95d3..8de6e974 100644 --- a/impl/CMakeLists.txt +++ b/impl/CMakeLists.txt @@ -92,6 +92,37 @@ add_library(tiling_api STATIC ${CMAKE_CURRENT_SOURCE_DIR}/math/axpy/axpy_tiling_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/ceil/ceil_tiling_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/floor/floor_tiling_impl.cpp +<<<<<<< HEAD +======= + ${CMAKE_CURRENT_SOURCE_DIR}/activation/softmax/softmax_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/activation/softmax/logsoftmax_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/rmsnorm/rmsnorm_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/batchnorm/batchnorm_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sort/sort/sort_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sort/topk/topk_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/deepnorm/deepnorm_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/select/selectwithbytesmask/selectwithbytesmask_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/layernorm/layernorm_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/normalize/normalize_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/layernormgrad/layernorm_grad_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/layernormgrad/layernorm_grad_beta_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/groupnorm/groupnorm_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/normalization/welfordfinalize/welfordfinalize_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/pad/pad/pad_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/transpose/confusion_transpose/confusion_transpose_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/pad/broadcast/broadcast_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/pad/broadcast/broadcast_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/math/xor/xor_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/math/cumsum/cumsum_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce/mean/mean_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/math/sign/sign_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/activation/reglu/reglu_tiling_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce/sum/sum_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/transdata/transdata_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/index/arithprogression/arithprogression_tiling_impl.cpp +>>>>>>> 4a0a42bb (update) ${CMAKE_CURRENT_SOURCE_DIR}/math/fmod/fmod_tiling_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/trunc/trunc_tiling_impl.cpp $<$:$> diff --git a/impl/transdata/transdata_impl.h b/impl/transdata/transdata_impl.h new file mode 100644 index 00000000..37966c8a --- /dev/null +++ b/impl/transdata/transdata_impl.h @@ -0,0 +1,318 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef IMPL_TRANSDATA_TRANSDATA_IMPL_H +#define IMPL_TRANSDATA_TRANSDATA_IMPL_H + +#include "kernel_tensor.h" +#include "kernel_operator_intf.h" +#include "kernel_tiling/kernel_tiling.h" + +namespace AscendC { + +template +struct TransDataParams { + T srcLayout; + U dstLayout; +}; + +#ifndef ASCC_PARAM_TRANSDATACONFIG +#define ASCC_PARAM_TRANSDATACONFIG +struct TransDataConfig { + DataFormat srcFormat; + DataFormat dstFormat; +}; +#endif // ASCC_PARAM_TRANSDATACONFIG + +namespace Internal { +struct TransDataTmpParams { + int32_t n; + int32_t c; + int32_t d; + int32_t h; + int32_t w; +}; + +constexpr int32_t DEFAULT_TRANSDATA_5HD_LIST = 16; + +// Transdata NCDHW -> FRACTAL_Z_3D +template +__aicore__ inline void TransDataImplMode1(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, + const TransDataTmpParams& param) +{ + constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T); + const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w; + constexpr int32_t c0 = 16; + constexpr int32_t n0 = 16; + const int32_t c1 = DivCeil(c, c0); + const int32_t n1 = DivCeil(n, n0); + int32_t padHw = AlignUp(h * w, elePerBlk); + int32_t currAxis = c * d * padHw; + Duplicate(tmpBuffer.ReinterpretCast(), static_cast(0), currAxis); + PipeBarrier(); + auto tmpDstTensor = tmpBuffer[currAxis * sizeof(T)].ReinterpretCast(); + uint64_t dstLocalList[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t srcLocalList[DEFAULT_TRANSDATA_5HD_LIST]; + + uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr(); + uint64_t srcTensorAddr = (uint64_t)src.GetPhyAddr(); + uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr(); + uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr(); + // step1, NCDHW -> CDHW, N1, N0 + // Do n1 times Transpose to split axis N, and fill with 0 on padding data. + TransDataTo5HDParams transDataParams; + transDataParams.dstHighHalf = false; + transDataParams.srcHighHalf = false; + transDataParams.repeatTimes = currAxis / elePerBlk; + // if repeat = 1, start offset is auto incremental by stride. + transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : n1 * n0; + transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1; + + bool isPadded = padHw != h * w; + // dst tensor is unable to fill all padded data. + auto tmpIfPadAddr = isPadded ? tmpDstTensorAddr : dstTensorAddr; + for (int j = 0; j < n1; j++) { + uint64_t currDstAddr = tmpIfPadAddr + j * n0 * sizeof(T); + uint64_t currSrcAddr = srcTensorAddr + j * currAxis * n0 * sizeof(T); + // handle the last axis if N is not even splited by n0. + int remain = j == n1 - 1 ? n - j * n0 : n0; + for (int32_t i = 0; i < n0; i++) { + dstLocalList[i] = currDstAddr + (i * n1 * n0) * sizeof(T); + } + for (int32_t i = 0; i < remain; i++) { + srcLocalList[i] = currSrcAddr + i * currAxis * sizeof(T); + } + for (int32_t i = remain; i < n0; i++) { + srcLocalList[i] = tmpBufferAddr; + } + TransDataTo5HD(dstLocalList, srcLocalList, transDataParams); + } + PipeBarrier(); + // step1.5 collapse padded H,W axis for CDHW, N1N0 + DataCopyParams copyParams; + if (isPadded) { + currAxis = h * w * n1 * n0; + copyParams.blockCount = c * d; + copyParams.blockLen = currAxis / elePerBlk; + // Merge axis by skiping padded H,W. + copyParams.srcStride = (padHw - h * w) * n1 * n0 / elePerBlk; + copyParams.dstStride = 0; + DataCopy(dst, tmpDstTensor, copyParams); + } + PipeBarrier(); + + // step2, CDHWN1N0 -> C1DHW, N1N0, C0 + currAxis = d * h * w * n1 * n0; + transDataParams.repeatTimes = currAxis / elePerBlk; + transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : c0; + transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1; + for (int32_t j = 0; j < c1; j++) { + uint64_t currDstAddr = tmpDstTensorAddr + j * currAxis * c0 * sizeof(T); + uint64_t currSrcAddr = dstTensorAddr + j * currAxis * c0 * sizeof(T); + int remain = j == c1 - 1 ? c - j * c0 : c0; + for (int32_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + dstLocalList[i] = currDstAddr + i * c0 * sizeof(T); + } + for (int32_t i = 0; i < remain; i++) { + srcLocalList[i] = currSrcAddr + i * currAxis * sizeof(T); + } + for (int32_t i = remain; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + srcLocalList[i] = tmpBufferAddr; + } + TransDataTo5HD(dstLocalList, srcLocalList, transDataParams); + } + PipeBarrier(); + // steo3 C1DHW, N1N0, C0 -> DC1HW, N1N0, C0 + currAxis = c0 * h * w * n1 * n0; + copyParams.blockCount = d; + copyParams.blockLen = currAxis / elePerBlk; + // Merge axis by skiping padding padHW -> h, w + copyParams.srcStride = 0; + copyParams.dstStride = (c1 - 1) * currAxis / elePerBlk; + for (int32_t i = 0; i < c1; i++) { + DataCopy(dst[i * currAxis], tmpDstTensor[i * d * currAxis], copyParams); + } + PipeBarrier(); +} + +// Transdata NCDHW -> NDC1HWC0 +template +__aicore__ inline void TransDataImplMode2(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, + const TransDataTmpParams& param) +{ + constexpr int32_t c0 = 16; + constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T); + const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w; + const int32_t c1 = DivCeil(c, c0); + const int32_t padHw = AlignUp(h * w, elePerBlk); + int32_t currAxis = d * padHw; + + int32_t axisHwd = h * w * d; + int32_t axisHwc0 = h * w * c0; + int32_t axisC1hwc0 = axisHwc0 * c1; + int32_t axisC1hwdc0 = axisC1hwc0 * d; + int32_t axisPadHwd = padHw * d; + int32_t axisPadHwc0 = padHw * c0; + int32_t axisPadHwdc0 = padHw * c0 * d; + Duplicate(tmpBuffer.ReinterpretCast(), static_cast(0), axisPadHwd); + PipeBarrier(); + + // reserve for padded 0 on additional axis c. + auto tmpDstTensor = tmpBuffer[axisPadHwd * sizeof(T)].ReinterpretCast(); + + uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr(); + uint64_t srcTensorAddr = (uint64_t)src.GetPhyAddr(); + uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr(); + uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr(); + uint64_t dstLocalList[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t srcLocalList[DEFAULT_TRANSDATA_5HD_LIST]; + TransDataTo5HDParams transDataParams; + transDataParams.dstHighHalf = false; + transDataParams.srcHighHalf = false; + transDataParams.repeatTimes = axisPadHwd / elePerBlk; + transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : c0; + transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1; + + DataCopyParams copyParams; + copyParams.blockCount = d; + copyParams.blockLen = axisHwc0 / elePerBlk; + copyParams.srcStride = (padHw - h * w) * c0 / elePerBlk; + copyParams.dstStride = (c1 - 1) * axisHwc0 / elePerBlk; + // iterates N times CDHW -> C1DHWC0 + for (int32_t k = 0; k < n; k++) { + int32_t currSrcStart = k * axisPadHwd * c; + int32_t currDstStart = k * axisC1hwdc0; + // it's impossible to have calculation size exceed max 255 repeats due to the total memory size. + // step1, CDHW -> C1DHWC0 with pad data + for (int32_t j = 0; j < c1; j++) { + uint64_t currDstAddr = tmpDstTensorAddr + j * axisPadHwdc0 * sizeof(T); + uint64_t currSrcAddr = srcTensorAddr + (currSrcStart + j * axisPadHwdc0) * sizeof(T); + int remain = j == c1 - 1 ? c - j * c0 : c0; + for (int32_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + dstLocalList[i] = currDstAddr + i * c0 * sizeof(T); + } + for (int32_t i = 0; i < remain; i++) { + srcLocalList[i] = currSrcAddr + i * axisPadHwd * sizeof(T); + } + for (int32_t i = remain; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + srcLocalList[i] = tmpBufferAddr; + } + TransDataTo5HD(dstLocalList, srcLocalList, transDataParams); + } + PipeBarrier(); + // step2, C1DHWC0 -> DC1HWC0 + for (int32_t i = 0; i < c1; i++) { + DataCopy(dst[currDstStart + i * axisHwc0], tmpDstTensor[i * axisPadHwdc0], copyParams); + } + PipeBarrier(); + } +} + +// Transdata NDC1HWC0 -> NCDHW +template +__aicore__ inline void TransDataImplMode3(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, + const TransDataTmpParams& param) +{ + const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w; + constexpr int32_t c0 = 16; + constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T); + const int32_t c1 = DivCeil(c, c0); + const int32_t padHw = AlignUp(h * w, elePerBlk); + constexpr int32_t reservedDummy = 512; + auto tmpDstTensor = tmpBuffer[reservedDummy].template ReinterpretCast(); + uint64_t dstLocalList[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t srcLocalList[DEFAULT_TRANSDATA_5HD_LIST]; + + uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr(); + uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr(); + uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr(); + + int32_t axisHwd = h * w * d; + int32_t axisHwc0 = h * w * c0; + int32_t axisC1hwc0 = axisHwc0 * c1; + int32_t axisC1hwdc0 = axisC1hwc0 * d; + int32_t axisPadHwd = padHw * d; + int32_t axisPadHwc0 = padHw * c0; + int32_t axisPadHwdc0 = padHw * c0 * d; + TransDataTo5HDParams transDataParams; + transDataParams.dstHighHalf = false; + transDataParams.srcHighHalf = false; + transDataParams.repeatTimes = padHw * d / elePerBlk; + transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : c0; + transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : 1; + + DataCopyParams copyParams; + copyParams.blockCount = c1; + copyParams.blockLen = h * w * c0 / elePerBlk; + copyParams.srcStride = 0; + copyParams.dstStride = (d * padHw - h * w) * c0 / elePerBlk; + // iterates N times C1DHWC0 -> CDHW + for (int32_t k = 0; k < n; k++) { + // step1 DC1HWC0 -> C1DHWC0 + int32_t currSrcStart = k * axisC1hwdc0; + int32_t currDstStart = k * axisPadHwd * c; + for (int32_t i = 0; i < d; i++) { + DataCopy(tmpDstTensor[i * axisPadHwc0], src[currSrcStart + i * axisC1hwc0], copyParams); + } + PipeBarrier(); + // step2, C1DHWC0 -> C1C0DHW + // it's impossible to have calculation size exceed max 255 repeats due to the total memory size. + for (int32_t j = 0; j < c1; j++) { + int32_t remain = j == c1 - 1 ? c - j * c0 : c0; + uint64_t currDstAddr = dstTensorAddr + (currDstStart + j * axisPadHwdc0) * sizeof(T); + uint64_t currSrcAddr = tmpDstTensorAddr + j * axisPadHwdc0 * sizeof(T); + for (int32_t i = 0; i < remain; i++) { + dstLocalList[i] = currDstAddr + i * axisPadHwd * sizeof(T); + } + for (int32_t i = remain; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + // temp for reserve redundant data. + dstLocalList[i] = tmpBufferAddr + i * ONE_BLK_SIZE; + } + for (int32_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + srcLocalList[i] = currSrcAddr + i * c0 * sizeof(T); + } + TransDataTo5HD(dstLocalList, srcLocalList, transDataParams); + } + PipeBarrier(); + } +} + +template +__aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const LocalTensor& srcTensor, + const LocalTensor& sharedTmpBuffer, const TransDataParams& params) +{ + static_assert(SupportType(), "Currents only supports half/bfloat16_t types."); + static_assert(is_layout_v, "srcLayout must be a layout"); + static_assert(is_layout_v, "dstLayout must be a layout"); + using SrcShapeTuple = Std::remove_cvref_t; + using DstShapeTuple = Std::remove_cvref_t; + static_assert(Std::is_tuple_v, "it must be a shape."); + static_assert(Std::is_tuple_v, "it must be a shape."); + + auto ncdhwShape = config.srcFormat == DataFormat::NCDHW ? params.srcLayout.GetShape() : params.dstLayout.GetShape(); + TransDataTmpParams tmpParams = { + Std::get<0>(ncdhwShape), + Std::get<1>(ncdhwShape), + Std::get<2>(ncdhwShape), + Std::get<3>(ncdhwShape), + Std::get<4>(ncdhwShape) + }; + + if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { + TransDataImplMode2(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + } else if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { + TransDataImplMode1(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + } else if constexpr (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) { + TransDataImplMode3(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + } +} + +} // namespace Internal +} // namespace AscendC +#endif // IMPL_TRANSDATA_TRANSDATA_IMPL_H \ No newline at end of file diff --git a/impl/transdata/transdata_tiling.cpp b/impl/transdata/transdata_tiling.cpp new file mode 100644 index 00000000..dfb216cc --- /dev/null +++ b/impl/transdata/transdata_tiling.cpp @@ -0,0 +1,140 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "lib/transdata/transdata_tiling.h" + +#include +#include + +#include "graph/tensor.h" +#include "impl/host_log.h" +#include "tiling/platform/platform_ascendc.h" +namespace AscendC { +namespace { +constexpr int32_t PAD_ELE_FOR_HALF = 16; +constexpr int32_t N_INDEX = 0; +constexpr int32_t C_INDEX = 1; +constexpr int32_t D_INDEX = 2; +constexpr int32_t H_INDEX = 3; +constexpr int32_t W_INDEX = 4; + +struct TmpTransDataParams { + int32_t n = 0; + int32_t c = 0; + int32_t d = 0; + int32_t h = 0; + int32_t w = 0; +}; + +int32_t DivCeil(int32_t a, int32_t b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b; +} + +int32_t AlignUp(int32_t a, int32_t b) +{ + return DivCeil(a, b) * b; +} + +bool GenerateShapeInfo(const TransDataConfig &config, const ge::Shape &srcShape, const ge::Shape &dstShape, ge::DataType type, + TmpTransDataParams ¶m) +{ + (void)type; + constexpr int32_t c0 = 16, n0 = 16; + std::vector srcDims = srcShape.GetDims(); + std::vector dstDims = dstShape.GetDims(); + if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { + ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 6, return false, "input shapes are not matched with DataFormat."); + param.n = srcDims[N_INDEX]; + param.c = srcDims[C_INDEX]; + param.d = srcDims[D_INDEX]; + param.h = srcDims[H_INDEX]; + param.w = srcDims[W_INDEX]; + // validate n, d, h, w + ASCENDC_HOST_ASSERT(param.n == dstDims[0] && param.d == dstDims[1] && param.h == dstDims[3] && param.w == dstDims[4], return false, "shapeInfo n,d,h,w is not matched."); + ASCENDC_HOST_ASSERT(dstDims[5] == c0 && dstDims[2] * c0 == AlignUp(param.c, c0), return false, "dst c0, c1 is not able to be converted to c."); + return true; + } + if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { + ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 7, return false, "input shapes are not matched with DataFormat."); + param.n = srcDims[N_INDEX]; + param.c = srcDims[C_INDEX]; + param.d = srcDims[D_INDEX]; + param.h = srcDims[H_INDEX]; + param.w = srcDims[W_INDEX]; + // validate n, d, h, w + ASCENDC_HOST_ASSERT(param.d == dstDims[0] && param.h == dstDims[2] && param.w == dstDims[3], return false, "shapeInfo n,d,h,w is not matched."); + ASCENDC_HOST_ASSERT(dstDims[6] == c0 && dstDims[1] * c0 == AlignUp(param.c, c0), return false, "dst c0, c1 is not able to be converted to c."); + ASCENDC_HOST_ASSERT(dstDims[5] == n0 && dstDims[4] * n0 == AlignUp(param.n, n0), return false, "dst n0, n1 is not able to be converted to n."); + return true; + } + if (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) { + ASCENDC_HOST_ASSERT(srcDims.size() == 6 && dstDims.size() == 5, return false, "input shapes are not matched with DataFormat."); + param.n = dstDims[N_INDEX]; + param.c = dstDims[C_INDEX]; + param.d = dstDims[D_INDEX]; + param.h = dstDims[H_INDEX]; + param.w = dstDims[W_INDEX]; + // validate n, d, h, w + ASCENDC_HOST_ASSERT(param.n == srcDims[0] && param.d == srcDims[1] && param.h == srcDims[3] && param.w == srcDims[4], return false, "shapeInfo n,d,h,w is not matched."); + ASCENDC_HOST_ASSERT(srcDims[5] == c0 && srcDims[2] * c0 == AlignUp(param.c, c0), return false, "src c0, c1 is not able to be converted to c."); + return true; + } + return false; +} + +int32_t GetTmpBufferSize(const TransDataConfig &config, const TmpTransDataParams ¶m) +{ + constexpr int32_t dataSize = 2; + int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w; + constexpr int32_t c0 = 16, n0 = 16; + int32_t c1 = DivCeil(c, c0), n1 = DivCeil(n, n0); + int32_t padHw = AlignUp(h * w, 16); + if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) + { + return d * padHw * dataSize + d * c1 * c0 * padHw * dataSize; + } + if (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) + { + constexpr int32_t redundantDataBuffer = 512; + return d * c1 * c0 * padHw * dataSize + redundantDataBuffer; + } + if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) + { + return c * d * padHw * dataSize + n1 * n0 * d * c1 * c0 * padHw * dataSize; + } + return 0; +} + +} // namespace + +bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform, + const ge::Shape &srcShape, + const ge::Shape &dstShape, + const ge::DataType dataType, + const TransDataConfig &config, + uint32_t &maxValue, uint32_t &minValue) +{ + ASCENDC_HOST_ASSERT(dataType == ge::DataType::DT_FLOAT16 || dataType == ge::DataType::DT_BF16, return false, "it only supports DT_FLOAT16/DT_BF16 data type"); + platform_ascendc::SocVersion socVersion = platform.GetSocVersion(); + ASCENDC_HOST_ASSERT(socVersion == platform_ascendc::SocVersion::ASCEND910B, return false, + "Unsupported SocVersion for TransData API."); + + TmpTransDataParams tmpParam; + + ASCENDC_HOST_ASSERT(GenerateShapeInfo(config, srcShape, dstShape, dataType, tmpParam), return false, "failed to validate inputs informations."); + maxValue = GetTmpBufferSize(config, tmpParam); + minValue = maxValue; + return true; +} +} // namespace AscendC diff --git a/lib/kernel_api.h b/lib/kernel_api.h new file mode 100644 index 00000000..b6f7a069 --- /dev/null +++ b/lib/kernel_api.h @@ -0,0 +1,159 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file kernel_api.h + * \brief + */ +#ifndef LIB_KERNEL_API_H +#define LIB_KERNEL_API_H + +#if defined(__DAV_C310__) || defined(__DAV_310R6__) +#include "hccl/hccl.h" +#include "index/arithprogression.h" +#include "activation/sigmoid.h" +#include "activation/softmax.h" +#include "activation/simplesoftmax.h" +#include "activation/softmaxflashv2.h" +#include "activation/softmaxgrad.h" +#include "activation/gelu.h" +#include "filter/dropout.h" +#include "math/tan.h" +#include "math/tanh.h" +#include "math/floor.h" +#include "math/lgamma.h" +#include "math/log.h" +#include "math/sin.h" +#include "math/atanh.h" +#include "math/asinh.h" +#include "math/acosh.h" +#include "math/trunc.h" +#include "math/cos.h" +#include "math/fmod.h" +#include "math/hypot.h" +#include "math/power.h" +#include "math/frac.h" +#include "math/cumsum.h" +#include "math/erf.h" +#include "math/erfc.h" +#include "math/atan.h" +#include "math/is_finite.h" +#include "math/philox.h" +#include "math/sinh.h" +#include "math/cosh.h" +#include "math/sign.h" +#include "math/asin.h" +#include "math/acos.h" +#include "math/exp.h" +#include "math/xor.h" +#include "normalization/layernorm.h" +#include "normalization/welfordfinalize.h" +#include "normalization/normalize.h" +#include "pad/broadcast.h" +#include "quantization/ascend_quant.h" +#include "quantization/ascend_dequant.h" +#include "quantization/ascend_antiquant.h" +#include "quantization/quantize.h" +#include "quantization/dequantize.h" +#include "quantization/antiquantize.h" +#include "utils/init_global_memory.h" +#include "sort/sort.h" +#include "sort/topk.h" +#include "transpose/confusion_transpose.h" +#include "select/selectwithbytesmask.h" +#include "reduce/reduce.h" +#include "math/clamp.h" +#include "math/round.h" +#include "math/ceil.h" +#endif // __CCE_AICORE__ == 310 + +#if defined(__CCE_AICORE__) && (__CCE_AICORE__ != 310) +#include "filter/dropout.h" +#include "activation/sigmoid.h" +#include "activation/softmax.h" +#include "activation/simplesoftmax.h" +#include "activation/softmaxflashv2.h" +#include "activation/softmaxflashv3.h" +#include "activation/softmaxgrad.h" +#include "math/xor.h" +#include "math/floor.h" +#include "sort/sort.h" +#endif + +#include "std/tuple.h" +#include "std/type_traits.h" +#include "std/utility.h" +#include "std/algorithm.h" + +#if defined(__CCE_AICORE__) && (__CCE_AICORE__ < 300) +#include "index/arithprogression.h" +#include "normalization/layernormgrad.h" +#include "normalization/layernormgradbeta.h" +#include "pad/pad.h" +#include "hccl/hccl.h" +#include "math/frac.h" +#include "math/power.h" +#include "math/log.h" +#include "math/sin.h" +#include "math/cos.h" +#include "math/asin.h" +#include "math/acos.h" +#include "math/asinh.h" +#include "math/acosh.h" +#include "math/atan.h" +#include "math/cosh.h" +#include "math/erf.h" +#include "math/erfc.h" +#include "math/clamp.h" +#include "normalization/rmsnorm.h" +#include "normalization/batchnorm.h" +#include "math/tanh.h" +#include "math/atanh.h" +#include "normalization/deepnorm.h" +#include "math/exp.h" +#include "normalization/layernorm.h" +#include "normalization/welfordfinalize.h" +#include "normalization/normalize.h" +#include "reduce/sum.h" +#include "activation/silu.h" +#include "activation/gelu.h" +#include "quantization/ascend_quant.h" +#include "quantization/ascend_dequant.h" +#include "quantization/ascend_antiquant.h" +#include "activation/logsoftmax.h" +#include "activation/softmaxflash.h" +#include "transpose/confusion_transpose.h" +#include "select/selectwithbytesmask.h" +#include "math/sinh.h" +#include "activation/swiglu.h" +#include "activation/reglu.h" +#include "math/tan.h" +#include "math/round.h" +#include "math/trunc.h" +#include "activation/swish.h" +#include "sort/topk.h" +#include "activation/geglu.h" +#include "math/lgamma.h" +#include "math/digamma.h" +#include "math/sign.h" +#include "reduce/mean.h" +#include "math/axpy.h" +#include "math/ceil.h" +#include "pad/broadcast.h" +#include "reduce/reduce_xor_sum.h" +#include "reduce/reduce.h" +#include "transdata/transdata.h" +#include "math/cumsum.h" +#include "math/fmod.h" +#include "normalization/groupnorm.h" +#include "utils/init_global_memory.h" +#endif // __CCE_AICORE__ < 300 + +#endif // LIB_KERNEL_API_H diff --git a/lib/transdata/transdata.h b/lib/transdata/transdata.h new file mode 100644 index 00000000..795c9a03 --- /dev/null +++ b/lib/transdata/transdata.h @@ -0,0 +1,47 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LIB_TRANSDATA_TRANSDATA_H +#define LIB_TRANSDATA_TRANSDATA_H +#if __CCE_AICORE__ == 220 +#include "kernel_tensor.h" +#include "kernel_operator_intf.h" +#include "kernel_pop_stack_buffer.h" +#include "../../impl/transdata/transdata_impl.h" +#if ASCENDC_CPU_DEBUG +#include "kernel_log.h" +#include +#endif + +namespace AscendC { + +template +__aicore__ inline void TransData(const LocalTensor& dstTensor, const LocalTensor& srcTensor, + const LocalTensor& sharedTmpBuffer, const TransDataParams& params) +{ + Internal::TransDataImpl(dstTensor, srcTensor, sharedTmpBuffer, params); +} + +template +__aicore__ inline void TransData(const LocalTensor& dstTensor, const LocalTensor& srcTensor, + const TransDataParams& params) +{ + // Only for AI Vector Core. + if ASCEND_IS_AIC { + return; + } + LocalTensor tmp; + const bool ret = PopStackBuffer(tmp); + ASCENDC_ASSERT((ret), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); + + TransData(dstTensor, srcTensor, tmp, params); +} +} // namespace AscendC +#endif +#endif // LIB_TRANSDATA_TRANSDATA_H \ No newline at end of file diff --git a/lib/transdata/transdata_tiling.h b/lib/transdata/transdata_tiling.h new file mode 100644 index 00000000..87559a51 --- /dev/null +++ b/lib/transdata/transdata_tiling.h @@ -0,0 +1,67 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file transdata_tiling.h + * \brief + */ +#ifndef LIB_TRANSDATA_TRANSDATA_TILING_H +#define LIB_TRANSDATA_TRANSDATA_TILING_H +#include +#include "graph/tensor.h" +#include "tiling/platform/platform_ascendc.h" + +namespace AscendC { +/* + * @brief DataFormat +*/ +#ifndef ASCC_ENUM_DATAFORMAT +#define ASCC_ENUM_DATAFORMAT +enum class DataFormat : uint8_t { + ND = 0, + NZ, + NCHW, + NC1HWC0, + NHWC, + NCDHW, + NDC1HWC0, + FRACTAL_Z_3D, +}; +#endif // ASCC_ENUM_DATAFORMAT + +#ifndef ASCC_PARAM_TRANSDATACONFIG +#define ASCC_PARAM_TRANSDATACONFIG +struct TransDataConfig { + DataFormat srcFormat; + DataFormat dstFormat; +}; +#endif // ASCC_PARAM_TRANSDATACONFIG + +/*! + * \brief This interface is used to obtain the maximum and minimum temporary space reserved or applied. + * The developer selects a proper space size based on this range as the tiling parameter. + * + * \param [in] platform, targeted platform information + * \param [in] srcShape, src tensor shape + * \param [in] dstShape, src tensor shape + * \param [in] dataType, actual data type of the input + * \param [in] config, transdata config + * \param [out] maxValue, maximum temporary space required + * \param [out] minValue, minimum temporary space required + * \return whether get the max/min value successfully + */ +bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform, + const ge::Shape &srcShape, + const ge::Shape &dstShape, + const ge::DataType dataType, + const TransDataConfig &config, + uint32_t &maxValue, uint32_t &minValue); +} // AscendC +#endif // LIB_TRANSDATA_TRANSDATA_TILING_H \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e1f9cd37..6e91de48 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -404,7 +404,11 @@ file(GLOB ASCENDC_TILING_SRC_FILES ${ASCENDC_API_DIR}/impl/quantization/quant/*.cpp ${ASCENDC_API_DIR}/impl/sort/topk/*.cpp ${ASCENDC_API_DIR}/impl/reduce/reduce_tiling.cpp +<<<<<<< HEAD ${ASCENDC_API_DIR}/impl/normalization/layernormV2/*.cpp +======= + ${ASCENDC_API_DIR}/impl/transdata/transdata_tiling.cpp +>>>>>>> 4a0a42bb (update) ) # ascendc_tiling_utest @@ -460,11 +464,15 @@ foreach(product_type ${PRODUCT_TYPE_LIST}) ${ASCENDC_API_DIR}/lib/reduce/ ${ASCENDC_API_DIR}/lib/select/ ${ASCENDC_API_DIR}/lib/transpose/ +<<<<<<< HEAD ${ASCENDC_API_DIR}/lib/matmul/ ${ASCENDC_API_DIR}/lib/math/ ${ASCENDC_API_DIR}/lib/normalization/ ${ASCENDC_API_DIR}/lib/quantization/ ${ASCENDC_API_DIR}/lib/sort/ +======= + ${ASCENDC_API_DIR}/lib/transdata/ +>>>>>>> 4a0a42bb (update) ${ASCENDC_TESTS_DIR}/common/ ) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 50d4faa0..cc0a7cec 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -13,6 +13,7 @@ #define private public #define protected public #include "lib/activation/softmax_tiling.h" +#include "lib/transdata/transdata_tiling.h" // temp for upload code #include "tiling_api.h" #include "platform_stub.h" #include "impl/matmul/tiling/math_util.h" @@ -33,6 +34,1310 @@ protected: }; +<<<<<<< HEAD +======= +#if defined(__DAV_C310__) || defined(__DAV_310R6__) +TEST_F(TestTiling, TestSoftMaxTiling) +{ + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + std::vector shapeDims = { 128, 128 }; + optiling::SoftMaxTiling tilingData; + auto softmaxShape = ge::Shape(shapeDims); + uint32_t softmaxTmpSize = 100 * 1024 * 4; + uint32_t softmaxNeedMinSize = GetSoftMaxMinTmpSize(ascendcPlatform, softmaxShape, 2, true); + EXPECT_EQ(softmaxNeedMinSize, 128 * (16 + 128) * 4); + uint32_t softmaxFlashNeedMinSize = GetSoftMaxFlashMinTmpSize(softmaxShape, 2, true, true); + EXPECT_EQ(softmaxFlashNeedMinSize, (16 * 4 + 128 * 2) * 4); + softmaxFlashNeedMinSize = GetSoftMaxFlashMinTmpSize(softmaxShape, 4, true, true); + EXPECT_EQ(softmaxFlashNeedMinSize, (8 * 4 + 128 * 2) * 4); + softmaxFlashNeedMinSize = GetSoftMaxFlashMinTmpSize(softmaxShape, 4, false, true); + EXPECT_EQ(softmaxFlashNeedMinSize, (8 + 128 + 64) * 4); + uint32_t softmaxGradNeedMinSize = GetSoftMaxGradMinTmpSize(softmaxShape, 2, true, true); + EXPECT_EQ(softmaxGradNeedMinSize, (16 * 2 + 128 * 3 + 64) * 4); + softmaxGradNeedMinSize = GetSoftMaxGradMinTmpSize(softmaxShape, 4, true, true); + EXPECT_EQ(softmaxGradNeedMinSize, (8 + 128 + 64) * 4); + softmaxGradNeedMinSize = GetSoftMaxGradMinTmpSize(softmaxShape, 0, true, true); + EXPECT_EQ(softmaxGradNeedMinSize, 0); + + uint32_t softmaxNeedMaxSize = GetSoftMaxMaxTmpSize(ascendcPlatform, softmaxShape, 2, true); + EXPECT_EQ(softmaxNeedMaxSize, 128 * (16 + 128 + 64) * 4); + softmaxNeedMaxSize = GetSoftMaxMaxTmpSize(ascendcPlatform, softmaxShape, 1, true); + EXPECT_EQ(softmaxNeedMaxSize, 0); + uint32_t softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 2, true, true); + EXPECT_EQ(softmaxFlashNeedMaxSize, 128 * (16 * 4 + 128 * 2) * 4); + softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 4, false, true); + EXPECT_EQ(softmaxFlashNeedMaxSize, 128 * (8 + 128 + 64) * 4); + softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 4, true, true); + EXPECT_EQ(softmaxFlashNeedMaxSize, 128 * (8 * 4 + 128 * 2) * 4); + softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 1, true, true); + EXPECT_EQ(softmaxFlashNeedMaxSize, 0); + uint32_t softmaxGradNeedMaxSize = GetSoftMaxGradMaxTmpSize(softmaxShape, 2, true, true); + EXPECT_EQ(softmaxGradNeedMaxSize, 128 * (16 * 2 + 128 * 3 + 64) * 4); + softmaxGradNeedMaxSize = GetSoftMaxGradMaxTmpSize(softmaxShape, 4, true, true); + EXPECT_EQ(softmaxGradNeedMaxSize, 128 * (8 + 128 + 64) * 4); + softmaxGradNeedMaxSize = GetSoftMaxGradMaxTmpSize(softmaxShape, 1, true, true); + EXPECT_EQ(softmaxGradNeedMaxSize, 0); + SoftMaxTilingFunc(softmaxShape, 2, softmaxTmpSize, tilingData); + EXPECT_EQ(tilingData.get_reduceM(), 64); + bool flag = IsBasicBlockInSoftMax(tilingData); + EXPECT_EQ(flag, true); + SoftMaxFlashTilingFunc(softmaxShape, 2, 77952, tilingData, true); + EXPECT_EQ(tilingData.get_reduceM(), 32); + SoftMaxFlashTilingFunc(softmaxShape, 2, 77952, tilingData, false); + EXPECT_EQ(tilingData.get_reduceM(), 64); + SoftMaxGradTilingFunc(softmaxShape, 2, softmaxTmpSize, tilingData, false); + EXPECT_EQ(tilingData.get_reduceM(), 64); + SoftMaxGradTilingFunc(softmaxShape, 4, softmaxTmpSize, tilingData, false); + EXPECT_EQ(tilingData.get_reduceM(), 64); + SoftMaxGradTilingFunc(softmaxShape, 2, 133120, tilingData, true); + EXPECT_EQ(tilingData.get_reduceM(), 64); +} + +TEST_F(TestTiling, TestSoftMaxFlashV2TilingMaxMinTmpSize) +{ + uint32_t softmaxflashV2NeedMinLength = 0; + uint32_t softmaxflashV2NeedMaxLength = 0; + + std::vector shapeDims = { 3, 3, 448 }; + auto softmaxShape = ge::Shape(shapeDims); + uint32_t dataTypeSize1 = 2; + uint32_t dataTypeSize2 = 2; + uint32_t isUpdate = 0; + uint32_t isBasicBlock = 0; + uint32_t isFlashOutputBrc = 1; + + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 17504); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 19008); + + shapeDims = {7, 1072}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 31296); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 32256); + + shapeDims = {1, 2, 3, 1, 2, 1, 16}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 2240); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 4608); + + shapeDims = {2, 6, 1, 16}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 2240); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 4608); + + shapeDims = {6, 1664}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 41184); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 41856); + + shapeDims = {2, 1760 }; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 15200); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 15200); + + shapeDims = {1, 5536 }; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 23232); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 23232); + + shapeDims = {2, 2, 2352}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 38816); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 38912); + + shapeDims = {2, 2, 2, 480 }; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 0; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 16672); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 17920); + + shapeDims = {2, 3632}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 1; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 29440); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 29824); + + shapeDims = {2, 4, 96}; + softmaxShape = ge::Shape(shapeDims); + dataTypeSize1 = 2; + dataTypeSize2 = 2; + isUpdate = 1; + isBasicBlock = 0; + isFlashOutputBrc = 1; + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 3840); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 6144); + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, 1, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 0); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, 1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 0); + + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, 1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMinLength, 0); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, 1, isUpdate, isBasicBlock, isFlashOutputBrc); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 0); +} + +TEST_F(TestTiling, TestSoftMaxFlashV2Tiling) +{ + std::vector shapeDims = { 128, 128 }; + optiling::SoftMaxTiling tilingData; + auto softmaxShape = ge::Shape(shapeDims); + uint32_t maxSumTypeSize = 2; + uint32_t inputTypeSize = 2; + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + uint32_t softmaxflashV2NeedMinLength = + GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, false); + EXPECT_EQ(softmaxflashV2NeedMinLength, 128 * (16 + 128) * 4); + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); + EXPECT_EQ(softmaxflashV2NeedMinLength, (128 * 2 + 128 * (128 + 16)) * 4); + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); + EXPECT_EQ(softmaxflashV2NeedMinLength, 128 * (128 + 16) * 4); + + uint32_t softmaxflashV2NeedMaxLength = + GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, false); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 16) * 4); + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 16 * 2) * 4); + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 16) * 4); + + maxSumTypeSize = 4; + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); + EXPECT_EQ(softmaxflashV2NeedMinLength, (128 * 2 + 128 * (128 + 16 + 8)) * 4); + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); + EXPECT_EQ(softmaxflashV2NeedMinLength, 128 * (128 + 8) * 4); + + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 8 * 2) * 4); + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 8) * 4); + + uint32_t workLength = 100 * 1024; + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, false, false); + EXPECT_EQ(tilingData.get_reduceM(), 120); + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, false, true); + EXPECT_EQ(tilingData.get_reduceM(), 64); + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, false); + EXPECT_EQ(tilingData.get_reduceM(), 120); + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); + EXPECT_EQ(tilingData.get_reduceM(), 64); + + inputTypeSize = 4; + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); + EXPECT_EQ(softmaxflashV2NeedMinLength, (128 * 2 + 128 * (16)) * 4); + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (64 + 8) * 4); + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); + EXPECT_EQ(tilingData.get_reduceM(), 64); +} + +TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock) +{ + std::vector shapeDims = { 8, 1024 }; + optiling::SoftMaxTiling tilingData; + auto softmaxShape = ge::Shape(shapeDims); + uint32_t maxSumTypeSize = 4; + uint32_t inputTypeSize = 4; + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + uint32_t softmaxflashV2NeedMinLength = + GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); + EXPECT_EQ(softmaxflashV2NeedMinLength, (64 + 8 * (16)) * 4); + uint32_t softmaxflashV2NeedMaxLength = + GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 8*(8 + 64) * 4); + + uint32_t workLength = 32 * 1024; + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); + EXPECT_EQ(tilingData.get_reduceM(), 8); + + inputTypeSize = 2; + workLength = 64 * 1024; + softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); + EXPECT_EQ(softmaxflashV2NeedMinLength, (64 + 8 * (16 + 1024 + 8)) * 4); + softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); + EXPECT_EQ(softmaxflashV2NeedMaxLength, 8 * (8 + 1024 + 64) * 4); + SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); + EXPECT_EQ(tilingData.get_reduceM(), 8); +} + +TEST_F(TestTiling, TestWelfordUpdateTiling) +{ + std::vector shapeDims1d = {1, 128}; + auto shape1d = ge::Shape(shapeDims1d); + uint32_t maxSize = 0; + uint32_t minSize = 0; + uint32_t dtypeTSize = sizeof(half); + uint32_t dtypeUSize = sizeof(float); + bool isReuseSource = false; + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + GetWelfordUpdateMaxMinTmpSize(shape1d, dtypeTSize, dtypeUSize, isReuseSource, false, ascendcPlatform, maxSize, minSize); + EXPECT_EQ(minSize, 0); + EXPECT_EQ(maxSize, 0); +} + +TEST_F(TestTiling, TestWelfordFinalizeTiling) +{ + std::vector shapeDims1d = {64}; + auto shape1d = ge::Shape(shapeDims1d); + uint32_t maxSize = 0; + uint32_t minSize = 0; + uint32_t dtypeSize = sizeof(float); + bool isReuseSource = false; + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + GetWelfordFinalizeMaxMinTmpSize(shape1d, dtypeSize, isReuseSource, ascendcPlatform, maxSize, minSize); + EXPECT_EQ(minSize, 768); + EXPECT_EQ(maxSize, 768); +} + +TEST_F(TestTiling, TestLayerNormRstdTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = sizeof(float); + std::vector shapeDims = {128, 88}; + auto layernormShape = ge::Shape(shapeDims); + bool isReuseSource = false; + bool isComputeRstd = true; + bool isOnlyOutput = false; + optiling::LayerNormSeparateTiling tiling; + gert::TilingContext* context = fe::GetFakeTilingContext(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + uint32_t minValue = 0; + uint32_t maxValue = 0; + GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, isComputeRstd, isOnlyOutput, ascendcPlatform, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * typeSize + 128 * typeSize); + EXPECT_EQ(minValue, 128 * typeSize + 128 * typeSize); + GetNormalizeMaxMinTmpSize(layernormShape, typeSize, typeSize, isReuseSource, isComputeRstd, isOnlyOutput, ascendcPlatform, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + GetLayerNormNDTilingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, isComputeRstd, ascendcPlatform, tiling); + EXPECT_EQ(tiling.get_rLength(), 88); + EXPECT_EQ(tiling.get_rHeadLength(), 64); +} + +TEST_F(TestTiling, TestAntiquantTilingNoTransposeFP4) +{ + std::vector srcDims = { 640, 5120 }; + auto srcShape = ge::Shape(srcDims); + std::vector offsetDSms = { 1, 5120 }; + auto offsetShape = ge::Shape(offsetDSms); + bool isTranspose = false; + uint32_t maxValue; + uint32_t minValue; + GetAscendAntiQuantMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_FLOAT4_E2M1, ge::DT_FLOAT16, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); +} + +TEST_F(TestTiling, TestAntiquantTilingTransposeFP4) +{ + std::vector srcDims = { 640, 5120 }; + auto srcShape = ge::Shape(srcDims); + std::vector offsetDSms = { 1, 5120 }; + auto offsetShape = ge::Shape(offsetDSms); + bool isTranspose = true; + uint32_t maxValue; + uint32_t minValue; + GetAscendAntiQuantMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_FLOAT4_E2M1, ge::DT_FLOAT16, maxValue, minValue); + EXPECT_EQ(minValue, 10240); + EXPECT_EQ(maxValue, 10240); +} + +TEST_F(TestTiling, TestAntiquantizeTilingNoTransposeFP4) +{ + std::vector srcDims = { 640, 5120 }; + auto srcShape = ge::Shape(srcDims); + std::vector offsetDSms = { 1, 5120 }; + auto offsetShape = ge::Shape(offsetDSms); + bool isTranspose = false; + uint32_t maxValue; + uint32_t minValue; + GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_FLOAT4_E2M1, ge::DT_FLOAT16, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); +} + +TEST_F(TestTiling, TestDequantizeTiling) +{ + // 2d input shape + std::vector shape_dims = {10, 32}; + auto shape = ge::Shape(shape_dims); + uint32_t maxValue; + uint32_t minValue; + + GetDequantizeMaxMinTmpSize(shape, 2, maxValue, minValue); + EXPECT_EQ(minValue, 4 * (64 + 32 + 40)); + EXPECT_EQ(maxValue, 4 * (64 + 32 * 10 + 40)); + + // 1d input shape + std::vector shape_dims_1d = {320}; + auto shape_1d = ge::Shape(shape_dims_1d); + + GetDequantizeMaxMinTmpSize(shape_1d, 2, maxValue, minValue); + EXPECT_EQ(minValue, 4 * (64 + 1 * 320 + 328)); + EXPECT_EQ(maxValue, 4 * (64 + 1 * 320 + 328)); +} + +TEST_F(TestTiling, TestAntiquantizeTilingNoTransposePerChannelHalf) +{ + std::vector srcDims = { 640, 5120 }; + auto srcShape = ge::Shape(srcDims); + std::vector offsetDSms = { 1, 5120 }; + auto offsetShape = ge::Shape(offsetDSms); + bool isTranspose = false; + uint32_t maxValue; + uint32_t minValue; + GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_INT8, ge::DT_FLOAT16, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); +} + +TEST_F(TestTiling, TestAntiquantizeTilingNoTransposePerChannel) +{ + std::vector srcDims = { 640, 5120 }; + auto srcShape = ge::Shape(srcDims); + std::vector offsetDSms = { 1, 5120 }; + auto offsetShape = ge::Shape(offsetDSms); + bool isTranspose = false; + uint32_t maxValue; + uint32_t minValue; + GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_INT8, ge::DT_BF16, maxValue, minValue); + uint32_t expectValue = 5120 * 2 * sizeof(float) + 64 * 640 * sizeof(float); + EXPECT_EQ(minValue, expectValue); + EXPECT_EQ(maxValue, expectValue); +} + +TEST_F(TestTiling, TestAntiquantizeTilingNoTransposePerTensor) +{ + std::vector srcDims = { 640, 5120 }; + auto srcShape = ge::Shape(srcDims); + std::vector offsetDSms = { 1 }; + auto offsetShape = ge::Shape(offsetDSms); + bool isTranspose = false; + uint32_t maxValue; + uint32_t minValue; + GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_INT8, ge::DT_BF16, maxValue, minValue); + EXPECT_EQ(minValue, 1024); + EXPECT_EQ(maxValue, 640 * 5120 * sizeof(float)); +} + +TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutput) +{ + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_INT16; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = false; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 9728); + EXPECT_EQ(minValue, 9728); +} + +TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutputB8) +{ + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_UINT8; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = false; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 7680); + EXPECT_EQ(minValue, 7680); +} + +TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutputB64) +{ + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_INT64; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = false; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 15872); + EXPECT_EQ(minValue, 15872); +} + +TEST_F(TestTiling, testAdvanceSortTilingDescendOrder) +{ + std::vector shapeDims = { 1023 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_UINT32; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = true; + bool hasSrcIndex = false; + bool hasDstIndex = false; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 11776); + EXPECT_EQ(minValue, 11776); +} + +TEST_F(TestTiling, testAdvanceSortTilingWithExtraDstIndex) +{ + std::vector shapeDims = { 300 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_FLOAT; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = true; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 5312); + EXPECT_EQ(minValue, 5312); +} + +TEST_F(TestTiling, testAdvanceSortTilingWithExtraDstIndexForB8) +{ + std::vector shapeDims = { 300 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_UINT8; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = true; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 2112); + EXPECT_EQ(minValue, 2112); +} + +TEST_F(TestTiling, testAdvanceSortTilingWithBothSrcDstIndex) +{ + std::vector shapeDims = { 4096 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_UINT16; + ge::DataType indexType = ge::DT_UINT64; + bool isDescend = false; + bool hasSrcIndex = true; + bool hasDstIndex = true; + bool isReuseSource = false; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 70144); + EXPECT_EQ(minValue, 70144); +} + +TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutputReuseSource) +{ + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_INT16; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = false; + bool isReuseSource = true; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 7680); + EXPECT_EQ(minValue, 7680); +} + +TEST_F(TestTiling, testAdvanceSortTilingDescendOrderReuseSource) +{ + std::vector shapeDims = { 1023 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_UINT32; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = true; + bool hasSrcIndex = false; + bool hasDstIndex = false; + bool isReuseSource = true; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 7680); + EXPECT_EQ(minValue, 7680); +} + +TEST_F(TestTiling, testAdvanceSortTilingWithExtraDstIndexReuseSource) +{ + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_INT32; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = false; + bool hasDstIndex = true; + bool isReuseSource = true; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 11776); + EXPECT_EQ(minValue, 11776); +} + +TEST_F(TestTiling, testAdvanceSortTilingWithBothSrcDstIndexReuseSource) +{ + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + ge::DataType valueType = ge::DT_INT16; + ge::DataType indexType = ge::DT_UINT32; + bool isDescend = false; + bool hasSrcIndex = true; + bool hasDstIndex = true; + bool isReuseSource = true; + SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); + + EXPECT_EQ(maxValue, 7680); + EXPECT_EQ(minValue, 7680); +} + +extern void platfrom_stub_set_chip_version(const char *num); +TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Float_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + bool isInitIndex = true; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 4; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 256); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_innerDataSize(), 128); + EXPECT_EQ(tilingData.get_sortRepeat(), 2); + EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); + EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); + EXPECT_EQ(tilingData.get_maskOffset(), 16); + EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); + EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 1024); + EXPECT_EQ(minValue, 1024); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexFalse_Float_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 4; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 320); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_innerDataSize(), 128); + EXPECT_EQ(tilingData.get_sortRepeat(), 2); + EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); + EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); + EXPECT_EQ(tilingData.get_maskOffset(), 16); + EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); + EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); + EXPECT_EQ(tilingData.get_srcIndexOffset(), 256); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 1280); + EXPECT_EQ(minValue, 1280); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Half_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + bool isInitIndex = true; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 2; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 512); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_innerDataSize(), 256); + EXPECT_EQ(tilingData.get_sortRepeat(), 2); + EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); + EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); + EXPECT_EQ(tilingData.get_maskOffset(), 16); + EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); + EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 1024); + EXPECT_EQ(minValue, 1024); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexFalse_Half_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 2; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 640); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_innerDataSize(), 256); + EXPECT_EQ(tilingData.get_sortRepeat(), 2); + EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); + EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); + EXPECT_EQ(tilingData.get_maskOffset(), 16); + EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); + EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); + EXPECT_EQ(tilingData.get_srcIndexOffset(), 512); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 1280); + EXPECT_EQ(minValue, 1280); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexTrue_Float_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = true; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 4; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, false, tilingData); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 128); + EXPECT_EQ(tilingData.get_maskOffset(), 10); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 512); + EXPECT_EQ(minValue, 512); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Float_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 4; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_maskOffset(), 10); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 192); + + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 768); + EXPECT_EQ(minValue, 768); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexTrue_Half_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = true; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 2; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 256); + EXPECT_EQ(tilingData.get_maskOffset(), 10); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 512); + EXPECT_EQ(minValue, 512); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Half_Inner64) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 2; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, false, tilingData); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_maskOffset(), 10); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 384); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 768); + EXPECT_EQ(minValue, 768); +} + +TEST_F(TestTiling, TestTopkTiling_DataTypeSize0_FAILED) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 0; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + auto res = TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, false, tilingData); + EXPECT_EQ(res, false); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Half_k) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + int32_t k = 13; + uint32_t dataTypeSize = 2; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_maskOffset(), 13); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 384); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 768); + EXPECT_EQ(minValue, 768); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Float_k32) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 32; + uint32_t dataTypeSize = 4; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); + EXPECT_EQ(tilingData.get_allDataSize(), 64); + EXPECT_EQ(tilingData.get_maskOffset(), 32); + EXPECT_EQ(tilingData.get_tmpLocalSize(), 192); + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 768); + EXPECT_EQ(minValue, 768); +} + +TEST_F(TestTiling, TestTopkTiling_RadixTopKModeSmall_isInitIndexFalse) +{ + enum TopKMode topkMode = TopKMode::TOPK_NSMALL; + const int32_t outter = 1; + const int32_t inner = 32; + const int32_t k = 10; + ge::DataType valueType = ge::DT_INT16; + bool isReuseSource = false; + bool isInitIndex = false; + TopKConfig config = { TopKAlgo::RADIX_SELECT, TopKOrder::UNSET, true}; + + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTopKMaxMinTmpSize(inner, outter, k, isReuseSource, isInitIndex, topkMode, + true, valueType, config, maxValue, minValue); + EXPECT_EQ(maxValue, 1696); + EXPECT_EQ(minValue, 1696); +} + +TEST_F(TestTiling, TestTopkTiling_RadixTopKModeNormal_isInitIndexFalse) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + const int32_t outter = 1; + const int32_t inner = 32; + const int32_t k = 10; + ge::DataType valueType = ge::DT_INT16; + bool isReuseSource = false; + bool isInitIndex = false; + TopKConfig config = { TopKAlgo::RADIX_SELECT, TopKOrder::UNSET, true}; + + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTopKMaxMinTmpSize(inner, outter, k, isReuseSource, isInitIndex, topkMode, + true, valueType, config, maxValue, minValue); + EXPECT_EQ(maxValue, 1696); + EXPECT_EQ(minValue, 1696); +} + +TEST_F(TestTiling, TestTopkTiling_RadixTopKModeNormal_isInitIndexTrue) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + const int32_t outter = 1; + const int32_t inner = 32; + const int32_t k = 10; + ge::DataType valueType = ge::DT_INT16; + bool isReuseSource = true; + bool isInitIndex = true; + TopKConfig config = { TopKAlgo::RADIX_SELECT, TopKOrder::UNSET, true}; + + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTopKMaxMinTmpSize(inner, outter, k, isReuseSource, isInitIndex, topkMode, + true, valueType, config, maxValue, minValue); + EXPECT_EQ(maxValue, 1504); + EXPECT_EQ(minValue, 1504); +} + +TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexFalse) +{ + enum TopKMode topkMode = TopKMode::TOPK_NORMAL; + bool isInitIndex = false; + const int32_t outter = 1; + const int32_t inner = 64; + const int32_t k = 10; + uint32_t dataTypeSize = 4; + bool isReuseSource = true; + uint32_t maxValue = 0; + uint32_t minValue = 0; + optiling::TopkTiling tilingData; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + + GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); + EXPECT_EQ(maxValue, 1280); + EXPECT_EQ(minValue, 1280); +} + +TEST_F(TestTiling, TestPowerTiling) +{ + std::vector shapeDims = { 1, 512 }; + auto powerShape = ge::Shape(shapeDims); + uint32_t maxVal; + uint32_t minVal; + + GetPowerMaxMinTmpSize(powerShape, powerShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 2); + EXPECT_EQ(maxVal, 512 * 4 * 2); + GetPowerMaxMinTmpSize(powerShape, powerShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + GetPowerMaxMinTmpSize(powerShape, powerShape, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 2); + EXPECT_EQ(minVal, 512 * 4 * 2); + + std::vector scalar_shape = { 1 }; + auto scalarShape = ge::Shape(scalar_shape); + GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 2); + EXPECT_EQ(maxVal, 512 * 4 * 2); + GetPowerMaxMinTmpSize(powerShape, scalarShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + GetPowerMaxMinTmpSize(scalarShape, powerShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 2); + EXPECT_EQ(minVal, 512 * 4 * 2); +} + +TEST_F(TestTiling, TestPowerTilingFactorSize) +{ + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + + GetPowerTmpBufferFactorSize(false, true, false, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); + + GetPowerTmpBufferFactorSize(false, true, false, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); + + GetPowerTmpBufferFactorSize(false, true, true, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestPowerTilingWithConfig) +{ + std::vector shapeDims = { 1, 512 }; + auto powerShape = ge::Shape(shapeDims); + uint32_t maxVal; + uint32_t minVal; + + AscendC::PowerConfig intrinsicConfig = { AscendC::PowerAlgo::INTRINSIC }; + AscendC::PowerConfig doubleFloatTechConfig = { AscendC::PowerAlgo::DOUBLE_FLOAT_TECH }; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + GetPowerMaxMinTmpSize(plat, intrinsicConfig, powerShape, powerShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(maxVal, 0); + GetPowerMaxMinTmpSize(plat, intrinsicConfig, powerShape, powerShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + + std::vector scalar_shape = { 1 }; + auto scalarShape = ge::Shape(scalar_shape); + GetPowerMaxMinTmpSize(plat, doubleFloatTechConfig, powerShape, scalarShape, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 2); + EXPECT_EQ(maxVal, 512 * 4 * 2); + GetPowerMaxMinTmpSize(plat, doubleFloatTechConfig, powerShape, scalarShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + GetPowerMaxMinTmpSize(plat, doubleFloatTechConfig, scalarShape, powerShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 2); + EXPECT_EQ(minVal, 512 * 4 * 2); +} + +TEST_F(TestTiling, TestPowerTilingFactorSizeWithConfig) +{ + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + + AscendC::PowerConfig intrinsicConfig = { AscendC::PowerAlgo::INTRINSIC }; + AscendC::PowerConfig doubleFloatTechConfig = { AscendC::PowerAlgo::DOUBLE_FLOAT_TECH }; + fe::PlatFormInfos platformInfo; + auto plat = platform_ascendc::PlatformAscendC(&platformInfo); + + GetPowerTmpBufferFactorSize(plat, intrinsicConfig, false, true, false, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + + GetPowerTmpBufferFactorSize(plat, intrinsicConfig, false, true, false, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + + GetPowerTmpBufferFactorSize(plat, doubleFloatTechConfig, false, true, false, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); + + GetPowerTmpBufferFactorSize(plat, doubleFloatTechConfig, false, true, false, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); + + GetPowerTmpBufferFactorSize(plat, doubleFloatTechConfig, false, true, true, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestCosTilingFloatWithConfig) +{ + std::vector shapeDims = { 128, 128 }; + auto cosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + + AscendC::CosConfig polyConfig = { AscendC::CosAlgo::POLYNOMIAL_APPROXIMATION }; + AscendC::CosConfig radinConfig = { AscendC::CosAlgo::RADIAN_REDUCTION }; + + AscendC::GetCosMaxMinTmpSize(polyConfig, cosShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + AscendC::GetCosMaxMinTmpSize(radinConfig, cosShape, 4, true, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4 + 32); + + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetCosTmpBufferFactorSize(polyConfig, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + GetCosTmpBufferFactorSize(radinConfig, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 32); +} + +TEST_F(TestTiling, TestCosTilingHalfWithConfig) +{ + std::vector shapeDims = { 512 }; + auto cosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + + AscendC::CosConfig polyConfig = { AscendC::CosAlgo::POLYNOMIAL_APPROXIMATION }; + AscendC::CosConfig radinConfig = { AscendC::CosAlgo::RADIAN_REDUCTION }; + + AscendC::GetCosMaxMinTmpSize(polyConfig, cosShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + + AscendC::GetCosMaxMinTmpSize(radinConfig, cosShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 512 * 2 * 4 + 32); + EXPECT_EQ(minValue, 512 * 2 * 4 + 32); + + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetCosTmpBufferFactorSize(polyConfig, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + GetCosTmpBufferFactorSize(radinConfig, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestHypotTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto atanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetHypotMaxMinTmpSize(atanShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetHypotTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestSinTilingFloatWithConfig) +{ + std::vector shapeDims = { 128, 128 }; + auto sinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + + AscendC::SinConfig polyConfig = { AscendC::SinAlgo::POLYNOMIAL_APPROXIMATION }; + AscendC::SinConfig radinConfig = { AscendC::SinAlgo::RADIAN_REDUCTION }; + + AscendC::GetSinMaxMinTmpSize(polyConfig, sinShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + AscendC::GetSinMaxMinTmpSize(radinConfig, sinShape, 4, true, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4 + 32); + + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetSinTmpBufferFactorSize(polyConfig, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + GetSinTmpBufferFactorSize(radinConfig, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 32); +} + +TEST_F(TestTiling, TestSinTilingHalfWithConfig) +{ + std::vector shapeDims = { 512 }; + auto sinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + + AscendC::SinConfig polyConfig = { AscendC::SinAlgo::POLYNOMIAL_APPROXIMATION }; + AscendC::SinConfig radinConfig = { AscendC::SinAlgo::RADIAN_REDUCTION }; + + AscendC::GetSinMaxMinTmpSize(polyConfig, sinShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + + AscendC::GetSinMaxMinTmpSize(radinConfig, sinShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 512 * 2 * 4 + 32); + EXPECT_EQ(minValue, 512 * 2 * 4 + 32); + + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetSinTmpBufferFactorSize(polyConfig, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + GetSinTmpBufferFactorSize(radinConfig, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestConfusionTransposeTiling) +{ + const uint32_t stackBufferSize = 0; + const uint32_t typeSize = 4; + + std::vector shapeDims = { 32, 64, 128 }; + auto srcShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + + AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, typeSize, 13, maxValue, minValue); + AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, typeSize, 14, maxValue, minValue); + AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, typeSize, 15, maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); + + optiling::ConfusionTransposeTiling tiling; + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 13, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 14, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 15, tiling); +} + +#else +extern void platfrom_stub_set_chip_version(const char *num); +>>>>>>> 4a0a42bb (update) TEST_F(TestTiling, MultiCoreSmallMN) { matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4,rnnMatmul5; @@ -5400,6 +6705,70 @@ TEST_F(TestTiling, TestOneElementBroadCast200) } #endif +TEST_F(TestTiling, testTransDataTilingUnalignedHw) +{ + platfrom_stub_set_chip_version("Ascend910B"); + uint32_t maxSize; + uint32_t minSize; + auto ncdhwShape = ge::Shape({ 16, 16, 3, 3, 3 }); + auto ndc1hwc0Shape = ge::Shape({ 16, 3, 1, 3, 3, 16}); + auto fractalzShape = ge::Shape({ 3, 1, 3, 3, 1, 16, 16}); + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + TransDataConfig config = {DataFormat::NCDHW, DataFormat::NDC1HWC0}; + bool ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, ndc1hwc0Shape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, 1632); + EXPECT_EQ(minSize, 1632); + + config = {DataFormat::NDC1HWC0, DataFormat::NCDHW}; + ret = GetTransDataMaxMinTmpSize(plat, ndc1hwc0Shape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, 2048); + EXPECT_EQ(minSize, 2048); + + config = {DataFormat::NCDHW, DataFormat::FRACTAL_Z_3D}; + ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, fractalzShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, 26112); + EXPECT_EQ(minSize, 26112); +} + +TEST_F(TestTiling, testTransDataTilingAlignedHw) +{ + platfrom_stub_set_chip_version("Ascend910B"); + uint32_t maxSize; + uint32_t minSize; + auto ncdhwShape = ge::Shape({ 5, 30, 2, 4, 8 }); + auto ndc1hwc0Shape = ge::Shape({ 5, 2, 2, 4, 8, 16}); + auto fractalzShape = ge::Shape({ 2, 2, 4, 8, 1, 16, 16}); + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + TransDataConfig config = {DataFormat::NCDHW, DataFormat::NDC1HWC0}; + bool ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, ndc1hwc0Shape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, 4224); + EXPECT_EQ(minSize, 4224); + + config = {DataFormat::NDC1HWC0, DataFormat::NCDHW}; + ret = GetTransDataMaxMinTmpSize(plat, ndc1hwc0Shape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, 4608); + EXPECT_EQ(minSize, 4608); + + config = {DataFormat::NCDHW, DataFormat::FRACTAL_Z_3D}; + ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, fractalzShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, 69376); + EXPECT_EQ(minSize, 69376); +} + TEST_F(TestTiling, TestReduceXorSumTilingInt16) { std::vector shapeDims = { 128, 128 }; -- Gitee From 28a54fa25ada7088ad91c44eb58675df6cd20aff Mon Sep 17 00:00:00 2001 From: chen-yiyuan Date: Tue, 20 May 2025 11:17:54 +0800 Subject: [PATCH 3/5] add transdata high api fractal_z_3d to ncdhw --- impl/transdata/transdata_common_impl.h | 29 +++ impl/transdata/transdata_impl.h | 231 +++++++++++++++-- impl/transdata/transdata_tiling.cpp | 90 +++++-- lib/tiling_api.h | 90 +++++++ tests/CMakeLists.txt | 88 +++++++ tests/tiling/test_tiling.cpp | 49 +++- tests/transdata/test_operator_transdata.cpp | 267 ++++++++++++++++++++ tiling/tiling_api.h | 90 +++++++ 8 files changed, 882 insertions(+), 52 deletions(-) create mode 100644 impl/transdata/transdata_common_impl.h create mode 100644 lib/tiling_api.h create mode 100644 tests/transdata/test_operator_transdata.cpp create mode 100644 tiling/tiling_api.h diff --git a/impl/transdata/transdata_common_impl.h b/impl/transdata/transdata_common_impl.h new file mode 100644 index 00000000..b1289dc7 --- /dev/null +++ b/impl/transdata/transdata_common_impl.h @@ -0,0 +1,29 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef IMPL_TRANSDATA_TRANSDATA_COMMON_IMPL_H +#define IMPL_TRANSDATA_TRANSDATA_COMMON_IMPL_H + +namespace AscendC { +template +struct TransDataParams { + T srcLayout; + U dstLayout; +}; + +#ifndef ASCC_PARAM_TRANSDATACONFIG +#define ASCC_PARAM_TRANSDATACONFIG +struct TransDataConfig { + DataFormat srcFormat; + DataFormat dstFormat; +}; +#endif // ASCC_PARAM_TRANSDATACONFIG +} // namespace AscendC + +#endif // IMPL_TRANSDATA_TRANSDATA_COMMON_IMPL_H \ No newline at end of file diff --git a/impl/transdata/transdata_impl.h b/impl/transdata/transdata_impl.h index 37966c8a..33571fab 100644 --- a/impl/transdata/transdata_impl.h +++ b/impl/transdata/transdata_impl.h @@ -13,34 +13,209 @@ #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "kernel_tiling/kernel_tiling.h" +#include "transdata_common_impl.h" +#include "../common/check.h" +#include "../api_check/kernel_api_check.h" namespace AscendC { +namespace Internal { -template -struct TransDataParams { - T srcLayout; - U dstLayout; -}; - -#ifndef ASCC_PARAM_TRANSDATACONFIG -#define ASCC_PARAM_TRANSDATACONFIG -struct TransDataConfig { - DataFormat srcFormat; - DataFormat dstFormat; -}; -#endif // ASCC_PARAM_TRANSDATACONFIG +namespace { +constexpr int32_t n0 = 16; +constexpr int32_t c0 = 16; +constexpr int32_t hw0 = 16; +} -namespace Internal { struct TransDataTmpParams { int32_t n; int32_t c; int32_t d; int32_t h; int32_t w; + int32_t n1; + int32_t c1; + int32_t padHw; }; constexpr int32_t DEFAULT_TRANSDATA_5HD_LIST = 16; +template +__aicore__ inline void DC1Hwn1n0c0ToC1DHwn1n0c0HWAlign(const LocalTensor& dst, const LocalTensor& src, + const TransDataTmpParams& params) +{ + // d, c1, h w n1 n0 c0 -> c1, d, hw1*hw0 n1 n0 c0 + int32_t d = params.d; + int32_t h = params.h; + int32_t w = params.w; + int32_t n1 = params.n1; + int32_t c1 = params.c1; + int32_t padHw = params.padHw; + + uint32_t dim0 = d; + uint32_t dim1 = c1; + uint32_t lastDim = h * w * n1 * n0 * c0; + + // dim0, dim1, lastDim -> dim1, dim0, lastDim + int32_t n1n0c0DimElems = n1 * n0 * c0; + int32_t hwAlignElems = padHw * n1n0c0DimElems; + int32_t hwPadElems = (padHw - h * w) * n1n0c0DimElems; + + uint16_t blockCount = dim1; + uint16_t blockLen = lastDim * sizeof(T) / ONE_BLK_SIZE; + uint16_t srcGap = 0; + uint16_t dstGap = ((dim0 - 1) * hwAlignElems + hwPadElems) * sizeof(T) / ONE_BLK_SIZE; + + uint32_t dstSize = c1 * d * padHw * n1 * n0 * c0; + Duplicate(dst, static_cast(0), dstSize); + PipeBarrier(); + + DataCopyParams dataCopyParams = { blockCount, blockLen, srcGap, dstGap }; + for (uint32_t d0 = 0; d0 < dim0; d0++) { + DataCopy(dst[d0 * hwAlignElems], src[d0 * dim1 * lastDim], dataCopyParams); + } + PipeBarrier(); +} + +template +__aicore__ inline void C1Dhwn1n0c0ToC1C0Dhwn1n0(const LocalTensor& dst, const LocalTensor& src, + const TransDataTmpParams& params) +{ + // C1 DHWN1N0 C0 -> C1 C0 DHWN1N0 + int32_t d = params.d; + int32_t n1 = params.n1; + int32_t c1 = params.c1; + int32_t padHw = params.padHw; + + TransDataTo5HDParams transDataParams; + transDataParams.dstHighHalf = false; + transDataParams.srcHighHalf = false; + transDataParams.repeatTimes = d * padHw * n1; + if (transDataParams.repeatTimes == 1) { + transDataParams.srcRepStride = 0; + transDataParams.dstRepStride = 0; + } else { + transDataParams.srcRepStride = DEFAULT_TRANSDATA_5HD_LIST * c0 * sizeof(T) / ONE_BLK_SIZE; + transDataParams.dstRepStride = n0 * sizeof(T) / ONE_BLK_SIZE; + } + + uint64_t srcOffsetArr[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t dstOffsetArr[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t srcAddr = (uint64_t)src.GetPhyAddr(); + uint64_t dstAddr = (uint64_t)dst.GetPhyAddr(); + for (uint32_t j = 0; j < c1; j++) { + uint32_t outOffset = j * d * padHw * n1 * n0 * c0; + for (uint8_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + srcOffsetArr[i] = (uint64_t)(srcAddr + (outOffset + i * n0) * sizeof(T)); + dstOffsetArr[i] = (uint64_t)(dstAddr + (outOffset + i * d * padHw * n1 * n0) * sizeof(T)); + } + TransDataTo5HD(dstOffsetArr, srcOffsetArr, transDataParams); + } + PipeBarrier(); +} + +template +__aicore__ inline void C1c0dhwN1n0ToNcdhw(const LocalTensor& dst, const LocalTensor& src, + const LocalTensor& tmp, const TransDataTmpParams& params) +{ + // C1C0DHW N1N0 -> N CDHW + int32_t d = params.d; + int32_t n1 = params.n1; + int32_t padHw = params.padHw; + int32_t currN = params.n; + int32_t c = params.c; + + TransDataTo5HDParams transDataParams; + transDataParams.dstHighHalf = false; + transDataParams.srcHighHalf = false; + transDataParams.repeatTimes = c * d * padHw / n0; + if (transDataParams.repeatTimes == 1) { + transDataParams.srcRepStride = 0; + transDataParams.dstRepStride = 0; + } else { + transDataParams.srcRepStride = DEFAULT_TRANSDATA_5HD_LIST * n1 * n0 * sizeof(T) / ONE_BLK_SIZE; + transDataParams.dstRepStride = c0 * sizeof(T) / ONE_BLK_SIZE; + } + + uint64_t srcOffsetArr[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t dstOffsetArr[DEFAULT_TRANSDATA_5HD_LIST]; + uint64_t srcAddr = (uint64_t)src.GetPhyAddr(); + uint64_t dstAddr = (uint64_t)dst.GetPhyAddr(); + uint64_t tmpAddr = (uint64_t)tmp.GetPhyAddr(); + for (uint32_t j = 0; j < n1; j++) { + if (n0 - currN > 0) { + for (uint8_t i = 0; i < currN; i++) { + dstOffsetArr[i] = (uint64_t)(dstAddr + (j * d * c * padHw * n0 + i * c * d * padHw) * sizeof(T)); + } + for (uint8_t i = currN; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + dstOffsetArr[i] = (uint64_t)(tmpAddr + i * ONE_BLK_SIZE * sizeof(T)); + } + } else { + for (uint8_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + dstOffsetArr[i] = (uint64_t)(dstAddr + (j * d * c * padHw * n0 + i * c * d * padHw) * sizeof(T)); + } + } + for (uint8_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) { + srcOffsetArr[i] = (uint64_t)(srcAddr + (j * n0 + i * n0 * n1) * sizeof(T)); + } + TransDataTo5HD(dstOffsetArr, srcOffsetArr, transDataParams); + currN -= n0; + } + PipeBarrier(); +} + +template +__aicore__ inline void N1n0C1c0DHWToNCDHW(const LocalTensor& dst, const LocalTensor& src, + const TransDataTmpParams& params) +{ + // N1N0 C1C0 D H W -> N C D H W + int32_t n = params.n; + int32_t c = params.c; + int32_t d = params.d; + int32_t c1 = params.c1; + int32_t padHw = params.padHw; + + uint16_t blockCount = n; + uint16_t blockLen = (c * (d * padHw)) * sizeof(T) / ONE_BLK_SIZE; + uint16_t srcGap = ((c1 * c0 - c) * (d * padHw)) * sizeof(T) /ONE_BLK_SIZE; + uint16_t dstGap = 0; + DataCopyParams dataCopyParams = { blockCount, blockLen, srcGap, dstGap }; + DataCopy(dst, src, dataCopyParams); + PipeBarrier(); +} + +template +__aicore__ inline void TransDataFractalToNcdhw(const LocalTensor& dst, const LocalTensor& src, + const LocalTensor& tmpBuffer, const TransDataTmpParams& params) +{ + int32_t d = params.d; + int32_t n1 = params.n1; + int32_t c1 = params.c1; + int32_t padHw = params.padHw; + int32_t n = params.n; + int32_t c = params.c; + + LocalTensor tmp = tmpBuffer.template ReinterpretCast(); + LocalTensor srcTmp = src.template ReinterpretCast(); + if (c == c1 * c0 && n == n1 * n0) { + LocalTensor dstTmp = dst.template ReinterpretCast(); + // D C1 HWN1N0C0 -> C1 D HWN1N0C0 (H*W 32B ALIGN -> HW1*HW0) + DC1Hwn1n0c0ToC1DHwn1n0c0HWAlign(dstTmp, srcTmp, params); + // C1 DHWN1N0 C0 -> C1 C0 DHWN1N0 + C1Dhwn1n0c0ToC1C0Dhwn1n0(tmp, dstTmp, params); + // C1C0DHW N1N0 -> N CDHW + C1c0dhwN1n0ToNcdhw(dstTmp, tmp, tmp, params); + } else { + LocalTensor transDataTmp = tmp[n1 * n0 * c1 * c0 * d * padHw]; + LocalTensor dstTmp = dst.template ReinterpretCast(); + // D C1 HWN1N0C0 -> C1 D HWN1N0C0 (H*W 32B ALIGN -> HW1*HW0) + DC1Hwn1n0c0ToC1DHwn1n0c0HWAlign(tmp, srcTmp, params); + // C1 DHWN1N0 C0 -> C1 C0 DHWN1N0 + C1Dhwn1n0c0ToC1C0Dhwn1n0(transDataTmp, tmp, params); + // C1C0DHW N1N0 -> N CDHW + C1c0dhwN1n0ToNcdhw(dstTmp, transDataTmp, tmp, params); + } +} + // Transdata NCDHW -> FRACTAL_Z_3D template __aicore__ inline void TransDataImplMode1(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, @@ -70,7 +245,7 @@ __aicore__ inline void TransDataImplMode1(const LocalTensor& dst, const Local transDataParams.dstHighHalf = false; transDataParams.srcHighHalf = false; transDataParams.repeatTimes = currAxis / elePerBlk; - // if repeat = 1, start offset is auto incremental by stride. + // if repeat = 1, start offset is auto incremental by stride. transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : n1 * n0; transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1; @@ -232,7 +407,7 @@ __aicore__ inline void TransDataImplMode3(const LocalTensor& dst, const Local uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr(); uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr(); uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr(); - + int32_t axisHwd = h * w * d; int32_t axisHwc0 = h * w * c0; int32_t axisC1hwc0 = axisHwc0 * c1; @@ -295,19 +470,27 @@ __aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const Loca static_assert(Std::is_tuple_v, "it must be a shape."); static_assert(Std::is_tuple_v, "it must be a shape."); + CHECK_FUNC_HIGHLEVEL_API(TransData, (config, T, U, S), (dstTensor, srcTensor, sharedTmpBuffer, params)); auto ncdhwShape = config.srcFormat == DataFormat::NCDHW ? params.srcLayout.GetShape() : params.dstLayout.GetShape(); + int32_t n = Std::get<0>(ncdhwShape); + int32_t c = Std::get<1>(ncdhwShape); + int32_t d = Std::get<2>(ncdhwShape); + int32_t h = Std::get<3>(ncdhwShape); + int32_t w = Std::get<4>(ncdhwShape); + int32_t n1 = (n + n0 - 1) / n0; + int32_t c1 = (c + c0 - 1) / c0; + int32_t hw1 = (h * w + hw0 - 1) / hw0; + int32_t padHw = hw1 * hw0; TransDataTmpParams tmpParams = { - Std::get<0>(ncdhwShape), - Std::get<1>(ncdhwShape), - Std::get<2>(ncdhwShape), - Std::get<3>(ncdhwShape), - Std::get<4>(ncdhwShape) + n, c, d, h, w, n1, c1, padHw, }; - if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { - TransDataImplMode2(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); - } else if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { + if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { TransDataImplMode1(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + } else if constexpr (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) { + TransDataFractalToNcdhw(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + } else if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { + TransDataImplMode2(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } else if constexpr (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) { TransDataImplMode3(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } diff --git a/impl/transdata/transdata_tiling.cpp b/impl/transdata/transdata_tiling.cpp index dfb216cc..a5de5de1 100644 --- a/impl/transdata/transdata_tiling.cpp +++ b/impl/transdata/transdata_tiling.cpp @@ -46,6 +46,46 @@ int32_t AlignUp(int32_t a, int32_t b) return DivCeil(a, b) * b; } +bool GenerateFractalZ3DToNcdhwShapeInfo(const std::vector& dstDims, const std::vector& srcDims, + TmpTransDataParams ¶m, const int32_t c0, const int32_t n0) +{ + ASCENDC_HOST_ASSERT(srcDims.size() == 7 && dstDims.size() == 5, return false, + "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat."); + param.n = dstDims[N_INDEX]; + param.c = dstDims[C_INDEX]; + param.d = dstDims[D_INDEX]; + param.h = dstDims[H_INDEX]; + param.w = dstDims[W_INDEX]; + // validate d, h, w + ASCENDC_HOST_ASSERT(param.d == srcDims[0] && param.h == srcDims[2] && param.w == srcDims[3], return false, + "[TransData][GetTransDataMaxMinTmpSize] shapeInfo d,h,w is not matched."); + ASCENDC_HOST_ASSERT(srcDims[6] == c0 && srcDims[1] * c0 == AlignUp(param.c, c0), return false, + "[TransData][GetTransDataMaxMinTmpSize] src c0, c1 is not able to be converted to c."); + ASCENDC_HOST_ASSERT(srcDims[5] == n0 && srcDims[4] * n0 == AlignUp(param.n, n0), return false, + "[TransData][GetTransDataMaxMinTmpSize] src n0, n1 is not able to be converted to n."); + return true; +} + +bool GenerateNcdhwToFractalZ3DShapeInfo(const std::vector& dstDims, const std::vector& srcDims, + TmpTransDataParams ¶m, const int32_t c0, const int32_t n0) +{ + ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 7, return false, + "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat."); + param.n = srcDims[N_INDEX]; + param.c = srcDims[C_INDEX]; + param.d = srcDims[D_INDEX]; + param.h = srcDims[H_INDEX]; + param.w = srcDims[W_INDEX]; + // validate d, h, w + ASCENDC_HOST_ASSERT(param.d == dstDims[0] && param.h == dstDims[2] && param.w == dstDims[3], return false, + "[TransData][GetTransDataMaxMinTmpSize] shapeInfo d,h,w is not matched."); + ASCENDC_HOST_ASSERT(dstDims[6] == c0 && dstDims[1] * c0 == AlignUp(param.c, c0), return false, + "[TransData][GetTransDataMaxMinTmpSize] dst c0, c1 is not able to be converted to c."); + ASCENDC_HOST_ASSERT(dstDims[5] == n0 && dstDims[4] * n0 == AlignUp(param.n, n0), return false, + "[TransData][GetTransDataMaxMinTmpSize] dst n0, n1 is not able to be converted to n."); + return true; +} + bool GenerateShapeInfo(const TransDataConfig &config, const ge::Shape &srcShape, const ge::Shape &dstShape, ge::DataType type, TmpTransDataParams ¶m) { @@ -54,40 +94,39 @@ bool GenerateShapeInfo(const TransDataConfig &config, const ge::Shape &srcShape, std::vector srcDims = srcShape.GetDims(); std::vector dstDims = dstShape.GetDims(); if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { - ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 6, return false, "input shapes are not matched with DataFormat."); + ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 6, return false, + "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat."); param.n = srcDims[N_INDEX]; param.c = srcDims[C_INDEX]; param.d = srcDims[D_INDEX]; param.h = srcDims[H_INDEX]; param.w = srcDims[W_INDEX]; // validate n, d, h, w - ASCENDC_HOST_ASSERT(param.n == dstDims[0] && param.d == dstDims[1] && param.h == dstDims[3] && param.w == dstDims[4], return false, "shapeInfo n,d,h,w is not matched."); - ASCENDC_HOST_ASSERT(dstDims[5] == c0 && dstDims[2] * c0 == AlignUp(param.c, c0), return false, "dst c0, c1 is not able to be converted to c."); + ASCENDC_HOST_ASSERT(param.n == dstDims[0] && param.d == dstDims[1] && param.h == dstDims[3] && param.w == dstDims[4], + return false, "[TransData][GetTransDataMaxMinTmpSize] shapeInfo n,d,h,w is not matched."); + ASCENDC_HOST_ASSERT(dstDims[5] == c0 && dstDims[2] * c0 == AlignUp(param.c, c0), return false, + "[TransData][GetTransDataMaxMinTmpSize] dst c0, c1 is not able to be converted to c."); return true; } if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { - ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 7, return false, "input shapes are not matched with DataFormat."); - param.n = srcDims[N_INDEX]; - param.c = srcDims[C_INDEX]; - param.d = srcDims[D_INDEX]; - param.h = srcDims[H_INDEX]; - param.w = srcDims[W_INDEX]; - // validate n, d, h, w - ASCENDC_HOST_ASSERT(param.d == dstDims[0] && param.h == dstDims[2] && param.w == dstDims[3], return false, "shapeInfo n,d,h,w is not matched."); - ASCENDC_HOST_ASSERT(dstDims[6] == c0 && dstDims[1] * c0 == AlignUp(param.c, c0), return false, "dst c0, c1 is not able to be converted to c."); - ASCENDC_HOST_ASSERT(dstDims[5] == n0 && dstDims[4] * n0 == AlignUp(param.n, n0), return false, "dst n0, n1 is not able to be converted to n."); - return true; + return GenerateNcdhwToFractalZ3DShapeInfo(dstDims, srcDims, param, c0, n0); + } + if (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) { + return GenerateFractalZ3DToNcdhwShapeInfo(dstDims, srcDims, param, c0, n0); } if (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) { - ASCENDC_HOST_ASSERT(srcDims.size() == 6 && dstDims.size() == 5, return false, "input shapes are not matched with DataFormat."); + ASCENDC_HOST_ASSERT(srcDims.size() == 6 && dstDims.size() == 5, return false, + "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat."); param.n = dstDims[N_INDEX]; param.c = dstDims[C_INDEX]; param.d = dstDims[D_INDEX]; param.h = dstDims[H_INDEX]; param.w = dstDims[W_INDEX]; // validate n, d, h, w - ASCENDC_HOST_ASSERT(param.n == srcDims[0] && param.d == srcDims[1] && param.h == srcDims[3] && param.w == srcDims[4], return false, "shapeInfo n,d,h,w is not matched."); - ASCENDC_HOST_ASSERT(srcDims[5] == c0 && srcDims[2] * c0 == AlignUp(param.c, c0), return false, "src c0, c1 is not able to be converted to c."); + ASCENDC_HOST_ASSERT(param.n == srcDims[0] && param.d == srcDims[1] && param.h == srcDims[3] && param.w == srcDims[4], + return false, "[TransData][GetTransDataMaxMinTmpSize] shapeInfo n,d,h,w is not matched."); + ASCENDC_HOST_ASSERT(srcDims[5] == c0 && srcDims[2] * c0 == AlignUp(param.c, c0), return false, + "[TransData][GetTransDataMaxMinTmpSize] src c0, c1 is not able to be converted to c."); return true; } return false; @@ -113,9 +152,16 @@ int32_t GetTmpBufferSize(const TransDataConfig &config, const TmpTransDataParams { return c * d * padHw * dataSize + n1 * n0 * d * c1 * c0 * padHw * dataSize; } + if (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) + { + constexpr int32_t doubleTmpSize = 2; + if (n == n0 * n1 && c == c0 * c1) { + return n1 * n0 * c1 * c0 * d * padHw * dataSize; + } + return n1 * n0 * c1 * c0 * d * padHw * dataSize * doubleTmpSize; + } return 0; } - } // namespace bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform, @@ -125,14 +171,16 @@ bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform const TransDataConfig &config, uint32_t &maxValue, uint32_t &minValue) { - ASCENDC_HOST_ASSERT(dataType == ge::DataType::DT_FLOAT16 || dataType == ge::DataType::DT_BF16, return false, "it only supports DT_FLOAT16/DT_BF16 data type"); + ASCENDC_HOST_ASSERT(dataType == ge::DataType::DT_FLOAT16 || dataType == ge::DataType::DT_BF16, return false, + "[TransData][GetTransDataMaxMinTmpSize] it only supports DT_FLOAT16/DT_BF16 data type"); platform_ascendc::SocVersion socVersion = platform.GetSocVersion(); ASCENDC_HOST_ASSERT(socVersion == platform_ascendc::SocVersion::ASCEND910B, return false, - "Unsupported SocVersion for TransData API."); + "[TransData][GetTransDataMaxMinTmpSize] Unsupported SocVersion for TransData API."); TmpTransDataParams tmpParam; - ASCENDC_HOST_ASSERT(GenerateShapeInfo(config, srcShape, dstShape, dataType, tmpParam), return false, "failed to validate inputs informations."); + ASCENDC_HOST_ASSERT(GenerateShapeInfo(config, srcShape, dstShape, dataType, tmpParam), return false, + "[TransData][GetTransDataMaxMinTmpSize] failed to validate inputs informations."); maxValue = GetTmpBufferSize(config, tmpParam); minValue = maxValue; return true; diff --git a/lib/tiling_api.h b/lib/tiling_api.h new file mode 100644 index 00000000..1b83428d --- /dev/null +++ b/lib/tiling_api.h @@ -0,0 +1,90 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file tiling_api.h + * \brief + */ +#ifndef LIB_TILING_API_H +#define LIB_TILING_API_H +#include "matmul/matmul_tiling.h" +#include "matmul/bmm_tiling.h" +#include "activation/softmax_tiling.h" +#include "activation/logsoftmax_tiling.h" +#include "filter/dropout_tiling.h" +#include "sort/sort_tiling_intf.h" +#include "index/arithprogression_tiling.h" +#include "quantization/ascend_dequant_tiling.h" +#include "quantization/ascend_quant_tiling.h" +#include "quantization/ascend_antiquant_tiling.h" +#include "quantization/quantize_tiling.h" +#include "quantization/antiquantize_tiling.h" +#include "quantization/dequantize_tiling.h" +#include "reduce/sum_tiling.h" +#include "activation/silu_tiling.h" +#include "activation/swish_tiling.h" +#include "activation/gelu_tiling.h" +#include "pad/pad_tiling.h" +#include "normalization/rmsnorm_tiling.h" +#include "normalization/deepnorm_tiling.h" +#include "normalization/layernorm_tiling.h" +#include "normalization/normalize_tiling.h" +#include "normalization/batchnorm_tiling.h" +#include "normalization/layernorm_grad_tiling.h" +#include "normalization/layernorm_grad_beta_tiling.h" +#include "normalization/welfordfinalize_tiling.h" +#include "transpose/confusion_transpose_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "sort/topk_tiling.h" +#include "math/tanh_tiling.h" +#include "activation/sigmoid_tiling.h" +#include "math/frac_tiling.h" +#include "math/acos_tiling.h" +#include "math/asin_tiling.h" +#include "math/acosh_tiling.h" +#include "math/asinh_tiling.h" +#include "math/sin_tiling.h" +#include "math/cos_tiling.h" +#include "math/atan_tiling.h" +#include "math/power_tiling.h" +#include "math/log_tiling.h" +#include "math/cosh_tiling.h" +#include "math/clamp_tiling.h" +#include "math/erf_tiling.h" +#include "math/erfc_tiling.h" +#include "math/round_tiling.h" +#include "math/sinh_tiling.h" +#include "activation/swiglu_tiling.h" +#include "math/tan_tiling.h" +#include "math/hypot_tiling.h" +#include "select/selectwithbytesmask_tiling.h" +#include "math/trunc_tiling.h" +#include "activation/geglu_tiling.h" +#include "math/lgamma_tiling.h" +#include "math/digamma_tiling.h" +#include "math/atanh_tiling.h" +#include "math/xor_tiling.h" +#include "math/sign_tiling.h" +#include "reduce/mean_tiling.h" +#include "math/exp_tiling.h" +#include "math/axpy_tiling.h" +#include "math/ceil_tiling.h" +#include "math/floor_tiling.h" +#include "activation/reglu_tiling.h" +#include "pad/broadcast_tiling.h" +#include "reduce/reduce_xor_sum_tiling.h" +#include "reduce/reduce_tiling.h" +#include "math/cumsum_tiling.h" +#include "math/fmod_tiling.h" +#include "normalization/groupnorm_tiling.h" +#include "transdata/transdata_tiling.h" +#include "hccl/hccl_tilingdata.h" +#include "hccl/hccl_tiling.h" +#endif // LIB_TILING_API_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6e91de48..133a36fd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -153,7 +153,95 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/sort/topk/test_operator_topk.cpp ${ASCENDC_TESTS_DIR}/normalization/welfordfinalize/test_operator_welfordfinalize.cpp ${ASCENDC_TESTS_DIR}/utils/init_global_memory/test_operator_init_global_memory.cpp +<<<<<<< HEAD ${ASCENDC_TESTS_DIR}/normalization/layernormV2/test_operator_layernormV2.cpp +======= + ${ASCENDC_TESTS_DIR}/std/sequence/test_sequence.cpp + ${ASCENDC_TESTS_DIR}/std/tuple/*.cpp + ${ASCENDC_TESTS_DIR}/std/type_traits/*.cpp + ${ASCENDC_TESTS_DIR}/transdata/*cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/geglu/kernel_geglu_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/gelu/kernel_gelu_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/gelu/kernel_faster_gelu_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/gelu/kernel_faster_geluv2_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/reglu/kernel_reglu_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/sigmoid/kernel_sigmoid_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/silu/kernel_silu_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/swiglu/kernel_swiglu_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/swish/kernel_swish_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/adjust_softmax_res/kernel_adjust_softmax_res_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/log_softmax/kernel_log_softmax_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/simple_softmax/kernel_simple_softmax_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax/kernel_softmax_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_flash/kernel_softmax_flash_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_flashv2/kernel_softmax_flashv2_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_flashv3/kernel_softmax_flashv3_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_grad/kernel_softmax_grad_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_grad_front/kernel_softmax_grad_front_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/filter/droupout/kernel_droupout_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/index/arithprogression/kernel_arithprogression_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/batchnorm/kernel_batchnorm_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/deepnorm/kernel_deepnorm_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/groupnorm/kernel_groupnorm_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/layernorm/kernel_layernorm_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/layernormgrad/kernel_layernormgrad_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/layernormgradbeta/kernel_layernormgradbeta_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/normalize/kernel_normalize_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/rmsnorm/kernel_rmsnorm_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/welfordfinalize/kernel_welfordfinalize_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/normalization/welfordupdate/kernel_welfordupdate_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/pad/broadcast/kernel_broadcast_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/pad/pad/kernel_pad_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/pad/unpad/kernel_unpad_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/quantization/antiquant/kernel_antiquant_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/quantization/dequant/kernel_dequant_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/quantization/quant/kernel_quant_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/select/selectwithbytesmask/kernel_selectwithbytesmask_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/sort/topk/kernel_topk_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/transpose/confusion_transpose/kernel_confusion_transpose_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/utils/init_global_memory/kernel_init_global_memory_check.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/acos/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/acosh/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/asin/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/asinh/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/atan/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/atanh/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/axpy/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/ceil/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/clamp/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/cos/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/cosh/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/cumsum/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/digamma/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/erf/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/erfc/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/exp/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/floor/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/fmod/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/frac/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/lgamma/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/log/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/power/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/round/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/sign/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/sin/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/sinh/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/tan/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/tanh/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/trunc/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/math/xor/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/mean/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/sum/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_xor_sum/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_all/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_any/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_max/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_min/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_sum/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_prod/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_mean/*.cpp + ${ASCENDC_TESTS_DIR}/api_check/transdata/*.cpp +>>>>>>> 4d201cad (add transdata high api fractal_z_3d to ncdhw) ) # ascend910B1 aic test cases diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index cc0a7cec..09630754 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -13,7 +13,6 @@ #define private public #define protected public #include "lib/activation/softmax_tiling.h" -#include "lib/transdata/transdata_tiling.h" // temp for upload code #include "tiling_api.h" #include "platform_stub.h" #include "impl/matmul/tiling/math_util.h" @@ -6710,9 +6709,20 @@ TEST_F(TestTiling, testTransDataTilingUnalignedHw) platfrom_stub_set_chip_version("Ascend910B"); uint32_t maxSize; uint32_t minSize; - auto ncdhwShape = ge::Shape({ 16, 16, 3, 3, 3 }); - auto ndc1hwc0Shape = ge::Shape({ 16, 3, 1, 3, 3, 16}); - auto fractalzShape = ge::Shape({ 3, 1, 3, 3, 1, 16, 16}); + int32_t n = 16; + int32_t c = 16; + int32_t d = 3; + int32_t h = 3; + int32_t w = 3; + int32_t c0 = 16; + int32_t n0 = 16; + int32_t c1 = (c + c0 - 1) / c0; + int32_t n1 = (n + n0 - 1) / n0; + int32_t hw0 = 16; + int32_t hw1 = (h * w + hw0 - 1) / hw0; + auto ncdhwShape = ge::Shape({ n, c, d, h, w }); + auto ndc1hwc0Shape = ge::Shape({ n, d, c1, h, w, c0}); + auto fractalzShape = ge::Shape({ d, c1, h, w, n1, n0, c0}); fe::PlatFormInfos platform_info; auto plat = platform_ascendc::PlatformAscendC(&platform_info); TransDataConfig config = {DataFormat::NCDHW, DataFormat::NDC1HWC0}; @@ -6735,6 +6745,13 @@ TEST_F(TestTiling, testTransDataTilingUnalignedHw) EXPECT_TRUE(ret); EXPECT_EQ(maxSize, 26112); EXPECT_EQ(minSize, 26112); + + config = {DataFormat::FRACTAL_Z_3D, DataFormat::NCDHW}; + ret = GetTransDataMaxMinTmpSize(plat, fractalzShape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2); + EXPECT_EQ(minSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2); } TEST_F(TestTiling, testTransDataTilingAlignedHw) @@ -6742,9 +6759,20 @@ TEST_F(TestTiling, testTransDataTilingAlignedHw) platfrom_stub_set_chip_version("Ascend910B"); uint32_t maxSize; uint32_t minSize; - auto ncdhwShape = ge::Shape({ 5, 30, 2, 4, 8 }); - auto ndc1hwc0Shape = ge::Shape({ 5, 2, 2, 4, 8, 16}); - auto fractalzShape = ge::Shape({ 2, 2, 4, 8, 1, 16, 16}); + int32_t n = 5; + int32_t c = 30; + int32_t d = 2; + int32_t h = 4; + int32_t w = 8; + int32_t c0 = 16; + int32_t n0 = 16; + int32_t c1 = (c + c0 - 1) / c0; + int32_t n1 = (n + n0 - 1) / n0; + int32_t hw0 = 16; + int32_t hw1 = (h * w + hw0 - 1) / hw0; + auto ncdhwShape = ge::Shape({ n, c, d, h, w }); + auto ndc1hwc0Shape = ge::Shape({ n, d, c1, h, w, c0}); + auto fractalzShape = ge::Shape({ d, c1, h, w, n1, n0, c0}); fe::PlatFormInfos platform_info; auto plat = platform_ascendc::PlatformAscendC(&platform_info); TransDataConfig config = {DataFormat::NCDHW, DataFormat::NDC1HWC0}; @@ -6767,6 +6795,13 @@ TEST_F(TestTiling, testTransDataTilingAlignedHw) EXPECT_TRUE(ret); EXPECT_EQ(maxSize, 69376); EXPECT_EQ(minSize, 69376); + + config = {DataFormat::FRACTAL_Z_3D, DataFormat::NCDHW}; + ret = GetTransDataMaxMinTmpSize(plat, fractalzShape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize); + + EXPECT_TRUE(ret); + EXPECT_EQ(maxSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2 * 2); + EXPECT_EQ(minSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2 * 2); } TEST_F(TestTiling, TestReduceXorSumTilingInt16) diff --git a/tests/transdata/test_operator_transdata.cpp b/tests/transdata/test_operator_transdata.cpp new file mode 100644 index 00000000..d50408a9 --- /dev/null +++ b/tests/transdata/test_operator_transdata.cpp @@ -0,0 +1,267 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "kernel_operator.h" + +#include +#include + +namespace AscendC { + +namespace { + +constexpr uint32_t NCDHW_FractalZ3D = 1; +constexpr uint32_t FractalZ3D_NCDHW = 2; +constexpr uint32_t NCDHW_NDC1HWC0 = 3; +constexpr uint32_t NDC1HWC0_NCDHW = 4; + + +constexpr TransDataConfig config1 = {DataFormat::NCDHW, DataFormat::FRACTAL_Z_3D}; +constexpr TransDataConfig config2 = {DataFormat::FRACTAL_Z_3D, DataFormat::NCDHW}; +constexpr TransDataConfig config3 = {DataFormat::NCDHW, DataFormat::NDC1HWC0}; +constexpr TransDataConfig config4 = {DataFormat::NDC1HWC0, DataFormat::NCDHW}; + +} + +template +class KernelTransData { +public: +__aicore__ inline KernelTransData() {} +__aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, + int32_t n, int32_t c, int32_t d, int32_t h, int32_t w, TPipe *tpipe) +{ + this->d = d; + this->c = c; + this->h = h; + this->w = w; + this->n = n; + this->c1 = (c + c0 - 1) / c0; + this->n1 = (n + n0 - 1) / n0; + this->hw1 = (h*w + hw0 - 1) / hw0; + + if (mode == NDC1HWC0_NCDHW) { + this->srcShapeSize = n * c1 * c0 * d * h * w; + this->dstShapeSize = n * d * c * hw0; + this->tmpShapeSize = 512 + d * c1 * c0 * hw0 * hw1; + uint32_t dstGmSize = n * c * d * h * w; + srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(srcGm), srcShapeSize * sizeof(T)); + dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(dstGm), dstGmSize * sizeof(T)); + } else { + if constexpr (mode == NCDHW_FractalZ3D) { + srcShapeSize = n * d * c * hw0 * hw1; + dstShapeSize = n1 * n0 * c1 * c0 * d * h * w; + if ((h*w) % 16 != 0 ) { + needPad = true; + dstShapeSize = n1 * n0 * c1 * c0 * d * hw0 * hw1; + } + tmpShapeSize = c * d * hw0 * hw1 + n1 * n0 * d * c1 * c0 * hw0 * hw1; + } else if constexpr (mode == FractalZ3D_NCDHW) { + this->srcShapeSize = d * c1 * h * w * n1 * n0 * c0; + this->dstShapeSize = n * c * d * (hw1 * hw0); + this->tmpShapeSize = d * c1 * (hw1 * hw0) * n1 * n0 * c0 * 2; + } else if constexpr (mode == NCDHW_NDC1HWC0) { + this->srcShapeSize = n * d * c * hw0; + this->dstShapeSize = n * c1 * c0 * d * h * w; + this->tmpShapeSize = d * hw0 * hw1 + d * c1 * c0 * hw0 * hw1; + } + srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(srcGm), srcShapeSize * sizeof(T)); + dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(dstGm), dstShapeSize * sizeof(T)); + } + + + this->pipe = tpipe; + pipe->InitBuffer(inQueue, 1, srcShapeSize * sizeof(T)); + pipe->InitBuffer(outQueue, 1, dstShapeSize * sizeof(T)); + pipe->InitBuffer(tmpBuf, tmpShapeSize * sizeof(T)); + +} +__aicore__ inline void Process() +{ + CopyIn(); + Compute(); + CopyOut(); +} + +private: +__aicore__ inline void CopyIn() +{ + LocalTensor srcLocal = inQueue.AllocTensor(); + if constexpr (mode == NCDHW_FractalZ3D || mode == NCDHW_NDC1HWC0) { + DataCopyExtParams extParam = {static_cast(n * c * d), + static_cast(h * w * sizeof(T)), 0, 0, 0}; + DataCopyPadExtParams padParam = {true, 0, 0, 0}; + if (needPad) { + DataCopyPad(srcLocal, srcGlobal, extParam, padParam); + } else { + DataCopy(srcLocal, srcGlobal, srcShapeSize); + } + } else if constexpr (mode == FractalZ3D_NCDHW || mode == NDC1HWC0_NCDHW) { + DataCopy(srcLocal, srcGlobal, srcShapeSize); + } + + inQueue.EnQue(srcLocal); +} +__aicore__ inline void Compute() +{ + LocalTensor dstLocal = outQueue.AllocTensor(); + LocalTensor tmp = tmpBuf.Get(); + LocalTensor srcLocal = inQueue.DeQue(); + PipeBarrier(); + + Layout ncdhwLayout = MakeLayout(MakeShape(n, c, d, h, w), MakeStride()); + Layout ndc1hwc0Layout = MakeLayout(MakeShape(n, d, c1, h, w, c0), MakeStride()); + Layout fractalLayout = MakeLayout(MakeShape(d, c1, h, w, n1, n0, c0), MakeStride()); + + if constexpr (mode == NCDHW_FractalZ3D) { + TransDataParams params = {ncdhwLayout, fractalLayout}; + TransData(dstLocal, srcLocal, tmp, params); + } else if constexpr (mode == FractalZ3D_NCDHW) { + TransDataParams params = {fractalLayout, ncdhwLayout}; + TransData(dstLocal, srcLocal, tmp, params); + } else if constexpr (mode == NCDHW_NDC1HWC0) { + TransDataParams params = {ncdhwLayout, ndc1hwc0Layout}; + TransData(dstLocal, srcLocal, tmp, params); + } else if constexpr (mode == NDC1HWC0_NCDHW) { + TransDataParams params = {ndc1hwc0Layout, ncdhwLayout}; + TransData(dstLocal, srcLocal, tmp, params); + } + + outQueue.EnQue(dstLocal); + inQueue.FreeTensor(srcLocal); + +} +__aicore__ inline void CopyOut() +{ + LocalTensor dstLocal = outQueue.DeQue(); + DataCopyExtParams extParam {static_cast(n * c * d), static_cast(h*w*sizeof(T)), 0, 0, 0}; + if constexpr (mode == NCDHW_FractalZ3D) { + DataCopy(dstGlobal, dstLocal, n1 * n0 * c1); + } else if constexpr (mode == FractalZ3D_NCDHW) { + DataCopy(dstGlobal, dstLocal, dstShapeSize); + } else if constexpr (mode == NCDHW_NDC1HWC0) { + DataCopy(dstGlobal, dstLocal, dstShapeSize); + } else if constexpr (mode == NDC1HWC0_NCDHW) { + DataCopyPad(dstGlobal, dstLocal, extParam); + } + outQueue.FreeTensor(dstLocal); +} + +private: + GlobalTensor srcGlobal; + GlobalTensor dstGlobal; + TPipe *pipe; + TQue inQueue; + TQue outQueue; + TBuf tmpBuf; + bool needPad = false; + int32_t n = 0; + int32_t c = 0; + int32_t d = 0; + int32_t h = 0; + int32_t w = 0; + int32_t n1 = 0; + int32_t c1 = 0; + int32_t hw1 = 0; + int32_t c0 = 16; + int32_t n0 = 16; + int32_t hw0 = 16; + uint32_t srcShapeSize = 0; + uint32_t dstShapeSize = 0; + uint32_t tmpShapeSize = 0; +}; +} // namespace AscendC + +template +__global__ __aicore__ void MainTransdata( + __gm__ uint8_t* dstGm, __gm__ uint8_t* srcGm, uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) +{ + if (g_coreType == AscendC::AIC || AscendC::GetBlockIdx() > 0) { + return; + } + AscendC::TPipe pipe; + AscendC::KernelTransData op; + op.Init(srcGm, dstGm, n, c, d, h, w, &pipe); + op.Process(); +} + +struct TransDataTestParams { + int32_t n; + int32_t c; + int32_t d; + int32_t h; + int32_t w; + uint32_t mode; + void (*cal_func)(uint8_t*, uint8_t*, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); +}; + +class TransDataTestsuite : public testing::Test, public testing::WithParamInterface { +protected: + void SetUp() + { + AscendC::SetGCoreType(2); + } + + void TearDown() + { + AscendC::SetGCoreType(0); + } +}; + +INSTANTIATE_TEST_CASE_P(TEST_OPERATTION_TRANSDATA, TransDataTestsuite, + ::testing::Values( + TransDataTestParams { 5, 32, 2, 1, 16, 1, MainTransdata }, + TransDataTestParams { 4, 31, 1, 6, 7, 2, MainTransdata }, + TransDataTestParams { 4, 20, 2, 3, 1, 3, MainTransdata }, + TransDataTestParams { 8, 14, 2, 1, 16, 4, MainTransdata }, + TransDataTestParams { 5, 32, 2, 1, 16, 1, MainTransdata }, + TransDataTestParams { 4, 31, 1, 6, 7, 2, MainTransdata }, + TransDataTestParams { 4, 20, 2, 3, 1, 3, MainTransdata }, + TransDataTestParams { 8, 14, 2, 1, 16, 4, MainTransdata } + + )); + +TEST_P(TransDataTestsuite, TransDataOpTestCase) +{ + auto params = GetParam(); + auto n = params.n; + auto c = params.c; + auto d = params.d; + auto h = params.h; + auto w = params.w; + auto mode = params.mode; + uint32_t srcShapeSize; + uint32_t dstShapeSize; + int32_t hw0 = 16; + int32_t hw1 = (h * w + hw0 - 1) / hw0; + int32_t c0 = 16; + int32_t n0 = 16; + int32_t c1 = (c + c0 - 1) / c0; + int32_t n1 = (n + n0 - 1) / n0; + if (mode == 1) { + srcShapeSize = n * d * c * hw0 * hw1; + dstShapeSize = n1 * n0 * c1 * c0 * d * h * w; + if ((h*w) % 16 != 0 ) { + dstShapeSize = n1 * n0 * c1 * c0 * d * hw0 * hw1; + } + } else if (mode == 2) { + srcShapeSize = d * c1 * h * w * n1 * n0 * c0; + dstShapeSize = n * c * d * (hw1 * hw0); + } else if (mode == 3) { + srcShapeSize = n * d * c * hw0; + dstShapeSize = n * c1 * c0 * d * h * w; + } else if (mode == 4) { + srcShapeSize = n * c1 * c0 * d * h * w; + dstShapeSize = n * d * c * hw0; + } + uint8_t srcGm[srcShapeSize * sizeof(half)] = {0}; // 外部保证inner是32B对齐 + uint8_t dstGm[dstShapeSize * sizeof(half)] = {0}; + params.cal_func(dstGm, srcGm, n, c, d, h, w); + EXPECT_EQ(dstGm[0], 0); +} diff --git a/tiling/tiling_api.h b/tiling/tiling_api.h new file mode 100644 index 00000000..015f68e4 --- /dev/null +++ b/tiling/tiling_api.h @@ -0,0 +1,90 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file tiling_api.h + * \brief + */ +#ifndef TILING_API_H +#define TILING_API_H +#include "../lib/matmul/matmul_tiling.h" +#include "../lib/matmul/bmm_tiling.h" +#include "../lib/activation/softmax_tiling.h" +#include "../lib/activation/logsoftmax_tiling.h" +#include "../lib/filter/dropout_tiling.h" +#include "../lib/sort/sort_tiling_intf.h" +#include "../lib/index/arithprogression_tiling.h" +#include "../lib/quantization/ascend_dequant_tiling.h" +#include "../lib/quantization/ascend_quant_tiling.h" +#include "../lib/quantization/ascend_antiquant_tiling.h" +#include "../lib/quantization/quantize_tiling.h" +#include "../lib/quantization/antiquantize_tiling.h" +#include "../lib/quantization/dequantize_tiling.h" +#include "../lib/reduce/sum_tiling.h" +#include "../lib/activation/silu_tiling.h" +#include "../lib/activation/swish_tiling.h" +#include "../lib/activation/gelu_tiling.h" +#include "../lib/pad/pad_tiling.h" +#include "../lib/normalization/rmsnorm_tiling.h" +#include "../lib/normalization/deepnorm_tiling.h" +#include "../lib/normalization/layernorm_tiling.h" +#include "../lib/normalization/normalize_tiling.h" +#include "../lib/normalization/groupnorm_tiling.h" +#include "../lib/normalization/batchnorm_tiling.h" +#include "../lib/normalization/layernorm_grad_tiling.h" +#include "../lib/normalization/layernorm_grad_beta_tiling.h" +#include "../lib/normalization/welfordfinalize_tiling.h" +#include "../lib/transpose/confusion_transpose_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "../lib/sort/topk_tiling.h" +#include "../lib/math/tanh_tiling.h" +#include "../lib/activation/sigmoid_tiling.h" +#include "../lib/math/frac_tiling.h" +#include "../lib/math/acos_tiling.h" +#include "../lib/math/asin_tiling.h" +#include "../lib/math/acosh_tiling.h" +#include "../lib/math/asinh_tiling.h" +#include "../lib/math/sin_tiling.h" +#include "../lib/math/cos_tiling.h" +#include "../lib/math/hypot_tiling.h" +#include "../lib/math/atan_tiling.h" +#include "../lib/math/power_tiling.h" +#include "../lib/math/log_tiling.h" +#include "../lib/math/cosh_tiling.h" +#include "../lib/math/clamp_tiling.h" +#include "../lib/math/erf_tiling.h" +#include "../lib/math/erfc_tiling.h" +#include "../lib/math/round_tiling.h" +#include "../lib/math/sinh_tiling.h" +#include "../lib/activation/swiglu_tiling.h" +#include "../lib/math/tan_tiling.h" +#include "../lib/select/selectwithbytesmask_tiling.h" +#include "../lib/math/trunc_tiling.h" +#include "../lib/math/fmod_tiling.h" +#include "../lib/activation/geglu_tiling.h" +#include "../lib/math/lgamma_tiling.h" +#include "../lib/math/digamma_tiling.h" +#include "../lib/math/atanh_tiling.h" +#include "../lib/math/xor_tiling.h" +#include "../lib/math/sign_tiling.h" +#include "../lib/reduce/mean_tiling.h" +#include "../lib/math/exp_tiling.h" +#include "../lib/math/axpy_tiling.h" +#include "../lib/math/ceil_tiling.h" +#include "../lib/math/floor_tiling.h" +#include "../lib/activation/reglu_tiling.h" +#include "../lib/pad/broadcast_tiling.h" +#include "../lib/reduce/reduce_xor_sum_tiling.h" +#include "../lib/reduce/reduce_tiling.h" +#include "../lib/transdata/transdata_tiling.h" +#include "../lib/math/cumsum_tiling.h" +#include "../lib/hccl/hccl_tilingdata.h" +#include "../lib/hccl/hccl_tiling.h" +#endif // TILING_API_H -- Gitee From 51e18c6dce8aa6b86f085dfd093cbf1bb630897a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=9F=E4=BF=8A=E6=88=90?= Date: Wed, 4 Jun 2025 09:14:43 +0800 Subject: [PATCH 4/5] add transdata shape check --- impl/reduce/reduce_tiling.cpp | 10 ++--- impl/transdata/transdata_impl.h | 45 ++++++++++++++----- impl/transdata/transdata_tiling.cpp | 13 +++++- lib/transdata/transdata.h | 1 + .../transdata/transdata_common.h | 6 +-- 5 files changed, 54 insertions(+), 21 deletions(-) rename impl/transdata/transdata_common_impl.h => lib/transdata/transdata_common.h (86%) diff --git a/impl/reduce/reduce_tiling.cpp b/impl/reduce/reduce_tiling.cpp index d6efe31f..71722928 100644 --- a/impl/reduce/reduce_tiling.cpp +++ b/impl/reduce/reduce_tiling.cpp @@ -102,7 +102,6 @@ void GetReduceCommonMaxMinTmpSize(const ge::Shape &srcShape, } inline void GetReduceSumMeanCommonTmpSize(const ge::Shape &srcShape, - const ge::DataType dataType, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, uint32_t &maxValue, uint32_t &minValue, std::string apiName, std::string funcName) { @@ -137,7 +136,6 @@ inline void GetReduceSumMeanCommonTmpSize(const ge::Shape &srcShape, } inline void GetReduceAnyAllCommonTmpSize(const ge::Shape &srcShape, - const ge::DataType dataType, ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource, uint32_t &maxValue, uint32_t &minValue, std::string apiName, std::string funcName) { @@ -229,7 +227,7 @@ void GetReduceAnyMaxMinTmpSize(const ge::Shape &srcShape, return, "[ReduceAny][GetReduceAnyMaxMinTmpSize] it only supports float and uint8_t type on this platform."); if (dataType == ge::DT_UINT8) { - GetReduceAnyAllCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + GetReduceAnyAllCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, "ReduceAny", "GetReduceAnyMaxMinTmpSize"); } else { GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, @@ -245,7 +243,7 @@ void GetReduceAllMaxMinTmpSize(const ge::Shape &srcShape, ASCENDC_HOST_ASSERT((dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8), return, "[ReduceAll][GetReduceAllMaxMinTmpSize] it only supports float and uint8 type on this platform."); if (dataType == ge::DT_UINT8) { - GetReduceAnyAllCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + GetReduceAnyAllCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, "ReduceAll", "GetReduceAllMaxMinTmpSize"); } else { GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, @@ -260,7 +258,7 @@ void GetReduceSumMaxMinTmpSize(const ge::Shape &srcShape, { ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, "[ReduceSum][GetReduceSumMaxMinTmpSize] it only supports float type on this platform."); - GetReduceSumMeanCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + GetReduceSumMeanCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, "ReduceSum", "GetReduceSumMaxMinTmpSize"); } @@ -271,7 +269,7 @@ void GetReduceMeanMaxMinTmpSize(const ge::Shape &srcShape, { ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, "[ReduceMean][GetReduceMeanMaxMinTmpSize] it only supports float type on this platform."); - GetReduceSumMeanCommonTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, + GetReduceSumMeanCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, "ReduceMean", "GetReduceMeanMaxMinTmpSize"); } } // namespace AscendC diff --git a/impl/transdata/transdata_impl.h b/impl/transdata/transdata_impl.h index 33571fab..ad544da1 100644 --- a/impl/transdata/transdata_impl.h +++ b/impl/transdata/transdata_impl.h @@ -13,7 +13,6 @@ #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "kernel_tiling/kernel_tiling.h" -#include "transdata_common_impl.h" #include "../common/check.h" #include "../api_check/kernel_api_check.h" @@ -24,6 +23,9 @@ namespace { constexpr int32_t n0 = 16; constexpr int32_t c0 = 16; constexpr int32_t hw0 = 16; +constexpr int32_t ncdhwDims = 5; +constexpr int32_t fractalZ3DDims = 7; +constexpr int32_t ndc1hwc0Dims = 6; } struct TransDataTmpParams { @@ -458,20 +460,38 @@ __aicore__ inline void TransDataImplMode3(const LocalTensor& dst, const Local } } -template -__aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const LocalTensor& srcTensor, - const LocalTensor& sharedTmpBuffer, const TransDataParams& params) +template +__aicore__ inline void TransDataCheck(const TransDataParams& params) { - static_assert(SupportType(), "Currents only supports half/bfloat16_t types."); + static_assert(SupportType(), + "Currents only supports half/bfloat16_t/uint16_t/int16_t types."); static_assert(is_layout_v, "srcLayout must be a layout"); static_assert(is_layout_v, "dstLayout must be a layout"); using SrcShapeTuple = Std::remove_cvref_t; using DstShapeTuple = Std::remove_cvref_t; static_assert(Std::is_tuple_v, "it must be a shape."); static_assert(Std::is_tuple_v, "it must be a shape."); +} +template +__aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const LocalTensor& srcTensor, + const LocalTensor& sharedTmpBuffer, const TransDataParams& params) +{ + TransDataCheck(params); + auto srcShape = params.srcLayout.GetShape(); + auto dstShape = params.dstLayout.GetShape(); + constexpr uint32_t srcShapeSize = static_cast(Std::tuple_size::value); + constexpr uint32_t dstShapeSize = static_cast(Std::tuple_size::value); CHECK_FUNC_HIGHLEVEL_API(TransData, (config, T, U, S), (dstTensor, srcTensor, sharedTmpBuffer, params)); - auto ncdhwShape = config.srcFormat == DataFormat::NCDHW ? params.srcLayout.GetShape() : params.dstLayout.GetShape(); + using srcType = decltype(srcShape); + using dstType = decltype(dstShape); + using ncdhwType = Std::conditional_t; + ncdhwType ncdhwShape; + if constexpr (config.srcFormat == DataFormat::NCDHW) { + ncdhwShape = params.srcLayout.GetShape(); + } else { + ncdhwShape = params.dstLayout.GetShape(); + } int32_t n = Std::get<0>(ncdhwShape); int32_t c = Std::get<1>(ncdhwShape); int32_t d = Std::get<2>(ncdhwShape); @@ -481,17 +501,22 @@ __aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const Loca int32_t c1 = (c + c0 - 1) / c0; int32_t hw1 = (h * w + hw0 - 1) / hw0; int32_t padHw = hw1 * hw0; - TransDataTmpParams tmpParams = { - n, c, d, h, w, n1, c1, padHw, - }; - + TransDataTmpParams tmpParams = { n, c, d, h, w, n1, c1, padHw }; if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { + static_assert(srcShapeSize == ncdhwDims, "srcLayout's shape dims must be equal to 5!"); + static_assert(dstShapeSize == fractalZ3DDims, "dstLayout's shape dims must be equal to 7!"); TransDataImplMode1(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } else if constexpr (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) { + static_assert(srcShapeSize == fractalZ3DDims, "srcLayout's shape dims must be equal to 7!"); + static_assert(dstShapeSize == ncdhwDims, "dstLayout's shape dims must be equal to 5!"); TransDataFractalToNcdhw(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } else if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { + static_assert(srcShapeSize == ncdhwDims, "srcLayout's shape dims must be equal to 5!"); + static_assert(dstShapeSize == ndc1hwc0Dims, "dstLayout's shape dims must be equal to 6!"); TransDataImplMode2(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } else if constexpr (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) { + static_assert(srcShapeSize == ndc1hwc0Dims, "srcLayout's shape dims must be equal to 6!"); + static_assert(dstShapeSize == ncdhwDims, "dstLayout's shape dims must be equal to 5!"); TransDataImplMode3(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } } diff --git a/impl/transdata/transdata_tiling.cpp b/impl/transdata/transdata_tiling.cpp index a5de5de1..45d1bc71 100644 --- a/impl/transdata/transdata_tiling.cpp +++ b/impl/transdata/transdata_tiling.cpp @@ -171,8 +171,17 @@ bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform const TransDataConfig &config, uint32_t &maxValue, uint32_t &minValue) { - ASCENDC_HOST_ASSERT(dataType == ge::DataType::DT_FLOAT16 || dataType == ge::DataType::DT_BF16, return false, - "[TransData][GetTransDataMaxMinTmpSize] it only supports DT_FLOAT16/DT_BF16 data type"); + ASCENDC_HOST_ASSERT(dataType == ge::DataType::DT_FLOAT16 || dataType == ge::DataType::DT_BF16 || + dataType == ge::DataType::DT_UINT16 || dataType == ge::DataType::DT_INT16, return false, + "[TransData][GetTransDataMaxMinTmpSize] it only supports DT_FLOAT16/DT_BF16/DT_UINT16/DT_INT16 data type"); + + ASCENDC_HOST_ASSERT(((config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) || + (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) || + (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) || + (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW)), return false, + "[TransData][GetTransDataMaxMinTmpSize] The parameter config srcFormat/dstFormat only supports " + "(NCDHW, FRACTAL_Z_3D)/(FRACTAL_Z_3D, NCDHW)/(NCDHW, NDC1HWC0)/(NDC1HWC0, NCDHW)!"); + platform_ascendc::SocVersion socVersion = platform.GetSocVersion(); ASCENDC_HOST_ASSERT(socVersion == platform_ascendc::SocVersion::ASCEND910B, return false, "[TransData][GetTransDataMaxMinTmpSize] Unsupported SocVersion for TransData API."); diff --git a/lib/transdata/transdata.h b/lib/transdata/transdata.h index 795c9a03..755a0086 100644 --- a/lib/transdata/transdata.h +++ b/lib/transdata/transdata.h @@ -10,6 +10,7 @@ #ifndef LIB_TRANSDATA_TRANSDATA_H #define LIB_TRANSDATA_TRANSDATA_H #if __CCE_AICORE__ == 220 +#include "transdata_common.h" #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "kernel_pop_stack_buffer.h" diff --git a/impl/transdata/transdata_common_impl.h b/lib/transdata/transdata_common.h similarity index 86% rename from impl/transdata/transdata_common_impl.h rename to lib/transdata/transdata_common.h index b1289dc7..644d78ab 100644 --- a/impl/transdata/transdata_common_impl.h +++ b/lib/transdata/transdata_common.h @@ -7,8 +7,8 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef IMPL_TRANSDATA_TRANSDATA_COMMON_IMPL_H -#define IMPL_TRANSDATA_TRANSDATA_COMMON_IMPL_H +#ifndef LIB_TRANSDATA_TRANSDATA_COMMON_H +#define LIB_TRANSDATA_TRANSDATA_COMMON_H namespace AscendC { template @@ -26,4 +26,4 @@ struct TransDataConfig { #endif // ASCC_PARAM_TRANSDATACONFIG } // namespace AscendC -#endif // IMPL_TRANSDATA_TRANSDATA_COMMON_IMPL_H \ No newline at end of file +#endif // LIB_TRANSDATA_TRANSDATA_COMMON_H \ No newline at end of file -- Gitee From ba10d800693e533d28c74a9153aba4361aedf4a8 Mon Sep 17 00:00:00 2001 From: chen-yiyuan Date: Mon, 9 Jun 2025 14:31:25 +0800 Subject: [PATCH 5/5] update --- docs/README.md | 6 +- impl/CMakeLists.txt | 32 +- .../transdata/transdata_impl.h | 26 +- .../transdata/transdata_tiling.cpp | 2 +- lib/kernel_api.h | 159 -- lib/tiling_api.h | 90 -- lib/{transdata => transpose}/transdata.h | 8 +- .../transdata_common.h | 6 +- .../transdata_tiling.h | 6 +- tests/CMakeLists.txt | 98 +- tests/tiling/test_tiling.cpp | 1304 ----------------- .../transdata/test_operator_transdata.cpp | 0 tiling/tiling_api.h | 90 -- 13 files changed, 32 insertions(+), 1795 deletions(-) rename impl/{ => transpose}/transdata/transdata_impl.h (95%) rename impl/{ => transpose}/transdata/transdata_tiling.cpp (99%) delete mode 100644 lib/kernel_api.h delete mode 100644 lib/tiling_api.h rename lib/{transdata => transpose}/transdata.h (91%) rename lib/{transdata => transpose}/transdata_common.h (88%) rename lib/{transdata => transpose}/transdata_tiling.h (94%) rename tests/{ => transpose}/transdata/test_operator_transdata.cpp (100%) delete mode 100644 tiling/tiling_api.h diff --git a/docs/README.md b/docs/README.md index b688a7f7..b90de102 100644 --- a/docs/README.md +++ b/docs/README.md @@ -334,10 +334,14 @@ 给定两个源操作数src0和src1,根据maskTensor相应位置的值(非bit位)选取元素,得到目的操作数dst。 - 变形 + 变形 ConfusionTranspose 对输入数据进行数据排布及Reshape操作。 + + TransData + 对输入数据排布格式转换为输出所需的数据排布格式 + 索引操作 ArithProgression diff --git a/impl/CMakeLists.txt b/impl/CMakeLists.txt index 8de6e974..1ab2cc72 100644 --- a/impl/CMakeLists.txt +++ b/impl/CMakeLists.txt @@ -92,37 +92,7 @@ add_library(tiling_api STATIC ${CMAKE_CURRENT_SOURCE_DIR}/math/axpy/axpy_tiling_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/ceil/ceil_tiling_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/floor/floor_tiling_impl.cpp -<<<<<<< HEAD -======= - ${CMAKE_CURRENT_SOURCE_DIR}/activation/softmax/softmax_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/activation/softmax/logsoftmax_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/rmsnorm/rmsnorm_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/batchnorm/batchnorm_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sort/sort/sort_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sort/topk/topk_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/deepnorm/deepnorm_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/select/selectwithbytesmask/selectwithbytesmask_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/layernorm/layernorm_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/normalize/normalize_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/layernormgrad/layernorm_grad_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/layernormgrad/layernorm_grad_beta_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/groupnorm/groupnorm_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/normalization/welfordfinalize/welfordfinalize_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/pad/pad/pad_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/transpose/confusion_transpose/confusion_transpose_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/pad/broadcast/broadcast_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/pad/broadcast/broadcast_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/math/xor/xor_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/math/cumsum/cumsum_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/reduce/mean/mean_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/math/sign/sign_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/activation/reglu/reglu_tiling_impl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/reduce/sum/sum_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/transdata/transdata_tiling.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/index/arithprogression/arithprogression_tiling_impl.cpp ->>>>>>> 4a0a42bb (update) + ${CMAKE_CURRENT_SOURCE_DIR}/transpose/transdata/transdata_tiling.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/fmod/fmod_tiling_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/math/trunc/trunc_tiling_impl.cpp $<$:$> diff --git a/impl/transdata/transdata_impl.h b/impl/transpose/transdata/transdata_impl.h similarity index 95% rename from impl/transdata/transdata_impl.h rename to impl/transpose/transdata/transdata_impl.h index ad544da1..d8e5d41e 100644 --- a/impl/transdata/transdata_impl.h +++ b/impl/transpose/transdata/transdata_impl.h @@ -7,14 +7,14 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef IMPL_TRANSDATA_TRANSDATA_IMPL_H -#define IMPL_TRANSDATA_TRANSDATA_IMPL_H +#ifndef IMPL_TRANSPOSE_TRANSDATA_TRANSDATA_IMPL_H +#define IMPL_TRANSPOSE_TRANSDATA_TRANSDATA_IMPL_H #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "kernel_tiling/kernel_tiling.h" -#include "../common/check.h" -#include "../api_check/kernel_api_check.h" +#include "../../common/check.h" +#include "../../api_check/kernel_api_check.h" namespace AscendC { namespace Internal { @@ -220,7 +220,7 @@ __aicore__ inline void TransDataFractalToNcdhw(const LocalTensor& dst, const // Transdata NCDHW -> FRACTAL_Z_3D template -__aicore__ inline void TransDataImplMode1(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, +__aicore__ inline void TransDataImplNcdhwToFractal(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, const TransDataTmpParams& param) { constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T); @@ -320,7 +320,7 @@ __aicore__ inline void TransDataImplMode1(const LocalTensor& dst, const Local // Transdata NCDHW -> NDC1HWC0 template -__aicore__ inline void TransDataImplMode2(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, +__aicore__ inline void TransDataImplNcdhwTo6Hd(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, const TransDataTmpParams& param) { constexpr int32_t c0 = 16; @@ -393,7 +393,7 @@ __aicore__ inline void TransDataImplMode2(const LocalTensor& dst, const Local // Transdata NDC1HWC0 -> NCDHW template -__aicore__ inline void TransDataImplMode3(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, +__aicore__ inline void TransDataImpl6HdToNcdhw(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& tmpBuffer, const TransDataTmpParams& param) { const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w; @@ -469,8 +469,8 @@ __aicore__ inline void TransDataCheck(const TransDataParams& params) static_assert(is_layout_v, "dstLayout must be a layout"); using SrcShapeTuple = Std::remove_cvref_t; using DstShapeTuple = Std::remove_cvref_t; - static_assert(Std::is_tuple_v, "it must be a shape."); - static_assert(Std::is_tuple_v, "it must be a shape."); + static_assert(Std::is_tuple_v, "srcLayout.GetShape() must be a shape."); + static_assert(Std::is_tuple_v, "dstLayout.GetShape() must be a shape."); } template @@ -505,7 +505,7 @@ __aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const Loca if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) { static_assert(srcShapeSize == ncdhwDims, "srcLayout's shape dims must be equal to 5!"); static_assert(dstShapeSize == fractalZ3DDims, "dstLayout's shape dims must be equal to 7!"); - TransDataImplMode1(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + TransDataImplNcdhwToFractal(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } else if constexpr (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) { static_assert(srcShapeSize == fractalZ3DDims, "srcLayout's shape dims must be equal to 7!"); static_assert(dstShapeSize == ncdhwDims, "dstLayout's shape dims must be equal to 5!"); @@ -513,14 +513,14 @@ __aicore__ inline void TransDataImpl(const LocalTensor& dstTensor, const Loca } else if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) { static_assert(srcShapeSize == ncdhwDims, "srcLayout's shape dims must be equal to 5!"); static_assert(dstShapeSize == ndc1hwc0Dims, "dstLayout's shape dims must be equal to 6!"); - TransDataImplMode2(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + TransDataImplNcdhwTo6Hd(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } else if constexpr (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) { static_assert(srcShapeSize == ndc1hwc0Dims, "srcLayout's shape dims must be equal to 6!"); static_assert(dstShapeSize == ncdhwDims, "dstLayout's shape dims must be equal to 5!"); - TransDataImplMode3(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); + TransDataImpl6HdToNcdhw(dstTensor, srcTensor, sharedTmpBuffer, tmpParams); } } } // namespace Internal } // namespace AscendC -#endif // IMPL_TRANSDATA_TRANSDATA_IMPL_H \ No newline at end of file +#endif // IMPL_TRANSPOSE_TRANSDATA_TRANSDATA_IMPL_H \ No newline at end of file diff --git a/impl/transdata/transdata_tiling.cpp b/impl/transpose/transdata/transdata_tiling.cpp similarity index 99% rename from impl/transdata/transdata_tiling.cpp rename to impl/transpose/transdata/transdata_tiling.cpp index 45d1bc71..dbfc281e 100644 --- a/impl/transdata/transdata_tiling.cpp +++ b/impl/transpose/transdata/transdata_tiling.cpp @@ -8,7 +8,7 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#include "lib/transdata/transdata_tiling.h" +#include "lib/transpose/transdata_tiling.h" #include #include diff --git a/lib/kernel_api.h b/lib/kernel_api.h deleted file mode 100644 index b6f7a069..00000000 --- a/lib/kernel_api.h +++ /dev/null @@ -1,159 +0,0 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file kernel_api.h - * \brief - */ -#ifndef LIB_KERNEL_API_H -#define LIB_KERNEL_API_H - -#if defined(__DAV_C310__) || defined(__DAV_310R6__) -#include "hccl/hccl.h" -#include "index/arithprogression.h" -#include "activation/sigmoid.h" -#include "activation/softmax.h" -#include "activation/simplesoftmax.h" -#include "activation/softmaxflashv2.h" -#include "activation/softmaxgrad.h" -#include "activation/gelu.h" -#include "filter/dropout.h" -#include "math/tan.h" -#include "math/tanh.h" -#include "math/floor.h" -#include "math/lgamma.h" -#include "math/log.h" -#include "math/sin.h" -#include "math/atanh.h" -#include "math/asinh.h" -#include "math/acosh.h" -#include "math/trunc.h" -#include "math/cos.h" -#include "math/fmod.h" -#include "math/hypot.h" -#include "math/power.h" -#include "math/frac.h" -#include "math/cumsum.h" -#include "math/erf.h" -#include "math/erfc.h" -#include "math/atan.h" -#include "math/is_finite.h" -#include "math/philox.h" -#include "math/sinh.h" -#include "math/cosh.h" -#include "math/sign.h" -#include "math/asin.h" -#include "math/acos.h" -#include "math/exp.h" -#include "math/xor.h" -#include "normalization/layernorm.h" -#include "normalization/welfordfinalize.h" -#include "normalization/normalize.h" -#include "pad/broadcast.h" -#include "quantization/ascend_quant.h" -#include "quantization/ascend_dequant.h" -#include "quantization/ascend_antiquant.h" -#include "quantization/quantize.h" -#include "quantization/dequantize.h" -#include "quantization/antiquantize.h" -#include "utils/init_global_memory.h" -#include "sort/sort.h" -#include "sort/topk.h" -#include "transpose/confusion_transpose.h" -#include "select/selectwithbytesmask.h" -#include "reduce/reduce.h" -#include "math/clamp.h" -#include "math/round.h" -#include "math/ceil.h" -#endif // __CCE_AICORE__ == 310 - -#if defined(__CCE_AICORE__) && (__CCE_AICORE__ != 310) -#include "filter/dropout.h" -#include "activation/sigmoid.h" -#include "activation/softmax.h" -#include "activation/simplesoftmax.h" -#include "activation/softmaxflashv2.h" -#include "activation/softmaxflashv3.h" -#include "activation/softmaxgrad.h" -#include "math/xor.h" -#include "math/floor.h" -#include "sort/sort.h" -#endif - -#include "std/tuple.h" -#include "std/type_traits.h" -#include "std/utility.h" -#include "std/algorithm.h" - -#if defined(__CCE_AICORE__) && (__CCE_AICORE__ < 300) -#include "index/arithprogression.h" -#include "normalization/layernormgrad.h" -#include "normalization/layernormgradbeta.h" -#include "pad/pad.h" -#include "hccl/hccl.h" -#include "math/frac.h" -#include "math/power.h" -#include "math/log.h" -#include "math/sin.h" -#include "math/cos.h" -#include "math/asin.h" -#include "math/acos.h" -#include "math/asinh.h" -#include "math/acosh.h" -#include "math/atan.h" -#include "math/cosh.h" -#include "math/erf.h" -#include "math/erfc.h" -#include "math/clamp.h" -#include "normalization/rmsnorm.h" -#include "normalization/batchnorm.h" -#include "math/tanh.h" -#include "math/atanh.h" -#include "normalization/deepnorm.h" -#include "math/exp.h" -#include "normalization/layernorm.h" -#include "normalization/welfordfinalize.h" -#include "normalization/normalize.h" -#include "reduce/sum.h" -#include "activation/silu.h" -#include "activation/gelu.h" -#include "quantization/ascend_quant.h" -#include "quantization/ascend_dequant.h" -#include "quantization/ascend_antiquant.h" -#include "activation/logsoftmax.h" -#include "activation/softmaxflash.h" -#include "transpose/confusion_transpose.h" -#include "select/selectwithbytesmask.h" -#include "math/sinh.h" -#include "activation/swiglu.h" -#include "activation/reglu.h" -#include "math/tan.h" -#include "math/round.h" -#include "math/trunc.h" -#include "activation/swish.h" -#include "sort/topk.h" -#include "activation/geglu.h" -#include "math/lgamma.h" -#include "math/digamma.h" -#include "math/sign.h" -#include "reduce/mean.h" -#include "math/axpy.h" -#include "math/ceil.h" -#include "pad/broadcast.h" -#include "reduce/reduce_xor_sum.h" -#include "reduce/reduce.h" -#include "transdata/transdata.h" -#include "math/cumsum.h" -#include "math/fmod.h" -#include "normalization/groupnorm.h" -#include "utils/init_global_memory.h" -#endif // __CCE_AICORE__ < 300 - -#endif // LIB_KERNEL_API_H diff --git a/lib/tiling_api.h b/lib/tiling_api.h deleted file mode 100644 index 1b83428d..00000000 --- a/lib/tiling_api.h +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file tiling_api.h - * \brief - */ -#ifndef LIB_TILING_API_H -#define LIB_TILING_API_H -#include "matmul/matmul_tiling.h" -#include "matmul/bmm_tiling.h" -#include "activation/softmax_tiling.h" -#include "activation/logsoftmax_tiling.h" -#include "filter/dropout_tiling.h" -#include "sort/sort_tiling_intf.h" -#include "index/arithprogression_tiling.h" -#include "quantization/ascend_dequant_tiling.h" -#include "quantization/ascend_quant_tiling.h" -#include "quantization/ascend_antiquant_tiling.h" -#include "quantization/quantize_tiling.h" -#include "quantization/antiquantize_tiling.h" -#include "quantization/dequantize_tiling.h" -#include "reduce/sum_tiling.h" -#include "activation/silu_tiling.h" -#include "activation/swish_tiling.h" -#include "activation/gelu_tiling.h" -#include "pad/pad_tiling.h" -#include "normalization/rmsnorm_tiling.h" -#include "normalization/deepnorm_tiling.h" -#include "normalization/layernorm_tiling.h" -#include "normalization/normalize_tiling.h" -#include "normalization/batchnorm_tiling.h" -#include "normalization/layernorm_grad_tiling.h" -#include "normalization/layernorm_grad_beta_tiling.h" -#include "normalization/welfordfinalize_tiling.h" -#include "transpose/confusion_transpose_tiling.h" -#include "tiling/platform/platform_ascendc.h" -#include "sort/topk_tiling.h" -#include "math/tanh_tiling.h" -#include "activation/sigmoid_tiling.h" -#include "math/frac_tiling.h" -#include "math/acos_tiling.h" -#include "math/asin_tiling.h" -#include "math/acosh_tiling.h" -#include "math/asinh_tiling.h" -#include "math/sin_tiling.h" -#include "math/cos_tiling.h" -#include "math/atan_tiling.h" -#include "math/power_tiling.h" -#include "math/log_tiling.h" -#include "math/cosh_tiling.h" -#include "math/clamp_tiling.h" -#include "math/erf_tiling.h" -#include "math/erfc_tiling.h" -#include "math/round_tiling.h" -#include "math/sinh_tiling.h" -#include "activation/swiglu_tiling.h" -#include "math/tan_tiling.h" -#include "math/hypot_tiling.h" -#include "select/selectwithbytesmask_tiling.h" -#include "math/trunc_tiling.h" -#include "activation/geglu_tiling.h" -#include "math/lgamma_tiling.h" -#include "math/digamma_tiling.h" -#include "math/atanh_tiling.h" -#include "math/xor_tiling.h" -#include "math/sign_tiling.h" -#include "reduce/mean_tiling.h" -#include "math/exp_tiling.h" -#include "math/axpy_tiling.h" -#include "math/ceil_tiling.h" -#include "math/floor_tiling.h" -#include "activation/reglu_tiling.h" -#include "pad/broadcast_tiling.h" -#include "reduce/reduce_xor_sum_tiling.h" -#include "reduce/reduce_tiling.h" -#include "math/cumsum_tiling.h" -#include "math/fmod_tiling.h" -#include "normalization/groupnorm_tiling.h" -#include "transdata/transdata_tiling.h" -#include "hccl/hccl_tilingdata.h" -#include "hccl/hccl_tiling.h" -#endif // LIB_TILING_API_H diff --git a/lib/transdata/transdata.h b/lib/transpose/transdata.h similarity index 91% rename from lib/transdata/transdata.h rename to lib/transpose/transdata.h index 755a0086..c0075cf5 100644 --- a/lib/transdata/transdata.h +++ b/lib/transpose/transdata.h @@ -7,14 +7,14 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef LIB_TRANSDATA_TRANSDATA_H -#define LIB_TRANSDATA_TRANSDATA_H +#ifndef LIB_TRANSPOSE_TRANSDATA_H +#define LIB_TRANSPOSE_TRANSDATA_H #if __CCE_AICORE__ == 220 #include "transdata_common.h" #include "kernel_tensor.h" #include "kernel_operator_intf.h" #include "kernel_pop_stack_buffer.h" -#include "../../impl/transdata/transdata_impl.h" +#include "../../impl/transpose/transdata/transdata_impl.h" #if ASCENDC_CPU_DEBUG #include "kernel_log.h" #include @@ -45,4 +45,4 @@ __aicore__ inline void TransData(const LocalTensor& dstTensor, const LocalTen } } // namespace AscendC #endif -#endif // LIB_TRANSDATA_TRANSDATA_H \ No newline at end of file +#endif // LIB_TRANSPOSE_TRANSDATA_H \ No newline at end of file diff --git a/lib/transdata/transdata_common.h b/lib/transpose/transdata_common.h similarity index 88% rename from lib/transdata/transdata_common.h rename to lib/transpose/transdata_common.h index 644d78ab..0421a3ca 100644 --- a/lib/transdata/transdata_common.h +++ b/lib/transpose/transdata_common.h @@ -7,8 +7,8 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef LIB_TRANSDATA_TRANSDATA_COMMON_H -#define LIB_TRANSDATA_TRANSDATA_COMMON_H +#ifndef LIB_TRANSPOSE_TRANSDATA_COMMON_H +#define LIB_TRANSPOSE_TRANSDATA_COMMON_H namespace AscendC { template @@ -26,4 +26,4 @@ struct TransDataConfig { #endif // ASCC_PARAM_TRANSDATACONFIG } // namespace AscendC -#endif // LIB_TRANSDATA_TRANSDATA_COMMON_H \ No newline at end of file +#endif // LIB_TRANSPOSE_TRANSDATA_COMMON_H \ No newline at end of file diff --git a/lib/transdata/transdata_tiling.h b/lib/transpose/transdata_tiling.h similarity index 94% rename from lib/transdata/transdata_tiling.h rename to lib/transpose/transdata_tiling.h index 87559a51..f2d72221 100644 --- a/lib/transdata/transdata_tiling.h +++ b/lib/transpose/transdata_tiling.h @@ -12,8 +12,8 @@ * \file transdata_tiling.h * \brief */ -#ifndef LIB_TRANSDATA_TRANSDATA_TILING_H -#define LIB_TRANSDATA_TRANSDATA_TILING_H +#ifndef LIB_TRANSPOSE_TRANSDATA_TILING_H +#define LIB_TRANSPOSE_TRANSDATA_TILING_H #include #include "graph/tensor.h" #include "tiling/platform/platform_ascendc.h" @@ -64,4 +64,4 @@ bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform const TransDataConfig &config, uint32_t &maxValue, uint32_t &minValue); } // AscendC -#endif // LIB_TRANSDATA_TRANSDATA_TILING_H \ No newline at end of file +#endif // LIB_TRANSPOSE_TRANSDATA_TILING_H \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 133a36fd..a7211593 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -153,95 +153,8 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/sort/topk/test_operator_topk.cpp ${ASCENDC_TESTS_DIR}/normalization/welfordfinalize/test_operator_welfordfinalize.cpp ${ASCENDC_TESTS_DIR}/utils/init_global_memory/test_operator_init_global_memory.cpp -<<<<<<< HEAD ${ASCENDC_TESTS_DIR}/normalization/layernormV2/test_operator_layernormV2.cpp -======= - ${ASCENDC_TESTS_DIR}/std/sequence/test_sequence.cpp - ${ASCENDC_TESTS_DIR}/std/tuple/*.cpp - ${ASCENDC_TESTS_DIR}/std/type_traits/*.cpp - ${ASCENDC_TESTS_DIR}/transdata/*cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/geglu/kernel_geglu_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/gelu/kernel_gelu_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/gelu/kernel_faster_gelu_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/gelu/kernel_faster_geluv2_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/reglu/kernel_reglu_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/sigmoid/kernel_sigmoid_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/silu/kernel_silu_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/swiglu/kernel_swiglu_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/swish/kernel_swish_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/adjust_softmax_res/kernel_adjust_softmax_res_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/log_softmax/kernel_log_softmax_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/simple_softmax/kernel_simple_softmax_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax/kernel_softmax_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_flash/kernel_softmax_flash_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_flashv2/kernel_softmax_flashv2_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_flashv3/kernel_softmax_flashv3_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_grad/kernel_softmax_grad_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/activation/softmax/softmax_grad_front/kernel_softmax_grad_front_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/filter/droupout/kernel_droupout_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/index/arithprogression/kernel_arithprogression_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/batchnorm/kernel_batchnorm_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/deepnorm/kernel_deepnorm_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/groupnorm/kernel_groupnorm_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/layernorm/kernel_layernorm_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/layernormgrad/kernel_layernormgrad_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/layernormgradbeta/kernel_layernormgradbeta_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/normalize/kernel_normalize_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/rmsnorm/kernel_rmsnorm_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/welfordfinalize/kernel_welfordfinalize_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/normalization/welfordupdate/kernel_welfordupdate_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/pad/broadcast/kernel_broadcast_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/pad/pad/kernel_pad_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/pad/unpad/kernel_unpad_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/quantization/antiquant/kernel_antiquant_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/quantization/dequant/kernel_dequant_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/quantization/quant/kernel_quant_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/select/selectwithbytesmask/kernel_selectwithbytesmask_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/sort/topk/kernel_topk_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/transpose/confusion_transpose/kernel_confusion_transpose_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/utils/init_global_memory/kernel_init_global_memory_check.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/acos/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/acosh/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/asin/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/asinh/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/atan/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/atanh/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/axpy/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/ceil/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/clamp/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/cos/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/cosh/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/cumsum/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/digamma/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/erf/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/erfc/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/exp/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/floor/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/fmod/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/frac/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/lgamma/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/log/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/power/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/round/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/sign/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/sin/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/sinh/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/tan/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/tanh/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/trunc/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/math/xor/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/mean/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/sum/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_xor_sum/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_all/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_any/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_max/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_min/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_sum/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_prod/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/reduce/reduce_mean/*.cpp - ${ASCENDC_TESTS_DIR}/api_check/transdata/*.cpp ->>>>>>> 4d201cad (add transdata high api fractal_z_3d to ncdhw) + ${ASCENDC_TESTS_DIR}/transpose/transdata/*cpp ) # ascend910B1 aic test cases @@ -492,11 +405,8 @@ file(GLOB ASCENDC_TILING_SRC_FILES ${ASCENDC_API_DIR}/impl/quantization/quant/*.cpp ${ASCENDC_API_DIR}/impl/sort/topk/*.cpp ${ASCENDC_API_DIR}/impl/reduce/reduce_tiling.cpp -<<<<<<< HEAD ${ASCENDC_API_DIR}/impl/normalization/layernormV2/*.cpp -======= - ${ASCENDC_API_DIR}/impl/transdata/transdata_tiling.cpp ->>>>>>> 4a0a42bb (update) + ${ASCENDC_API_DIR}/impl/transpose/transdata/transdata_tiling.cpp ) # ascendc_tiling_utest @@ -552,15 +462,11 @@ foreach(product_type ${PRODUCT_TYPE_LIST}) ${ASCENDC_API_DIR}/lib/reduce/ ${ASCENDC_API_DIR}/lib/select/ ${ASCENDC_API_DIR}/lib/transpose/ -<<<<<<< HEAD ${ASCENDC_API_DIR}/lib/matmul/ ${ASCENDC_API_DIR}/lib/math/ ${ASCENDC_API_DIR}/lib/normalization/ ${ASCENDC_API_DIR}/lib/quantization/ ${ASCENDC_API_DIR}/lib/sort/ -======= - ${ASCENDC_API_DIR}/lib/transdata/ ->>>>>>> 4a0a42bb (update) ${ASCENDC_TESTS_DIR}/common/ ) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 09630754..b2835619 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -32,1311 +32,7 @@ protected: void TearDown() {} }; - -<<<<<<< HEAD -======= -#if defined(__DAV_C310__) || defined(__DAV_310R6__) -TEST_F(TestTiling, TestSoftMaxTiling) -{ - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - std::vector shapeDims = { 128, 128 }; - optiling::SoftMaxTiling tilingData; - auto softmaxShape = ge::Shape(shapeDims); - uint32_t softmaxTmpSize = 100 * 1024 * 4; - uint32_t softmaxNeedMinSize = GetSoftMaxMinTmpSize(ascendcPlatform, softmaxShape, 2, true); - EXPECT_EQ(softmaxNeedMinSize, 128 * (16 + 128) * 4); - uint32_t softmaxFlashNeedMinSize = GetSoftMaxFlashMinTmpSize(softmaxShape, 2, true, true); - EXPECT_EQ(softmaxFlashNeedMinSize, (16 * 4 + 128 * 2) * 4); - softmaxFlashNeedMinSize = GetSoftMaxFlashMinTmpSize(softmaxShape, 4, true, true); - EXPECT_EQ(softmaxFlashNeedMinSize, (8 * 4 + 128 * 2) * 4); - softmaxFlashNeedMinSize = GetSoftMaxFlashMinTmpSize(softmaxShape, 4, false, true); - EXPECT_EQ(softmaxFlashNeedMinSize, (8 + 128 + 64) * 4); - uint32_t softmaxGradNeedMinSize = GetSoftMaxGradMinTmpSize(softmaxShape, 2, true, true); - EXPECT_EQ(softmaxGradNeedMinSize, (16 * 2 + 128 * 3 + 64) * 4); - softmaxGradNeedMinSize = GetSoftMaxGradMinTmpSize(softmaxShape, 4, true, true); - EXPECT_EQ(softmaxGradNeedMinSize, (8 + 128 + 64) * 4); - softmaxGradNeedMinSize = GetSoftMaxGradMinTmpSize(softmaxShape, 0, true, true); - EXPECT_EQ(softmaxGradNeedMinSize, 0); - - uint32_t softmaxNeedMaxSize = GetSoftMaxMaxTmpSize(ascendcPlatform, softmaxShape, 2, true); - EXPECT_EQ(softmaxNeedMaxSize, 128 * (16 + 128 + 64) * 4); - softmaxNeedMaxSize = GetSoftMaxMaxTmpSize(ascendcPlatform, softmaxShape, 1, true); - EXPECT_EQ(softmaxNeedMaxSize, 0); - uint32_t softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 2, true, true); - EXPECT_EQ(softmaxFlashNeedMaxSize, 128 * (16 * 4 + 128 * 2) * 4); - softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 4, false, true); - EXPECT_EQ(softmaxFlashNeedMaxSize, 128 * (8 + 128 + 64) * 4); - softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 4, true, true); - EXPECT_EQ(softmaxFlashNeedMaxSize, 128 * (8 * 4 + 128 * 2) * 4); - softmaxFlashNeedMaxSize = GetSoftMaxFlashMaxTmpSize(softmaxShape, 1, true, true); - EXPECT_EQ(softmaxFlashNeedMaxSize, 0); - uint32_t softmaxGradNeedMaxSize = GetSoftMaxGradMaxTmpSize(softmaxShape, 2, true, true); - EXPECT_EQ(softmaxGradNeedMaxSize, 128 * (16 * 2 + 128 * 3 + 64) * 4); - softmaxGradNeedMaxSize = GetSoftMaxGradMaxTmpSize(softmaxShape, 4, true, true); - EXPECT_EQ(softmaxGradNeedMaxSize, 128 * (8 + 128 + 64) * 4); - softmaxGradNeedMaxSize = GetSoftMaxGradMaxTmpSize(softmaxShape, 1, true, true); - EXPECT_EQ(softmaxGradNeedMaxSize, 0); - SoftMaxTilingFunc(softmaxShape, 2, softmaxTmpSize, tilingData); - EXPECT_EQ(tilingData.get_reduceM(), 64); - bool flag = IsBasicBlockInSoftMax(tilingData); - EXPECT_EQ(flag, true); - SoftMaxFlashTilingFunc(softmaxShape, 2, 77952, tilingData, true); - EXPECT_EQ(tilingData.get_reduceM(), 32); - SoftMaxFlashTilingFunc(softmaxShape, 2, 77952, tilingData, false); - EXPECT_EQ(tilingData.get_reduceM(), 64); - SoftMaxGradTilingFunc(softmaxShape, 2, softmaxTmpSize, tilingData, false); - EXPECT_EQ(tilingData.get_reduceM(), 64); - SoftMaxGradTilingFunc(softmaxShape, 4, softmaxTmpSize, tilingData, false); - EXPECT_EQ(tilingData.get_reduceM(), 64); - SoftMaxGradTilingFunc(softmaxShape, 2, 133120, tilingData, true); - EXPECT_EQ(tilingData.get_reduceM(), 64); -} - -TEST_F(TestTiling, TestSoftMaxFlashV2TilingMaxMinTmpSize) -{ - uint32_t softmaxflashV2NeedMinLength = 0; - uint32_t softmaxflashV2NeedMaxLength = 0; - - std::vector shapeDims = { 3, 3, 448 }; - auto softmaxShape = ge::Shape(shapeDims); - uint32_t dataTypeSize1 = 2; - uint32_t dataTypeSize2 = 2; - uint32_t isUpdate = 0; - uint32_t isBasicBlock = 0; - uint32_t isFlashOutputBrc = 1; - - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 17504); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 19008); - - shapeDims = {7, 1072}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 31296); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 32256); - - shapeDims = {1, 2, 3, 1, 2, 1, 16}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 2240); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 4608); - - shapeDims = {2, 6, 1, 16}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 2240); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 4608); - - shapeDims = {6, 1664}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 41184); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 41856); - - shapeDims = {2, 1760 }; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 15200); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 15200); - - shapeDims = {1, 5536 }; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 23232); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 23232); - - shapeDims = {2, 2, 2352}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 38816); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 38912); - - shapeDims = {2, 2, 2, 480 }; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 0; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 16672); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 17920); - - shapeDims = {2, 3632}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 1; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 29440); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 29824); - - shapeDims = {2, 4, 96}; - softmaxShape = ge::Shape(shapeDims); - dataTypeSize1 = 2; - dataTypeSize2 = 2; - isUpdate = 1; - isBasicBlock = 0; - isFlashOutputBrc = 1; - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 3840); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 6144); - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, 1, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 0); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, 1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 0); - - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, 1, dataTypeSize2, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMinLength, 0); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, dataTypeSize1, 1, isUpdate, isBasicBlock, isFlashOutputBrc); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 0); -} - -TEST_F(TestTiling, TestSoftMaxFlashV2Tiling) -{ - std::vector shapeDims = { 128, 128 }; - optiling::SoftMaxTiling tilingData; - auto softmaxShape = ge::Shape(shapeDims); - uint32_t maxSumTypeSize = 2; - uint32_t inputTypeSize = 2; - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - uint32_t softmaxflashV2NeedMinLength = - GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, false); - EXPECT_EQ(softmaxflashV2NeedMinLength, 128 * (16 + 128) * 4); - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); - EXPECT_EQ(softmaxflashV2NeedMinLength, (128 * 2 + 128 * (128 + 16)) * 4); - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); - EXPECT_EQ(softmaxflashV2NeedMinLength, 128 * (128 + 16) * 4); - - uint32_t softmaxflashV2NeedMaxLength = - GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, false); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 16) * 4); - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 16 * 2) * 4); - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 16) * 4); - - maxSumTypeSize = 4; - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); - EXPECT_EQ(softmaxflashV2NeedMinLength, (128 * 2 + 128 * (128 + 16 + 8)) * 4); - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); - EXPECT_EQ(softmaxflashV2NeedMinLength, 128 * (128 + 8) * 4); - - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, false); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 8 * 2) * 4); - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, false, true); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (128 + 64 + 8) * 4); - - uint32_t workLength = 100 * 1024; - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, false, false); - EXPECT_EQ(tilingData.get_reduceM(), 120); - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, false, true); - EXPECT_EQ(tilingData.get_reduceM(), 64); - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, false); - EXPECT_EQ(tilingData.get_reduceM(), 120); - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); - EXPECT_EQ(tilingData.get_reduceM(), 64); - - inputTypeSize = 4; - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); - EXPECT_EQ(softmaxflashV2NeedMinLength, (128 * 2 + 128 * (16)) * 4); - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 128 * (64 + 8) * 4); - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); - EXPECT_EQ(tilingData.get_reduceM(), 64); -} - -TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock) -{ - std::vector shapeDims = { 8, 1024 }; - optiling::SoftMaxTiling tilingData; - auto softmaxShape = ge::Shape(shapeDims); - uint32_t maxSumTypeSize = 4; - uint32_t inputTypeSize = 4; - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - uint32_t softmaxflashV2NeedMinLength = - GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); - EXPECT_EQ(softmaxflashV2NeedMinLength, (64 + 8 * (16)) * 4); - uint32_t softmaxflashV2NeedMaxLength = - GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 8*(8 + 64) * 4); - - uint32_t workLength = 32 * 1024; - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); - EXPECT_EQ(tilingData.get_reduceM(), 8); - - inputTypeSize = 2; - workLength = 64 * 1024; - softmaxflashV2NeedMinLength = GetSoftMaxFlashV2MinTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); - EXPECT_EQ(softmaxflashV2NeedMinLength, (64 + 8 * (16 + 1024 + 8)) * 4); - softmaxflashV2NeedMaxLength = GetSoftMaxFlashV2MaxTmpSize(ascendcPlatform, softmaxShape, inputTypeSize, maxSumTypeSize, true, true); - EXPECT_EQ(softmaxflashV2NeedMaxLength, 8 * (8 + 1024 + 64) * 4); - SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); - EXPECT_EQ(tilingData.get_reduceM(), 8); -} - -TEST_F(TestTiling, TestWelfordUpdateTiling) -{ - std::vector shapeDims1d = {1, 128}; - auto shape1d = ge::Shape(shapeDims1d); - uint32_t maxSize = 0; - uint32_t minSize = 0; - uint32_t dtypeTSize = sizeof(half); - uint32_t dtypeUSize = sizeof(float); - bool isReuseSource = false; - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - GetWelfordUpdateMaxMinTmpSize(shape1d, dtypeTSize, dtypeUSize, isReuseSource, false, ascendcPlatform, maxSize, minSize); - EXPECT_EQ(minSize, 0); - EXPECT_EQ(maxSize, 0); -} - -TEST_F(TestTiling, TestWelfordFinalizeTiling) -{ - std::vector shapeDims1d = {64}; - auto shape1d = ge::Shape(shapeDims1d); - uint32_t maxSize = 0; - uint32_t minSize = 0; - uint32_t dtypeSize = sizeof(float); - bool isReuseSource = false; - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - GetWelfordFinalizeMaxMinTmpSize(shape1d, dtypeSize, isReuseSource, ascendcPlatform, maxSize, minSize); - EXPECT_EQ(minSize, 768); - EXPECT_EQ(maxSize, 768); -} - -TEST_F(TestTiling, TestLayerNormRstdTiling) -{ - const uint32_t stackBufferSize = 100 * 1024; - const uint32_t typeSize = sizeof(float); - std::vector shapeDims = {128, 88}; - auto layernormShape = ge::Shape(shapeDims); - bool isReuseSource = false; - bool isComputeRstd = true; - bool isOnlyOutput = false; - optiling::LayerNormSeparateTiling tiling; - gert::TilingContext* context = fe::GetFakeTilingContext(); - auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - uint32_t minValue = 0; - uint32_t maxValue = 0; - GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, isComputeRstd, isOnlyOutput, ascendcPlatform, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * typeSize + 128 * typeSize); - EXPECT_EQ(minValue, 128 * typeSize + 128 * typeSize); - GetNormalizeMaxMinTmpSize(layernormShape, typeSize, typeSize, isReuseSource, isComputeRstd, isOnlyOutput, ascendcPlatform, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - GetLayerNormNDTilingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, isComputeRstd, ascendcPlatform, tiling); - EXPECT_EQ(tiling.get_rLength(), 88); - EXPECT_EQ(tiling.get_rHeadLength(), 64); -} - -TEST_F(TestTiling, TestAntiquantTilingNoTransposeFP4) -{ - std::vector srcDims = { 640, 5120 }; - auto srcShape = ge::Shape(srcDims); - std::vector offsetDSms = { 1, 5120 }; - auto offsetShape = ge::Shape(offsetDSms); - bool isTranspose = false; - uint32_t maxValue; - uint32_t minValue; - GetAscendAntiQuantMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_FLOAT4_E2M1, ge::DT_FLOAT16, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); -} - -TEST_F(TestTiling, TestAntiquantTilingTransposeFP4) -{ - std::vector srcDims = { 640, 5120 }; - auto srcShape = ge::Shape(srcDims); - std::vector offsetDSms = { 1, 5120 }; - auto offsetShape = ge::Shape(offsetDSms); - bool isTranspose = true; - uint32_t maxValue; - uint32_t minValue; - GetAscendAntiQuantMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_FLOAT4_E2M1, ge::DT_FLOAT16, maxValue, minValue); - EXPECT_EQ(minValue, 10240); - EXPECT_EQ(maxValue, 10240); -} - -TEST_F(TestTiling, TestAntiquantizeTilingNoTransposeFP4) -{ - std::vector srcDims = { 640, 5120 }; - auto srcShape = ge::Shape(srcDims); - std::vector offsetDSms = { 1, 5120 }; - auto offsetShape = ge::Shape(offsetDSms); - bool isTranspose = false; - uint32_t maxValue; - uint32_t minValue; - GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_FLOAT4_E2M1, ge::DT_FLOAT16, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); -} - -TEST_F(TestTiling, TestDequantizeTiling) -{ - // 2d input shape - std::vector shape_dims = {10, 32}; - auto shape = ge::Shape(shape_dims); - uint32_t maxValue; - uint32_t minValue; - - GetDequantizeMaxMinTmpSize(shape, 2, maxValue, minValue); - EXPECT_EQ(minValue, 4 * (64 + 32 + 40)); - EXPECT_EQ(maxValue, 4 * (64 + 32 * 10 + 40)); - - // 1d input shape - std::vector shape_dims_1d = {320}; - auto shape_1d = ge::Shape(shape_dims_1d); - - GetDequantizeMaxMinTmpSize(shape_1d, 2, maxValue, minValue); - EXPECT_EQ(minValue, 4 * (64 + 1 * 320 + 328)); - EXPECT_EQ(maxValue, 4 * (64 + 1 * 320 + 328)); -} - -TEST_F(TestTiling, TestAntiquantizeTilingNoTransposePerChannelHalf) -{ - std::vector srcDims = { 640, 5120 }; - auto srcShape = ge::Shape(srcDims); - std::vector offsetDSms = { 1, 5120 }; - auto offsetShape = ge::Shape(offsetDSms); - bool isTranspose = false; - uint32_t maxValue; - uint32_t minValue; - GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_INT8, ge::DT_FLOAT16, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); -} - -TEST_F(TestTiling, TestAntiquantizeTilingNoTransposePerChannel) -{ - std::vector srcDims = { 640, 5120 }; - auto srcShape = ge::Shape(srcDims); - std::vector offsetDSms = { 1, 5120 }; - auto offsetShape = ge::Shape(offsetDSms); - bool isTranspose = false; - uint32_t maxValue; - uint32_t minValue; - GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_INT8, ge::DT_BF16, maxValue, minValue); - uint32_t expectValue = 5120 * 2 * sizeof(float) + 64 * 640 * sizeof(float); - EXPECT_EQ(minValue, expectValue); - EXPECT_EQ(maxValue, expectValue); -} - -TEST_F(TestTiling, TestAntiquantizeTilingNoTransposePerTensor) -{ - std::vector srcDims = { 640, 5120 }; - auto srcShape = ge::Shape(srcDims); - std::vector offsetDSms = { 1 }; - auto offsetShape = ge::Shape(offsetDSms); - bool isTranspose = false; - uint32_t maxValue; - uint32_t minValue; - GetAntiQuantizeMaxMinTmpSize(srcShape, offsetShape, isTranspose, ge::DT_INT8, ge::DT_BF16, maxValue, minValue); - EXPECT_EQ(minValue, 1024); - EXPECT_EQ(maxValue, 640 * 5120 * sizeof(float)); -} - -TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutput) -{ - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_INT16; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = false; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 9728); - EXPECT_EQ(minValue, 9728); -} - -TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutputB8) -{ - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_UINT8; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = false; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 7680); - EXPECT_EQ(minValue, 7680); -} - -TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutputB64) -{ - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_INT64; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = false; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 15872); - EXPECT_EQ(minValue, 15872); -} - -TEST_F(TestTiling, testAdvanceSortTilingDescendOrder) -{ - std::vector shapeDims = { 1023 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_UINT32; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = true; - bool hasSrcIndex = false; - bool hasDstIndex = false; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 11776); - EXPECT_EQ(minValue, 11776); -} - -TEST_F(TestTiling, testAdvanceSortTilingWithExtraDstIndex) -{ - std::vector shapeDims = { 300 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_FLOAT; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = true; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 5312); - EXPECT_EQ(minValue, 5312); -} - -TEST_F(TestTiling, testAdvanceSortTilingWithExtraDstIndexForB8) -{ - std::vector shapeDims = { 300 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_UINT8; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = true; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 2112); - EXPECT_EQ(minValue, 2112); -} - -TEST_F(TestTiling, testAdvanceSortTilingWithBothSrcDstIndex) -{ - std::vector shapeDims = { 4096 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_UINT16; - ge::DataType indexType = ge::DT_UINT64; - bool isDescend = false; - bool hasSrcIndex = true; - bool hasDstIndex = true; - bool isReuseSource = false; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 70144); - EXPECT_EQ(minValue, 70144); -} - -TEST_F(TestTiling, testAdvanceSortTilingOnlyDataOutputReuseSource) -{ - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_INT16; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = false; - bool isReuseSource = true; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 7680); - EXPECT_EQ(minValue, 7680); -} - -TEST_F(TestTiling, testAdvanceSortTilingDescendOrderReuseSource) -{ - std::vector shapeDims = { 1023 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_UINT32; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = true; - bool hasSrcIndex = false; - bool hasDstIndex = false; - bool isReuseSource = true; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 7680); - EXPECT_EQ(minValue, 7680); -} - -TEST_F(TestTiling, testAdvanceSortTilingWithExtraDstIndexReuseSource) -{ - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_INT32; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = false; - bool hasDstIndex = true; - bool isReuseSource = true; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 11776); - EXPECT_EQ(minValue, 11776); -} - -TEST_F(TestTiling, testAdvanceSortTilingWithBothSrcDstIndexReuseSource) -{ - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - ge::DataType valueType = ge::DT_INT16; - ge::DataType indexType = ge::DT_UINT32; - bool isDescend = false; - bool hasSrcIndex = true; - bool hasDstIndex = true; - bool isReuseSource = true; - SortConfig config = { SortType::RADIX_SORT, isDescend, hasSrcIndex, hasDstIndex }; - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetSortMaxMinTmpSize(srcShape, valueType, indexType, isReuseSource, config, maxValue, minValue); - - EXPECT_EQ(maxValue, 7680); - EXPECT_EQ(minValue, 7680); -} - -extern void platfrom_stub_set_chip_version(const char *num); -TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Float_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - bool isInitIndex = true; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 4; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 256); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_innerDataSize(), 128); - EXPECT_EQ(tilingData.get_sortRepeat(), 2); - EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); - EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); - EXPECT_EQ(tilingData.get_maskOffset(), 16); - EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); - EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 1024); - EXPECT_EQ(minValue, 1024); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexFalse_Float_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 4; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 320); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_innerDataSize(), 128); - EXPECT_EQ(tilingData.get_sortRepeat(), 2); - EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); - EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); - EXPECT_EQ(tilingData.get_maskOffset(), 16); - EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); - EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); - EXPECT_EQ(tilingData.get_srcIndexOffset(), 256); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 1280); - EXPECT_EQ(minValue, 1280); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Half_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - bool isInitIndex = true; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 2; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 512); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_innerDataSize(), 256); - EXPECT_EQ(tilingData.get_sortRepeat(), 2); - EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); - EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); - EXPECT_EQ(tilingData.get_maskOffset(), 16); - EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); - EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 1024); - EXPECT_EQ(minValue, 1024); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexFalse_Half_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 2; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 640); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_innerDataSize(), 256); - EXPECT_EQ(tilingData.get_sortRepeat(), 2); - EXPECT_EQ(tilingData.get_kAlignFourBytes(), 16); - EXPECT_EQ(tilingData.get_kAlignTwoBytes(), 16); - EXPECT_EQ(tilingData.get_maskOffset(), 16); - EXPECT_EQ(tilingData.get_maskVreducev2FourBytes(), 20); - EXPECT_EQ(tilingData.get_maskVreducev2TwoBytes(), 40); - EXPECT_EQ(tilingData.get_srcIndexOffset(), 512); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 1280); - EXPECT_EQ(minValue, 1280); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexTrue_Float_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = true; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 4; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, false, tilingData); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 128); - EXPECT_EQ(tilingData.get_maskOffset(), 10); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 512); - EXPECT_EQ(minValue, 512); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Float_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 4; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_maskOffset(), 10); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 192); - - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 768); - EXPECT_EQ(minValue, 768); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexTrue_Half_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = true; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 2; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 256); - EXPECT_EQ(tilingData.get_maskOffset(), 10); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 512); - EXPECT_EQ(minValue, 512); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Half_Inner64) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 2; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, false, tilingData); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_maskOffset(), 10); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 384); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 768); - EXPECT_EQ(minValue, 768); -} - -TEST_F(TestTiling, TestTopkTiling_DataTypeSize0_FAILED) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 0; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - auto res = TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, false, tilingData); - EXPECT_EQ(res, false); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Half_k) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - int32_t k = 13; - uint32_t dataTypeSize = 2; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_maskOffset(), 13); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 384); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 768); - EXPECT_EQ(minValue, 768); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeSmall_isInitIndexFalse_Float_k32) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 32; - uint32_t dataTypeSize = 4; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - TopKTilingFunc(plat, inner, outter, k, dataTypeSize, isInitIndex, topkMode, true, tilingData); - EXPECT_EQ(tilingData.get_allDataSize(), 64); - EXPECT_EQ(tilingData.get_maskOffset(), 32); - EXPECT_EQ(tilingData.get_tmpLocalSize(), 192); - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, true, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 768); - EXPECT_EQ(minValue, 768); -} - -TEST_F(TestTiling, TestTopkTiling_RadixTopKModeSmall_isInitIndexFalse) -{ - enum TopKMode topkMode = TopKMode::TOPK_NSMALL; - const int32_t outter = 1; - const int32_t inner = 32; - const int32_t k = 10; - ge::DataType valueType = ge::DT_INT16; - bool isReuseSource = false; - bool isInitIndex = false; - TopKConfig config = { TopKAlgo::RADIX_SELECT, TopKOrder::UNSET, true}; - - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTopKMaxMinTmpSize(inner, outter, k, isReuseSource, isInitIndex, topkMode, - true, valueType, config, maxValue, minValue); - EXPECT_EQ(maxValue, 1696); - EXPECT_EQ(minValue, 1696); -} - -TEST_F(TestTiling, TestTopkTiling_RadixTopKModeNormal_isInitIndexFalse) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - const int32_t outter = 1; - const int32_t inner = 32; - const int32_t k = 10; - ge::DataType valueType = ge::DT_INT16; - bool isReuseSource = false; - bool isInitIndex = false; - TopKConfig config = { TopKAlgo::RADIX_SELECT, TopKOrder::UNSET, true}; - - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTopKMaxMinTmpSize(inner, outter, k, isReuseSource, isInitIndex, topkMode, - true, valueType, config, maxValue, minValue); - EXPECT_EQ(maxValue, 1696); - EXPECT_EQ(minValue, 1696); -} - -TEST_F(TestTiling, TestTopkTiling_RadixTopKModeNormal_isInitIndexTrue) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - const int32_t outter = 1; - const int32_t inner = 32; - const int32_t k = 10; - ge::DataType valueType = ge::DT_INT16; - bool isReuseSource = true; - bool isInitIndex = true; - TopKConfig config = { TopKAlgo::RADIX_SELECT, TopKOrder::UNSET, true}; - - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTopKMaxMinTmpSize(inner, outter, k, isReuseSource, isInitIndex, topkMode, - true, valueType, config, maxValue, minValue); - EXPECT_EQ(maxValue, 1504); - EXPECT_EQ(minValue, 1504); -} - -TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexFalse) -{ - enum TopKMode topkMode = TopKMode::TOPK_NORMAL; - bool isInitIndex = false; - const int32_t outter = 1; - const int32_t inner = 64; - const int32_t k = 10; - uint32_t dataTypeSize = 4; - bool isReuseSource = true; - uint32_t maxValue = 0; - uint32_t minValue = 0; - optiling::TopkTiling tilingData; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - - GetTopKMaxMinTmpSize(plat, inner, outter, isReuseSource, isInitIndex, topkMode, false, 4, maxValue, minValue); - EXPECT_EQ(maxValue, 1280); - EXPECT_EQ(minValue, 1280); -} - -TEST_F(TestTiling, TestPowerTiling) -{ - std::vector shapeDims = { 1, 512 }; - auto powerShape = ge::Shape(shapeDims); - uint32_t maxVal; - uint32_t minVal; - - GetPowerMaxMinTmpSize(powerShape, powerShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 2); - EXPECT_EQ(maxVal, 512 * 4 * 2); - GetPowerMaxMinTmpSize(powerShape, powerShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - GetPowerMaxMinTmpSize(powerShape, powerShape, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 2); - EXPECT_EQ(minVal, 512 * 4 * 2); - - std::vector scalar_shape = { 1 }; - auto scalarShape = ge::Shape(scalar_shape); - GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 2); - EXPECT_EQ(maxVal, 512 * 4 * 2); - GetPowerMaxMinTmpSize(powerShape, scalarShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - GetPowerMaxMinTmpSize(scalarShape, powerShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 2); - EXPECT_EQ(minVal, 512 * 4 * 2); -} - -TEST_F(TestTiling, TestPowerTilingFactorSize) -{ - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - - GetPowerTmpBufferFactorSize(false, true, false, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); - - GetPowerTmpBufferFactorSize(false, true, false, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); - - GetPowerTmpBufferFactorSize(false, true, true, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestPowerTilingWithConfig) -{ - std::vector shapeDims = { 1, 512 }; - auto powerShape = ge::Shape(shapeDims); - uint32_t maxVal; - uint32_t minVal; - - AscendC::PowerConfig intrinsicConfig = { AscendC::PowerAlgo::INTRINSIC }; - AscendC::PowerConfig doubleFloatTechConfig = { AscendC::PowerAlgo::DOUBLE_FLOAT_TECH }; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - GetPowerMaxMinTmpSize(plat, intrinsicConfig, powerShape, powerShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(maxVal, 0); - GetPowerMaxMinTmpSize(plat, intrinsicConfig, powerShape, powerShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - - std::vector scalar_shape = { 1 }; - auto scalarShape = ge::Shape(scalar_shape); - GetPowerMaxMinTmpSize(plat, doubleFloatTechConfig, powerShape, scalarShape, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 2); - EXPECT_EQ(maxVal, 512 * 4 * 2); - GetPowerMaxMinTmpSize(plat, doubleFloatTechConfig, powerShape, scalarShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - GetPowerMaxMinTmpSize(plat, doubleFloatTechConfig, scalarShape, powerShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 2); - EXPECT_EQ(minVal, 512 * 4 * 2); -} - -TEST_F(TestTiling, TestPowerTilingFactorSizeWithConfig) -{ - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - - AscendC::PowerConfig intrinsicConfig = { AscendC::PowerAlgo::INTRINSIC }; - AscendC::PowerConfig doubleFloatTechConfig = { AscendC::PowerAlgo::DOUBLE_FLOAT_TECH }; - fe::PlatFormInfos platformInfo; - auto plat = platform_ascendc::PlatformAscendC(&platformInfo); - - GetPowerTmpBufferFactorSize(plat, intrinsicConfig, false, true, false, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - - GetPowerTmpBufferFactorSize(plat, intrinsicConfig, false, true, false, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - - GetPowerTmpBufferFactorSize(plat, doubleFloatTechConfig, false, true, false, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); - - GetPowerTmpBufferFactorSize(plat, doubleFloatTechConfig, false, true, false, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); - - GetPowerTmpBufferFactorSize(plat, doubleFloatTechConfig, false, true, true, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestCosTilingFloatWithConfig) -{ - std::vector shapeDims = { 128, 128 }; - auto cosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - - AscendC::CosConfig polyConfig = { AscendC::CosAlgo::POLYNOMIAL_APPROXIMATION }; - AscendC::CosConfig radinConfig = { AscendC::CosAlgo::RADIAN_REDUCTION }; - - AscendC::GetCosMaxMinTmpSize(polyConfig, cosShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - AscendC::GetCosMaxMinTmpSize(radinConfig, cosShape, 4, true, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4 + 32); - - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetCosTmpBufferFactorSize(polyConfig, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - GetCosTmpBufferFactorSize(radinConfig, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 32); -} - -TEST_F(TestTiling, TestCosTilingHalfWithConfig) -{ - std::vector shapeDims = { 512 }; - auto cosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - - AscendC::CosConfig polyConfig = { AscendC::CosAlgo::POLYNOMIAL_APPROXIMATION }; - AscendC::CosConfig radinConfig = { AscendC::CosAlgo::RADIAN_REDUCTION }; - - AscendC::GetCosMaxMinTmpSize(polyConfig, cosShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - - AscendC::GetCosMaxMinTmpSize(radinConfig, cosShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 512 * 2 * 4 + 32); - EXPECT_EQ(minValue, 512 * 2 * 4 + 32); - - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetCosTmpBufferFactorSize(polyConfig, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - GetCosTmpBufferFactorSize(radinConfig, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestHypotTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto atanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetHypotMaxMinTmpSize(atanShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetHypotTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestSinTilingFloatWithConfig) -{ - std::vector shapeDims = { 128, 128 }; - auto sinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - - AscendC::SinConfig polyConfig = { AscendC::SinAlgo::POLYNOMIAL_APPROXIMATION }; - AscendC::SinConfig radinConfig = { AscendC::SinAlgo::RADIAN_REDUCTION }; - - AscendC::GetSinMaxMinTmpSize(polyConfig, sinShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - AscendC::GetSinMaxMinTmpSize(radinConfig, sinShape, 4, true, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4 + 32); - - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetSinTmpBufferFactorSize(polyConfig, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - GetSinTmpBufferFactorSize(radinConfig, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 32); -} - -TEST_F(TestTiling, TestSinTilingHalfWithConfig) -{ - std::vector shapeDims = { 512 }; - auto sinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - - AscendC::SinConfig polyConfig = { AscendC::SinAlgo::POLYNOMIAL_APPROXIMATION }; - AscendC::SinConfig radinConfig = { AscendC::SinAlgo::RADIAN_REDUCTION }; - - AscendC::GetSinMaxMinTmpSize(polyConfig, sinShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - - AscendC::GetSinMaxMinTmpSize(radinConfig, sinShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 512 * 2 * 4 + 32); - EXPECT_EQ(minValue, 512 * 2 * 4 + 32); - - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetSinTmpBufferFactorSize(polyConfig, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - GetSinTmpBufferFactorSize(radinConfig, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestConfusionTransposeTiling) -{ - const uint32_t stackBufferSize = 0; - const uint32_t typeSize = 4; - - std::vector shapeDims = { 32, 64, 128 }; - auto srcShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - - AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, typeSize, 13, maxValue, minValue); - AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, typeSize, 14, maxValue, minValue); - AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, typeSize, 15, maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); - - optiling::ConfusionTransposeTiling tiling; - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 13, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 14, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 15, tiling); -} - -#else extern void platfrom_stub_set_chip_version(const char *num); ->>>>>>> 4a0a42bb (update) TEST_F(TestTiling, MultiCoreSmallMN) { matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4,rnnMatmul5; diff --git a/tests/transdata/test_operator_transdata.cpp b/tests/transpose/transdata/test_operator_transdata.cpp similarity index 100% rename from tests/transdata/test_operator_transdata.cpp rename to tests/transpose/transdata/test_operator_transdata.cpp diff --git a/tiling/tiling_api.h b/tiling/tiling_api.h deleted file mode 100644 index 015f68e4..00000000 --- a/tiling/tiling_api.h +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file tiling_api.h - * \brief - */ -#ifndef TILING_API_H -#define TILING_API_H -#include "../lib/matmul/matmul_tiling.h" -#include "../lib/matmul/bmm_tiling.h" -#include "../lib/activation/softmax_tiling.h" -#include "../lib/activation/logsoftmax_tiling.h" -#include "../lib/filter/dropout_tiling.h" -#include "../lib/sort/sort_tiling_intf.h" -#include "../lib/index/arithprogression_tiling.h" -#include "../lib/quantization/ascend_dequant_tiling.h" -#include "../lib/quantization/ascend_quant_tiling.h" -#include "../lib/quantization/ascend_antiquant_tiling.h" -#include "../lib/quantization/quantize_tiling.h" -#include "../lib/quantization/antiquantize_tiling.h" -#include "../lib/quantization/dequantize_tiling.h" -#include "../lib/reduce/sum_tiling.h" -#include "../lib/activation/silu_tiling.h" -#include "../lib/activation/swish_tiling.h" -#include "../lib/activation/gelu_tiling.h" -#include "../lib/pad/pad_tiling.h" -#include "../lib/normalization/rmsnorm_tiling.h" -#include "../lib/normalization/deepnorm_tiling.h" -#include "../lib/normalization/layernorm_tiling.h" -#include "../lib/normalization/normalize_tiling.h" -#include "../lib/normalization/groupnorm_tiling.h" -#include "../lib/normalization/batchnorm_tiling.h" -#include "../lib/normalization/layernorm_grad_tiling.h" -#include "../lib/normalization/layernorm_grad_beta_tiling.h" -#include "../lib/normalization/welfordfinalize_tiling.h" -#include "../lib/transpose/confusion_transpose_tiling.h" -#include "tiling/platform/platform_ascendc.h" -#include "../lib/sort/topk_tiling.h" -#include "../lib/math/tanh_tiling.h" -#include "../lib/activation/sigmoid_tiling.h" -#include "../lib/math/frac_tiling.h" -#include "../lib/math/acos_tiling.h" -#include "../lib/math/asin_tiling.h" -#include "../lib/math/acosh_tiling.h" -#include "../lib/math/asinh_tiling.h" -#include "../lib/math/sin_tiling.h" -#include "../lib/math/cos_tiling.h" -#include "../lib/math/hypot_tiling.h" -#include "../lib/math/atan_tiling.h" -#include "../lib/math/power_tiling.h" -#include "../lib/math/log_tiling.h" -#include "../lib/math/cosh_tiling.h" -#include "../lib/math/clamp_tiling.h" -#include "../lib/math/erf_tiling.h" -#include "../lib/math/erfc_tiling.h" -#include "../lib/math/round_tiling.h" -#include "../lib/math/sinh_tiling.h" -#include "../lib/activation/swiglu_tiling.h" -#include "../lib/math/tan_tiling.h" -#include "../lib/select/selectwithbytesmask_tiling.h" -#include "../lib/math/trunc_tiling.h" -#include "../lib/math/fmod_tiling.h" -#include "../lib/activation/geglu_tiling.h" -#include "../lib/math/lgamma_tiling.h" -#include "../lib/math/digamma_tiling.h" -#include "../lib/math/atanh_tiling.h" -#include "../lib/math/xor_tiling.h" -#include "../lib/math/sign_tiling.h" -#include "../lib/reduce/mean_tiling.h" -#include "../lib/math/exp_tiling.h" -#include "../lib/math/axpy_tiling.h" -#include "../lib/math/ceil_tiling.h" -#include "../lib/math/floor_tiling.h" -#include "../lib/activation/reglu_tiling.h" -#include "../lib/pad/broadcast_tiling.h" -#include "../lib/reduce/reduce_xor_sum_tiling.h" -#include "../lib/reduce/reduce_tiling.h" -#include "../lib/transdata/transdata_tiling.h" -#include "../lib/math/cumsum_tiling.h" -#include "../lib/hccl/hccl_tilingdata.h" -#include "../lib/hccl/hccl_tiling.h" -#endif // TILING_API_H -- Gitee