diff --git a/docs/README.md b/docs/README.md
index b688a7f7960ad7bec14e33c5eeb99b51da3e196b..b90de102a7bd4bd32cad0d34380f6b855122a0ba 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -334,10 +334,14 @@
         <td> 给定两个源操作数src0和src1，根据maskTensor相应位置的值(非bit位)选取元素，得到目的操作数dst。 </td>
     </tr>
     <tr>
-        <th rowspan="1"> 变形 </th>
+        <th rowspan="2"> 变形 </th>
         <td> ConfusionTranspose </td>
         <td> 对输入数据进行数据排布及Reshape操作。 </td>
     </tr>
+    <tr>
+        <td> TransData </td>
+        <td> 对输入数据排布格式转换为输出所需的数据排布格式 </td>
+    </tr>
     <tr>
         <th rowspan="1"> 索引操作 </th>
         <td> ArithProgression </td>
diff --git a/impl/CMakeLists.txt b/impl/CMakeLists.txt
index c29d95d33e3a958ae66a96848fc92ab7cc2df526..1ab2cc72b6dc11b43344c7529011edcaa3002e4e 100644
--- a/impl/CMakeLists.txt
+++ b/impl/CMakeLists.txt
@@ -92,6 +92,7 @@ add_library(tiling_api STATIC
     ${CMAKE_CURRENT_SOURCE_DIR}/math/axpy/axpy_tiling_impl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/math/ceil/ceil_tiling_impl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/math/floor/floor_tiling_impl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/transpose/transdata/transdata_tiling.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/math/fmod/fmod_tiling_impl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/math/trunc/trunc_tiling_impl.cpp
     $<$<BOOL:${BUILD_OPEN_PROJECT}>:$<TARGET_OBJECTS:${ASCENDC_API_ADV_OBJ}>>
diff --git a/impl/reduce/mean/mean_tiling.cpp b/impl/reduce/mean/mean_tiling.cpp
index c22cacec074b9811d9809985aa87a5ac86eaa054..6838fb099b180eeb5f515a295fef925b59c40232 100644
--- a/impl/reduce/mean/mean_tiling.cpp
+++ b/impl/reduce/mean/mean_tiling.cpp
@@ -10,16 +10,37 @@
 
 #include "lib/reduce/mean_tiling.h"
 #include "register/tilingdata_base.h"
+#include "impl/host_log.h"
+
 namespace AscendC {
 constexpr uint32_t MEAN_CALC_PROC = 1;
 const uint32_t MEAN_ONE_BLK_SIZE = 32;
 const uint32_t MEAN_ONE_REPEAT_BYTE_SIZE = 256;
 const uint32_t HALF_TYPE_SIZE = 2;
 const uint32_t FLOAT_TYPE_SIZE = 4;
+
+inline void CheckMeanHostParams(const uint32_t n, const uint32_t srcTypeSize,
+    const uint32_t accTypeSize, const bool isReuseSource)
+{
+    ASCENDC_HOST_ASSERT(
+        ((srcTypeSize == 2U && accTypeSize == 2U) ||
+         (srcTypeSize == 4U && accTypeSize == 4U) ||
+         (srcTypeSize == 2U && srcTypeSize == 4U)),
+         return,
+         "[Mean][GetMeanMaxMinTmpSize] The parameter (srcTypeSize, accTypeSize) is (%u, %u), expected is (2, 2)/(4, 4)/(2, 4).",
+         srcTypeSize, accTypeSize
+        );
+    ASCENDC_HOST_ASSERT(n > 0,
+         return, "[Mean][GetMeanMaxMinTmpSize] The parameter n is %u, expected is greater than 0!", n);
+    if (isReuseSource) {
+        TILING_LOG_WARNING("[Mean][GetMeanMaxMinTmpSize] The parameter isReuseSource is true, which is not effective!");
+    }
+}
+
 void GetMeanMaxMinTmpSize(const uint32_t n, const uint32_t srcTypeSize,
     const uint32_t accTypeSize, const bool isReuseSource, uint32_t& maxSize, uint32_t& minSize)
 {
-    (void)isReuseSource;
+    CheckMeanHostParams(n, srcTypeSize, accTypeSize, isReuseSource);
     if (srcTypeSize == 0) {
         return;
     }
diff --git a/impl/reduce/reduce_all/reduce_all_v220_impl.h b/impl/reduce/reduce_all/reduce_all_v220_impl.h
index 223639d81a0d045b1dc23027e43adaff296f44a3..aa5386bb32996f9e556a8303391f5ae897ed8105 100644
--- a/impl/reduce/reduce_all/reduce_all_v220_impl.h
+++ b/impl/reduce/reduce_all/reduce_all_v220_impl.h
@@ -64,9 +64,9 @@ __aicore__ inline void ReduceAllImpl(const LocalTensor<T>& dstTensor, const Loca
             BinaryReduceByFirstAxis<T, isReuseSource, Min<T, false>>(
                 dstTensor, srcTensor, tmpTensor, first, last, padLast);
         }
-        SetMaskNorm();
-        ResetMask();
     }
+    SetMaskNorm();
+    ResetMask();
 }
 } // namespace Internal
 } // namespace AscendC
diff --git a/impl/reduce/reduce_any/reduce_any_v220_impl.h b/impl/reduce/reduce_any/reduce_any_v220_impl.h
index b8d7f59dcc5501bd58eb617bf13897fac93125ed..5c3bd25208cde6324fd2b49892198051fca00d50 100644
--- a/impl/reduce/reduce_any/reduce_any_v220_impl.h
+++ b/impl/reduce/reduce_any/reduce_any_v220_impl.h
@@ -64,9 +64,9 @@ __aicore__ inline void ReduceAnyImpl(const LocalTensor<T>& dstTensor, const Loca
             BinaryReduceByFirstAxis<T, isReuseSource, Max<T, false>>(
                 dstTensor, srcTensor, tmpTensor, first, last, padLast);
         }
-        SetMaskNorm();
-        ResetMask();
     }
+    SetMaskNorm();
+    ResetMask();
 }
 } // namespace Internal
 } // namespace AscendC
diff --git a/impl/reduce/reduce_sum/reduce_sum_v220_impl.h b/impl/reduce/reduce_sum/reduce_sum_v220_impl.h
index 75890b67bd6874eeb1f04ae4e7d606382513d9fa..f29211873c3b887f94091092e91715171f6261c6 100644
--- a/impl/reduce/reduce_sum/reduce_sum_v220_impl.h
+++ b/impl/reduce/reduce_sum/reduce_sum_v220_impl.h
@@ -268,6 +268,8 @@ __aicore__ inline void ReduceSumImpl(const LocalTensor<T>& dstTensor, const Loca
         BinaryReduceByFirstAxis<T, isReuseSource, Add<T, false>>(
                 dstTensor, srcTensor, tmpBuf, first, last, padLast);
     }
+    SetMaskNorm();
+    ResetMask();
 }
 } // namespace Internal
 } // namespace AscendC
diff --git a/impl/reduce/reduce_tiling.cpp b/impl/reduce/reduce_tiling.cpp
index b427d5b41220b56ffda6cfda5cafb9d81b8e4344..71722928c3a4c898614e4c5f2ba0528dcbd113c0 100644
--- a/impl/reduce/reduce_tiling.cpp
+++ b/impl/reduce/reduce_tiling.cpp
@@ -10,6 +10,7 @@
 
 #include "lib/reduce/reduce_tiling.h"
 
+#include <string>
 #include <cstdint>
 #include <algorithm>
 
@@ -44,27 +45,31 @@ uint32_t FindK(uint32_t n) {
 }
 
 inline void CheckParams(std::vector<int64_t> shapeDims, bool isSrcInnerPad, ReducePattern pattern,
-    uint32_t first, uint32_t last)
+    uint32_t first, uint32_t last, std::string apiName, std::string funcName)
 {
-    ASCENDC_HOST_ASSERT(shapeDims.size() == ALLOWED_SHAPE_DIM, return, "srcShape dims must be 2.");
-    ASCENDC_HOST_ASSERT(isSrcInnerPad, return, "isSrcInnerPad must be true on this platform.");
+    ASCENDC_HOST_ASSERT(shapeDims.size() == ALLOWED_SHAPE_DIM, return,
+        "[%s][%s] srcShape dims must be 2.", apiName.c_str(), funcName.c_str());
+    ASCENDC_HOST_ASSERT(isSrcInnerPad, return,
+        "[%s][%s] isSrcInnerPad must be true on this platform.", apiName.c_str(), funcName.c_str());
     ASCENDC_HOST_ASSERT(pattern == ReducePattern::AR || pattern == ReducePattern::RA,
         return,
-        "Currently only support AR and RA pattern.");
-    ASCENDC_HOST_ASSERT(first > 0 && last > 0, return, "both first and last axis must be greater than 0.");
+        "[%s][%s] Currently only support AR and RA pattern.", apiName.c_str(), funcName.c_str());
+    ASCENDC_HOST_ASSERT(first > 0 && last > 0, return,
+        "[%s][%s] both first and last axis must be greater than 0.", apiName.c_str(), funcName.c_str());
 }
 } // namespace
 
 void GetReduceCommonMaxMinTmpSize(const ge::Shape &srcShape,
                                 const ge::DataType dataType,
                                 ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
-                                uint32_t &maxValue, uint32_t &minValue, bool isBinaryAdd) 
+                                uint32_t &maxValue, uint32_t &minValue, bool isBinaryAdd,
+                                std::string apiName, std::string funcName)
 {
     std::vector<int64_t> shapeDims = srcShape.GetDims();
 
     const uint32_t first = static_cast<uint32_t>(shapeDims[0]);
     const uint32_t last = static_cast<uint32_t>(shapeDims[1]);
-    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last);
+    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, apiName, funcName);
     if (isReuseSource) {
         maxValue = minValue = 0U;
         return;
@@ -96,18 +101,77 @@ void GetReduceCommonMaxMinTmpSize(const ge::Shape &srcShape,
     maxValue = minValue = k * ((last * GetTypeSize(dataType) + ONE_BLK_SIZE - 1u) / ONE_BLK_SIZE * ONE_BLK_SIZE);
 }
 
+inline void GetReduceSumMeanCommonTmpSize(const ge::Shape &srcShape,
+                               ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
+                               uint32_t &maxValue, uint32_t &minValue, std::string apiName, std::string funcName)
+{
+    std::vector<int64_t> shapeDims = srcShape.GetDims();
+    const uint32_t first = static_cast<uint32_t>(shapeDims[0]);
+    const uint32_t last = static_cast<uint32_t>(shapeDims[1]);
+    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, apiName, funcName);
+    if (isReuseSource) {
+        maxValue = minValue = 0U;
+        return;
+    }
+    uint32_t elePerBlk = ONE_BLK_SIZE / FLOAT_TYPE_SIZE;
+    if (pattern == ReducePattern::AR) {
+        uint32_t k = FindK(last);
+        if (k == last && first > 1U) {
+            k >>= 1U;
+        }
+        if (last <= B32_ELEM_NUM_PER_REPEAT) {
+            maxValue = minValue = 0U;
+        } else {
+            maxValue = minValue = (first * k) * FLOAT_TYPE_SIZE;
+        }
+    } else {
+        uint32_t k = FindK(first);
+        uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk;
+        if (first == k && first > 1U) {
+            k >>= 1U;
+        }
+        maxValue = minValue = (k * padLast) * FLOAT_TYPE_SIZE;
+    }
+    return;
+}
+
+inline void GetReduceAnyAllCommonTmpSize(const ge::Shape &srcShape,
+                                ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
+                                uint32_t &maxValue, uint32_t &minValue, std::string apiName, std::string funcName)
+{
+    std::vector<int64_t> shapeDims = srcShape.GetDims();
+    const uint32_t first = static_cast<uint32_t>(shapeDims[0]);
+    const uint32_t last = static_cast<uint32_t>(shapeDims[1]);
+    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, apiName, funcName);
+    if (pattern == ReducePattern::AR) {
+        uint32_t elePerBlk = static_cast<uint32_t>(ONE_BLK_SIZE / sizeof(uint8_t));
+        uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk;
+        minValue = maxValue = static_cast<uint32_t>(padLast * sizeof(uint16_t)) + (first * elePerBlk);
+    } else {
+        if (isReuseSource) {
+            maxValue = minValue = 0U;
+            return;
+        }
+        uint32_t k = FindK(first);
+        if (k == first && first > 1U) {
+            k >>= 1U;
+        }
+        maxValue = minValue = k * ((last + ONE_BLK_SIZE - 1U) / ONE_BLK_SIZE * ONE_BLK_SIZE);
+    }
+    return;
+}
+
 void GetReduceProdMaxMinTmpSize(const ge::Shape &srcShape,
                                 const ge::DataType dataType,
                                 ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
                                 uint32_t &maxValue, uint32_t &minValue) 
 {
-    ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, "it only supports float type on this platform.");
-
+    ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return,
+        "[ReduceProd][GetReduceProdMaxMinTmpSize] it only supports float type on this platform.");
     std::vector<int64_t> shapeDims = srcShape.GetDims();
-
     const uint32_t first = static_cast<uint32_t>(shapeDims[0]);
     const uint32_t last = static_cast<uint32_t>(shapeDims[1]);
-    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last);
+    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last, "ReduceProd", "GetReduceProdMaxMinTmpSize");
     if (isReuseSource) {
         minValue = pattern == ReducePattern::AR ? ONE_REPEAT_BYTE_SIZE : 0U;
         maxValue = minValue;
@@ -137,8 +201,9 @@ void GetReduceMaxMaxMinTmpSize(const ge::Shape &srcShape,
 {
     ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_FLOAT16,
         return,
-        "it only supports float and half type on this platform.");
-    GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false);
+        "[ReduceMax][GetReduceMaxMaxMinTmpSize] it only supports float and half type on this platform.");
+    GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false,
+        "ReduceMax", "GetReduceMaxMaxMinTmpSize");
 }
 
 void GetReduceMinMaxMinTmpSize(const ge::Shape &srcShape,
@@ -148,8 +213,9 @@ void GetReduceMinMaxMinTmpSize(const ge::Shape &srcShape,
 {
     ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_FLOAT16,
         return,
-        "it only supports float and half type on this platform.");
-    GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false);
+        "[ReduceMin][GetReduceMinMaxMinTmpSize] it only supports float and half type on this platform.");
+    GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false,
+        "ReduceMin", "GetReduceMinMaxMinTmpSize");
 }
 
 void GetReduceAnyMaxMinTmpSize(const ge::Shape &srcShape,
@@ -157,13 +223,15 @@ void GetReduceAnyMaxMinTmpSize(const ge::Shape &srcShape,
                                 ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
                                 uint32_t &maxValue, uint32_t &minValue) 
 {
+    ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8,
+        return,
+        "[ReduceAny][GetReduceAnyMaxMinTmpSize] it only supports float and uint8_t type on this platform.");
     if (dataType == ge::DT_UINT8) {
-        GetReduceAllMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue);
+        GetReduceAnyAllCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue,
+            "ReduceAny", "GetReduceAnyMaxMinTmpSize");
     } else {
-        ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8,
-            return,
-            "it only supports float and uint8_t type on this platform.");
-        GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false);
+        GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue,
+            false, "ReduceAny", "GetReduceAnyMaxMinTmpSize");
     }
 }
 
@@ -173,31 +241,13 @@ void GetReduceAllMaxMinTmpSize(const ge::Shape &srcShape,
                                uint32_t &maxValue, uint32_t &minValue)
 {
     ASCENDC_HOST_ASSERT((dataType == ge::DT_FLOAT || dataType == ge::DT_UINT8), return,
-        "it only supports float uint8 type on this platform.");
-    std::vector<int64_t> shapeDims = srcShape.GetDims();
-    const uint32_t first = static_cast<uint32_t>(shapeDims[0]);
-    const uint32_t last = static_cast<uint32_t>(shapeDims[1]);
-    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last);
+        "[ReduceAll][GetReduceAllMaxMinTmpSize] it only supports float and uint8 type on this platform.");
     if (dataType == ge::DT_UINT8) {
-        if (pattern == ReducePattern::AR) {
-            uint32_t elePerBlk = static_cast<uint32_t>(ONE_BLK_SIZE / sizeof(uint8_t));
-            uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk;
-            minValue = maxValue = static_cast<uint32_t>(padLast * sizeof(uint16_t)) + (first * elePerBlk);
-        } else {
-            if (isReuseSource) {
-                maxValue = minValue = 0U;
-                return;
-            }
-            uint32_t k = FindK(first);
-            if (k == first && first > 1U) {
-                k >>= 1U;
-            }
-            maxValue = minValue = k * ((last + ONE_BLK_SIZE - 1U) / ONE_BLK_SIZE * ONE_BLK_SIZE);
-        }
-        return;
+        GetReduceAnyAllCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue,
+            "ReduceAll", "GetReduceAllMaxMinTmpSize");
     } else {
-        GetReduceCommonMaxMinTmpSize(
-            srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue, false);
+        GetReduceCommonMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue,
+            false, "ReduceAll", "GetReduceAllMaxMinTmpSize");
     }
 }
 
@@ -206,35 +256,10 @@ void GetReduceSumMaxMinTmpSize(const ge::Shape &srcShape,
                                ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
                                uint32_t &maxValue, uint32_t &minValue)
 {
-    ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return, "it only supports float type on this platform.");
-    std::vector<int64_t> shapeDims = srcShape.GetDims();
-    const uint32_t first = static_cast<uint32_t>(shapeDims[0]);
-    const uint32_t last = static_cast<uint32_t>(shapeDims[1]);
-    CheckParams(shapeDims, isSrcInnerPad, pattern, first, last);
-    if (isReuseSource) {
-        maxValue = minValue = 0U;
-        return;
-    }
-    uint32_t elePerBlk = ONE_BLK_SIZE / FLOAT_TYPE_SIZE;
-    if (pattern == ReducePattern::AR) {
-        uint32_t k = FindK(last);
-        if (k == last && first > 1U) {
-            k >>= 1U;
-        }
-        if (last <= B32_ELEM_NUM_PER_REPEAT) {
-            maxValue = minValue = 0U;
-        } else {
-            maxValue = minValue = (first * k) * FLOAT_TYPE_SIZE;
-        }
-    } else {
-        uint32_t k = FindK(first);
-        uint32_t padLast = (last + elePerBlk - 1U) / elePerBlk * elePerBlk;
-        if (first == k && first > 1U) {
-            k >>= 1U;
-        }
-        maxValue = minValue = (k * padLast) * FLOAT_TYPE_SIZE;
-    }
-    return;
+    ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return,
+        "[ReduceSum][GetReduceSumMaxMinTmpSize] it only supports float type on this platform.");
+    GetReduceSumMeanCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue,
+        "ReduceSum", "GetReduceSumMaxMinTmpSize");
 }
 
 void GetReduceMeanMaxMinTmpSize(const ge::Shape &srcShape,
@@ -242,6 +267,9 @@ void GetReduceMeanMaxMinTmpSize(const ge::Shape &srcShape,
                                 ReducePattern pattern, bool isSrcInnerPad, bool isReuseSource,
                                 uint32_t &maxValue, uint32_t &minValue)
 {
-    GetReduceSumMaxMinTmpSize(srcShape, dataType, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue);
+    ASCENDC_HOST_ASSERT(dataType == ge::DT_FLOAT, return,
+        "[ReduceMean][GetReduceMeanMaxMinTmpSize] it only supports float type on this platform.");
+    GetReduceSumMeanCommonTmpSize(srcShape, pattern, isSrcInnerPad, isReuseSource, maxValue, minValue,
+        "ReduceMean", "GetReduceMeanMaxMinTmpSize");
 }
 }  // namespace AscendC
diff --git a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp
index d705e532a8b02a9c55e29143bb5a2a346e1abbdd..244456092f3ea848a0fcbd09f77ffaa73af861e3 100644
--- a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp
+++ b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp
@@ -32,7 +32,16 @@ void GetReduceXorSumMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t type
                                   uint32_t &maxValue, uint32_t &minValue)
 {
     const uint32_t inputSize = srcShape.GetShapeSize();
-    ASCENDC_HOST_ASSERT(inputSize > 0, return, "ReduceXorSum input Shape size must be greater than 0.");
+
+    ASCENDC_HOST_ASSERT(inputSize > 0U, return,
+        "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape size is %u, expected is greater than 0!",
+         inputSize);
+    std::vector<int64_t> shapeDims = srcShape.GetDims();
+    ASCENDC_HOST_ASSERT(shapeDims.size() > 0UL, return,
+        "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape dimension number is %lli, expected is greater than 0!",
+         shapeDims.size());
+    ASCENDC_HOST_ASSERT(typeSize == 2U, return,
+        "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter typeSize is %u, expected is 2!", typeSize);
 
     maxValue = GetTmpSize(inputSize, typeSize, isReuseSource);
     minValue = maxValue;
diff --git a/impl/reduce/sum/sum_tiling.cpp b/impl/reduce/sum/sum_tiling.cpp
index 95e087acdf5e2e067197672e247c91a44c231b60..817031cd1025239c84280ad2f680ab1e1de51afa 100644
--- a/impl/reduce/sum/sum_tiling.cpp
+++ b/impl/reduce/sum/sum_tiling.cpp
@@ -19,8 +19,14 @@ namespace AscendC {
 void GetSumMaxMinTmpSize(
     const uint32_t n, const uint32_t typeSize, const bool isReuseSource, uint32_t &maxSize, uint32_t &minSize)
 {
-    (void)isReuseSource;
-    ASCENDC_HOST_ASSERT(typeSize > 0, return, "typeSize must be greater than 0.");
+    if (isReuseSource) {
+        TILING_LOG_WARNING("[Sum][GetSumMaxMinTmpSize] The parameter isReuseSource is true, which is not effective!");
+    }
+    ASCENDC_HOST_ASSERT(typeSize > 0, return,
+        "[Sum][GetSumMaxMinTmpSize] The parameter typeSize is %u, expected is 2 or 4!", typeSize);
+    ASCENDC_HOST_ASSERT(n > 0,
+         return, "[Sum][GetSumMaxMinTmpSize] The parameter n is %u, expected is greater than 0!", n);
+
     constexpr uint32_t sumOneBlkSize = 32;
     constexpr uint32_t sumOneRepeatByteSize = 256;
 
diff --git a/impl/transpose/transdata/transdata_impl.h b/impl/transpose/transdata/transdata_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8e5d41e273cfaaa40b5f88191409bd51e873c2f
--- /dev/null
+++ b/impl/transpose/transdata/transdata_impl.h
@@ -0,0 +1,526 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef IMPL_TRANSPOSE_TRANSDATA_TRANSDATA_IMPL_H
+#define IMPL_TRANSPOSE_TRANSDATA_TRANSDATA_IMPL_H
+
+#include "kernel_tensor.h"
+#include "kernel_operator_intf.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "../../common/check.h"
+#include "../../api_check/kernel_api_check.h"
+
+namespace AscendC {
+namespace Internal {
+
+namespace {
+constexpr int32_t n0 = 16;
+constexpr int32_t c0 = 16;
+constexpr int32_t hw0 = 16;
+constexpr int32_t ncdhwDims = 5;
+constexpr int32_t fractalZ3DDims = 7;
+constexpr int32_t ndc1hwc0Dims = 6;
+}
+
+struct TransDataTmpParams {
+    int32_t n;
+    int32_t c;
+    int32_t d;
+    int32_t h;
+    int32_t w;
+    int32_t n1;
+    int32_t c1;
+    int32_t padHw;
+};
+
+constexpr int32_t DEFAULT_TRANSDATA_5HD_LIST = 16;
+
+template <typename T>
+__aicore__ inline void DC1Hwn1n0c0ToC1DHwn1n0c0HWAlign(const LocalTensor<T>& dst, const LocalTensor<T>& src,
+    const TransDataTmpParams& params)
+{
+    // d, c1, h w n1 n0 c0 -> c1, d, hw1*hw0 n1 n0 c0
+    int32_t d = params.d;
+    int32_t h = params.h;
+    int32_t w = params.w;
+    int32_t n1 = params.n1;
+    int32_t c1 = params.c1;
+    int32_t padHw = params.padHw;
+
+    uint32_t dim0 = d;
+    uint32_t dim1 = c1;
+    uint32_t lastDim = h * w * n1 * n0 * c0;
+
+    // dim0, dim1, lastDim -> dim1, dim0, lastDim
+    int32_t n1n0c0DimElems = n1 * n0 * c0;
+    int32_t hwAlignElems = padHw * n1n0c0DimElems;
+    int32_t hwPadElems = (padHw - h * w) * n1n0c0DimElems;
+
+    uint16_t blockCount = dim1;
+    uint16_t blockLen = lastDim * sizeof(T) / ONE_BLK_SIZE;
+    uint16_t srcGap = 0;
+    uint16_t dstGap = ((dim0 - 1) * hwAlignElems + hwPadElems) * sizeof(T) / ONE_BLK_SIZE;
+
+    uint32_t dstSize = c1 * d * padHw * n1 * n0 * c0;
+    Duplicate<T>(dst, static_cast<T>(0), dstSize);
+    PipeBarrier<PIPE_V>();
+
+    DataCopyParams dataCopyParams = { blockCount, blockLen, srcGap, dstGap };
+    for (uint32_t d0 = 0; d0 < dim0; d0++) {
+        DataCopy(dst[d0 * hwAlignElems], src[d0 * dim1 * lastDim], dataCopyParams);
+    }
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void C1Dhwn1n0c0ToC1C0Dhwn1n0(const LocalTensor<T>& dst, const LocalTensor<T>& src,
+    const TransDataTmpParams& params)
+{
+    // C1 DHWN1N0 C0 -> C1 C0 DHWN1N0
+    int32_t d = params.d;
+    int32_t n1 = params.n1;
+    int32_t c1 = params.c1;
+    int32_t padHw = params.padHw;
+
+    TransDataTo5HDParams transDataParams;
+    transDataParams.dstHighHalf = false;
+    transDataParams.srcHighHalf = false;
+    transDataParams.repeatTimes = d * padHw * n1;
+    if (transDataParams.repeatTimes == 1) {
+        transDataParams.srcRepStride = 0;
+        transDataParams.dstRepStride = 0;
+    } else {
+        transDataParams.srcRepStride = DEFAULT_TRANSDATA_5HD_LIST * c0 * sizeof(T) / ONE_BLK_SIZE;
+        transDataParams.dstRepStride = n0 * sizeof(T) / ONE_BLK_SIZE;
+    }
+
+    uint64_t srcOffsetArr[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t dstOffsetArr[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t srcAddr = (uint64_t)src.GetPhyAddr();
+    uint64_t dstAddr = (uint64_t)dst.GetPhyAddr();
+    for (uint32_t j = 0; j < c1; j++) {
+        uint32_t outOffset = j * d * padHw * n1 * n0 * c0;
+        for (uint8_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+            srcOffsetArr[i] = (uint64_t)(srcAddr + (outOffset + i * n0) * sizeof(T));
+            dstOffsetArr[i] = (uint64_t)(dstAddr + (outOffset + i * d * padHw * n1 * n0) * sizeof(T));
+        }
+        TransDataTo5HD<T>(dstOffsetArr, srcOffsetArr, transDataParams);
+    }
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void C1c0dhwN1n0ToNcdhw(const LocalTensor<T>& dst, const LocalTensor<T>& src,
+    const LocalTensor<T>& tmp, const TransDataTmpParams& params)
+{
+    // C1C0DHW N1N0 -> N CDHW
+    int32_t d = params.d;
+    int32_t n1 = params.n1;
+    int32_t padHw = params.padHw;
+    int32_t currN = params.n;
+    int32_t c = params.c;
+
+    TransDataTo5HDParams transDataParams;
+    transDataParams.dstHighHalf = false;
+    transDataParams.srcHighHalf = false;
+    transDataParams.repeatTimes = c * d * padHw / n0;
+    if (transDataParams.repeatTimes == 1) {
+        transDataParams.srcRepStride = 0;
+        transDataParams.dstRepStride = 0;
+    } else {
+        transDataParams.srcRepStride = DEFAULT_TRANSDATA_5HD_LIST * n1 * n0 * sizeof(T) / ONE_BLK_SIZE;
+        transDataParams.dstRepStride = c0 * sizeof(T) / ONE_BLK_SIZE;
+    }
+
+    uint64_t srcOffsetArr[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t dstOffsetArr[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t srcAddr = (uint64_t)src.GetPhyAddr();
+    uint64_t dstAddr = (uint64_t)dst.GetPhyAddr();
+    uint64_t tmpAddr = (uint64_t)tmp.GetPhyAddr();
+    for (uint32_t j = 0; j < n1; j++) {
+        if (n0 - currN > 0) {
+            for (uint8_t i = 0; i < currN; i++) {
+                dstOffsetArr[i] = (uint64_t)(dstAddr + (j * d * c * padHw * n0 + i * c * d * padHw) * sizeof(T));
+            }
+            for (uint8_t i = currN; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+                dstOffsetArr[i] = (uint64_t)(tmpAddr + i * ONE_BLK_SIZE * sizeof(T));
+            }
+        } else {
+            for (uint8_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+                dstOffsetArr[i] = (uint64_t)(dstAddr + (j * d * c * padHw * n0 + i * c * d * padHw) * sizeof(T));
+            }
+        }
+        for (uint8_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+            srcOffsetArr[i] = (uint64_t)(srcAddr + (j * n0 + i * n0 * n1) * sizeof(T));
+        }
+        TransDataTo5HD<T>(dstOffsetArr, srcOffsetArr, transDataParams);
+        currN -= n0;
+    }
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void N1n0C1c0DHWToNCDHW(const LocalTensor<T>& dst, const LocalTensor<T>& src,
+    const TransDataTmpParams& params)
+{
+    // N1N0 C1C0 D H W -> N C D H W
+    int32_t n = params.n;
+    int32_t c = params.c;
+    int32_t d = params.d;
+    int32_t c1 = params.c1;
+    int32_t padHw = params.padHw;
+
+    uint16_t blockCount = n;
+    uint16_t blockLen = (c * (d * padHw)) * sizeof(T) / ONE_BLK_SIZE;
+    uint16_t srcGap = ((c1 * c0 - c) * (d * padHw)) * sizeof(T) /ONE_BLK_SIZE;
+    uint16_t dstGap = 0;
+    DataCopyParams dataCopyParams = { blockCount, blockLen, srcGap, dstGap };
+    DataCopy(dst, src, dataCopyParams);
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void TransDataFractalToNcdhw(const LocalTensor<T>& dst, const LocalTensor<T>& src,
+    const LocalTensor<uint8_t>& tmpBuffer, const TransDataTmpParams& params)
+{
+    int32_t d = params.d;
+    int32_t n1 = params.n1;
+    int32_t c1 = params.c1;
+    int32_t padHw = params.padHw;
+    int32_t n = params.n;
+    int32_t c = params.c;
+
+    LocalTensor<half> tmp = tmpBuffer.template ReinterpretCast<half>();
+    LocalTensor<half> srcTmp = src.template ReinterpretCast<half>();
+    if (c == c1 * c0 && n == n1 * n0) {
+        LocalTensor<half> dstTmp = dst.template ReinterpretCast<half>();
+        // D C1 HWN1N0C0 -> C1 D HWN1N0C0 (H*W 32B ALIGN -> HW1*HW0)
+        DC1Hwn1n0c0ToC1DHwn1n0c0HWAlign<half>(dstTmp, srcTmp, params);
+        // C1 DHWN1N0 C0 -> C1 C0 DHWN1N0
+        C1Dhwn1n0c0ToC1C0Dhwn1n0<half>(tmp, dstTmp, params);
+        // C1C0DHW N1N0 -> N CDHW
+        C1c0dhwN1n0ToNcdhw<half>(dstTmp, tmp, tmp, params);
+    } else {
+        LocalTensor<half> transDataTmp = tmp[n1 * n0 * c1 * c0 * d * padHw];
+        LocalTensor<half> dstTmp = dst.template ReinterpretCast<half>();
+        // D C1 HWN1N0C0 -> C1 D HWN1N0C0 (H*W 32B ALIGN -> HW1*HW0)
+        DC1Hwn1n0c0ToC1DHwn1n0c0HWAlign<half>(tmp, srcTmp, params);
+        // C1 DHWN1N0 C0 -> C1 C0 DHWN1N0
+        C1Dhwn1n0c0ToC1C0Dhwn1n0<half>(transDataTmp, tmp, params);
+        // C1C0DHW N1N0 -> N CDHW
+        C1c0dhwN1n0ToNcdhw<half>(dstTmp, transDataTmp, tmp, params);
+    }
+}
+
+// Transdata NCDHW -> FRACTAL_Z_3D
+template <typename T>
+__aicore__ inline void TransDataImplNcdhwToFractal(const LocalTensor<T>& dst, const LocalTensor<T>& src, const LocalTensor<uint8_t>& tmpBuffer,
+    const TransDataTmpParams& param)
+{
+    constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T);
+    const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w;
+    constexpr int32_t c0 = 16;
+    constexpr int32_t n0 = 16;
+    const int32_t c1 = DivCeil(c, c0);
+    const int32_t n1 = DivCeil(n, n0);
+    int32_t padHw = AlignUp(h * w, elePerBlk);
+    int32_t currAxis = c * d * padHw;
+    Duplicate<T>(tmpBuffer.ReinterpretCast<T>(), static_cast<T>(0), currAxis);
+    PipeBarrier<PIPE_V>();
+    auto tmpDstTensor = tmpBuffer[currAxis * sizeof(T)].ReinterpretCast<T>();
+    uint64_t dstLocalList[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t srcLocalList[DEFAULT_TRANSDATA_5HD_LIST];
+
+    uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr();
+    uint64_t srcTensorAddr = (uint64_t)src.GetPhyAddr();
+    uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr();
+    uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr();
+    // step1, NCDHW -> CDHW, N1, N0
+    // Do n1 times Transpose to split axis N, and fill with 0 on padding data.
+    TransDataTo5HDParams transDataParams;
+    transDataParams.dstHighHalf = false;
+    transDataParams.srcHighHalf = false;
+    transDataParams.repeatTimes = currAxis / elePerBlk;
+    // if repeat = 1, start offset is auto incremental by stride.
+    transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : n1 * n0;
+    transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1;
+
+    bool isPadded = padHw != h * w;
+    // dst tensor is unable to fill all padded data.
+    auto tmpIfPadAddr = isPadded ? tmpDstTensorAddr : dstTensorAddr;
+    for (int j = 0; j < n1; j++) {
+        uint64_t currDstAddr = tmpIfPadAddr + j * n0 * sizeof(T);
+        uint64_t currSrcAddr = srcTensorAddr + j * currAxis * n0 * sizeof(T);
+        // handle the last axis if N is not even splited by n0.
+        int remain = j == n1 - 1 ? n - j * n0 : n0;
+        for (int32_t i = 0; i < n0; i++) {
+            dstLocalList[i] = currDstAddr +  (i * n1 * n0) * sizeof(T);
+        }
+        for (int32_t i = 0; i < remain; i++) {
+            srcLocalList[i] = currSrcAddr + i * currAxis * sizeof(T);
+        }
+        for (int32_t i = remain; i < n0; i++) {
+            srcLocalList[i] = tmpBufferAddr;
+        }
+        TransDataTo5HD<half>(dstLocalList, srcLocalList, transDataParams);
+    }
+    PipeBarrier<PIPE_V>();
+    // step1.5 collapse padded H,W axis for CDHW, N1N0
+    DataCopyParams copyParams;
+    if (isPadded) {
+        currAxis = h * w * n1 * n0;
+        copyParams.blockCount = c * d;
+        copyParams.blockLen = currAxis / elePerBlk;
+        // Merge axis by skiping padded H,W.
+        copyParams.srcStride = (padHw - h * w) * n1 * n0 / elePerBlk;
+        copyParams.dstStride = 0;
+        DataCopy(dst, tmpDstTensor, copyParams);
+    }
+    PipeBarrier<PIPE_V>();
+
+    // step2, CDHWN1N0 -> C1DHW, N1N0, C0
+    currAxis = d * h * w * n1 * n0;
+    transDataParams.repeatTimes = currAxis / elePerBlk;
+    transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : c0;
+    transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1;
+    for (int32_t j = 0; j < c1; j++) {
+        uint64_t currDstAddr = tmpDstTensorAddr + j * currAxis * c0 * sizeof(T);
+        uint64_t currSrcAddr = dstTensorAddr + j * currAxis * c0 * sizeof(T);
+        int remain = j == c1 - 1 ? c - j * c0 : c0;
+        for (int32_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+            dstLocalList[i] = currDstAddr + i * c0 * sizeof(T);
+        }
+        for (int32_t i = 0; i < remain; i++) {
+            srcLocalList[i] = currSrcAddr + i * currAxis * sizeof(T);
+        }
+        for (int32_t i = remain; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+            srcLocalList[i] = tmpBufferAddr;
+        }
+        TransDataTo5HD<half>(dstLocalList, srcLocalList, transDataParams);
+    }
+    PipeBarrier<PIPE_V>();
+    // steo3 C1DHW, N1N0, C0 -> DC1HW, N1N0, C0
+    currAxis = c0 * h * w * n1 * n0;
+    copyParams.blockCount = d;
+    copyParams.blockLen = currAxis / elePerBlk;
+    // Merge axis by skiping padding padHW -> h, w
+    copyParams.srcStride = 0;
+    copyParams.dstStride = (c1 - 1) * currAxis / elePerBlk;
+    for (int32_t i = 0; i < c1; i++) {
+        DataCopy(dst[i * currAxis], tmpDstTensor[i * d * currAxis], copyParams);
+    }
+    PipeBarrier<PIPE_V>();
+}
+
+// Transdata NCDHW -> NDC1HWC0
+template <typename T>
+__aicore__ inline void TransDataImplNcdhwTo6Hd(const LocalTensor<T>& dst, const LocalTensor<T>& src, const LocalTensor<uint8_t>& tmpBuffer,
+    const TransDataTmpParams& param)
+{
+    constexpr int32_t c0 = 16;
+    constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T);
+    const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w;
+    const int32_t c1 = DivCeil(c, c0);
+    const int32_t padHw = AlignUp(h * w, elePerBlk);
+    int32_t currAxis = d * padHw;
+
+    int32_t axisHwd = h * w * d;
+    int32_t axisHwc0 = h * w * c0;
+    int32_t axisC1hwc0 = axisHwc0 * c1;
+    int32_t axisC1hwdc0 = axisC1hwc0 * d;
+    int32_t axisPadHwd = padHw * d;
+    int32_t axisPadHwc0 = padHw * c0;
+    int32_t axisPadHwdc0 = padHw * c0 * d;
+    Duplicate<T>(tmpBuffer.ReinterpretCast<T>(), static_cast<T>(0), axisPadHwd);
+    PipeBarrier<PIPE_V>();
+
+    // reserve for padded 0 on additional axis c.
+    auto tmpDstTensor = tmpBuffer[axisPadHwd * sizeof(T)].ReinterpretCast<T>();
+
+    uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr();
+    uint64_t srcTensorAddr = (uint64_t)src.GetPhyAddr();
+    uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr();
+    uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr();
+    uint64_t dstLocalList[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t srcLocalList[DEFAULT_TRANSDATA_5HD_LIST];
+    TransDataTo5HDParams transDataParams;
+    transDataParams.dstHighHalf = false;
+    transDataParams.srcHighHalf = false;
+    transDataParams.repeatTimes = axisPadHwd / elePerBlk;
+    transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : c0;
+    transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : 1;
+
+    DataCopyParams copyParams;
+    copyParams.blockCount = d;
+    copyParams.blockLen = axisHwc0 / elePerBlk;
+    copyParams.srcStride = (padHw - h * w) * c0 / elePerBlk;
+    copyParams.dstStride = (c1 - 1) * axisHwc0 / elePerBlk;
+    // iterates N times CDHW -> C1DHWC0
+    for (int32_t k = 0; k < n; k++) {
+        int32_t currSrcStart = k * axisPadHwd * c;
+        int32_t currDstStart = k * axisC1hwdc0;
+        // it's impossible to have calculation size exceed max 255 repeats due to the total memory size.
+        // step1, CDHW -> C1DHWC0 with pad data
+        for (int32_t j = 0; j < c1; j++) {
+            uint64_t currDstAddr = tmpDstTensorAddr + j * axisPadHwdc0 * sizeof(T);
+            uint64_t currSrcAddr = srcTensorAddr + (currSrcStart + j * axisPadHwdc0) * sizeof(T);
+            int remain = j == c1 - 1 ? c - j * c0 : c0;
+            for (int32_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+                dstLocalList[i] = currDstAddr + i * c0 * sizeof(T);
+            }
+            for (int32_t i = 0; i < remain; i++) {
+                srcLocalList[i] = currSrcAddr + i * axisPadHwd * sizeof(T);
+            }
+            for (int32_t i = remain; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+                srcLocalList[i] = tmpBufferAddr;
+            }
+            TransDataTo5HD<half>(dstLocalList, srcLocalList, transDataParams);
+        }
+        PipeBarrier<PIPE_V>();
+        // step2, C1DHWC0 -> DC1HWC0
+        for (int32_t i = 0; i < c1; i++) {
+            DataCopy(dst[currDstStart + i * axisHwc0], tmpDstTensor[i * axisPadHwdc0], copyParams);
+        }
+        PipeBarrier<PIPE_V>();
+    }
+}
+
+// Transdata NDC1HWC0 -> NCDHW
+template <typename T>
+__aicore__ inline void TransDataImpl6HdToNcdhw(const LocalTensor<T>& dst, const LocalTensor<T>& src, const LocalTensor<uint8_t>& tmpBuffer,
+    const TransDataTmpParams& param)
+{
+    const int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w;
+    constexpr int32_t c0 = 16;
+    constexpr int32_t elePerBlk = ONE_BLK_SIZE / sizeof(T);
+    const int32_t c1 = DivCeil(c, c0);
+    const int32_t padHw = AlignUp(h * w, elePerBlk);
+    constexpr int32_t reservedDummy = 512;
+    auto tmpDstTensor = tmpBuffer[reservedDummy].template ReinterpretCast<T>();
+    uint64_t dstLocalList[DEFAULT_TRANSDATA_5HD_LIST];
+    uint64_t srcLocalList[DEFAULT_TRANSDATA_5HD_LIST];
+
+    uint64_t dstTensorAddr = (uint64_t)dst.GetPhyAddr();
+    uint64_t tmpDstTensorAddr = (uint64_t)tmpDstTensor.GetPhyAddr();
+    uint64_t tmpBufferAddr = (uint64_t)tmpBuffer.GetPhyAddr();
+
+    int32_t axisHwd = h * w * d;
+    int32_t axisHwc0 = h * w * c0;
+    int32_t axisC1hwc0 = axisHwc0 * c1;
+    int32_t axisC1hwdc0 = axisC1hwc0 * d;
+    int32_t axisPadHwd = padHw * d;
+    int32_t axisPadHwc0 = padHw * c0;
+    int32_t axisPadHwdc0 = padHw * c0 * d;
+    TransDataTo5HDParams transDataParams;
+    transDataParams.dstHighHalf = false;
+    transDataParams.srcHighHalf = false;
+    transDataParams.repeatTimes = padHw * d / elePerBlk;
+    transDataParams.srcRepStride = transDataParams.repeatTimes == 1 ? 0 : c0;
+    transDataParams.dstRepStride = transDataParams.repeatTimes == 1 ? 0 : 1;
+
+    DataCopyParams copyParams;
+    copyParams.blockCount = c1;
+    copyParams.blockLen = h * w * c0 / elePerBlk;
+    copyParams.srcStride = 0;
+    copyParams.dstStride = (d * padHw - h * w) * c0 / elePerBlk;
+    // iterates N times C1DHWC0 -> CDHW
+    for (int32_t k = 0; k < n; k++) {
+        // step1 DC1HWC0 -> C1DHWC0
+        int32_t currSrcStart = k * axisC1hwdc0;
+        int32_t currDstStart = k * axisPadHwd * c;
+        for (int32_t i = 0; i < d; i++) {
+            DataCopy(tmpDstTensor[i * axisPadHwc0], src[currSrcStart + i * axisC1hwc0], copyParams);
+        }
+        PipeBarrier<PIPE_V>();
+        // step2, C1DHWC0 -> C1C0DHW
+        // it's impossible to have calculation size exceed max 255 repeats due to the total memory size.
+        for (int32_t j = 0; j < c1; j++) {
+            int32_t remain = j == c1 - 1 ? c - j * c0 : c0;
+            uint64_t currDstAddr = dstTensorAddr + (currDstStart + j * axisPadHwdc0) * sizeof(T);
+            uint64_t currSrcAddr = tmpDstTensorAddr + j * axisPadHwdc0 * sizeof(T);
+            for (int32_t i = 0; i < remain; i++) {
+                dstLocalList[i] = currDstAddr + i * axisPadHwd * sizeof(T);
+            }
+            for (int32_t i = remain; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+                // temp for reserve redundant data.
+                dstLocalList[i] = tmpBufferAddr + i * ONE_BLK_SIZE;
+            }
+            for (int32_t i = 0; i < DEFAULT_TRANSDATA_5HD_LIST; i++) {
+                srcLocalList[i] = currSrcAddr + i * c0 * sizeof(T);
+            }
+            TransDataTo5HD<half>(dstLocalList, srcLocalList, transDataParams);
+        }
+        PipeBarrier<PIPE_V>();
+    }
+}
+
+template <typename T, typename U, typename S>
+__aicore__ inline void TransDataCheck(const TransDataParams<U, S>& params)
+{
+    static_assert(SupportType<T, half, bfloat16_t, uint16_t, int16_t>(),
+        "Currents only supports half/bfloat16_t/uint16_t/int16_t types.");
+    static_assert(is_layout_v<U>, "srcLayout must be a layout");
+    static_assert(is_layout_v<S>, "dstLayout must be a layout");
+    using SrcShapeTuple = Std::remove_cvref_t<decltype(params.srcLayout.GetShape())>;
+    using DstShapeTuple = Std::remove_cvref_t<decltype(params.dstLayout.GetShape())>;
+    static_assert(Std::is_tuple_v<SrcShapeTuple>, "srcLayout.GetShape() must be a shape.");
+    static_assert(Std::is_tuple_v<DstShapeTuple>, "dstLayout.GetShape() must be a shape.");
+}
+
+template <const TransDataConfig& config, typename T, typename U, typename S>
+__aicore__ inline void TransDataImpl(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const TransDataParams<U, S>& params)
+{
+    TransDataCheck<T, U, S>(params);
+    auto srcShape = params.srcLayout.GetShape();
+    auto dstShape = params.dstLayout.GetShape();
+    constexpr uint32_t srcShapeSize = static_cast<uint32_t>(Std::tuple_size<decltype(srcShape)>::value);
+    constexpr uint32_t dstShapeSize = static_cast<uint32_t>(Std::tuple_size<decltype(dstShape)>::value);
+    CHECK_FUNC_HIGHLEVEL_API(TransData, (config, T, U, S), (dstTensor, srcTensor, sharedTmpBuffer, params));
+    using srcType = decltype(srcShape);
+    using dstType = decltype(dstShape);
+    using ncdhwType = Std::conditional_t<config.srcFormat == DataFormat::NCDHW, srcType, dstType>;
+    ncdhwType ncdhwShape;
+    if constexpr (config.srcFormat == DataFormat::NCDHW) {
+        ncdhwShape = params.srcLayout.GetShape();
+    } else {
+        ncdhwShape = params.dstLayout.GetShape();
+    }
+    int32_t n = Std::get<0>(ncdhwShape);
+    int32_t c = Std::get<1>(ncdhwShape);
+    int32_t d = Std::get<2>(ncdhwShape);
+    int32_t h = Std::get<3>(ncdhwShape);
+    int32_t w = Std::get<4>(ncdhwShape);
+    int32_t n1 = (n + n0 - 1) / n0;
+    int32_t c1 = (c + c0 - 1) / c0;
+    int32_t hw1 = (h * w + hw0 - 1) / hw0;
+    int32_t padHw = hw1 * hw0;
+    TransDataTmpParams tmpParams = { n, c, d, h, w, n1, c1, padHw };
+    if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) {
+        static_assert(srcShapeSize == ncdhwDims, "srcLayout's shape dims must be equal to 5!");
+        static_assert(dstShapeSize == fractalZ3DDims, "dstLayout's shape dims must be equal to 7!");
+        TransDataImplNcdhwToFractal(dstTensor, srcTensor, sharedTmpBuffer, tmpParams);
+    } else if constexpr (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) {
+        static_assert(srcShapeSize == fractalZ3DDims, "srcLayout's shape dims must be equal to 7!");
+        static_assert(dstShapeSize == ncdhwDims, "dstLayout's shape dims must be equal to 5!");
+        TransDataFractalToNcdhw<T>(dstTensor, srcTensor, sharedTmpBuffer, tmpParams);
+    } else if constexpr (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) {
+        static_assert(srcShapeSize == ncdhwDims, "srcLayout's shape dims must be equal to 5!");
+        static_assert(dstShapeSize == ndc1hwc0Dims, "dstLayout's shape dims must be equal to 6!");
+        TransDataImplNcdhwTo6Hd(dstTensor, srcTensor, sharedTmpBuffer, tmpParams);
+    } else if constexpr (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) {
+        static_assert(srcShapeSize == ndc1hwc0Dims, "srcLayout's shape dims must be equal to 6!");
+        static_assert(dstShapeSize == ncdhwDims, "dstLayout's shape dims must be equal to 5!");
+        TransDataImpl6HdToNcdhw(dstTensor, srcTensor, sharedTmpBuffer, tmpParams);
+    }
+}
+
+} // namespace Internal
+} // namespace AscendC
+#endif // IMPL_TRANSPOSE_TRANSDATA_TRANSDATA_IMPL_H
\ No newline at end of file
diff --git a/impl/transpose/transdata/transdata_tiling.cpp b/impl/transpose/transdata/transdata_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbfc281e48b3c8e9770a2f55bb2bea36d545ae38
--- /dev/null
+++ b/impl/transpose/transdata/transdata_tiling.cpp
@@ -0,0 +1,197 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "lib/transpose/transdata_tiling.h"
+
+#include <cstdint>
+#include <algorithm>
+
+#include "graph/tensor.h"
+#include "impl/host_log.h"
+#include "tiling/platform/platform_ascendc.h"
+namespace AscendC {
+namespace {
+constexpr int32_t PAD_ELE_FOR_HALF = 16;
+constexpr int32_t N_INDEX = 0;
+constexpr int32_t C_INDEX = 1;
+constexpr int32_t D_INDEX = 2;
+constexpr int32_t H_INDEX = 3;
+constexpr int32_t W_INDEX = 4;
+
+struct TmpTransDataParams {
+    int32_t n = 0;
+    int32_t c = 0;
+    int32_t d = 0;
+    int32_t h = 0;
+    int32_t w = 0;
+};
+
+int32_t DivCeil(int32_t a, int32_t b)
+{
+    if (b == 0) {
+        return a;
+    }
+    return (a + b - 1) / b;
+}
+
+int32_t AlignUp(int32_t a, int32_t b)
+{
+    return DivCeil(a, b) * b;
+}
+
+bool GenerateFractalZ3DToNcdhwShapeInfo(const std::vector<int64_t>& dstDims, const std::vector<int64_t>& srcDims,
+   TmpTransDataParams &param, const int32_t c0, const int32_t n0)
+{
+    ASCENDC_HOST_ASSERT(srcDims.size() == 7 && dstDims.size() == 5, return false,
+        "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat.");
+    param.n = dstDims[N_INDEX];
+    param.c = dstDims[C_INDEX];
+    param.d = dstDims[D_INDEX];
+    param.h = dstDims[H_INDEX];
+    param.w = dstDims[W_INDEX];
+    // validate d, h, w
+    ASCENDC_HOST_ASSERT(param.d == srcDims[0] && param.h == srcDims[2] && param.w == srcDims[3], return false,
+        "[TransData][GetTransDataMaxMinTmpSize] shapeInfo d,h,w is not matched.");
+    ASCENDC_HOST_ASSERT(srcDims[6] == c0 && srcDims[1] * c0 == AlignUp(param.c, c0), return false,
+        "[TransData][GetTransDataMaxMinTmpSize] src c0, c1 is not able to be converted to c.");
+    ASCENDC_HOST_ASSERT(srcDims[5] == n0 && srcDims[4] * n0 == AlignUp(param.n, n0), return false,
+        "[TransData][GetTransDataMaxMinTmpSize] src n0, n1 is not able to be converted to n.");
+    return true;
+}
+
+bool GenerateNcdhwToFractalZ3DShapeInfo(const std::vector<int64_t>& dstDims, const std::vector<int64_t>& srcDims,
+   TmpTransDataParams &param, const int32_t c0, const int32_t n0)
+{
+    ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 7, return false,
+        "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat.");
+    param.n = srcDims[N_INDEX];
+    param.c = srcDims[C_INDEX];
+    param.d = srcDims[D_INDEX];
+    param.h = srcDims[H_INDEX];
+    param.w = srcDims[W_INDEX];
+    // validate d, h, w
+    ASCENDC_HOST_ASSERT(param.d == dstDims[0] && param.h == dstDims[2] && param.w == dstDims[3], return false,
+        "[TransData][GetTransDataMaxMinTmpSize] shapeInfo d,h,w is not matched.");
+    ASCENDC_HOST_ASSERT(dstDims[6] == c0 && dstDims[1] * c0 == AlignUp(param.c, c0), return false,
+        "[TransData][GetTransDataMaxMinTmpSize] dst c0, c1 is not able to be converted to c.");
+    ASCENDC_HOST_ASSERT(dstDims[5] == n0 && dstDims[4] * n0 == AlignUp(param.n, n0), return false,
+        "[TransData][GetTransDataMaxMinTmpSize] dst n0, n1 is not able to be converted to n.");
+    return true;
+}
+
+bool GenerateShapeInfo(const TransDataConfig &config, const ge::Shape &srcShape, const ge::Shape &dstShape, ge::DataType type,
+    TmpTransDataParams &param)
+{
+    (void)type;
+    constexpr int32_t c0 = 16, n0 = 16;
+    std::vector<int64_t> srcDims = srcShape.GetDims();
+    std::vector<int64_t> dstDims = dstShape.GetDims();
+    if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) {
+        ASCENDC_HOST_ASSERT(srcDims.size() == 5 && dstDims.size() == 6, return false,
+            "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat.");
+        param.n = srcDims[N_INDEX];
+        param.c = srcDims[C_INDEX];
+        param.d = srcDims[D_INDEX];
+        param.h = srcDims[H_INDEX];
+        param.w = srcDims[W_INDEX];
+        // validate n, d, h, w
+        ASCENDC_HOST_ASSERT(param.n == dstDims[0] && param.d == dstDims[1] && param.h == dstDims[3] && param.w == dstDims[4],
+            return false, "[TransData][GetTransDataMaxMinTmpSize] shapeInfo n,d,h,w is not matched.");
+        ASCENDC_HOST_ASSERT(dstDims[5] == c0 && dstDims[2] * c0 == AlignUp(param.c, c0), return false,
+            "[TransData][GetTransDataMaxMinTmpSize] dst c0, c1 is not able to be converted to c.");
+        return true;
+    }
+    if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) {
+        return GenerateNcdhwToFractalZ3DShapeInfo(dstDims, srcDims, param, c0, n0);
+    }
+    if (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) {
+        return GenerateFractalZ3DToNcdhwShapeInfo(dstDims, srcDims, param, c0, n0);
+    }
+    if (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW) {
+        ASCENDC_HOST_ASSERT(srcDims.size() == 6 && dstDims.size() == 5, return false,
+            "[TransData][GetTransDataMaxMinTmpSize] input shapes are not matched with DataFormat.");
+        param.n = dstDims[N_INDEX];
+        param.c = dstDims[C_INDEX];
+        param.d = dstDims[D_INDEX];
+        param.h = dstDims[H_INDEX];
+        param.w = dstDims[W_INDEX];
+        // validate n, d, h, w
+        ASCENDC_HOST_ASSERT(param.n == srcDims[0] && param.d == srcDims[1] && param.h == srcDims[3] && param.w == srcDims[4],
+            return false, "[TransData][GetTransDataMaxMinTmpSize] shapeInfo n,d,h,w is not matched.");
+        ASCENDC_HOST_ASSERT(srcDims[5] == c0 && srcDims[2] * c0 == AlignUp(param.c, c0), return false,
+            "[TransData][GetTransDataMaxMinTmpSize]  src c0, c1 is not able to be converted to c.");
+        return true;
+    }
+    return false;
+}
+
+int32_t GetTmpBufferSize(const TransDataConfig &config, const TmpTransDataParams &param)
+{
+    constexpr int32_t dataSize = 2;
+    int32_t n = param.n, c = param.c, d = param.d, h = param.h, w = param.w;
+    constexpr int32_t c0 = 16, n0 = 16;
+    int32_t c1 = DivCeil(c, c0), n1 = DivCeil(n, n0);
+    int32_t padHw = AlignUp(h * w, 16);
+    if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0)
+    {
+        return d * padHw * dataSize + d * c1 * c0 * padHw * dataSize;
+    }
+    if (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW)
+    {
+        constexpr int32_t redundantDataBuffer = 512;
+        return d * c1 * c0 * padHw * dataSize + redundantDataBuffer;
+    }
+    if (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D)
+    {
+        return c * d * padHw * dataSize + n1 * n0 * d * c1 * c0 * padHw * dataSize;
+    }
+    if (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW)
+    {
+        constexpr int32_t doubleTmpSize = 2;
+        if (n == n0 * n1 && c == c0 * c1) {
+            return n1 * n0 * c1 * c0 * d * padHw * dataSize;
+        }
+        return n1 * n0 * c1 * c0 * d * padHw * dataSize * doubleTmpSize;
+    }
+    return 0;
+}
+} // namespace
+
+bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform,
+                                const ge::Shape &srcShape,
+                                const ge::Shape &dstShape,
+                                const ge::DataType dataType,
+                                const TransDataConfig &config,
+                                uint32_t &maxValue, uint32_t &minValue)
+{
+    ASCENDC_HOST_ASSERT(dataType == ge::DataType::DT_FLOAT16 || dataType == ge::DataType::DT_BF16 ||
+        dataType == ge::DataType::DT_UINT16 || dataType == ge::DataType::DT_INT16, return false,
+        "[TransData][GetTransDataMaxMinTmpSize] it only supports DT_FLOAT16/DT_BF16/DT_UINT16/DT_INT16 data type");
+
+    ASCENDC_HOST_ASSERT(((config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::FRACTAL_Z_3D) ||
+        (config.srcFormat == DataFormat::FRACTAL_Z_3D && config.dstFormat == DataFormat::NCDHW) ||
+        (config.srcFormat == DataFormat::NCDHW && config.dstFormat == DataFormat::NDC1HWC0) ||
+        (config.srcFormat == DataFormat::NDC1HWC0 && config.dstFormat == DataFormat::NCDHW)), return false,
+         "[TransData][GetTransDataMaxMinTmpSize] The parameter config srcFormat/dstFormat only supports "
+         "(NCDHW, FRACTAL_Z_3D)/(FRACTAL_Z_3D, NCDHW)/(NCDHW, NDC1HWC0)/(NDC1HWC0, NCDHW)!");
+
+    platform_ascendc::SocVersion socVersion = platform.GetSocVersion();
+    ASCENDC_HOST_ASSERT(socVersion == platform_ascendc::SocVersion::ASCEND910B, return false,
+                        "[TransData][GetTransDataMaxMinTmpSize] Unsupported SocVersion for TransData API.");
+
+    TmpTransDataParams tmpParam;
+
+    ASCENDC_HOST_ASSERT(GenerateShapeInfo(config, srcShape, dstShape, dataType, tmpParam), return false,
+        "[TransData][GetTransDataMaxMinTmpSize] failed to validate inputs informations.");
+    maxValue = GetTmpBufferSize(config, tmpParam);
+    minValue = maxValue;
+    return true;
+}
+} // namespace AscendC
diff --git a/lib/reduce/reduce_tiling.h b/lib/reduce/reduce_tiling.h
index a43f6f8330bc2f7d1e5199e28fca4ce6eed5ca16..c27591e4a087be179fc7a2a5b3d2fc409b582493 100644
--- a/lib/reduce/reduce_tiling.h
+++ b/lib/reduce/reduce_tiling.h
@@ -15,6 +15,7 @@
 #ifndef LIB_REDUCE_REDUCE_TILING_H
 #define LIB_REDUCE_REDUCE_TILING_H
 #include <cstdint>
+#include "graph/types.h"
 #include "graph/tensor.h"
 
 namespace AscendC {
diff --git a/lib/sort/topk_tiling.h b/lib/sort/topk_tiling.h
index 07e36dbbfde70517691d0f9b7a7b41a3f483d62e..e08d1af42d502deb8543af313f56bdaafe053328 100644
--- a/lib/sort/topk_tiling.h
+++ b/lib/sort/topk_tiling.h
@@ -11,6 +11,7 @@
 #define LIB_SORT_TOPK_TILING_H
 #include "topk_tilingdata.h"
 #include "tiling/platform/platform_ascendc.h"
+#include "graph/types.h"
 
 namespace AscendC {
 
diff --git a/lib/transpose/transdata.h b/lib/transpose/transdata.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0075cf5276392e5ca5399c7e122203059d393ca
--- /dev/null
+++ b/lib/transpose/transdata.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef LIB_TRANSPOSE_TRANSDATA_H
+#define LIB_TRANSPOSE_TRANSDATA_H
+#if __CCE_AICORE__ == 220
+#include "transdata_common.h"
+#include "kernel_tensor.h"
+#include "kernel_operator_intf.h"
+#include "kernel_pop_stack_buffer.h"
+#include "../../impl/transpose/transdata/transdata_impl.h"
+#if ASCENDC_CPU_DEBUG
+#include "kernel_log.h"
+#include <type_traits>
+#endif
+
+namespace AscendC {
+
+template <const TransDataConfig& config, typename T, typename U, typename S>
+__aicore__ inline void TransData(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const TransDataParams<U, S>& params)
+{
+    Internal::TransDataImpl<config, T, U, S>(dstTensor, srcTensor, sharedTmpBuffer, params);
+}
+
+template <const TransDataConfig& config, typename T, typename U, typename S>
+__aicore__ inline void TransData(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
+    const TransDataParams<U, S>& params)
+{
+    // Only for AI Vector Core.
+    if ASCEND_IS_AIC {
+        return;
+    }
+    LocalTensor<uint8_t> tmp;
+    const bool ret = PopStackBuffer<uint8_t, TPosition::LCM>(tmp);
+    ASCENDC_ASSERT((ret), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
+
+    TransData<config, T, U, S>(dstTensor, srcTensor, tmp, params);
+}
+} // namespace AscendC
+#endif
+#endif // LIB_TRANSPOSE_TRANSDATA_H
\ No newline at end of file
diff --git a/lib/transpose/transdata_common.h b/lib/transpose/transdata_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..0421a3ca724cf4f6feaae1ab14e7f8fe3e89692d
--- /dev/null
+++ b/lib/transpose/transdata_common.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef LIB_TRANSPOSE_TRANSDATA_COMMON_H
+#define LIB_TRANSPOSE_TRANSDATA_COMMON_H
+
+namespace AscendC {
+template <typename T, typename U>
+struct TransDataParams {
+    T srcLayout;
+    U dstLayout;
+};
+
+#ifndef ASCC_PARAM_TRANSDATACONFIG
+#define ASCC_PARAM_TRANSDATACONFIG
+struct TransDataConfig {
+    DataFormat srcFormat;
+    DataFormat dstFormat;
+};
+#endif // ASCC_PARAM_TRANSDATACONFIG
+} // namespace AscendC
+
+#endif // LIB_TRANSPOSE_TRANSDATA_COMMON_H
\ No newline at end of file
diff --git a/lib/transpose/transdata_tiling.h b/lib/transpose/transdata_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2d722218a7c5556d30910c293a1f808c54b3d7e
--- /dev/null
+++ b/lib/transpose/transdata_tiling.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file transdata_tiling.h
+ * \brief
+ */
+#ifndef LIB_TRANSPOSE_TRANSDATA_TILING_H
+#define LIB_TRANSPOSE_TRANSDATA_TILING_H
+#include <cstdint>
+#include "graph/tensor.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace AscendC {
+/*
+ * @brief DataFormat
+*/
+#ifndef ASCC_ENUM_DATAFORMAT
+#define ASCC_ENUM_DATAFORMAT
+enum class DataFormat : uint8_t {
+    ND = 0,
+    NZ,
+    NCHW,
+    NC1HWC0,
+    NHWC,
+    NCDHW,
+    NDC1HWC0,
+    FRACTAL_Z_3D,
+};
+#endif // ASCC_ENUM_DATAFORMAT
+
+#ifndef ASCC_PARAM_TRANSDATACONFIG
+#define ASCC_PARAM_TRANSDATACONFIG
+struct TransDataConfig {
+    DataFormat srcFormat;
+    DataFormat dstFormat;
+};
+#endif // ASCC_PARAM_TRANSDATACONFIG
+
+/*!
+ * \brief This interface is used to obtain the maximum and minimum temporary space reserved or applied.
+ * The developer selects a proper space size based on this range as the tiling parameter.
+ *
+ * \param [in] platform, targeted platform information
+ * \param [in] srcShape, src tensor shape
+ * \param [in] dstShape, src tensor shape
+ * \param [in] dataType, actual data type of the input
+ * \param [in] config, transdata config
+ * \param [out] maxValue, maximum temporary space required
+ * \param [out] minValue, minimum temporary space required
+ * \return whether get the max/min value successfully
+ */
+bool GetTransDataMaxMinTmpSize(const platform_ascendc::PlatformAscendC &platform,
+                               const ge::Shape &srcShape,
+                               const ge::Shape &dstShape,
+                               const ge::DataType dataType,
+                               const TransDataConfig &config,
+                               uint32_t &maxValue, uint32_t &minValue);
+} // AscendC
+#endif // LIB_TRANSPOSE_TRANSDATA_TILING_H
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e1f9cd3763fd4813a21ef92eb4d2b6ca7de408c2..a721159336fcde0e830a02746b01f0956ad97038 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -154,6 +154,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/normalization/welfordfinalize/test_operator_welfordfinalize.cpp
     ${ASCENDC_TESTS_DIR}/utils/init_global_memory/test_operator_init_global_memory.cpp
     ${ASCENDC_TESTS_DIR}/normalization/layernormV2/test_operator_layernormV2.cpp
+    ${ASCENDC_TESTS_DIR}/transpose/transdata/*cpp
 )
 
 # ascend910B1 aic test cases
@@ -405,6 +406,7 @@ file(GLOB ASCENDC_TILING_SRC_FILES
     ${ASCENDC_API_DIR}/impl/sort/topk/*.cpp
     ${ASCENDC_API_DIR}/impl/reduce/reduce_tiling.cpp
     ${ASCENDC_API_DIR}/impl/normalization/layernormV2/*.cpp
+    ${ASCENDC_API_DIR}/impl/transpose/transdata/transdata_tiling.cpp
 )
 
 # ascendc_tiling_utest
diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index 50d4faa0fc247d615512ec2c2502a7f6967fa3e5..b28356198233bce88a10eacf6c8e1f4b75c44a52 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -32,7 +32,7 @@ protected:
     void TearDown() {}
 };
 
-
+extern void platfrom_stub_set_chip_version(const char *num);
 TEST_F(TestTiling, MultiCoreSmallMN)
 {
     matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4,rnnMatmul5;
@@ -5400,6 +5400,106 @@ TEST_F(TestTiling, TestOneElementBroadCast200)
 }
 #endif
 
+TEST_F(TestTiling, testTransDataTilingUnalignedHw)
+{
+    platfrom_stub_set_chip_version("Ascend910B");
+    uint32_t maxSize;
+    uint32_t minSize;
+    int32_t n = 16;
+    int32_t c = 16;
+    int32_t d = 3;
+    int32_t h = 3;
+    int32_t w = 3;
+    int32_t c0 = 16;
+    int32_t n0 = 16;
+    int32_t c1 = (c + c0 - 1) / c0;
+    int32_t n1 = (n + n0 - 1) / n0;
+    int32_t hw0 = 16;
+    int32_t hw1 = (h * w + hw0 - 1) / hw0;
+    auto ncdhwShape = ge::Shape({ n, c, d, h, w });
+    auto ndc1hwc0Shape = ge::Shape({ n, d, c1, h, w, c0});
+    auto fractalzShape = ge::Shape({ d, c1, h, w, n1, n0, c0});
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    TransDataConfig config = {DataFormat::NCDHW, DataFormat::NDC1HWC0};
+    bool ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, ndc1hwc0Shape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, 1632);
+    EXPECT_EQ(minSize, 1632);
+
+    config = {DataFormat::NDC1HWC0, DataFormat::NCDHW};
+    ret = GetTransDataMaxMinTmpSize(plat, ndc1hwc0Shape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, 2048);
+    EXPECT_EQ(minSize, 2048);
+
+    config = {DataFormat::NCDHW, DataFormat::FRACTAL_Z_3D};
+    ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, fractalzShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, 26112);
+    EXPECT_EQ(minSize, 26112);
+
+    config = {DataFormat::FRACTAL_Z_3D, DataFormat::NCDHW};
+    ret = GetTransDataMaxMinTmpSize(plat, fractalzShape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2);
+    EXPECT_EQ(minSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2);
+}
+
+TEST_F(TestTiling, testTransDataTilingAlignedHw)
+{
+    platfrom_stub_set_chip_version("Ascend910B");
+    uint32_t maxSize;
+    uint32_t minSize;
+    int32_t n = 5;
+    int32_t c = 30;
+    int32_t d = 2;
+    int32_t h = 4;
+    int32_t w = 8;
+    int32_t c0 = 16;
+    int32_t n0 = 16;
+    int32_t c1 = (c + c0 - 1) / c0;
+    int32_t n1 = (n + n0 - 1) / n0;
+    int32_t hw0 = 16;
+    int32_t hw1 = (h * w + hw0 - 1) / hw0;
+    auto ncdhwShape = ge::Shape({ n, c, d, h, w });
+    auto ndc1hwc0Shape = ge::Shape({ n, d, c1, h, w, c0});
+    auto fractalzShape = ge::Shape({ d, c1, h, w, n1, n0, c0});
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    TransDataConfig config = {DataFormat::NCDHW, DataFormat::NDC1HWC0};
+    bool ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, ndc1hwc0Shape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, 4224);
+    EXPECT_EQ(minSize, 4224);
+
+    config = {DataFormat::NDC1HWC0, DataFormat::NCDHW};
+    ret = GetTransDataMaxMinTmpSize(plat, ndc1hwc0Shape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, 4608);
+    EXPECT_EQ(minSize, 4608);
+
+    config = {DataFormat::NCDHW, DataFormat::FRACTAL_Z_3D};
+    ret = GetTransDataMaxMinTmpSize(plat, ncdhwShape, fractalzShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, 69376);
+    EXPECT_EQ(minSize, 69376);
+
+    config = {DataFormat::FRACTAL_Z_3D, DataFormat::NCDHW};
+    ret = GetTransDataMaxMinTmpSize(plat, fractalzShape, ncdhwShape, ge::DataType::DT_FLOAT16, config, maxSize, minSize);
+
+    EXPECT_TRUE(ret);
+    EXPECT_EQ(maxSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2 * 2);
+    EXPECT_EQ(minSize, n1 * n0 * c1 * c0 * d * hw0 * hw1 * 2 * 2);
+}
+
 TEST_F(TestTiling, TestReduceXorSumTilingInt16)
 {
     std::vector<int64_t> shapeDims = { 128, 128 };
diff --git a/tests/transpose/transdata/test_operator_transdata.cpp b/tests/transpose/transdata/test_operator_transdata.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d50408a97e8ab49666de9d50fb3fb65d5d74e982
--- /dev/null
+++ b/tests/transpose/transdata/test_operator_transdata.cpp
@@ -0,0 +1,267 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+
+#include <iostream>
+#include <fstream>
+
+namespace AscendC {
+
+namespace {
+
+constexpr uint32_t NCDHW_FractalZ3D = 1;
+constexpr uint32_t FractalZ3D_NCDHW = 2;
+constexpr uint32_t NCDHW_NDC1HWC0 = 3;
+constexpr uint32_t NDC1HWC0_NCDHW = 4;
+
+
+constexpr TransDataConfig config1 = {DataFormat::NCDHW, DataFormat::FRACTAL_Z_3D};
+constexpr TransDataConfig config2 = {DataFormat::FRACTAL_Z_3D, DataFormat::NCDHW};
+constexpr TransDataConfig config3 = {DataFormat::NCDHW, DataFormat::NDC1HWC0};
+constexpr TransDataConfig config4 = {DataFormat::NDC1HWC0, DataFormat::NCDHW};
+
+}
+
+template <typename T, uint32_t mode>
+class KernelTransData {
+public:
+__aicore__ inline KernelTransData() {}
+__aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm,
+    int32_t n, int32_t c, int32_t d, int32_t h, int32_t w, TPipe *tpipe)
+{
+    this->d = d;
+    this->c = c;
+    this->h = h;
+    this->w = w;
+    this->n = n;
+    this->c1 = (c + c0 - 1) / c0;
+    this->n1 = (n + n0 - 1) / n0;
+    this->hw1 = (h*w + hw0 - 1) / hw0;
+
+    if (mode == NDC1HWC0_NCDHW) {
+        this->srcShapeSize = n * c1 * c0 * d * h * w;
+        this->dstShapeSize = n * d * c * hw0;
+        this->tmpShapeSize = 512 + d * c1 * c0 * hw0 * hw1;
+        uint32_t dstGmSize = n * c * d * h * w;
+        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(srcGm), srcShapeSize * sizeof(T));
+        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(dstGm), dstGmSize * sizeof(T));
+    } else {
+        if constexpr (mode == NCDHW_FractalZ3D) {
+            srcShapeSize = n * d * c * hw0 * hw1;
+            dstShapeSize = n1 * n0 * c1 * c0 * d * h * w;
+            if ((h*w) % 16 != 0 ) {
+                needPad = true;
+                dstShapeSize = n1 * n0 * c1 * c0 * d * hw0 * hw1;
+            }
+            tmpShapeSize = c * d * hw0 * hw1 + n1 * n0 * d * c1 * c0 * hw0 * hw1;
+        } else if constexpr (mode == FractalZ3D_NCDHW) {
+            this->srcShapeSize = d * c1 * h * w * n1 * n0 * c0;
+            this->dstShapeSize = n * c * d * (hw1 * hw0);
+            this->tmpShapeSize = d * c1 * (hw1 * hw0) * n1 * n0 * c0 * 2;
+        } else if constexpr (mode == NCDHW_NDC1HWC0) {
+            this->srcShapeSize = n * d * c * hw0;
+            this->dstShapeSize = n * c1 * c0 * d * h * w;
+            this->tmpShapeSize = d * hw0 * hw1 + d * c1 * c0 * hw0 * hw1;
+        }
+        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(srcGm), srcShapeSize * sizeof(T));
+        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(dstGm), dstShapeSize * sizeof(T));
+    }
+
+
+    this->pipe = tpipe;
+    pipe->InitBuffer(inQueue, 1, srcShapeSize * sizeof(T));
+    pipe->InitBuffer(outQueue, 1, dstShapeSize * sizeof(T));
+    pipe->InitBuffer(tmpBuf, tmpShapeSize * sizeof(T));
+
+}
+__aicore__ inline void Process()
+{
+    CopyIn();
+    Compute();
+    CopyOut();
+}
+
+private:
+__aicore__ inline void CopyIn()
+{
+    LocalTensor<T> srcLocal = inQueue.AllocTensor<T>();
+    if constexpr (mode == NCDHW_FractalZ3D || mode == NCDHW_NDC1HWC0) {
+        DataCopyExtParams extParam = {static_cast<uint16_t>(n * c * d),
+            static_cast<uint32_t>(h * w * sizeof(T)), 0, 0, 0};
+        DataCopyPadExtParams<T> padParam = {true, 0, 0, 0};
+        if (needPad) {
+            DataCopyPad(srcLocal, srcGlobal, extParam, padParam);
+        } else {
+            DataCopy(srcLocal, srcGlobal, srcShapeSize);
+        }
+    } else if constexpr (mode == FractalZ3D_NCDHW || mode == NDC1HWC0_NCDHW) {
+        DataCopy(srcLocal, srcGlobal, srcShapeSize);
+    }
+
+    inQueue.EnQue(srcLocal);
+}
+__aicore__ inline void Compute()
+{
+    LocalTensor<T> dstLocal = outQueue.AllocTensor<T>();
+    LocalTensor<uint8_t> tmp = tmpBuf.Get<uint8_t>();
+    LocalTensor<T> srcLocal = inQueue.DeQue<T>();
+    PipeBarrier<PIPE_V>();
+
+    Layout ncdhwLayout = MakeLayout(MakeShape(n, c, d, h, w), MakeStride());
+    Layout ndc1hwc0Layout = MakeLayout(MakeShape(n, d, c1, h, w, c0), MakeStride());
+    Layout fractalLayout = MakeLayout(MakeShape(d, c1, h, w, n1, n0, c0), MakeStride());
+
+    if constexpr (mode == NCDHW_FractalZ3D) {
+        TransDataParams<decltype(ncdhwLayout), decltype(fractalLayout)> params = {ncdhwLayout, fractalLayout};
+        TransData<config1>(dstLocal, srcLocal, tmp, params);
+    } else if constexpr (mode == FractalZ3D_NCDHW) {
+        TransDataParams<decltype(fractalLayout), decltype(ncdhwLayout)> params = {fractalLayout, ncdhwLayout};
+        TransData<config2, T, decltype(fractalLayout), decltype(ncdhwLayout)>(dstLocal, srcLocal, tmp, params);
+    } else if constexpr (mode == NCDHW_NDC1HWC0) {
+        TransDataParams<decltype(ncdhwLayout), decltype(ndc1hwc0Layout)> params = {ncdhwLayout, ndc1hwc0Layout};
+        TransData<config3>(dstLocal, srcLocal, tmp, params);
+    } else if constexpr (mode == NDC1HWC0_NCDHW) {
+        TransDataParams<decltype(ndc1hwc0Layout), decltype(ncdhwLayout)> params = {ndc1hwc0Layout, ncdhwLayout};
+        TransData<config4>(dstLocal, srcLocal, tmp, params);
+    }
+
+    outQueue.EnQue<T>(dstLocal);
+    inQueue.FreeTensor(srcLocal);
+
+}
+__aicore__ inline void CopyOut()
+{
+    LocalTensor<T> dstLocal = outQueue.DeQue<T>();
+    DataCopyExtParams extParam {static_cast<uint16_t>(n * c * d), static_cast<uint32_t>(h*w*sizeof(T)), 0, 0, 0};
+    if constexpr (mode == NCDHW_FractalZ3D) {
+        DataCopy(dstGlobal, dstLocal, n1 * n0 * c1);
+    } else if constexpr (mode == FractalZ3D_NCDHW) {
+        DataCopy(dstGlobal, dstLocal, dstShapeSize);
+    } else if constexpr (mode == NCDHW_NDC1HWC0) {
+        DataCopy(dstGlobal, dstLocal, dstShapeSize);
+    } else if constexpr (mode == NDC1HWC0_NCDHW) {
+        DataCopyPad(dstGlobal, dstLocal, extParam);
+    }
+    outQueue.FreeTensor(dstLocal);
+}
+
+private:
+    GlobalTensor<T> srcGlobal;
+    GlobalTensor<T> dstGlobal;
+    TPipe *pipe;
+    TQue<TPosition::VECIN, 1> inQueue;
+    TQue<TPosition::VECOUT, 1> outQueue;
+    TBuf<QuePosition::VECIN> tmpBuf;
+    bool needPad = false;
+    int32_t n = 0;
+    int32_t c = 0;
+    int32_t d = 0;
+    int32_t h = 0;
+    int32_t w = 0;
+    int32_t n1 = 0;
+    int32_t c1 = 0;
+    int32_t hw1 = 0;
+    int32_t c0 = 16;
+    int32_t n0 = 16;
+    int32_t hw0 = 16;
+    uint32_t srcShapeSize = 0;
+    uint32_t dstShapeSize = 0;
+    uint32_t tmpShapeSize = 0;
+};
+} // namespace AscendC
+
+template <typename T, uint32_t mode>
+__global__ __aicore__ void MainTransdata(
+    __gm__ uint8_t* dstGm, __gm__ uint8_t* srcGm, uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+{
+    if (g_coreType == AscendC::AIC || AscendC::GetBlockIdx() > 0) {
+        return;
+    }
+    AscendC::TPipe pipe;
+    AscendC::KernelTransData<T, mode> op;
+    op.Init(srcGm, dstGm, n, c, d, h, w, &pipe);
+    op.Process();
+}
+
+struct TransDataTestParams {
+    int32_t n;
+    int32_t c;
+    int32_t d;
+    int32_t h;
+    int32_t w;
+    uint32_t mode;
+    void (*cal_func)(uint8_t*, uint8_t*, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
+};
+
+class TransDataTestsuite : public testing::Test, public testing::WithParamInterface<TransDataTestParams> {
+protected:
+    void SetUp()
+    {
+        AscendC::SetGCoreType(2);
+    }
+
+    void TearDown()
+    {
+        AscendC::SetGCoreType(0);
+    }
+};
+
+INSTANTIATE_TEST_CASE_P(TEST_OPERATTION_TRANSDATA, TransDataTestsuite,
+    ::testing::Values(
+        TransDataTestParams { 5, 32, 2, 1, 16, 1, MainTransdata<half, 1> },
+        TransDataTestParams { 4, 31, 1, 6, 7, 2, MainTransdata<half, 2> },
+        TransDataTestParams { 4, 20, 2, 3, 1, 3, MainTransdata<half, 3> },
+        TransDataTestParams { 8, 14, 2, 1, 16, 4, MainTransdata<half, 4> },
+        TransDataTestParams { 5, 32, 2, 1, 16, 1, MainTransdata<bfloat16_t, 1> },
+        TransDataTestParams { 4, 31, 1, 6, 7, 2, MainTransdata<bfloat16_t, 2> },
+        TransDataTestParams { 4, 20, 2, 3, 1, 3, MainTransdata<bfloat16_t, 3> },
+        TransDataTestParams { 8, 14, 2, 1, 16, 4, MainTransdata<bfloat16_t, 4> }
+
+        ));
+
+TEST_P(TransDataTestsuite, TransDataOpTestCase)
+{
+    auto params = GetParam();
+    auto n = params.n;
+    auto c = params.c;
+    auto d = params.d;
+    auto h = params.h;
+    auto w = params.w;
+    auto mode = params.mode;
+    uint32_t srcShapeSize;
+    uint32_t dstShapeSize;
+    int32_t hw0 = 16;
+    int32_t hw1 = (h * w + hw0 - 1) / hw0;
+    int32_t c0 = 16;
+    int32_t n0 = 16;
+    int32_t c1 = (c + c0 - 1) / c0;
+    int32_t n1 = (n + n0 - 1) / n0;
+    if (mode == 1) {
+        srcShapeSize = n * d * c * hw0 * hw1;
+        dstShapeSize = n1 * n0 * c1 * c0 * d * h * w;
+        if ((h*w) % 16 != 0 ) {
+            dstShapeSize = n1 * n0 * c1 * c0 * d * hw0 * hw1;
+        }
+    } else if (mode == 2) {
+        srcShapeSize = d * c1 * h * w * n1 * n0 * c0;
+        dstShapeSize = n * c * d * (hw1 * hw0);
+    } else if (mode == 3) {
+        srcShapeSize = n * d * c * hw0;
+        dstShapeSize = n * c1 * c0 * d * h * w;
+    } else if (mode == 4) {
+        srcShapeSize = n * c1 * c0 * d * h * w;
+        dstShapeSize = n * d * c * hw0;
+    }
+    uint8_t srcGm[srcShapeSize * sizeof(half)] = {0}; // 外部保证inner是32B对齐
+    uint8_t dstGm[dstShapeSize * sizeof(half)] = {0};
+    params.cal_func(dstGm, srcGm, n, c, d, h, w);
+    EXPECT_EQ(dstGm[0], 0);
+}