From dee3a83c1422ce9a7b940bcade7b437c1690dfd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=B3=BD=E4=B8=9C?= Date: Wed, 13 Nov 2024 02:06:33 +0000 Subject: [PATCH 1/2] copy cube in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 姜泽东 --- impl/matmul/matmul_impl.h | 78 ++++++------ impl/matmul/modules/matmul_private_modules.h | 8 ++ impl/matmul/modules/matmul_subblock_info.h | 36 ++++++ impl/matmul/modules/matmul_tensor_info.h | 118 ++++++++++++++++++ impl/matmul/modules/matmul_var.h | 106 ---------------- .../copy_cube_in/batch/batch_copy_cube_in.h | 30 +++-- .../batch/batch_copy_cube_in_intf.h | 89 +++++++++++++ .../batch/batch_copy_cube_in_v200.h | 30 +++-- .../stage/copy_cube_in/copy_cube_in_from_l1.h | 7 +- .../stage/copy_cube_in/copy_cube_in_mdl.h | 5 +- .../stage/copy_cube_in/copy_cube_in_norm.h | 16 +-- .../copy_cube_in/copy_cube_in_using_ub.h | 19 +-- .../stage/copy_cube_in/data_copy_wrapper.h | 31 ++--- lib/matmul/matmul.h | 23 ++++ .../copy_cube_in/test_copy_cube_in_mdl.cpp | 11 +- .../test_copy_cube_in_mdl_310p.cpp | 11 +- .../copy_cube_in/test_copy_cube_in_norm.cpp | 11 +- .../test_copy_cube_in_norm_310p.cpp | 11 +- tests/matmul/test_matmul_var.cpp | 7 +- 19 files changed, 435 insertions(+), 212 deletions(-) create mode 100644 impl/matmul/modules/matmul_subblock_info.h create mode 100644 impl/matmul/modules/matmul_tensor_info.h create mode 100644 impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index bf3552e9..0274e182 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -526,8 +526,8 @@ __aicore__ inline void MatmulImplInit(); - MATMUL_MODULE(CopyCubeInB)->Init(); + MATMUL_MODULE(BatchCopyCubeInA)->Init(); + MATMUL_MODULE(BatchCopyCubeInB)->Init(); uint32_t lenFactor = 1; #if __CCE_AICORE__ >= 220 @@ -630,8 +630,8 @@ __aicore__ inline void MatmulImplInit(); - MATMUL_MODULE(CopyCubeInB)->Init(); + MATMUL_MODULE(BatchCopyCubeInA)->Init(); + MATMUL_MODULE(BatchCopyCubeInB)->Init(); uint32_t lenFactor = 1; #if __CCE_AICORE__ >= 220 @@ -1078,8 +1078,8 @@ template __aicore__ inline void MatmulImpl::EndNorm() { - MATMUL_MODULE(CopyCubeInA)->Destroy(); - MATMUL_MODULE(CopyCubeInB)->Destroy(); + MATMUL_MODULE(ChosenCopyCubeInA)->Destroy(); + MATMUL_MODULE(ChosenCopyCubeInB)->Destroy(); #if __CCE_AICORE__ == 220 if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { @@ -1251,7 +1251,7 @@ __aicore__ inline void MatmulImplSetInput(gm.address_, isTransposeA); + MATMUL_MODULE(ChosenCopyCubeInA)->SetInput(gm.address_, isTransposeA); if constexpr (DoMatmulNorm(MM_CFG)) { IterateController::Reset(); } else { @@ -1365,7 +1365,7 @@ __aicore__ inline void MatmulImplSetInput(gm.address_, isTransposeB); + MATMUL_MODULE(ChosenCopyCubeInB)->SetInput(gm.address_, isTransposeB); if constexpr (DoMatmulNorm(MM_CFG)) { IterateController::Reset(); } else { @@ -2345,12 +2345,12 @@ __aicore__ inline void MatmulImplLoadData(0, 0, var.tailM_, var.tailK_); + a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(0, 0, var.tailM_, var.tailK_); if constexpr (!ToMatmulConfig(MM_CFG).intraBlockPartSum) { - b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(0, 0, var.tailK_, var.tailN_); + b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tailK_, var.tailN_); } else { if (intraBlockMatmul.fakeMsg) { - b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(0, 0, var.tailK_, var.tailN_); + b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tailK_, var.tailN_); } } var.baseUseK_ = var.tailK_; @@ -2358,12 +2358,12 @@ __aicore__ inline void MatmulImplLoadData(0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseK()); + a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(0, 0, var.tiling_.GetBaseM(), var.tiling_.GetBaseK()); if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) { - b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(0, 0, var.tiling_.GetBaseK(), var.tiling_.GetBaseN()); + b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tiling_.GetBaseK(), var.tiling_.GetBaseN()); } else if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { if (intraBlockMatmul.fakeMsg) { - b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(0, 0, var.tiling_.GetBaseK(), var.tiling_.GetBaseN()); + b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, 0, var.tiling_.GetBaseK(), var.tiling_.GetBaseN()); } } } @@ -2450,9 +2450,9 @@ __aicore__ inline void MatmulImplClearLoadData(a1); + MATMUL_MODULE(ChosenCopyCubeInA)->ClearLoadData(a1); if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) { - MATMUL_MODULE(CopyCubeInB)->ClearLoadData(b1); + MATMUL_MODULE(ChosenCopyCubeInB)->ClearLoadData(b1); } } else { // not basic for (int k = 0; k < var.kIter_; k++) { // start reduce K axis @@ -2466,12 +2466,12 @@ __aicore__ inline void MatmulImplLoadData(var.curM_, k, var.baseUseM_, var.baseUseK_); + a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_, k, var.baseUseM_, var.baseUseK_); if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) { - b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_); + b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_); } else if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { if (intraBlockMatmul.fakeMsg) { - b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_); + b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(k, var.curN_, var.baseUseK_, var.baseUseN_); } } if constexpr (!IsStaticPaddingEnable(MM_CFG)) { @@ -2571,9 +2571,9 @@ __aicore__ inline void MatmulImplClearLoadData(a1, var.curM_, k); + MATMUL_MODULE(ChosenCopyCubeInA)->ClearLoadData(a1, var.curM_, k); if constexpr(!ToMatmulConfig(MM_CFG).intraBlockPartSum) { - MATMUL_MODULE(CopyCubeInB)->ClearLoadData(b1, k, var.curN_); + MATMUL_MODULE(ChosenCopyCubeInB)->ClearLoadData(b1, k, var.curN_); } } } @@ -2678,8 +2678,8 @@ __aicore__ inline void MatmulImpl bias; var.baseUseK_ = var.tailK_; var.blockUseK_ = Ceil(var.baseUseK_, c0Size_); - auto a1 = MATMUL_MODULE(CopyCubeInA)->LoadData(var.curM_, 0, var.baseUseM_, var.baseUseK_); - auto b1 = MATMUL_MODULE(CopyCubeInB)->LoadData(0, var.curN_, var.baseUseK_, var.baseUseN_); + auto a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_, 0, var.baseUseM_, var.baseUseK_); + auto b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(0, var.curN_, var.baseUseK_, var.baseUseN_); // set addr MatmulInstr::sAL1K_ = var.blockUseK_ * c0Size_; MatmulInstr::sBL1K_ = var.blockUseK_ * c0Size_; @@ -5002,7 +5002,7 @@ __aicore__ inline void MatmulImpl batchB_ ? batchA_ : batchB_; int32_t splitSize = (batchNum >= 2) && (batchA_ % 2 == 0) && (batchB_ % 2 == 0)? 2 : 1; int32_t splitBatchNum = batchNum / splitSize; - auto matrixA = MATMUL_MODULE(CopyCubeInA)->AllocTensor(); - auto matrixB = MATMUL_MODULE(CopyCubeInB)->AllocTensor(); + auto matrixA = MATMUL_MODULE(BatchCopyCubeInA)->AllocTensor(); + auto matrixB = MATMUL_MODULE(BatchCopyCubeInB)->AllocTensor(); #if __CCE_AICORE__ >= 220 // Transfer the batchNum Bias matrix to L1 at one time. if constexpr (ToMatmulConfig(MM_CFG).enableSetBias && @@ -5189,8 +5188,8 @@ __aicore__ inline void MatmulImpl(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1)); event_t eventIDMToMte1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::M_MTE1)); for (int32_t outer = 0; outer < splitSize; ++outer) { - MATMUL_MODULE(CopyCubeInA)->BatchLoad(matrixA, matrixStrideA, batchOuterIdx, outer, splitSize); - MATMUL_MODULE(CopyCubeInB)->BatchLoad(matrixB, matrixStrideB, batchOuterIdx, outer, splitSize); + MATMUL_MODULE(BatchCopyCubeInA)->BatchLoad(matrixA, matrixStrideA, batchOuterIdx, outer, splitSize); + MATMUL_MODULE(BatchCopyCubeInB)->BatchLoad(matrixB, matrixStrideB, batchOuterIdx, outer, splitSize); SetFlag(eventIDMte2ToMte1); WaitFlag(eventIDMte2ToMte1); ASSERT(batchA_ > 0 && batchB_ > 0 && (batchA_ % batchB_ == 0 || batchB_ % batchA_ == 0)); @@ -5245,8 +5244,8 @@ __aicore__ inline void MatmulImplBatchDestroy(); - MATMUL_MODULE(CopyCubeInB)->BatchDestroy(); + MATMUL_MODULE(BatchCopyCubeInA)->BatchDestroy(); + MATMUL_MODULE(BatchCopyCubeInB)->BatchDestroy(); } } @@ -5268,8 +5267,8 @@ __aicore__ inline void MatmulImpl batchB_ ? batchA_ : batchB_; int32_t splitSize = (batchNum >= 2) && (batchA_ % 2 == 0) && (batchB_ % 2 == 0)? 2 : 1; int32_t splitBatchNum = batchNum / splitSize; - auto matrixA = MATMUL_MODULE(CopyCubeInA)->AllocTensor(); - auto matrixB = MATMUL_MODULE(CopyCubeInB)->AllocTensor(); + auto matrixA = MATMUL_MODULE(BatchCopyCubeInA)->AllocTensor(); + auto matrixB = MATMUL_MODULE(BatchCopyCubeInB)->AllocTensor(); #if __CCE_AICORE__ >= 220 // Transfer the batchNum Bias matrix to L1 at one time. if constexpr (ToMatmulConfig(MM_CFG).enableSetBias && @@ -5281,8 +5280,8 @@ __aicore__ inline void MatmulImpl(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1)); event_t eventIDMToMte1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::M_MTE1)); for (int32_t outer = 0; outer < splitSize; ++outer) { - MATMUL_MODULE(CopyCubeInA)->BatchLoad(matrixA, matrixStrideA, batchOuterIdx, outer, splitSize); - MATMUL_MODULE(CopyCubeInB)->BatchLoad(matrixB, matrixStrideB, batchOuterIdx, outer, splitSize); + MATMUL_MODULE(BatchCopyCubeInA)->BatchLoad(matrixA, matrixStrideA, batchOuterIdx, outer, splitSize); + MATMUL_MODULE(BatchCopyCubeInB)->BatchLoad(matrixB, matrixStrideB, batchOuterIdx, outer, splitSize); SetFlag(eventIDMte2ToMte1); WaitFlag(eventIDMte2ToMte1); ASSERT(batchA_ > 0 && batchB_ > 0 && (batchA_ % batchB_ == 0 || batchB_ % batchA_ == 0)); @@ -5328,8 +5327,8 @@ __aicore__ inline void MatmulImplBatchDestroy(); - MATMUL_MODULE(CopyCubeInB)->BatchDestroy(); + MATMUL_MODULE(BatchCopyCubeInA)->BatchDestroy(); + MATMUL_MODULE(BatchCopyCubeInB)->BatchDestroy(); } } @@ -6796,8 +6795,7 @@ __aicore__ inline void MatmulImpl>; using DataCopyUtilsA = DataCopyWrapper>; using DataCopyUtilsB = DataCopyWrapper>; + using MatmulTensorInfoA = MatmulTensorInfo>; + using MatmulTensorInfoB = MatmulTensorInfo>; + using MatmulSubBlockInfo = matmul::MatmulSubBlockInfo; using BatchLayoutA = BatchLayout>; using BatchLayoutB = BatchLayout>; using DataWarp = matmul::DataWarp; using QuantProcessor = matmul::QuantProcessor::GetCubeOutType()>; + using BatchCopyCubeInA = BatchCopyCubeIn, MM_CFG>; + using BatchCopyCubeInB = BatchCopyCubeIn, MM_CFG>; }; } #endif // _MATMUL_PRIVATE_MODULES_H_ \ No newline at end of file diff --git a/impl/matmul/modules/matmul_subblock_info.h b/impl/matmul/modules/matmul_subblock_info.h new file mode 100644 index 00000000..4922616b --- /dev/null +++ b/impl/matmul/modules/matmul_subblock_info.h @@ -0,0 +1,36 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_subblock_info.h + * \brief matmul variable manager + */ + +#ifndef IMPL_MATMUL_MODULES_MATMUL_SUBBLOCK_INFO_H +#define IMPL_MATMUL_MODULES_MATMUL_SUBBLOCK_INFO_H + +#include "matmul_module.h" + +namespace matmul { +template +class MatmulSubBlockInfo { +public: + __aicore__ inline uint8_t GetSubBlockIdx() const + { + return MATMUL_CONST_PARAM_VAR.subBlockIdx_; + } + + __aicore__ inline bool IsFakeIntraBlock() const + { + return MATMUL_CONST_INTRA_BLOCK.fakeMsg || MATMUL_CONST_PARAM_VAR.subBlockIdx_ == 0; + } +}; +} // namespace matmul +#endif // IMPL_MATMUL_MODULES_MATMUL_SUBBLOCK_INFO_H \ No newline at end of file diff --git a/impl/matmul/modules/matmul_tensor_info.h b/impl/matmul/modules/matmul_tensor_info.h new file mode 100644 index 00000000..487de9d2 --- /dev/null +++ b/impl/matmul/modules/matmul_tensor_info.h @@ -0,0 +1,118 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_tensor_info.h + * \brief matmul variable manager + */ + +#ifndef IMPL_MATMUL_MODULES_MATMUL_TENSOR_INFO_H +#define IMPL_MATMUL_MODULES_MATMUL_TENSOR_INFO_H + +#include "matmul_module.h" + +namespace matmul { +template +class MatmulTensorInfo { + using SrcT = typename INPUT_TYPE::T; +public: + template + __aicore__ inline __gm__ SrcT* GetGlobalAddr() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.aGlobal; + } else { + return MATMUL_CONST_PARAM_VAR.aGlobal_; + } + } + + __aicore__ inline TBuffAddr GetLocalAddr() const + { + return MATMUL_CONST_PARAM_VAR.leftMatrix_; + } + + template + __aicore__ inline void SetGlobalAddr(__gm__ SrcT* srcGlobalAddr, bool isTranspose) + { + if constexpr (IS_INTRA_BLOCK) { + MATMUL_INTRA_BLOCK.aGlobal = srcGlobalAddr; + MATMUL_INTRA_BLOCK.isTransposeA = isTranspose; + } else { + MATMUL_PARAM_VAR.aGlobal_ = srcGlobalAddr; + MATMUL_PARAM_VAR.isTransposeA_ = isTranspose; + } + } + + __aicore__ inline void SetLocalAddr(const TBuffAddr& address, bool isTranspose) + { + MATMUL_PARAM_VAR.leftMatrix_ = address; + MATMUL_PARAM_VAR.isTransposeA_ = isTranspose; + } + + __aicore__ inline uint64_t GetSelfDefineData() const + { + return MATMUL_CONST_PARAM_VAR.dataPtr_; + } + + __aicore__ inline uint64_t GetUserDefineInfo() const + { + return MATMUL_CONST_PARAM_VAR.tilingPtr_; + } +}; + +template +class MatmulTensorInfo> { + using SrcT = typename INPUT_TYPE::T; +public: + template + __aicore__ inline __gm__ SrcT* GetGlobalAddr() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.bGlobal; + } else { + return MATMUL_CONST_PARAM_VAR.bGlobal_; + } + } + + __aicore__ inline TBuffAddr GetLocalAddr() const + { + return MATMUL_CONST_PARAM_VAR.rightMatrix_; + } + + template + __aicore__ inline void SetGlobalAddr(__gm__ SrcT* srcGlobalAddr, bool isTranspose) + { + if constexpr (IS_INTRA_BLOCK) { + MATMUL_INTRA_BLOCK.bGlobal = srcGlobalAddr; + MATMUL_INTRA_BLOCK.isTransposeB = isTranspose; + } else { + MATMUL_PARAM_VAR.bGlobal_ = srcGlobalAddr; + MATMUL_PARAM_VAR.isTransposeB_ = isTranspose; + } + } + + __aicore__ inline void SetLocalAddr(const TBuffAddr& address, bool isTranspose) + { + MATMUL_PARAM_VAR.rightMatrix_ = address; + MATMUL_PARAM_VAR.isTransposeB_ = isTranspose; + } + + __aicore__ inline uint64_t GetSelfDefineData() const + { + return MATMUL_CONST_PARAM_VAR.dataPtr_; + } + + __aicore__ inline uint64_t GetUserDefineInfo() const + { + return MATMUL_CONST_PARAM_VAR.tilingPtr_; + } +}; +} // namespace matmul +#endif // IMPL_MATMUL_MODULES_MATMUL_TENSOR_INFO_H \ No newline at end of file diff --git a/impl/matmul/modules/matmul_var.h b/impl/matmul/modules/matmul_var.h index 36181b4c..bdc2c359 100644 --- a/impl/matmul/modules/matmul_var.h +++ b/impl/matmul/modules/matmul_var.h @@ -29,39 +29,6 @@ public: return MATMUL_CONST_PARAM_VAR.tiling_; } - template - __aicore__ inline __gm__ SrcT* GetGlobalAddr() const - { - if constexpr (IS_INTRA_BLOCK) { - return MATMUL_CONST_INTRA_BLOCK.aGlobal; - } else { - return MATMUL_CONST_PARAM_VAR.aGlobal_; - } - } - - __aicore__ inline TBuffAddr GetLocalAddr() const - { - return MATMUL_CONST_PARAM_VAR.leftMatrix_; - } - - template - __aicore__ inline void SetGlobalAddr(__gm__ SrcT* srcGlobalAddr, bool isTranspose) - { - if constexpr (IS_INTRA_BLOCK) { - MATMUL_INTRA_BLOCK.aGlobal = srcGlobalAddr; - MATMUL_INTRA_BLOCK.isTransposeA = isTranspose; - } else { - MATMUL_PARAM_VAR.aGlobal_ = srcGlobalAddr; - MATMUL_PARAM_VAR.isTransposeA_ = isTranspose; - } - } - - __aicore__ inline void SetLocalAddr(const TBuffAddr& address, bool isTranspose) - { - MATMUL_PARAM_VAR.leftMatrix_ = address; - MATMUL_PARAM_VAR.isTransposeA_ = isTranspose; - } - template __aicore__ inline int32_t GetOrgHeight() { @@ -311,26 +278,6 @@ public: return IsTranspose(); } - __aicore__ inline uint64_t GetSelfDefineData() const - { - return MATMUL_CONST_PARAM_VAR.dataPtr_; - } - - __aicore__ inline uint64_t GetUserDefineInfo() const - { - return MATMUL_CONST_PARAM_VAR.tilingPtr_; - } - - __aicore__ inline uint8_t GetSubBlockIdx() const - { - return MATMUL_CONST_PARAM_VAR.subBlockIdx_; - } - - __aicore__ inline bool IsFakeIntraBlock() const - { - return MATMUL_CONST_INTRA_BLOCK.fakeMsg || MATMUL_CONST_PARAM_VAR.subBlockIdx_ == 0; - } - #if __CCE_AICORE__ < 220 __aicore__ inline LocalTensor GetLocalWorkspace(int offset) const { @@ -384,39 +331,6 @@ public: return MATMUL_CONST_PARAM_VAR.tiling_; } - template - __aicore__ inline __gm__ SrcT* GetGlobalAddr() const - { - if constexpr (IS_INTRA_BLOCK) { - return MATMUL_CONST_INTRA_BLOCK.bGlobal; - } else { - return MATMUL_CONST_PARAM_VAR.bGlobal_; - } - } - - __aicore__ inline TBuffAddr GetLocalAddr() const - { - return MATMUL_CONST_PARAM_VAR.rightMatrix_; - } - - template - __aicore__ inline void SetGlobalAddr(__gm__ SrcT* srcGlobalAddr, bool isTranspose) - { - if constexpr (IS_INTRA_BLOCK) { - MATMUL_INTRA_BLOCK.bGlobal = srcGlobalAddr; - MATMUL_INTRA_BLOCK.isTransposeB = isTranspose; - } else { - MATMUL_PARAM_VAR.bGlobal_ = srcGlobalAddr; - MATMUL_PARAM_VAR.isTransposeB_ = isTranspose; - } - } - - __aicore__ inline void SetLocalAddr(const TBuffAddr& address, bool isTranspose) - { - MATMUL_PARAM_VAR.rightMatrix_ = address; - MATMUL_PARAM_VAR.isTransposeB_ = isTranspose; - } - template __aicore__ inline int32_t GetOrgHeight() { @@ -666,26 +580,6 @@ public: return !IsTranspose(); } - __aicore__ inline uint64_t GetSelfDefineData() const - { - return MATMUL_CONST_PARAM_VAR.dataPtr_; - } - - __aicore__ inline uint64_t GetUserDefineInfo() const - { - return MATMUL_CONST_PARAM_VAR.tilingPtr_; - } - - __aicore__ inline uint8_t GetSubBlockIdx() const - { - return MATMUL_CONST_PARAM_VAR.subBlockIdx_; - } - - __aicore__ inline bool IsFakeIntraBlock() const - { - return MATMUL_CONST_INTRA_BLOCK.fakeMsg || MATMUL_CONST_PARAM_VAR.subBlockIdx_ == 0; - } - #if __CCE_AICORE__ < 220 __aicore__ inline LocalTensor GetLocalWorkspace(int offset) const { diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h index 24593841..39d8a24c 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h @@ -15,14 +15,18 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H -#include "../copy_cube_in_intf.h" +#include "batch_copy_cube_in_intf.h" #include "batch_layout.h" +#include "../data_copy_wrapper.h" +#include "../../../resource/cube_in_buffer/cube_in_buffer.h" +#include "../copy_cube_in_params.h" +#include "../../../matmul_var.h" namespace matmul { // Specialized Template Class of Batch Matmul CopyIn // Batch Matmul ND Format Data CopyIn From GM/UB template -class CopyCubeIn::IsNeedUB() && GetCopyCubeInType() == CopyCubeInType::BMM && INPUT_TYPE::format == CubeFormat::ND>> @@ -36,8 +40,8 @@ class CopyCubeInSetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } @@ -82,7 +86,7 @@ public: __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { LocalTensor localTensor; - localTensor.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); return localTensor; } @@ -130,7 +134,7 @@ private: for (int32_t idx = 0; idx < iterNum; ++idx) { dstOffset += iterOffset; GlobalTensor srcGlobal; - srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); srcGlobal.SetAddr(iterOffset + batchOffset); if (srcStride >= UINT16_MAX) { for (int i = 0; i < batchNum; ++i) { @@ -169,13 +173,13 @@ class CopyCubeInSetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } @@ -215,7 +219,7 @@ public: __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { LocalTensor localTensor; - localTensor.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); return localTensor; } @@ -257,7 +261,7 @@ private: bool iskRowDirec = IS_KROW && IsSameTypeV; GlobalTensor srcGlobal; - srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); srcGlobal.SetAddr(batchOffset); int32_t alignHeight = CeilAlign(height, BLOCK_CUBE); diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h new file mode 100644 index 00000000..396b4bd5 --- /dev/null +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h @@ -0,0 +1,89 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! +* \file batch_copy_cube_in_intf.h +* \brief +*/ + +#ifndef IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ +#define IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ + +namespace matmul { +using namespace AscendC; + +template +class BatchCopyCubeIn +{ + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; +public: + __aicore__ inline BatchCopyCubeIn() = default; + __aicore__ inline ~BatchCopyCubeIn() = default; + /** + * @description: Init of BatchCopyCubeIn + * @return: void + */ + __aicore__ inline void Init() {} + + /** + * @description: Set input global address + * @param: address: Global address input through SetTensorA or SetTensorB + * @param: srcGlobalAddr: true if input tensor is transposed + * @return: void + */ + __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) {} + + __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) {} + + __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) {} + + /** + * @description: Load input data to L1 + * @param: curRow: The row index of the matrixA/B to be loaded at current iterate + * @param: curCol: The column index of the matrixA/B to be loaded at current iterate + * @param: tileHeight: The height of the matrixA/B tiles to be loaded at current iterate + * @param: tileWidth: The width of the matrixA/B tiles to be loaded at current iterate + * @return: Tensor on L1 + */ + __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { + ASCENDC_ASSERT((false), { + KERNEL_LOG(KERNEL_ERROR, "Matching error. This is an empty implementation."); + }); + return NULL_TENSOR; + } + + /** + * @description: Release tensor on l1 at one compute end + * @param: tensor: The tensor on l1 need to be released + * @param: curRow: The row index of the matrixA/B at current iterate + * @param: curCol: The column index of the matrixA/B at current iterate + * @return: void + */ + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) {} + + /* + * @description: Reset buffer status used in copy in + * @return: void + */ + __aicore__ inline void Reset() {} + + /** + * @description: Destory tensor on l1 at iterate end + * @return: void + */ + __aicore__ inline void Destroy() {} + + __aicore__ inline void BatchDestroy() {} +}; + +} +#endif // _BATCH_COPY_CUBE_IN_INTF_H_ \ No newline at end of file diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_v200.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_v200.h index 2d96842b..e7ebbea1 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_v200.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_v200.h @@ -15,14 +15,18 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_V200_H #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_V200_H -#include "../copy_cube_in_intf.h" +#include "batch_copy_cube_in_intf.h" #include "batch_layout.h" +#include "../data_copy_wrapper.h" +#include "../../../resource/cube_in_buffer/cube_in_buffer.h" +#include "../copy_cube_in_params.h" +#include "../../../matmul_var.h" namespace matmul { // Specialized Template Class of Batch Matmul CopyIn // Batch Matmul ND Format Data CopyIn From GM/UB template -class CopyCubeIn::IsNeedUB()) && GetCopyCubeInType() == CopyCubeInType::BMM && (INPUT_TYPE::format == CubeFormat::ND)>> @@ -33,6 +37,7 @@ private: MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); using TransT = typename INPUT_TYPE::TRANS_T; using SrcT = typename INPUT_TYPE::T; @@ -57,8 +62,8 @@ private: } public: - inline __aicore__ CopyCubeIn() = default; - inline __aicore__ ~CopyCubeIn() = default; + inline __aicore__ BatchCopyCubeIn() = default; + inline __aicore__ ~BatchCopyCubeIn() = default; __aicore__ inline void Init() { @@ -70,7 +75,7 @@ public: __aicore__ inline void SetInput(__gm__ SrcT *srcGlobalAddr, bool isTranspose = false) { - MATMUL_MODULE(MatmulVar)->SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } @@ -97,7 +102,7 @@ public: __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { LocalTensor localTensor; - localTensor.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); return localTensor; } @@ -305,7 +310,7 @@ private: // Specialized Template Class of Batch Matmul CopyIn // Batch Matmul NZ Format Data CopyIn From GM/UB template -class CopyCubeIn::IsNeedUB()) && GetCopyCubeInType() == CopyCubeInType::BMM && (INPUT_TYPE::format == CubeFormat::NZ)>> @@ -321,10 +326,11 @@ private: MATMUL_USE_MODULE_ON(BatchLayout, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); public: - inline __aicore__ CopyCubeIn() = default; - inline __aicore__ ~CopyCubeIn() = default; + inline __aicore__ BatchCopyCubeIn() = default; + inline __aicore__ ~BatchCopyCubeIn() = default; __aicore__ inline void Init() { @@ -336,7 +342,7 @@ public: __aicore__ inline void SetInput(__gm__ SrcT *srcGlobalAddr, bool isTranspose = false) { - MATMUL_MODULE(MatmulVar)->SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } @@ -352,7 +358,7 @@ public: auto batchOffset = outerIdx * MATMUL_MODULE(BatchLayout)->GetBatchNum() * MATMUL_MODULE(CopyCubeInParams)->GetSingleSize(); if constexpr (PhyPosIsGM(INPUT_TYPE::pos)) { - srcTensor.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + srcTensor.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); srcTensor.SetAddr(batchOffset); } else { srcTensor.SetAddr(MATMUL_PARAM_VAR.leftMatrix_); @@ -375,7 +381,7 @@ public: __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { LocalTensor localTensor; - localTensor.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); return localTensor; } diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h index 8911f296..d6baa0ef 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h @@ -26,6 +26,7 @@ class CopyCubeIn() == CopyCubeInType::FROM_L1>> { MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); using TransT = typename INPUT_TYPE::TRANS_T; using SrcT = typename INPUT_TYPE::T; @@ -39,19 +40,19 @@ public: __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose) { - MATMUL_MODULE(MatmulVar)->SetLocalAddr(address, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); } __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) { - MATMUL_MODULE(MatmulVar)->SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); } __aicore__ inline LocalTensor LoadData( int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth) { LocalTensor l1; - l1.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + l1.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); return l1; } diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h index 5e60d5ba..03d0c566 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h @@ -30,6 +30,7 @@ class CopyCubeInSetLocalAddr(address, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) { - MATMUL_MODULE(MatmulVar)->SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h index a6ce7a64..3dd2981e 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h @@ -30,6 +30,8 @@ class CopyCubeInSetLocalAddr(address, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) { if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { - if (MATMUL_MODULE(MatmulVar)->GetSubBlockIdx() == 0) { - MATMUL_MODULE(MatmulVar)->template SetGlobalAddr(srcGlobalAddr, isTranspose); + if (MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() == 0) { + MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalAddr(srcGlobalAddr, isTranspose); } else { - MATMUL_MODULE(MatmulVar)->template SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalAddr(srcGlobalAddr, isTranspose); } } else { - MATMUL_MODULE(MatmulVar)->template SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalAddr(srcGlobalAddr, isTranspose); } MATMUL_MODULE(CubeInBuffer)->Reset(); if constexpr (IsSameABTemplate()) { - MATMUL_MODULE(CubeInBuffer)->SetOrgAddr(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + MATMUL_MODULE(CubeInBuffer)->SetOrgAddr(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); } } @@ -81,7 +83,7 @@ public: } else { l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1); if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { - if (MATMUL_MODULE(MatmulVar)->IsFakeIntraBlock()) { + if (MATMUL_MODULE(MatmulSubBlockInfo)->IsFakeIntraBlock()) { MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( l1, curRow, curCol, tileHeight, tileWidth); } else { diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h index 6e7bef82..c0a7a564 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h @@ -42,6 +42,7 @@ class CopyCubeInSetLocalAddr(address, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } __aicore__ inline void SetInput(__gm__ SRC_T *srcGlobalAddr, bool isTranspose) { // do Set Global Input here - MATMUL_MODULE(MatmulVar)->SetGlobalAddr(srcGlobalAddr, isTranspose); + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); MATMUL_MODULE(CubeInBuffer)->Reset(); } @@ -159,7 +160,7 @@ private: { if constexpr (INPUT_TYPE::format == CubeFormat::ND) { GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV && IsSameTypeV) { if (!MATMUL_MODULE(MatmulVar)->IsTranspose()) { @@ -175,7 +176,7 @@ private: } } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) { GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV && IsSameTypeV) { if (!MATMUL_MODULE(MatmulVar)->IsTranspose()) { @@ -196,7 +197,7 @@ private: return false; } GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->GetGlobalAddr()); + aGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); CopyVector2A1(aMatrix, aGlobal, curCol * MATMUL_MODULE(MatmulVar)->template GetBaseWidth(), CeilT(tileWidth, c0Size_)); } else { @@ -240,7 +241,7 @@ private: IsSameTypeV) { if (!MATMUL_MODULE(MatmulVar)->IsTranspose()) { LocalTensor leftMatrix; - leftMatrix.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); CopyND2NZWithTransData(aMatrix, leftMatrix, curRow, curCol, tileHeight, tileWidth); } else { @@ -261,7 +262,7 @@ private: } } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) { LocalTensor leftMatrix; - leftMatrix.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IsSameTypeV && IsSameTypeV) { if (!MATMUL_MODULE(MatmulVar)->IsTranspose()) { @@ -283,7 +284,7 @@ private: return false; } LocalTensor leftMatrix; - leftMatrix.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); CopyVector2A1(aMatrix, leftMatrix, curCol * MATMUL_MODULE(MatmulVar)->template GetBaseWidth(), CeilT(tileWidth, c0Size_)); } else { @@ -298,7 +299,7 @@ private: int tileHeight, int tileWidth) { LocalTensor leftMatrix; - leftMatrix.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + leftMatrix.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); CopyND2NZOnTheFly( aMatrix, leftMatrix, curRow * MATMUL_MODULE(MatmulVar)->template GetBaseHeight(), curCol * MATMUL_MODULE(MatmulVar)->template GetBaseWidth(), tileHeight, tileWidth, diff --git a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h index 5af6a908..6051a398 100644 --- a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h +++ b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h @@ -34,6 +34,7 @@ class DataCopyWrapper { MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); public: __aicore__ inline DataCopyWrapper() = default; @@ -47,28 +48,28 @@ public: if (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyA1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulVar)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulVar)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulVar)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); } else if (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyB1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulVar)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulVar)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulVar)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); #else if constexpr (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyA1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulVar)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulVar)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulVar)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); } else if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyB1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulVar)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulVar)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulVar)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); #endif } else { constexpr int32_t widthFactor = @@ -76,7 +77,7 @@ public: if (MATMUL_MODULE(MatmulVar)->template IsTranspose()) { if constexpr (IsCopyFromUB()) { LocalTensor src; - src.SetAddr(MATMUL_MODULE(MatmulVar)->GetLocalAddr()); + src.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); CopyTileToCubeFromUB( dst, src, curCol, curRow, tileWidth, tileHeight / widthFactor, MATMUL_MODULE(MatmulVar)->template GetBaseHeight(), @@ -86,7 +87,7 @@ public: MATMUL_MODULE(MatmulVar)->template IsKRowDirec()); } else { GlobalTensor src; - src.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->template GetGlobalAddr()); + src.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()); CopyTileToCubeFromGM(dst, src, curCol, curRow, tileWidth, tileHeight / widthFactor, MATMUL_MODULE(MatmulVar)->template GetBaseHeight(), MATMUL_MODULE(MatmulVar)->template GetBaseWidth(), @@ -108,7 +109,7 @@ public: MATMUL_MODULE(MatmulVar)->template IsKRowDirec()); } else { GlobalTensor src; - src.SetGlobalBuffer(MATMUL_MODULE(MatmulVar)->template GetGlobalAddr()); + src.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()); CopyTileToCubeFromGM( dst, src, curRow, curCol, tileHeight, tileWidth / widthFactor, MATMUL_MODULE(MatmulVar)->template GetBaseHeight(), diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index 4dc14ba3..4ad25722 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -62,6 +62,11 @@ class MatmulImpl , MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarA) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarC) +, MATMUL_IMPORT_MODULE_PRIVATE(BatchCopyCubeInA) +, MATMUL_IMPORT_MODULE_PRIVATE(BatchCopyCubeInB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoA) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulSubBlockInfo) #if __CCE_AICORE__ == 220 || __CCE_AICORE__ == 300 || __CCE_AICORE__ == 200 , MatmulMacroImpl, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS @@ -251,6 +256,13 @@ public: MATMUL_ALLOW_USING_PRIVATE(MatmulVarA); MATMUL_ALLOW_USING_PRIVATE(MatmulVarB); MATMUL_ALLOW_USING_PRIVATE(MatmulVarC); + MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInA); + MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInB); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoA); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB); + MATMUL_ALLOW_USING_PRIVATE(MatmulSubBlockInfo); + MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInA); + MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInB); template using CubeInBuffer = typename AscendC::Conditional::type; template @@ -272,6 +284,17 @@ private: using IMPL = MatmulImpl; MATMUL_USE_MODULE(CopyCubeInA); MATMUL_USE_MODULE(CopyCubeInB); + MATMUL_USE_MODULE(BatchCopyCubeInA); + MATMUL_USE_MODULE(BatchCopyCubeInB); + + using ChosenCopyCubeInA = typename AscendC::Conditional() != CopyCubeInType::BMM, + CopyCubeInA, BatchCopyCubeInA>::type; + + using ChosenCopyCubeInB = typename AscendC::Conditional() != CopyCubeInType::BMM, + CopyCubeInB, BatchCopyCubeInB>::type; + + MATMUL_USE_MODULE(ChosenCopyCubeInA); + MATMUL_USE_MODULE(ChosenCopyCubeInB); private: template using MatmulVar = typename AscendC::Conditional::type; template + using MatmulTensorInfo = typename AscendC::Conditional::type; + template using DataCopyUtils = typename AscendC::Conditional::type; using CallBack = MM_CB; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp index cc3ca7ac..214083eb 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp @@ -102,7 +102,11 @@ class MatmulImpl , MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsA) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarA) , MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsB) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) { +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoA) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulSubBlockInfo) +{ MATMUL_ALLOW_USING(CopyCubeInA); MATMUL_ALLOW_USING(CopyCubeInB); MATMUL_ALLOW_USING(CubeInBufferA); @@ -111,6 +115,9 @@ class MatmulImpl MATMUL_ALLOW_USING_PRIVATE(MatmulVarA); MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsB); MATMUL_ALLOW_USING_PRIVATE(MatmulVarB); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoA); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB); + MATMUL_ALLOW_USING_PRIVATE(MatmulSubBlockInfo); using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -127,6 +134,8 @@ public: template using MatmulVar = typename AscendC::Conditional::type; using CallBack = MM_CB; + template + using MatmulTensorInfo = typename AscendC::Conditional::type; MATMUL_USE_MODULE(CopyCubeInA); MATMUL_USE_MODULE(CopyCubeInB); diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp index 44c7ac40..a7f58863 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp @@ -103,7 +103,11 @@ class MatmulImpl , MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarA) , MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsB) , MATMUL_IMPORT_MODULE_PRIVATE(DataCopyUtilsB) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) { +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoA) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulSubBlockInfo) +{ MATMUL_ALLOW_USING(CopyCubeInA); MATMUL_ALLOW_USING(CubeInBufferA); MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsA); @@ -113,6 +117,9 @@ class MatmulImpl MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsB); MATMUL_ALLOW_USING_PRIVATE(DataCopyUtilsB); MATMUL_ALLOW_USING_PRIVATE(MatmulVarB); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoA); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB); + MATMUL_ALLOW_USING_PRIVATE(MatmulSubBlockInfo); using SrcT = typename A_TYPE::T; @@ -128,6 +135,8 @@ public: template using MatmulVar = typename AscendC::Conditional::type; template + using MatmulTensorInfo = typename AscendC::Conditional::type; + template using DataCopyUtils = typename AscendC::Conditional::type; using CallBack = MM_CB; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp index 930b5ffd..682e6c85 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp @@ -102,7 +102,11 @@ class MatmulImpl , MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsA) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarA) , MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsB) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) { +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoA) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulSubBlockInfo) +{ MATMUL_ALLOW_USING(CopyCubeInA); MATMUL_ALLOW_USING(CopyCubeInB); MATMUL_ALLOW_USING(CubeInBufferA); @@ -111,6 +115,9 @@ class MatmulImpl MATMUL_ALLOW_USING_PRIVATE(MatmulVarA); MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsB); MATMUL_ALLOW_USING_PRIVATE(MatmulVarB); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoA); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB); + MATMUL_ALLOW_USING_PRIVATE(MatmulSubBlockInfo); using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -127,6 +134,8 @@ public: template using MatmulVar = typename AscendC::Conditional::type; using CallBack = MM_CB; + template + using MatmulTensorInfo = typename AscendC::Conditional::type; MATMUL_USE_MODULE(CopyCubeInA); MATMUL_USE_MODULE(CopyCubeInB); diff --git a/tests/matmul/test_matmul_var.cpp b/tests/matmul/test_matmul_var.cpp index 89b56ef4..e36c73b0 100644 --- a/tests/matmul/test_matmul_var.cpp +++ b/tests/matmul/test_matmul_var.cpp @@ -11,6 +11,7 @@ #include "impl/matmul/modules/matmul_policy.h" #include "impl/matmul/modules/matmul_private_modules.h" #define private public +#include "impl/matmul/modules/matmul_tensor_info.h" #include "impl/matmul/modules/matmul_var.h" using namespace std; @@ -20,8 +21,12 @@ using namespace matmul; namespace { template -class MatmulImpl : MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) { +class MatmulImpl +: MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) +{ MATMUL_ALLOW_USING_PRIVATE(MatmulVarB); + MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB); using SrcT = typename A_TYPE::T; public: using VAR_PARAMS = -- Gitee From e1c46747c934e53803728386e05e37643b906845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E6=B3=BD=E4=B8=9C?= Date: Wed, 13 Nov 2024 03:03:04 +0000 Subject: [PATCH 2/2] fix error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 姜泽东 --- lib/matmul/matmul.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index 4ad25722..09df3814 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -276,6 +276,8 @@ public: template using MatmulVar = typename AscendC::Conditional::type; template + using MatmulTensorInfo = typename AscendC::Conditional::type; + template using DataCopyUtils = typename AscendC::Conditional::type; using CallBack = MM_CB; -- Gitee