diff --git a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h index 07772ac8ee75b1a5842120e329529f87d7bfd081..8c0ae687f5ed2222028c46db406a553fadcdc6e0 100644 --- a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h +++ b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h @@ -32,7 +32,7 @@ bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmul cubeTiling->SetShape(M, N, K); cubeTiling->SetOrgShape(M, N, K); cubeTiling->SetFixSplit(baseM, baseN, baseK); - cubeTiling->SetBias(isBias); + cubeTiling->EnableBias(isBias); cubeTiling->SetBufferSpace(-1, -1, -1); if (cubeTiling->GetTiling(tiling) == -1) { return false; diff --git a/examples/matrix/batch_matmul/host_tiling/batch_matmul_custom_tiling.h b/examples/matrix/batch_matmul/host_tiling/batch_matmul_custom_tiling.h index 3ac3bd681234cc1ff0438e9d9c3690ed7b011ebd..8f6497da21a787dcdc7b8951bcc569ba276fbced 100644 --- a/examples/matrix/batch_matmul/host_tiling/batch_matmul_custom_tiling.h +++ b/examples/matrix/batch_matmul/host_tiling/batch_matmul_custom_tiling.h @@ -36,7 +36,7 @@ bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmul cubeTiling->SetShape(M, N, K); cubeTiling->SetOrgShape(M, N, K); cubeTiling->SetFixSplit(baseM, baseN, -1); - cubeTiling->SetBias(isBias); + cubeTiling->EnableBias(isBias); cubeTiling->SetBufferSpace(-1, -1, -1); constexpr int32_t A_BNUM = 2; diff --git a/examples/matrix/matmul/host_tiling/matmul_custom_tiling.h b/examples/matrix/matmul/host_tiling/matmul_custom_tiling.h index 89bcba1a6d63a8c96f1fa20d438e64f2210403c0..bf12d8517f0b0da379b18a69c148409d7447442d 100644 --- a/examples/matrix/matmul/host_tiling/matmul_custom_tiling.h +++ b/examples/matrix/matmul/host_tiling/matmul_custom_tiling.h @@ -34,7 +34,7 @@ bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmul cubeTiling->SetShape(M, N, K); cubeTiling->SetOrgShape(M, N, K); cubeTiling->SetFixSplit(baseM, baseN, -1); - cubeTiling->SetBias(isBias); + cubeTiling->EnableBias(isBias); cubeTiling->SetBufferSpace(-1, -1, -1); if (cubeTiling->GetTiling(tiling) == -1) { return false; diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 09d44c628e464b2ec1a91422f074e1733c7589ea..589586889fd576abc05f9a432997da0aaed18355 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -2350,11 +2350,18 @@ __aicore__ inline void MatmulImpl -__aicore__ inline void MatmulImpl::ClearBias() +__aicore__ inline void MatmulImpl::DisableBias() { var.enableBias_ = false; } +template +__aicore__ inline void MatmulImpl::ClearBias() +{ + DisableBias(); +} + template template diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h index dce3bbccb5b2293677216b03330361b488b03216..c5c72a24ee03796aa52df62738f3ce7661be312c 100644 --- a/impl/matmul/matmul_server.h +++ b/impl/matmul/matmul_server.h @@ -381,7 +381,7 @@ public: mul.SetBias(biasGlobal); } } else if (msg->body.setClearBias) { - mul.ClearBias(); + mul.DisableBias(); } } @@ -399,7 +399,7 @@ public: mul.SetBias(biasGlobal); } } else if (msg->body.setClearBias) { - mul.ClearBias(); + mul.DisableBias(); } } @@ -965,6 +965,7 @@ public: __aicore__ inline void SetBias(const LocalTensor& inputBias){}; __aicore__ inline void SetTensorA(SrcAT aScalar){}; __aicore__ inline void SetTensorB(SrcBT bScalar){}; + __aicore__ inline void DisableBias(){}; __aicore__ inline void ClearBias(){}; __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {} __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {} @@ -1014,8 +1015,15 @@ public: bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) {}; template + __aicore__ inline GlobalTensor GetBatchTensorC(uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false) {}; + template __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {}; template + __aicore__ inline void GetBatchTensorC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) {}; + template __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, uint32_t dstGap = 0) {}; diff --git a/impl/matmul/matmul_tiling_base.cpp b/impl/matmul/matmul_tiling_base.cpp index 07d197bbd4ab6c7c7de8243d9ab794c4ff5b1efe..2922ac2749c306866b0ba383644554998613de81 100644 --- a/impl/matmul/matmul_tiling_base.cpp +++ b/impl/matmul/matmul_tiling_base.cpp @@ -427,13 +427,18 @@ int32_t MatmulApiTilingBase::SetShape(int32_t m, int32_t n, int32_t k) return 0; } -int32_t MatmulApiTilingBase::SetBias(bool isBiasIn) +int32_t MatmulApiTilingBase::EnableBias(bool isBiasIn) { TILING_LOG_DEBUG(" Set bias: %d", static_cast(isBiasIn)); this->isBias = isBiasIn; return 0; } +int32_t MatmulApiTilingBase::SetBias(bool isBiasIn) +{ + return EnableBias(isBiasIn); +} + int32_t MatmulApiTilingBase::SetFixSplit(int32_t baseMIn, int32_t baseNIn, int32_t baseKIn) { TILING_LOG_DEBUG(" Set fixed split baseM: %d", baseMIn); diff --git a/lib/matmul/bmm_tiling.h b/lib/matmul/bmm_tiling.h index 594ca4488cb4c16f44f09a38a475cc7f24a9f910..7a1bccfd611823e1f8de9ef6156e3663e723e516 100644 --- a/lib/matmul/bmm_tiling.h +++ b/lib/matmul/bmm_tiling.h @@ -48,10 +48,14 @@ public: // Get the BlockDim used after multi core tiling. // It is carried by users to the kernel to control the service logic in the kernel. int32_t GetCoreNum(int32_t &dim, int32_t &mDim, int32_t &nDim); - void SetSplitK(bool flag) + void EnableMultiCoreSplitK(bool flag) { enableSplitK_ = flag; } + void SetSplitK(bool flag) + { + EnableMultiCoreSplitK(flag); + } protected: virtual int64_t Compute() override; }; diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index 2f6f505f1a735f7801c4526ff6ed658de211b7e3..4eb04cf7f3ad60ada3bf703183e04e88110fe860 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -93,6 +93,7 @@ public: bool isTransposeB = false); __aicore__ inline void SetBias(const LocalTensor& inputBias); __aicore__ inline void SetBatchNum(int32_t batchA, int32_t batchB); + __aicore__ inline void DisableBias(); __aicore__ inline void ClearBias(); template __aicore__ inline bool Iterate(bool enPartialSum = false); template diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index 3b529551b68ee7ecfe0e3708ea1119950bc4da0b..62d457ab9d166429fc84b6e53dcacb40f8a45d5e 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -286,11 +286,16 @@ public: kfcMsg_.body.setTensorB = 1; } - __aicore__ inline void ClearBias() + __aicore__ inline void DisableBias() { kfcMsg_.body.setClearBias = 1; } + __aicore__ inline void ClearBias() + { + DisableBias(); + } + __aicore__ inline void End() { if (isSyncGetC) { @@ -664,7 +669,7 @@ public: } template - __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) + __aicore__ inline GlobalTensor GetBatchTensorC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) { GlobalTensor global; if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { @@ -688,9 +693,15 @@ public: return global; } + template + __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) + { + return GetBatchTensorC(batchA, batchB, enSequentialWrite); + } + // coordinated use with IterateNBatch,get single IterateBatch outcome template - __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + __aicore__ inline void GetBatchTensorC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) { if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { @@ -714,6 +725,13 @@ public: TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); } + template + __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false) + { + GetBatchTensorC(c, batchA, batchB, enSequentialWrite); + } + __aicore__ inline void AsyncGetTensorC(const LocalTensor& c) { TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index a724fdad9a51facd681d1a25ddbadfe0b844815e..12c5f1f1d6a8a83c3d3017ad8dfe52e883e63dd2 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -206,6 +206,7 @@ public: int32_t SetCLayout(int32_t b, int32_t s, int32_t n, int32_t g, int32_t d); int32_t SetBatchInfoForNormal(int32_t batchA, int32_t batchB, int32_t m, int32_t n, int32_t k); int32_t SetBatchNum(int32_t batch); + int32_t EnableBias(bool isBiasIn = false); int32_t SetBias(bool isBiasIn = false); int32_t SetFixSplit(int32_t baseMIn = -1, int32_t baseNIn = -1, int32_t baseKIn = -1); int32_t SetBufferSpace(int32_t l1Size = -1, int32_t l0CSize = -1, int32_t ubSize = -1, int32_t btSize = -1);