From 52305cc3ec465dd2952da0768d3af518ee2d68cf Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 14 Aug 2025 14:33:40 +0800 Subject: [PATCH 001/414] Init --- comm/lcal/src/CmakeLists.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 comm/lcal/src/CmakeLists.txt diff --git a/comm/lcal/src/CmakeLists.txt b/comm/lcal/src/CmakeLists.txt new file mode 100644 index 00000000..e69de29b -- Gitee From 0c0f753862fe69b17c1ccd193eac6cb737ad3750 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 11:17:42 +0800 Subject: [PATCH 002/414] allreduce_big_data --- .../src/ascendc_kernels/allreduce_big_data.h | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 comm/lcal/src/ascendc_kernels/allreduce_big_data.h diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h new file mode 100644 index 00000000..24fe9fde --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + #ifndef LCCL_ALLREDUCE_BIG_DATA_H + #define LCCL_ALLREDUCE_BIG_DATA_H + + #include "all_reduce_quant.h" + #include "sync_collectives.h" + #include "ipc_queue.h" + using namespace AscendC; + + template + class AllReduceBigData : public AllReduceQuant { + constexpr static int QUEUE_DEPTH = 4; + constexpr static int T oneCast = (T) 1; + +public: + FORCE_INLINE_AICORE AllReduceBigData(int rank, int rankSize, uint32_t extraFlag) + : AllReduceQuant(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + if constexpr(!std::is_same_v) { + BuildScaleOffset(scale, scaleCount, offset); + } + + if (blockIdx >- PING_PONG_SIZE * rankSize) { + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + return; + } + + perStepBlockNum = rankSize; + + __gm__ CommArgs *localArgs = reinterpret_cast<__gm__ CommArgs *>(commArgs); + int globalRankSize = localArgs->rankSize <= 0 ? rankSize : localArgs->rankSize; + int localRankSize = localArgs->localRankSize <= 0 ? rankSize : localArgs->localRankSize; + int serverNum = globalRankSize / localRankSize; + int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) + / QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; + curBlockSize = ipcBuffMaxSizeAligned / localRankSize / QUEUE_DEPTH; + curBlockNum = curBlockSize / sizeof(T); + atomOp = op; + int64_t perQueSize = ipcBuffMaxSizeAligned / localRankSize; + int64_t perQueNum = perQueSize / sizeof(T); + + for (int i = 0; i < rankSize; ++i) { + rankList[i] = i; + coreIdxList[i] = rankSize + blockIdx % perStepBlockNum; + } + peerRank = blockIdx % perStepBlockNum; + perRankDataNum = GetDataCount(len, rankSize) / scaleNum * scaleNum; + + peerRank = blockIdx % perStepBlockNum; + perRankDataNum = GetDataCount(len, rankSize) / scaleNum * scaleNum; + + curRankDatNum = perRankDataNum; + if (blockIdx % perStepBlockNum == rankSize - 1) { + curRankDatNum = len - (rankSize - 1) * perRankDataNum; + } + + pullRankDataNum = (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; + + inputBuffOffsetNum = blockIdx % rankSize * perRankDataNum; + + inputGt.SetGlobalBuffer((__gm__ U*)input + inputBuffOffsetNum, curRankDatNum); + + outputBuffOffsetNum = peerRank * perRankDataNum; + + outputGt.SetGlobalBuffer((__gm__ U*)output + outputBuffOffsetNum, curRankDatNum); + + inputIpcGtOffsetNum = perQueSize * (blockIdx % perStepBlockNum); + + if (blockIdx / perStepBlockNum == 0) { + inputQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlocknum); + } else { + srcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + rank * perQueSize, + perQueNum, curBlocknum); + dstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * perQueSize, + perQueNum, curBlocknum); + pullQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + peerRank * perQueSize, + perQueNum, curBlocknum); + } + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + if (blockIdx >= PING_PONG_SIZE * rankSize) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + return; + } + + if constexpr (!std::is_same_v) { + if (rankSize == 1 && blockIdx == 0) { + int64_t remain = curRankDataNum; + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int64_t count = 0; + while (count < loopCount) { + int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; + Collectives::CpGM2GMPingPong(copyNum * sizeof(T), inputGt[count * curBlockNum], + outputGt[count * curBlockNum], COPYONLY); + remain -= curBlockNum; + ++count; + } + } + if (rankSize == 1) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + return; + } + } + + if (blockIdx / perStepBlockNum == 0) { + Producer(); + } else { + Consumer(); + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + } +private: + FORCE_INLINE_AICORE void Producer() + { + int64_t remain = curRankDataNum; + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int count = 0; + while (count < loopCount) { + inputQue.DeQue(rankList, coreIdxList, rankSize); + GlobalTensor outputGm = inputQue.EnQue(); + int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; + if constexpr (std::is_same_v) { + Collectives::CpGM2GMPingPong(copyNum * sizeof(T), inputGt[count * curBlockNum], + outputGm, COPYONLY); + } else { + if (blockIdx != rank) { + GlobalTensor outputGmTmp; + outputGmTmp.SetGlobalBuffer((__gm__ U*)outputGm.GetPhyAddr()); + Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], + outputGmTmp, COPYONLY); + } else { + CpGM2GMWithScale(copyNum, inputGt[count * curBlockNum], + outputGm, COPYONLY); + } + } + sync.SetInnerFlag(magic, count); + + remain = remain - curBlockNum; + count = count + 1; + } + } + + FORCE_INLINE_AICORE void Consumer() + { + int64_t atomLoopCount = CeilDiv(pullRankDataNum, curBlockNum); + int64_t atomRemain = pullRankDataNum; + int64_t remain = curRankDataNum; + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int count = 0; + while (count < loopCount || count < atomLoopCount) { + if (peerRank != rank && count != atomLoopCount) { + sync.WaitInnerFlag(magic, count, rank, rank); + sync.WaitInnerFlag(magic, count, peerRank, rank); + + GlobalTensor inputGm = srcQue.ReadFront(); + GlobalTensor outputGm = dstQue.EnQue(); + + int64_t atomCopyNum = (atomRemain < curBlockNum) ? atomRemain : curBlockNum; + if constexpr (std::is_same_v) { + Collectives::CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, + outputGm, atomOp); + } else { + GlobalTensor inputGmTmp; + inputGmTmp.SetGlobalBuffer((__gm__ U*)inputGm.GetPhyAddr()); + CpGM2GMWithScale(atomCopyNum, inputGmTmp, + outputGm, atomOp); + } + atomRemain = atomremain - curBlockNum; + } + sync.SetOuterFlag(magic, count); + if (count == loopCount) { + break; + } + sync.WaitOneRankPartOuterFlag(magic, count, peerRank, rankSize, rankSize); + if (!(extraFlag & ExtraFlag::RDMA)) { + GlobalTensor pullGm = pullQue.ReadFront(); + int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; + Collectives::CpGM2GMPingPong(copyNum * sizeof(T), pullGm, outputGt[count * curBlockNum], COPYONLY); + } + + sync.SetInnerFlag(magic, count); + remain = remain - curBlockNum; + count = count + 1; + + } + } + } +private: + GlobalTensor inputGt; + GlobalTensor outputGt; + + int atomOp; + + int64_t perRankDataNum; + int64_t curRankDataNum; + int64_t peerRank; + int64_t pullRankDataNum; + int64_t inputBuffOffsetNum; + int64_t outputBuffOffsetNum; + int64_t inputIpcGtOffsetNum; + int64_t curBlockSize; + int64_t perStepBlockNum; + int64_t curBlockNum; + + IpcQueue inputQue; + IpcQueue srcQue; + IpcQueue dstQue; + IpcQueue pullQue; + + int rankList[LCAL_MAX_RANK_SIZE]; + int coreIdxList[LCAL_MAX_RANK_SIZE]; + + GlobalTensor scaleGt; + int64_t scaleNum = 1; + T firstScale = 1; + T offset = 0; + bool isEnableScale = false; + bool isVectorScale = false; +}; + +#endif // LCCL_ALLREDUCE_BIG_DATA_H + + + + + -- Gitee From e91d8ddbdcf3bd3d4d2da450bc77096ec5aa9208 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 11:56:13 +0800 Subject: [PATCH 003/414] revise allreduce_big_data --- .../src/ascendc_kernels/allreduce_big_data.h | 84 +++++++++++-------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 24fe9fde..040214ae 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -11,15 +11,15 @@ #ifndef LCCL_ALLREDUCE_BIG_DATA_H #define LCCL_ALLREDUCE_BIG_DATA_H - #include "all_reduce_quant.h" + #include "allreduce_quant.h" #include "sync_collectives.h" #include "ipc_queue.h" using namespace AscendC; template - class AllReduceBigData : public AllReduceQuant { + class AllReduceBigData : protected AllReduceQuant { constexpr static int QUEUE_DEPTH = 4; - constexpr static int T oneCast = (T) 1; + constexpr static T oneCast = (T) 1; public: FORCE_INLINE_AICORE AllReduceBigData(int rank, int rankSize, uint32_t extraFlag) @@ -32,7 +32,7 @@ public: BuildScaleOffset(scale, scaleCount, offset); } - if (blockIdx >- PING_PONG_SIZE * rankSize) { + if (blockIdx >= PING_PONG_SIZE * rankSize) { DumpLcclLogInfo(LogId::INIT, static_cast(op)); return; } @@ -43,8 +43,7 @@ public: int globalRankSize = localArgs->rankSize <= 0 ? rankSize : localArgs->rankSize; int localRankSize = localArgs->localRankSize <= 0 ? rankSize : localArgs->localRankSize; int serverNum = globalRankSize / localRankSize; - int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) - / QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; + int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; curBlockSize = ipcBuffMaxSizeAligned / localRankSize / QUEUE_DEPTH; curBlockNum = curBlockSize / sizeof(T); atomOp = op; @@ -55,48 +54,46 @@ public: rankList[i] = i; coreIdxList[i] = rankSize + blockIdx % perStepBlockNum; } - peerRank = blockIdx % perStepBlockNum; - perRankDataNum = GetDataCount(len, rankSize) / scaleNum * scaleNum; peerRank = blockIdx % perStepBlockNum; perRankDataNum = GetDataCount(len, rankSize) / scaleNum * scaleNum; - curRankDatNum = perRankDataNum; + curRankDataNum = perRankDataNum; if (blockIdx % perStepBlockNum == rankSize - 1) { - curRankDatNum = len - (rankSize - 1) * perRankDataNum; + curRankDataNum = len - (rankSize - 1) * perRankDataNum; } pullRankDataNum = (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; inputBuffOffsetNum = blockIdx % rankSize * perRankDataNum; - inputGt.SetGlobalBuffer((__gm__ U*)input + inputBuffOffsetNum, curRankDatNum); + inputGt.SetGlobalBuffer((__gm__ U*)input + inputBuffOffsetNum, curRankDataNum); outputBuffOffsetNum = peerRank * perRankDataNum; - outputGt.SetGlobalBuffer((__gm__ U*)output + outputBuffOffsetNum, curRankDatNum); + outputGt.SetGlobalBuffer((__gm__ T*)output + outputBuffOffsetNum, curRankDataNum); inputIpcGtOffsetNum = perQueSize * (blockIdx % perStepBlockNum); if (blockIdx / perStepBlockNum == 0) { inputQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlocknum); + perQueNum, curBlockNum); } else { srcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + rank * perQueSize, - perQueNum, curBlocknum); + perQueNum, curBlockNum); dstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * perQueSize, - perQueNum, curBlocknum); + perQueNum, curBlockNum); pullQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + peerRank * perQueSize, - perQueNum, curBlocknum); + perQueNum, curBlockNum); } DumpLcclLogInfo(LogId::INIT, static_cast(op)); } FORCE_INLINE_AICORE void Process() { - DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); if (blockIdx >= PING_PONG_SIZE * rankSize) { - DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); return; } @@ -114,7 +111,7 @@ public: } } if (rankSize == 1) { - DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); return; } } @@ -124,30 +121,27 @@ public: } else { Consumer(); } - DumpLcclLogInfo(LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); } private: FORCE_INLINE_AICORE void Producer() { - int64_t remain = curRankDataNum; int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int64_t remain = curRankDataNum; int count = 0; while (count < loopCount) { inputQue.DeQue(rankList, coreIdxList, rankSize); GlobalTensor outputGm = inputQue.EnQue(); int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; if constexpr (std::is_same_v) { - Collectives::CpGM2GMPingPong(copyNum * sizeof(T), inputGt[count * curBlockNum], - outputGm, COPYONLY); + Collectives::CpGM2GMPingPong(copyNum * sizeof(T), inputGt[count * curBlockNum], outputGm, COPYONLY); } else { if (blockIdx != rank) { GlobalTensor outputGmTmp; outputGmTmp.SetGlobalBuffer((__gm__ U*)outputGm.GetPhyAddr()); - Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], - outputGmTmp, COPYONLY); + Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], outputGmTmp, COPYONLY); } else { - CpGM2GMWithScale(copyNum, inputGt[count * curBlockNum], - outputGm, COPYONLY); + CpGM2GMWithScale(copyNum, inputGt[count * curBlockNum], outputGm, COPYONLY); } } sync.SetInnerFlag(magic, count); @@ -161,8 +155,8 @@ private: { int64_t atomLoopCount = CeilDiv(pullRankDataNum, curBlockNum); int64_t atomRemain = pullRankDataNum; + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); int64_t remain = curRankDataNum; - int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); int count = 0; while (count < loopCount || count < atomLoopCount) { if (peerRank != rank && count != atomLoopCount) { @@ -174,15 +168,13 @@ private: int64_t atomCopyNum = (atomRemain < curBlockNum) ? atomRemain : curBlockNum; if constexpr (std::is_same_v) { - Collectives::CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, - outputGm, atomOp); + Collectives::CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); } else { GlobalTensor inputGmTmp; inputGmTmp.SetGlobalBuffer((__gm__ U*)inputGm.GetPhyAddr()); - CpGM2GMWithScale(atomCopyNum, inputGmTmp, - outputGm, atomOp); + CpGM2GMWithScale(atomCopyNum, inputGmTmp, outputGm, atomOp); } - atomRemain = atomremain - curBlockNum; + atomRemain = atomRemain - curBlockNum; } sync.SetOuterFlag(magic, count); if (count == loopCount) { @@ -202,9 +194,33 @@ private: } } } + + FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) + { + if (scale != nullptr && offset != nullptr) { + scaleGt.SetGlobalBuffer((__gm__ T*)scale); + this->firstScale = scaleGt.GetValue(0); + this->offset =* reinterpret_cast<__gm__ T*>(offset); + this->scaleNum = scaleCount < 1 ? 1 : scaleCount; + isVectorScale = scaleCount > 1; + isEnableScale = scaleCount > 0 && !(*(uint16_t *)(&(this->offset)) == 0 && + scaleCount == 1 && *(uint16_t *)(&firstScale) == *(uint16_t *)(&oneCast)); + } + } + + FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, int64_t atomOp) + { + if (isEnableScale) { + Collectives::CpGM2GMWithVectorScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); + } else if (!isVectorScale) { + CpGM2GMWithScalarScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, firstScale, offset); + } else { + CpGM2GMWithScalarScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scalseNum, offset); + } + } private: GlobalTensor inputGt; - GlobalTensor outputGt; + GlobalTensor outputGt; int atomOp; -- Gitee From b68a4708fed2b6606127af0d1c536eb872027db1 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 13:48:12 +0800 Subject: [PATCH 004/414] 2nd revise allreduce_big_data --- comm/lcal/src/ascendc_kernels/allreduce_big_data.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 040214ae..fbc1b4db 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -17,7 +17,7 @@ using namespace AscendC; template - class AllReduceBigData : protected AllReduceQuant { + class AllReduceBigData : protected AllReduceQuant { constexpr static int QUEUE_DEPTH = 4; constexpr static T oneCast = (T) 1; @@ -43,7 +43,8 @@ public: int globalRankSize = localArgs->rankSize <= 0 ? rankSize : localArgs->rankSize; int localRankSize = localArgs->localRankSize <= 0 ? rankSize : localArgs->localRankSize; int serverNum = globalRankSize / localRankSize; - int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; + int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / + QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; curBlockSize = ipcBuffMaxSizeAligned / localRankSize / QUEUE_DEPTH; curBlockNum = curBlockSize / sizeof(T); atomOp = op; @@ -139,7 +140,8 @@ private: if (blockIdx != rank) { GlobalTensor outputGmTmp; outputGmTmp.SetGlobalBuffer((__gm__ U*)outputGm.GetPhyAddr()); - Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], outputGmTmp, COPYONLY); + Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], outputGmTmp, + COPYONLY); } else { CpGM2GMWithScale(copyNum, inputGt[count * curBlockNum], outputGm, COPYONLY); } @@ -208,7 +210,7 @@ private: } } - FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, int64_t atomOp) + FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, int64_t atomOp) { if (isEnableScale) { Collectives::CpGM2GMWithVectorScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); -- Gitee From 1c947ba39782d325732e730861f32eda91824d4a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 13:51:17 +0800 Subject: [PATCH 005/414] 3rd revise allreduce_big_data --- .../src/ascendc_kernels/allreduce_big_data.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index fbc1b4db..4d7b5665 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -191,9 +191,7 @@ private: sync.SetInnerFlag(magic, count); remain = remain - curBlockNum; - count = count + 1; - - } + count = count + 1; } } @@ -210,14 +208,16 @@ private: } } - FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, int64_t atomOp) + FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, + int64_t atomOp) { - if (isEnableScale) { - Collectives::CpGM2GMWithVectorScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); + if (!isEnableScale) { + Collectives::CpGM2GMPingpong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); } else if (!isVectorScale) { - CpGM2GMWithScalarScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, firstScale, offset); + CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, firstScale, offset); } else { - CpGM2GMWithScalarScale(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scalseNum, offset); + CpGM2GMPingpong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scaleNum, + offset); } } private: -- Gitee From 4abbba0dbbacb2c3e223a3fefd78ca1f643a6176 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 13:52:17 +0800 Subject: [PATCH 006/414] 4th allreduce_big_data --- comm/lcal/src/ascendc_kernels/allreduce_big_data.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 4d7b5665..8ff63353 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -212,11 +212,11 @@ private: int64_t atomOp) { if (!isEnableScale) { - Collectives::CpGM2GMPingpong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); + Collectives::CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); } else if (!isVectorScale) { CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, firstScale, offset); } else { - CpGM2GMPingpong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scaleNum, + CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scaleNum, offset); } } -- Gitee From 4b0e0151728a37bb234fb75be1e7b18c348068cf Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 15:37:42 +0800 Subject: [PATCH 007/414] lcal.h --- comm/lcal/include/lcal.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 comm/lcal/include/lcal.h diff --git a/comm/lcal/include/lcal.h b/comm/lcal/include/lcal.h new file mode 100644 index 00000000..bc13ba18 --- /dev/null +++ b/comm/lcal/include/lcal.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_H +#define LCAL_H +#include "lcal_types.h" +#include "lcal_comm.h" +#include "lccl.h" +#include "lcoc.h" +#endif // LCAL_H \ No newline at end of file -- Gitee From 14d6e701374f9a79b4881444b716a15bd093609b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 15:40:27 +0800 Subject: [PATCH 008/414] comm_args.h --- comm/lcal/include/comm_args.h | 95 +++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 comm/lcal/include/comm_args.h diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h new file mode 100644 index 00000000..18a84514 --- /dev/null +++ b/comm/lcal/include/comm_args.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCCL_COMM_ARGS_H +#define LCCL_COMM_ARGS_H +#include + +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) +using GM_ADDR = uint8_t*; +#else +#define FORCE_INLINE_AICORE __attribute__((always_inline)) inline __aicore__ +#include "kernel_operator.h" +#endif +namespace Lcal { + +constexpr int LCAL_MAX_RANK_SIZE = 128; // lcal通信库最大支持的npu卡数 +constexpr int RANK_SIZE_TWO = 2; // 可用SIO的规模,以及是否需要跨卡搬运数据核的分界规模 +constexpr int64_t IPC_BUFF_MAX_SIZE = 100 * 1024 * 1024; +constexpr int64_t IPC_DATA_OFFSET = 2 * 1024 * 1024; // 前2MB作为flag标志位,之后100MB作为数据存储 +constexpr int64_t SYNC_FLAG_BIT_NUM = 10; // cce 算子在用 +constexpr int64_t MEM_DMA_UNIT_INT_NUM = 4; +constexpr int64_t EVENT_ID_MASK = 0xFFFFFFFF; +constexpr int64_t PING_PONG_SIZE = 2; +constexpr int64_t UB_SINGLE_DMA_SIZE_MAX = 190 * 1024; +constexpr int64_t SMALL_DATA_SIZE = 1 * 1024 * 1024; +constexpr int64_t UB_SINGLE_PING_PONG_ADD_SIZE_MAX = UB_SINGLE_DMA_SIZE_MAX / 2; +constexpr int UB_ALIGN_SIZE = 32; + +// 2step算法中,2个aiv真正用作数据预处理 +constexpr int64_t PRE_CORE_REAL_NUM = 2; + +constexpr int64_t AIV_PER_AICORE = 2; + +constexpr int DFX_COUNT = 50; + +constexpr int64_t HALF_NUM = 2; + +constexpr int64_t THREE_NUM = 3; + +constexpr int64_t FOUR_NUM = 4; + +constexpr int64_t VADD_MAX_REPEAT = 255; +constexpr int64_t VADD_UNIT_BYTE = 256; + +// vadd单位粒度是256B,vadd最大repeat次数为255,两个相乘的结果 +constexpr int64_t MAX_VADD_SIZE = VADD_MAX_REPEAT * VADD_UNIT_BYTE; +constexpr int64_t BLOCK_UNIT_BYTE = 32; +constexpr int64_t VADD_UNIT_TO_BLOCK_UNIT_RATIO = VADD_UNIT_BYTE / BLOCK_UNIT_BYTE; // 8 + +constexpr bool ATOMIC_ENABLE = false; + +enum Op : int { + COPYONLY = -1, + ADD = 0, + MUL = 1, + MAX = 2, + MIN = 3 +}; + +struct ExtraFlag { + static constexpr uint32_t RDMA = 1; + static constexpr uint32_t TOPO_910B2C = 1 << 1; + static constexpr uint32_t TOPO_910_93 = 1 << 2; + static constexpr uint32_t DETERMINISTIC = 1 << 3; + static constexpr uint32_t QUANT_FP16 = 1 << 4; + static constexpr uint32_t QUANT_FP32 = 1 << 5; + static constexpr uint32_t TOPO_910A5 = 1 << 6; + static constexpr uint32_t QUANT_DELAY = 1 << 7; + static constexpr uint32_t QUANT_CURRENT = 1 << 8; + static constexpr uint32_t TOPO_PCIE = 1 << 9; +}; + +struct CommArgs { + int rank = 0; // attr rank_id, global rank + int localRank = -1; + int rankSize = 0; // global rank size + int localRankSize = -1; // 此参数是指fullmesh互联的卡数 + uint32_t extraFlag = 0; // 32 bit map,具体每一位的含义就在此文件正上方 + GM_ADDR peerMems[LCAL_MAX_RANK_SIZE] = {}; // 传入初始化获得的buff,所有allreduce都是同一个参数 + /** + * @param sendCountMatrix 大小是rankSize*rankSize的一维数组 + * eg: sendCountMatrix[1] 的数值,对应二维数组的[0][1],表示 卡0 要给 卡1 发送的数据个数 + */ + int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE] = {}; // for all2allv + int64_t dfx[DFX_COUNT] = {}; +}; +} +#endif // LCCL_COMM_ARGS_H \ No newline at end of file -- Gitee From f13a1ed78927e519bdca1b95c8dda2a476d8c01c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 15:54:15 +0800 Subject: [PATCH 009/414] 2nd comm_args.h --- comm/lcal/include/comm_args.h | 40 +++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index 18a84514..6afb7b7e 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -12,7 +12,7 @@ #define LCCL_COMM_ARGS_H #include -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_C910__) && !defined(__DAV_220_CUBE__) using GM_ADDR = uint8_t*; #else #define FORCE_INLINE_AICORE __attribute__((always_inline)) inline __aicore__ @@ -56,6 +56,7 @@ constexpr int64_t VADD_UNIT_TO_BLOCK_UNIT_RATIO = VADD_UNIT_BYTE / BLOCK_UNIT_BY constexpr bool ATOMIC_ENABLE = false; +constexpr int32_t LCCL_DUMP_UNIT_SIZE = 1 * 1024 * 1024; enum Op : int { COPYONLY = -1, ADD = 0, @@ -75,6 +76,7 @@ struct ExtraFlag { static constexpr uint32_t QUANT_DELAY = 1 << 7; static constexpr uint32_t QUANT_CURRENT = 1 << 8; static constexpr uint32_t TOPO_PCIE = 1 << 9; + static constexpr uint32_t IS_GATHER_THAN_40_AIV = 1 << 16; }; struct CommArgs { @@ -90,6 +92,40 @@ struct CommArgs { */ int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE] = {}; // for all2allv int64_t dfx[DFX_COUNT] = {}; + GM_ADDR dumpAddr = nullptr; + int32_t magics[LCAL_MAX_RANK_SIZE] = {0}; + uint64_t fftsVal = 0; }; -} + +struct LcclDumpBlockInfo { + uint32_t len = 0; + uint32_t core = 0; + uint32_t blockNum = 0; + uint32_t dumpOffset = 0; + uint32_t magic = 0; + uint32_t rsv = 0; + uint64_t dumpAddr = 0; +}; + +struct LcclDumpInfo { + uint32_t logId = 0; + uint32_t blockId = 0; + uint64_t syscyc = 0; + uint64_t curPc = 0; + uint32_t operationType = 0; + uint32_t rsv = 0; +}; + +union LcclDumpUnion { + LcclDumpBlockInfo blockInfo; + LcclDumpInfo logInfo; +}; + +enum LogId : int { + OVERALL = 0, + INIT, + PROCESS +}; + +} // namespace Lcal #endif // LCCL_COMM_ARGS_H \ No newline at end of file -- Gitee From fdc53ab65fadc72c2647b1a7aa37eb153f17b38d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 15:59:12 +0800 Subject: [PATCH 010/414] 2nd comm_args.h --- comm/lcal/include/comm_args.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index 6afb7b7e..bccedf1b 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -12,7 +12,7 @@ #define LCCL_COMM_ARGS_H #include -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_C910__) && !defined(__DAV_220_CUBE__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_220_CUBE__) using GM_ADDR = uint8_t*; #else #define FORCE_INLINE_AICORE __attribute__((always_inline)) inline __aicore__ @@ -56,7 +56,7 @@ constexpr int64_t VADD_UNIT_TO_BLOCK_UNIT_RATIO = VADD_UNIT_BYTE / BLOCK_UNIT_BY constexpr bool ATOMIC_ENABLE = false; -constexpr int32_t LCCL_DUMP_UNIT_SIZE = 1 * 1024 * 1024; +constexpr int32_t LCCL_DUMP_UINT_SIZE = 1 * 1024 * 1024; enum Op : int { COPYONLY = -1, ADD = 0, @@ -76,7 +76,7 @@ struct ExtraFlag { static constexpr uint32_t QUANT_DELAY = 1 << 7; static constexpr uint32_t QUANT_CURRENT = 1 << 8; static constexpr uint32_t TOPO_PCIE = 1 << 9; - static constexpr uint32_t IS_GATHER_THAN_40_AIV = 1 << 16; + static constexpr uint32_t IS_GREATER_THAN_40_AIV = 1 << 16; }; struct CommArgs { @@ -107,7 +107,7 @@ struct LcclDumpBlockInfo { uint64_t dumpAddr = 0; }; -struct LcclDumpInfo { +struct LcclDumpLogInfo { uint32_t logId = 0; uint32_t blockId = 0; uint64_t syscyc = 0; @@ -118,7 +118,7 @@ struct LcclDumpInfo { union LcclDumpUnion { LcclDumpBlockInfo blockInfo; - LcclDumpInfo logInfo; + LcclDumpLogInfo logInfo; }; enum LogId : int { -- Gitee From d174793309aaadb69be7d1f69bdbc527c98b01e3 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:00:07 +0800 Subject: [PATCH 011/414] 3rd comm_args.h --- comm/lcal/include/comm_args.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index bccedf1b..c8be9c47 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -12,7 +12,7 @@ #define LCCL_COMM_ARGS_H #include -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_220_CUBE__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_C220_CUBE__) using GM_ADDR = uint8_t*; #else #define FORCE_INLINE_AICORE __attribute__((always_inline)) inline __aicore__ -- Gitee From 27e678e412ff018f8922a2ab3381cd23180d7493 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:01:37 +0800 Subject: [PATCH 012/414] 1st lcal_api.h --- comm/lcal/include/lcal_api.h | 64 ++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 comm/lcal/include/lcal_api.h diff --git a/comm/lcal/include/lcal_api.h b/comm/lcal/include/lcal_api.h new file mode 100644 index 00000000..582c9514 --- /dev/null +++ b/comm/lcal/include/lcal_api.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_API_H +#define LCAL_API_H + +#include +#include +#include +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +typedef void *LcalCommPtr; +#define LCAL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[LCAL_UNIQUE_ID_BYTES]; } LcalUniqueId; + +int LcalGetUniqueId(LcalUniqueId *uniqueId, int commDomain); + +int LcalCommInitRankLocal(int rankSize, int rank, LcalCommPtr *comm); + +int LcalCommInitRank(LcalUniqueId commId, int rankSize, int rank, LcalCommPtr *comm); + +int LcalCommInitRankWithCustDomainSize(int commDomain, int bufferSize, int rankSize, int rank, LcalCommPtr *comm, + const bool isEnableAutoMagicNum = false); + +int LcalCommInitRankWithDomain(int commDomain, int rankSize, int rank, LcalCommPtr *comm); + +int LcalGetCommArgsDev(LcalCommPtr comm, GM_ADDR &commArgsPtr); + +int LcalGetCommArgsHost(LcalCommPtr comm, Lcal::CommArgs *&commArgsPtr); + +void LcalPrintDFX2Log(LcalCommPtr comm); + +int LcalCommInit(int rank, int rankSize, LcalCommPtr *comms); + +int LcalCommInitAll(uint32_t ndev, int32_t* devices, LcalCommPtr *comms); + +int LcalCommInitThread(int rank, int rankSize, const char *uid, LcalCommPtr *comms); + +int LcclAllReduce(void *sendBuf, void *recvBuf, int64_t count, HcclDataType dataType, HcclReduceOp op, + LcalCommPtr comm, aclrtStream stream); + +int LcclAllGather(void *sendBuf, void *recvBuf, int64_t sendCount, HcclDataType dataType, LcalCommPtr comm, + aclrtStream stream); + +int LcclReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount, HcclDataType dataType, HcclReduceOp op, + LcalCommPtr comm, aclrtStream stream); + +int LcclBroadcast(void *buf, int64_t count, HcclDataType dataType, int root, LcalCommPtr comm, + aclrtStream stream); + +int LcclCommDestroy(LcalCommPtr comm); + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // LCAL_API_H \ No newline at end of file -- Gitee From 8516ccdbf07d35f351ac3a40fe9f0c962bc4e5b3 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:08:21 +0800 Subject: [PATCH 013/414] 1st lcal_comm.h --- comm/lcal/include/lcal_comm.h | 96 +++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 comm/lcal/include/lcal_comm.h diff --git a/comm/lcal/include/lcal_comm.h b/comm/lcal/include/lcal_comm.h new file mode 100644 index 00000000..6ec0fbd7 --- /dev/null +++ b/comm/lcal/include/lcal_comm.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_COMM_H +#define LCAL_COMM_H + +#include +#include + +#include +#include "lcal_types.h" +#include "lcal_api.h" +#include "comm_args.h" + +namespace Lcal { +constexpr int IPC_NAME_SIZE = 65; +constexpr int SINGLE_MACHINE_910B2C_RANK_SIZE = 16; + +class LcalSockExchange; +class LcalComm { +public: + LcalComm(int rank, int rankSize); + LcalComm(int rank, int rankSize, int bufferSize); + LcalComm(int rank, int rankSize, int commDomain, int bufferSize, int isEnableMagic); + LcalComm(int rank, int rankSize, LcalUniqueId commId); + ~LcalComm(); + LcalComm(const LcalComm &) = delete; + LcalComm &operator=(const LcalComm &) = delete; + int Init(); + int InitThread(const std::string &uid = "default"); + int GetRank() const; + int GetRankSize() const; + int GetCommSize() const; + int GetBufferSize() const; + const PhysicalInfo &GetPhysicalInfo() const; + GM_ADDR GetCommArgsPtr() const; + CommArgs* GetCommArgs(); + std::string PrintDFX(); + friend class Lccl; + friend class Lcoc; + friend class LcclTest; + +private: + int SetMemoryName(std::string &name); + int SetIpcPidSdid(std::string &name, const uint32_t *pids, const int64_t *sdids) const; + int OpenIpcMem(const char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]); + int GetDev(); + int GetDevThread(const std::string &uid = ""); + int EnablePeerAccess(); + int InitCommMem(); + int InitCommon(); + void CloseIpcMem(); + void FreePeerMem(GM_ADDR &mem) const; + int InitMem(); + int GetSidId(int64_t sdids[LCAL_MAX_RANK_SIZE], int rankSize); + int GetPid(uint32_t *pids); + int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const; + int SyncCommArgs(); + int InitDumpAddr(); + +private: + int rank_ = 0; // global rank id + int rankSize_ = 0; // global rank size + int commSize_ = 0; // local LcalComm size + int localRank_ = -1; + int localRankSize_ = -1; + int devId_ = 0; + int64_t magic_ = 1; + bool inited_ = false; + bool ipcMemInited_ = false; + std::string uid_ = {}; + std::vector devList_ = {}; + std::vector rankList_ = {}; + int commDomain_ = {}; + int bufferSize_ = LCAL_COMM_BUFFER_SIZE; + + // shared ping pong buff,这个地址就是一开始申请在HBM上的,所以host上可以取到,但不能直接修改。 + GM_ADDR peerMem_[LCAL_MAX_RANK_SIZE] = {}; + PhysicalInfo physicalInfo_ = {}; + CommArgs commArgs_ = {}; // host侧 + GM_ADDR commArgsPtr_ = nullptr; // device侧 + LcalUniqueId commId_ = {}; + LcalSockExchange *socketExchange_ = nullptr; + bool deterministic_ = false; + bool isEnableMsprofOp_ = false; + bool isEnableMix_ = false; +}; +} // Lcal + +#endif // LCAL_COMM_H \ No newline at end of file -- Gitee From b548c708ea52440576df8247edcc18080d6e47ad Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:12:01 +0800 Subject: [PATCH 014/414] 1st lcal_types.h --- comm/lcal/include/lcal_types.h | 117 +++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 comm/lcal/include/lcal_types.h diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h new file mode 100644 index 00000000..59a3ec88 --- /dev/null +++ b/comm/lcal/include/lcal_types.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_TYPES_H +#define LCAL_TYPES_H + +#include +#include +#include + +namespace Lcal { +constexpr int LCAL_SUCCESS = 0; +constexpr int LCAL_ERROR_NOT_INITIALIZED = -1; +constexpr int LCAL_ERROR_MKIRT = -2; +constexpr int LCAL_ERROR_PARA_CHECK_FAIL = -3; +constexpr int LCAL_ERROR_INTERNAL = -4; +constexpr int LCAL_ERROR_TIMEOUT = -5; +constexpr int LCAL_ERROR_NOT_FOUND = -7; +constexpr int OUT_OF_DEVICE_MEMORY = -8; +constexpr int64_t LCAL_INVALID_VALUE = -1; + +// shared buffer size,这里要和collectives.cce文件中的常量联动修改!!! +constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; +constexpr int LCAL_COMM_BUFFER_SIZE = 200; // 单位MB + +enum class ChipName { + CHIP_310P3 = 0, + CHIP_910B1, + CHIP_910B2, + CHIP_910B3, + CHIP_910B4, + CHIP_910B41, + CHIP_910B2C, + CHIP_910_9391, + CHIP_910_9381, + CHIP_910_9392, + CHIP_910_9382, + CHIP_910_9372, + CHIP_910_9361, + CHIP_910_9362, + CHIP_910A5, + RESERVED, +}; + +enum class PhysicalLink { + HCCS = 0, + PCIE = 1, + RESERVED, +}; + +// 包含 物理链路、芯片名称 信息。 +struct PhysicalInfo { + ChipName chipName = ChipName::RESERVED; + PhysicalLink physicalLink = PhysicalLink::RESERVED; + uint32_t coreNum = 0; +}; + +enum class LcalType { + ALL_REDUCE = 1, + REDUCE_SCATTER = 2, + ALL_GATHER = 3, + BROADCAST = 4, + ALL2ALL = 5, + ALL_REDUCE_910B2C = 6, + ALL_GATHER_910B2C = 7, + LOCAL_REDUCE = 8, + SEND = 9, + RECV = 10, + ALL2ALL_V_C = 11, + GATHER = 12, + PURE_MATMUL = 101, + MATMUL_ALL_REDUCE = 102, + MATMUL_REDUCE_SCATTER = 103, + ALL_GATHER_MATMUL = 104, + ALL_GATHER_MATMUL_V2 = 105, + ALL2ALL_MATMUL = 106, + MATMUL_ALL2ALL = 107, + ALL_GATHER_MATMUL_REDUCE_SCATTER = 111, + BANDWIDTH = 201, + ALLTOALLV_ALLGATHER_MATMUL = 305, + ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN = 309, + MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN = 310, + LCAL_TYPE_MAX = 311, +}; + +const std::map LCAL_TYPE2NAME = { + { LcalType::ALL_REDUCE, "LcalAllReduce" }, + { LcalType::REDUCE_SCATTER, "LcalReduceScatter" }, + { LcalType::ALL_GATHER, "LcalAllGather" }, + { LcalType::BROADCAST, "LcalBroadcast" }, + { LcalType::PURE_MATMUL, "LcalPureMatmul" }, + { LcalType::MATMUL_ALL_REDUCE, "LcalMatmulAllReduce" }, + { LcalType::MATMUL_REDUCE_SCATTER, "LcalMatmulReduceScatter" }, + { LcalType::ALL_GATHER_MATMUL, "LcalAllGatherMatmul" }, + { LcalType::ALL_GATHER_MATMUL_V2, "LcalAllGatherMatmulV2" }, + { LcalType::ALL2ALL_MATMUL, "LcalAll2AllMatmul" }, + { LcalType::MATMUL_ALL2ALL, "LcalMatmulAll2All" }, + { LcalType::ALL2ALL, "LcalAll2All" }, + { LcalType::ALL2ALL_V_C, "LcalAll2AllVC" }, + { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER, "LcalAllGatherMatmulReduceScatter" }, + { LcalType::BANDWIDTH, "LcalBandwidthTest" }, + { LcalType::ALL_REDUCE_910B2C, "LcalAllReduce910B2C" }, + { LcalType::ALL_GATHER_910B2C, "LcalAllGather910B2C" }, + { LcalType::ALLTOALLV_ALLGATHER_MATMUL, "LcalAllToAllVAllGatherMatmul" }, + { LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN, "LcalAllToAllVAllGatherMatmulHidden" }, + { LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN, "LcalMatmulReduceScatterAllToAllVHidden" } +}; + + +} // namespace Lcal +#endif // LCAL_TYPES_H \ No newline at end of file -- Gitee From 2c2a897e3fc34823de2132f92617418c343a875b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:13:40 +0800 Subject: [PATCH 015/414] 2nd lcal_types.h --- comm/lcal/include/lcal_types.h | 1 + 1 file changed, 1 insertion(+) diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index 59a3ec88..067e79e7 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -27,6 +27,7 @@ constexpr int64_t LCAL_INVALID_VALUE = -1; // shared buffer size,这里要和collectives.cce文件中的常量联动修改!!! constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; +constexpr int64_t LCAL_FLAG_BUFF_BYTES = 4 * 1024 * 1024; constexpr int LCAL_COMM_BUFFER_SIZE = 200; // 单位MB enum class ChipName { -- Gitee From 309787889236900d26c658166ed93997d2ed5b7a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:14:21 +0800 Subject: [PATCH 016/414] 3rd lcal_types.h --- comm/lcal/include/lcal_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index 067e79e7..10ee60ff 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -27,7 +27,7 @@ constexpr int64_t LCAL_INVALID_VALUE = -1; // shared buffer size,这里要和collectives.cce文件中的常量联动修改!!! constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; -constexpr int64_t LCAL_FLAG_BUFF_BYTES = 4 * 1024 * 1024; +constexpr int LCAL_FLAG_BUFF_BYTES = 4 * 1024 * 1024; constexpr int LCAL_COMM_BUFFER_SIZE = 200; // 单位MB enum class ChipName { -- Gitee From f15e4221071a4e3ba612954515c91fe24d58648c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:16:00 +0800 Subject: [PATCH 017/414] 1st lccl.h --- comm/lcal/include/lccl.h | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 comm/lcal/include/lccl.h diff --git a/comm/lcal/include/lccl.h b/comm/lcal/include/lccl.h new file mode 100644 index 00000000..6ad82b7a --- /dev/null +++ b/comm/lcal/include/lccl.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_LCCL_H +#define LCAL_LCCL_H + +#include + + +namespace Lcal { +class Lccl { +public: + Lccl() = delete; + explicit Lccl(LcalComm *comm); + explicit Lccl(LcalComm &comm); + ~Lccl(); + uint32_t GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize, int localRankSize, uint32_t extraFlag) + const; + int AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, + HcclReduceOp op = HCCL_REDUCE_SUM, aclrtStream stream = nullptr, + HcclDataType outputDataType = HCCL_DATA_TYPE_RESERVED, const void *scale = nullptr, int64_t scaleCount = 0, + const void *offset = nullptr) const; + int ReduceScatter(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, + HcclReduceOp op = HCCL_REDUCE_SUM, aclrtStream stream = nullptr) const; + int AllGather(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const; + int All2All(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const; + int All2All(void *sendBuff, void *recvBuff, int64_t count, int burstLen, + int stride, HcclDataType dataType, aclrtStream stream) const; + int All2AllVC(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const; + + int Broadcast(void *buff, int64_t count, HcclDataType dataType, int32_t root, aclrtStream stream) const; + int BandwidthTest(const void *buff, void *recvBuff, int64_t count, HcclDataType dataType, + int32_t root, aclrtStream stream) const; + friend class LcclTest; + +private: + bool CheckDataType(const HcclDataType &dataType) const; + bool CheckBuff(const void *sendBuff, const void *recvBuff) const; + int LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType) const; + +private: + LcalComm *comm_ = nullptr; + int rank_ = 0; + int rankSize_ = 0; +}; +} +#endif // LCAL_LCCL_H \ No newline at end of file -- Gitee From 06ce17c22ea911ec996191eb17e7fc0c5007103a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 16:16:49 +0800 Subject: [PATCH 018/414] 2nd lccl.h --- comm/lcal/include/lccl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/include/lccl.h b/comm/lcal/include/lccl.h index 6ad82b7a..c9cae2a2 100644 --- a/comm/lcal/include/lccl.h +++ b/comm/lcal/include/lccl.h @@ -42,7 +42,7 @@ public: private: bool CheckDataType(const HcclDataType &dataType) const; bool CheckBuff(const void *sendBuff, const void *recvBuff) const; - int LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType) const; + int LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const; private: LcalComm *comm_ = nullptr; -- Gitee From e54bfe4b1d5079d6b52ea0cb0c0fc41d024ca462 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 17:18:00 +0800 Subject: [PATCH 019/414] collectives.h draft 1 --- comm/lcal/src/ascendc_kernels/collectives.h | 174 ++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 comm/lcal/src/ascendc_kernels/collectives.h diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h new file mode 100644 index 00000000..bf4cded4 --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCCL_COLLECTIVES_H +#define LCCL_COLLECTIVES_H + +#include + +#include "datacopy_gm2gm.h" +#include "datacopy_gm2gm_delay.h" +#include "sync_collectives.h" +using namespace AscendC; +using namespace Lcal; + +#define KERNELS_ARGS_FUN() \ +GM_ADDR input, GM_ADDR output, GM_ADDR commArgs, GM_ADDR scale, int64_t len, int64_t magic, int op, int root, int cycleCount, \ +GM_ADDR scale, int64_t scaleCount, GM_ADDR offset + +#define KERNELS_ARGS_FUN() \ +input, output, commArgs, scale, len, magic, op, root, cycleCount, scale, scaleCount, offset + +#define KERNELS_GATHER_TABLE_ARGS_CALL() \ +GM_ADDR embTable, GM_ADDR lookup, GM_ADDR revData, int64_t lookupLen, int64_t embTableLen, int64_t embTableDim + +#define KERNELS_GATHER_TABLE_ARGS_CALL() \ +embTable, lookup, revData, lookupLen, embTableLen, embTableDim + +enum DfxPos : int { + MAGIC, + LEN, + RUN_STATUS +}; + +class Collectives { + constexpr static int32_t UB_HEAD_OFFSET = 96; + constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + ALIGN_SIZE; +public: + FORCE_INLINE_AICORE Collectives(int rank, int rankSize, uint32_t extraFlag) : rank(rank), rankSize(rankSize), + extraFlag(extraFlag) {} + + FORCE_INLINE_AICORE ~Collectives() + { + const int64_t notRunning = 0xdead; + dfx.SetValue(RUN_STATUS, notRunning); + } + + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + dumpAddr_ = (reinterpret_cast<__gm__ CommArgs *>(commArgs))->dumpAddr; + GlobalTensor peerMemsAddrGm; + peerMemsAddrGm.SetGlobalBuffer(&(reinterpret_cast<__gm__ CommArgs *>(commArgs))->peerMems[0], + LCAL_MAX_RANK_SIZE); + for (int i = 0; i < rankSize; ++i) { + shareAddrs[i] = peerMemsAddrGm.GetValue(i) + + (magic % PING_PONG_SIZE) * (IPC_BUFF_MAX_SIZE + IPC_DATA_OFFSET); + } + dfx.SetGlobalBuffer((reinterpret_cast<__gm__ CommArgs *>(commArgs))->dfx, + DFX_COUNT); + this->root = root; + this->len = len; + this->magic = magic; + this->localRank = reinterpret_cast<__gm__ CommArgs *>(commArgs)->localRank; + this->localRankSize = reinterpret_cast<__gm__ CommArgs *>(commArgs)->localRankSize; + this->xRankSize = localRankSize; + this->yRankSize = rankSize / localRankSize; + this->xRankIdx = rank % localRankSize; + this->yRankIdx = rank / localRankSize; + + blockIdx = GetBlockIdx(); + blockNum = GetBlockNum() * LCAL_BLOCK_BUN_MULTI; + + sync.Init(rank, rankSize, shareAddrs); + dfx.SetValue(MAGIC, magic); + dfx.SetValue(LEN, len); + const int64_t running = 0xbeef; + dfx.SetValue(RUN_STATUS, running); + } + + template + FORCE_INLINE_AICORE void DataCopyPingPong(const GlobalTensor& inputGT, const GlobalTensor& outputGT, + int64_t dataSizeRemain, int op, TBuf tbuf) + { + if (dataSizeRemain <= 0) { + return; + } + LocalTensor localUB[2]; + localUB[0] = tbuf.GetWithOffset(UB_SINGLE_PING_PONG_ADD_SIZE_MAX * 0); + localUB[1] = tbuf.GetWithOffset(UBSINGLE_PING_PONG_ADD_SIZE_MAX, UBSINGLE_PING_PONG_ADD_SIZE_MAX); + + int inputOffset = 0; + int outputOffset = 0; + + PipeBarrier(); + if (op != COPYONLY) { + SetAscendCAtomic(op); + } + PipeBarrier(); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? + UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; + TEventID eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + AscendC::WaitFlag(eventId); + DataCopyWrap(localUB[i & 1], inputGT[inputOffset], size); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + DataCopyWrap(outputGT[outputOffsetNum], localUB[(i & 1) ? 0 : 1], size); + AscendC::SetFlag(eventId); + dataSizeRemain -= size; + inputOffset += (size / sizeof(T)); + outputOffsetNum += (size / sizeof(T)); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); + + if (op != COPYONLY) { + SetAtomicNone(); + } + PipeBarrier(); + } + + template + FORCE_INLINE_AICORE void CpGM2GMDelay(GlobalTensor& outputGT, GlobalTensor (&inputGT)[8], + GlobalTensor (&inputScaleGT)[8], const uint32_t calCount, int rankCount, GlobalTensor& outScaleGt, + TBuf tbuf) + { + DataCopyGm2GmDelay cpKernel; + cpKernel.Init(outputGT, inputGT, inputScaleGT, calCount, rankCount, outScaleGT, tbuf); + cpKernel.Process(); + } + + template + FORCE_INLINE_AICORE T1 CeilDiv(T1 a, T2 b) const + { + if (b == 0) { + return 0; + } + return (a + b - 1) / b; + } + + FORCE_INLINE_AICORE void VecAddCce(int64_t curDealSize, __ubuf__ T *ubuf0, __ubuf__ T *ubuf1) + { + if (curDealSize > MAX_VADD_SIZE) { + vadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1 + VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); + vadd((__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + (__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + (__ubuf__ T*)((__ubuf__ int8_t*)ubuf1 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); + } else { + Avadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1, + VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); + } + } + + template + FORCE_INLINE_AICORE void LoopVaddCceProcess(__ubuf__ T* localUB[2], const int64_t remainSize, + int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, + __gm__ T *srcGmMem, __gm__ T *dstGmMem, int64_t alreadyDealNum) + { + for + } +}; \ No newline at end of file -- Gitee From ee2e27b580ccf10580fd8a9b2e37634a66ea8725 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 17:23:38 +0800 Subject: [PATCH 020/414] collectives draft 2nd --- comm/lcal/src/ascendc_kernels/collectives.h | 31 +++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index bf4cded4..218307bc 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -20,13 +20,13 @@ using namespace AscendC; using namespace Lcal; #define KERNELS_ARGS_FUN() \ -GM_ADDR input, GM_ADDR output, GM_ADDR commArgs, GM_ADDR scale, int64_t len, int64_t magic, int op, int root, int cycleCount, \ +GM_ADDR input, GM_ADDR output, GM_ADDR commArgs, int64_t len, int64_t magic, int op, int root, int cycleCount, \ GM_ADDR scale, int64_t scaleCount, GM_ADDR offset -#define KERNELS_ARGS_FUN() \ -input, output, commArgs, scale, len, magic, op, root, cycleCount, scale, scaleCount, offset +#define KERNELS_ARGS_CALL() \ +input, output, commArgs, len, magic, op, root, cycleCount, scale, scaleCount, offset -#define KERNELS_GATHER_TABLE_ARGS_CALL() \ +#define KERNELS_GATHER_TABLE_ARGS_FUN() \ GM_ADDR embTable, GM_ADDR lookup, GM_ADDR revData, int64_t lookupLen, int64_t embTableLen, int64_t embTableDim #define KERNELS_GATHER_TABLE_ARGS_CALL() \ @@ -74,7 +74,7 @@ public: this->yRankIdx = rank / localRankSize; blockIdx = GetBlockIdx(); - blockNum = GetBlockNum() * LCAL_BLOCK_BUN_MULTI; + blockNum = GetBlockNum() * LCAL_BLOCK_NUM_MULTI; sync.Init(rank, rankSize, shareAddrs); dfx.SetValue(MAGIC, magic); @@ -84,18 +84,18 @@ public: } template - FORCE_INLINE_AICORE void DataCopyPingPong(const GlobalTensor& inputGT, const GlobalTensor& outputGT, + FORCE_INLINE_AICORE void DataCopyWrapPingPong(const GlobalTensor& inputGT, const GlobalTensor& outputGT, int64_t dataSizeRemain, int op, TBuf tbuf) { if (dataSizeRemain <= 0) { return; } LocalTensor localUB[2]; - localUB[0] = tbuf.GetWithOffset(UB_SINGLE_PING_PONG_ADD_SIZE_MAX * 0); - localUB[1] = tbuf.GetWithOffset(UBSINGLE_PING_PONG_ADD_SIZE_MAX, UBSINGLE_PING_PONG_ADD_SIZE_MAX); + localUB[0] = tbuf.GetWithOffset(UB_SINGLE_PING_PONG_ADD_SIZE_MAX, 0); + localUB[1] = tbuf.GetWithOffset(UB_SINGLE_PING_PONG_ADD_SIZE_MAX, UB_SINGLE_PING_PONG_ADD_SIZE_MAX); - int inputOffset = 0; - int outputOffset = 0; + int inputOffsetNum = 0; + int outputOffsetNum = 0; PipeBarrier(); if (op != COPYONLY) { @@ -110,13 +110,13 @@ public: UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; TEventID eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; AscendC::WaitFlag(eventId); - DataCopyWrap(localUB[i & 1], inputGT[inputOffset], size); + DataCopyWrap(localUB[(i & 1) ? 0 : 1], inputGT[inputOffsetNum], size); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); DataCopyWrap(outputGT[outputOffsetNum], localUB[(i & 1) ? 0 : 1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; - inputOffset += (size / sizeof(T)); + inputOffsetNum += (size / sizeof(T)); outputOffsetNum += (size / sizeof(T)); } AscendC::WaitFlag(EVENT_ID0); @@ -131,17 +131,18 @@ public: } template + FORCE_INLINE_AICORE void CpGM2GMDelay(GlobalTensor& outputGT, GlobalTensor (&inputGT)[8], - GlobalTensor (&inputScaleGT)[8], const uint32_t calCount, int rankCount, GlobalTensor& outScaleGt, + GlobalTensor (&inputScaleGT)[8], const uint32_t calCount, int rankCount, GlobalTensor& outScaleGT, TBuf tbuf) { - DataCopyGm2GmDelay cpKernel; + DataCopyGM2GMDelay cpKernel; cpKernel.Init(outputGT, inputGT, inputScaleGT, calCount, rankCount, outScaleGT, tbuf); cpKernel.Process(); } template - FORCE_INLINE_AICORE T1 CeilDiv(T1 a, T2 b) const + FORCE_INLINE_AICORE T1 CeilDiv(T1 a, T2 b) { if (b == 0) { return 0; -- Gitee From 9024f38151f306bf5895fb22435503ca7558187f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 21:23:59 +0800 Subject: [PATCH 021/414] collectives.h draft2 --- comm/lcal/src/ascendc_kernels/collectives.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index 218307bc..a46dbcc1 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -131,7 +131,15 @@ public: } template - + FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GlobalTensor& inputGT, + const uint32_t calCount, int op) + { + DataCopyGM2GM cpKernel; + cpKernel.Init(outputGT, inputGT, calCount, op); + cpKernel.Process(); + } + + template FORCE_INLINE_AICORE void CpGM2GMDelay(GlobalTensor& outputGT, GlobalTensor (&inputGT)[8], GlobalTensor (&inputScaleGT)[8], const uint32_t calCount, int rankCount, GlobalTensor& outScaleGT, TBuf tbuf) -- Gitee From e983d69069df44b321993059724e9b0ce8fdd16c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 15 Aug 2025 21:29:12 +0800 Subject: [PATCH 022/414] collectives.h draft3 --- comm/lcal/src/ascendc_kernels/collectives.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index a46dbcc1..2ad7a1a7 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -158,17 +158,18 @@ public: return (a + b - 1) / b; } + template FORCE_INLINE_AICORE void VecAddCce(int64_t curDealSize, __ubuf__ T *ubuf0, __ubuf__ T *ubuf1) { if (curDealSize > MAX_VADD_SIZE) { - vadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1 + vadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); vadd((__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), (__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), - (__ubuf__ T*)((__ubuf__ int8_t*)ubuf1 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + CeilDiv((curDealSize - MAX_VADD_SIZE), VADD_UNIT_BYTE), 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); } else { - Avadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1, + vadd(ubuf0, ubuf1, ubuf0, CeilDiv(curDealSize, VADD_UNIT_BYTE), 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); } } @@ -176,7 +177,7 @@ public: template FORCE_INLINE_AICORE void LoopVaddCceProcess(__ubuf__ T* localUB[2], const int64_t remainSize, int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, - __gm__ T *srcGmMem, __gm__ T *dstGmMem, int64_t alreadyDealNum) + __gm__ T *srcGmMem, __gm__ T *dstIpcMem, int64_t alreadyDealNum) { for } -- Gitee From 529068f0939efc0fcf86aa99fd572ab8b5c88fa2 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 10:32:16 +0800 Subject: [PATCH 023/414] collectives.h wuhu --- comm/lcal/src/ascendc_kernels/collectives.h | 327 +++++++++++++++++++- 1 file changed, 324 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index 2ad7a1a7..e7a5a983 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -164,7 +164,8 @@ public: if (curDealSize > MAX_VADD_SIZE) { vadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); - vadd((__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + vadd((__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + (__ubuf__ T*)((__ubuf__ int8_t*)ubuf1 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), (__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), CeilDiv((curDealSize - MAX_VADD_SIZE), VADD_UNIT_BYTE), 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); @@ -179,6 +180,326 @@ public: int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, __gm__ T *srcGmMem, __gm__ T *dstIpcMem, int64_t alreadyDealNum) { - for + for (int64_t alreadyDealSize = 0; alreadyDealSize < remainSize; + alreadyDealSize += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + int64_t curDealSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; + if (remainSize - alreadyDealSize < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + curDealSize = remainSize - alreadyDealSize; + } + if (alreadyDealSize != 0) { + AscendC::WaitFlag(EVENT_ID0); + + } + CpGM2UB(localUB[0], srcGmMem + alreadyDealSize, curDealSize); + + for (int64_t i = 0; i < targetRankArrValidSize; i++) { + int64_t targetRank = targetRankArr[i]; + if (targetRank == rank) { + continue; + } + if (i > 0 && !((targetRankAr[0] == rank) && i == 1)) { + AscendC::WaitFlag(EVENT_ID1); + } + CpGM2UB(localUB[1], + (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealSize, + curDealSize); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + AscendC::SetFlag(EVENT_ID2); + AscendC::WaitFlag(EVENT_ID2); + VecAddCce(curDealSize, localUB[0], localUB[1]); + if (((i + 1) == targetRankArrValidSize)) { + continue; + } + if (((i + 1) == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { + continue; + } + AscendC::SetFlag(EVENT_ID1); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM((__gm__ T*)(dstIpcMem + alreadyDealSize), localUB[0], curDealSize); + if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { + AscendC::SetFlag(EVENT_ID0); + } + alreadyDealNum += curDealSize / sizeof(T); + } + } + + template + FORCE_INLINE_AICORE void LoopVadCce(__ubuf__ T *localUB[2], const int64_t remainNum, int64_t (&targetRankArr)[8], + int64_t targetRankArrValidSize, int64_t srcIpcOffsetNum, __gm__ T *srcGmMem, __gm__ T *dstIpcMem) + { + AscendC::PipeBarrier(); + LoopVaddCceProcess(localUB, remainNum * (int64_t)sizeof(T), targetRankArr, targetRankArrValidSize, + srcIpcOffsetNum, srcGmMem, dstIpcMem, 0); + AscendC::PipeBarrier(); + } + + template + FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputG, + const GlobalTensor& outputGT, int op) + { + constexpr int32_t ubBlockSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; + constexpr int32_t ubBlockSize = ubBlockSize / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; + constexpr int32_t inputUbBlockSize = std::is_save_v ? ubBlockSize : ubAlignNum * sizeof(U); + constexpr int32_t outputUbBlockSize = std::is_save_v ? ubBlockSize : ubAlignNum * sizeof(T); + + __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); + __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); + __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET), (__ubuf__ U*)(UB_MID_OFFSET)}; + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)inputUB[0], (__ubuf__ T*)inputUB[1]}; + if constexpr (!std::is_same_v) { + outputUB[0] = (__ubuf__ T*)(inputUB[0] + inputUbBlockSize / sizeof(U)); + outputUB[1] = (__ubuf__ T*)(inputUB[1] + inputUbBlockSize / sizeof(U)); + } + int inputOffsetNum = 0; + int outputOffsetNum = 0; + if (dataSizeRemain <= 0) { + return; + } + + SetAtomic(op); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + uint32_t size = dataSizeRemain > outputUbBlockSize ? outputUbBlockSize : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + inputOffsetNum, size / sizeof(T) * sizeof(U)); + if constexpr (!std::is_same_v) { + SetWaitEvent(eventId); + CastImpl((i & 1) ? outputUB[0] : outputUB[1], (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, + size / sizeof(T)); + SetWaitEvent(eventId); + } + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + CpGM2UB(output + outputOffsetNum, (i & 1) ? outputUB[0] : outputUB[1], size); + AscendC::SetFlag(eventId); + dataSizeRemain -= size; + inputOffsetNum += (size / sizeof(T)); + outputOffsetNum += (size / sizeof(T)); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); + UnsetAtomic(op); + return; } -}; \ No newline at end of file + + template + FORCE_INLINE_AICORE void VecAdd(int64_t curDealSize, LocalTensor &ubuf0, LocalTensor &ubuf1) + { + if (curDealSize > MAX_VADD_SIZE) { + Add(ubuf0, ubuf1, ubuf0, MASK_PLACEHOLDER, VADD_MAX_REPEAT, + {1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO}); + + Add(ubuf0[MAX_VADD_SIZE / sizeof(T)], ubuf1[MAX_VADD_SIZE / sizeof(T)], + ubuf0[MAX_VADD_SIZE / sizeof(T)], MASK_PLACEHOLDER, + CeilDiv((curDealSize - MAX_VADD_SIZE), VADD_UNIT_BYTE), + {1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO}); + } else { + AAdd(ubuf0, ubuf1, ubuf0, MASK_PLACEHOLDER, CeilDiv(curDealSize, VADD_UNIT_BYTE), + {1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO}); + } + } + + template + FORCE_INLINE_AICORE void LoopVadd(TBuf tbuf, int64_t &remainNum, int64_t (&targetRankArr)[8], + int64_t targetRankArrValidSize, int64_t srcIpcOffsetNum, const GlobalTensor &srcGT, + const GlobalTensor &dstGT) + { + if (remainNum <= 0) { + return; + } + LocalTensor localUB[2]; + localUB[0] = tbuf.GetWithOffset(95 * 1024, 0); + localUB[1] = tbuf.GetWithOffset(95 * 1024, 95 * 1024); + + AscendC::PipeBarrier(); + LoopVaddProcess(localUB, remainNum * sizeof(T), targetRankArr, targetRankArrValidSize, + srcIpcOffsetNum, srcGT, dstGT, 0); + AscendC::PipeBarrier(); + } + template + FORCE_INLINE_AICORE void LoopVaddProcess(LocalTensor localUB[2], const int64_t remainSize, + int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, + const GlobalTensor &srcGT, const GlobalTensor &dstGT, int64_t alreadyDealNum) + { + for (int64_t alreadyDealSize = 0; alreadyDealSize < remainSize; + alreadyDealSize += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + int64_t curDealSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; + if (remainSize - alreadyDealSize < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + curDealSize = remainSize - alreadyDealSize; + } + if (alreadyDealSize != 0) { + AscendC::WaitFlag(EVENT_ID0); + + } + DataCopywrap(localUB[0], srcGT[alreadyDealNum], curDealSize); + + for (int64_t i = 0; i < targetRankArrValidSize; i++) { + int64_t targetRank = targetRankArr[i]; + if (targetRank == rank) { + continue; + } + if (i > 0 && !((targetRankArr[0] == rank) && i == 1)) { + AscendC::WaitFlag(EVENT_ID1); + } + GlobalTensor srcGtTmp; + srcGtTmp.SetGlobalBuffer( + (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealSize); + DataCopywrap(localUB[1], srcGtTmp, curDealSize); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + AscendC::SetFlag(EVENT_ID2); + AscendC::WaitFlag(EVENT_ID2); + VecAdd(curDealSize, localUB[0], localUB[1]); + if (((i + 1) == targetRankArrValidSize)) { + continue + } + if (((i + 1) == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { + continue; + } + AscendC::SetFlag(EVENT_ID1); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + DataCopywrap(dstGT[alreadyDealNum], localUB[0], curDealSize); + if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { + AscendC::SetFlag(EVENT_ID0); + } + alreadyDealNum += curDealSize / sizeof(T); + } + } + + template + FORCE_INLINE_AICORE void SetSingleValue2Gm(GM_ADDR gm, T value) + { + AscendC::PipeBarrier(); + __ubuf__ T *inputUB = (__ubuf__ T *)(96); + *inputUB = value; + AscendC::PipeBarrier(); + CpUB2GM((__gm__ T *)gm, inputUB, sizeof(T)); + AscendC::PipeBarrier(); + } + +protected: + int rank; + int rankSize; + int localRank = 0; + int localRankSize = 0; + int xRankSize = 0; + int yRankSize = 0; + int xRankIdx = 0; + int yRankIdx = 0; + uint32_t extraFlag; + int root; + int64_t len; + int64_t magic; + int64_t blockIdx; + int64_t blockNum; + GM_ADDR shareAddrs[LCAL_MAX_RANK_SIZE]; + GlobalTensor dfx; + SyncCollectives sync; + GM_ADDR dumpAddr_ = nullptr; + GM_ADDR shareAddrs[LCAL_MAX_RANK_SIZE]; + + template + FORCE_INLINE_AICORE void SetAscendCAtomic(int op) + { + SetAtomicType(op); + switch (op) { + case ADD: + SetAtomicAdd(); + return; + case MUL: + return; + case MAX: + SetAtomicMax(); + return; + case MIN: + SetAtomicMin(); + return; + default: + ; + } + } + + template + FORCE_INLINE_AICORE void SetAtomic(int op) + { + PipeBarrier(); + if (op != -1) { +#ifdef __DAV_C220_VEC__ + SetAscendCAtomic(op); +#endif + } + PipeBarrier(); + } + + FORCE_INLINE_AICORE void UnsetAtomic(int op) + { + if (op != -1) { + AscendC::SetAtomicNone(); + } + PipeBarrier(); + } + + template + FORCE_INLINE_AICORE void SetWaitEvent(event_t eventId) + { + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + } + + FORCE_INLINE_AICORE int64_t DumpLcclLogInfo(LogId logId, Op operationType) + { +#ifdef ENABLE_LCCL_DUMP + constexpr int32_t UB_HEAD_OFFSET = 96; + + AscendC::PipeBarrier(); + GM_ADDR blockGm = (GM_ADDR)(dumpAddr_ + LCCL_DUMP_UNIT_SIZE * GetBlockIdx())); + __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); + __ubuf__ LcclDumpLogInfo *LogUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); + + CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); + AscendC::PipeBarrier(); + + if (blockUb->dumpOffset < sizeof(LcclDumpBlockInfo)) { + return; + } + + logUb->logId = logId; + logUb->blockId = GetBlockIdx(); + logUb->syscyc = static_cast(GetSystemCycle()); + logUb->curpc = static_cast(get_pc()); + logUb->operationType = operationType; + logUb->rsv = 0; + CpUB2GM((GM_ADDR)blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpLogInfo)); + + blockUb->dumpAddr += sizeof(LcclDumpLogInfo); + blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); + CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpLogInfo)); + AscendC::PipeBarrier(); +#endif + } +}; + +FORCE_INLINE_AICORE int64_t GetDataCount(const int64_t dataLen, const int64_t useBlockNum) +{ + return dataLen / useBlockNum; +} +#endif // LCCL_COLLECTIVES_H -- Gitee From ffa418f4e6c063230e886d211762634c7a8fb202 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 10:38:28 +0800 Subject: [PATCH 024/414] collective.h wuhu 1 --- comm/lcal/src/ascendc_kernels/collectives.h | 62 ++++++++++----------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index e7a5a983..ff2b333c 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -180,28 +180,28 @@ public: int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, __gm__ T *srcGmMem, __gm__ T *dstIpcMem, int64_t alreadyDealNum) { - for (int64_t alreadyDealSize = 0; alreadyDealSize < remainSize; - alreadyDealSize += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + for (int64_t alreadyDealNum = 0; alreadyDealNum < remainSize; + alreadyDealNum += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { int64_t curDealSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; - if (remainSize - alreadyDealSize < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { - curDealSize = remainSize - alreadyDealSize; + if (remainSize - alreadyDealNum < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + curDealSize = remainSize - alreadyDealNum; } - if (alreadyDealSize != 0) { + if (alreadyDealNum != 0) { AscendC::WaitFlag(EVENT_ID0); } - CpGM2UB(localUB[0], srcGmMem + alreadyDealSize, curDealSize); + CpGM2UB(localUB[0], srcGmMem + alreadyDealNum, curDealSize); for (int64_t i = 0; i < targetRankArrValidSize; i++) { int64_t targetRank = targetRankArr[i]; if (targetRank == rank) { continue; } - if (i > 0 && !((targetRankAr[0] == rank) && i == 1)) { - AscendC::WaitFlag(EVENT_ID1); + if (i > 0 && !((targetRankArr[0] == rank) && i == 1)) { + AscendC::WaitFlag(EVENT_ID1); } CpGM2UB(localUB[1], - (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealSize, + (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealNum, curDealSize); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -222,8 +222,8 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CpUB2GM((__gm__ T*)(dstIpcMem + alreadyDealSize), localUB[0], curDealSize); - if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { + CpUB2GM((__gm__ T*)dstIpcMem + alreadyDealNum), localUB[0], curDealSize); + if (alreadyDealNum + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { AscendC::SetFlag(EVENT_ID0); } alreadyDealNum += curDealSize / sizeof(T); @@ -231,23 +231,23 @@ public: } template - FORCE_INLINE_AICORE void LoopVadCce(__ubuf__ T *localUB[2], const int64_t remainNum, int64_t (&targetRankArr)[8], + FORCE_INLINE_AICORE void LoopVaddCce(__ubuf__ T* localUB[2], const int64_t remainNum, int64_t (&targetRankArr)[8], int64_t targetRankArrValidSize, int64_t srcIpcOffsetNum, __gm__ T *srcGmMem, __gm__ T *dstIpcMem) { AscendC::PipeBarrier(); - LoopVaddCceProcess(localUB, remainNum * (int64_t)sizeof(T), targetRankArr, targetRankArrValidSize, + LoopVaddCceProcess(localUB, remainNum * (int64_t)sizeof(T), targetRankArr, targetRankArrValidSize, srcIpcOffsetNum, srcGmMem, dstIpcMem, 0); AscendC::PipeBarrier(); } template - FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputG, + FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputGT, const GlobalTensor& outputGT, int op) { constexpr int32_t ubBlockSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; - constexpr int32_t ubBlockSize = ubBlockSize / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; - constexpr int32_t inputUbBlockSize = std::is_save_v ? ubBlockSize : ubAlignNum * sizeof(U); - constexpr int32_t outputUbBlockSize = std::is_save_v ? ubBlockSize : ubAlignNum * sizeof(T); + constexpr int32_t ubAlignNum = ubBlockSize / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; + constexpr int32_t inputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(U); + constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); @@ -265,8 +265,8 @@ public: SetAtomic(op); - AscendC::SetFlag(EVENT_ID0); - AscendC::SetFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); for (int64_t i = 0; dataSizeRemain > 0; i++) { uint32_t size = dataSizeRemain > outputUbBlockSize ? outputUbBlockSize : dataSizeRemain; event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; @@ -280,7 +280,7 @@ public: } AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - CpGM2UB(output + outputOffsetNum, (i & 1) ? outputUB[0] : outputUB[1], size); + CpUB2GM(output + outputOffsetNum, (i & 1) ? outputUB[0] : outputUB[1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; inputOffsetNum += (size / sizeof(T)); @@ -307,7 +307,7 @@ public: CeilDiv((curDealSize - MAX_VADD_SIZE), VADD_UNIT_BYTE), {1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO}); } else { - AAdd(ubuf0, ubuf1, ubuf0, MASK_PLACEHOLDER, CeilDiv(curDealSize, VADD_UNIT_BYTE), + Add(ubuf0, ubuf1, ubuf0, MASK_PLACEHOLDER, CeilDiv(curDealSize, VADD_UNIT_BYTE), {1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO}); } } @@ -315,7 +315,7 @@ public: template FORCE_INLINE_AICORE void LoopVadd(TBuf tbuf, int64_t &remainNum, int64_t (&targetRankArr)[8], int64_t targetRankArrValidSize, int64_t srcIpcOffsetNum, const GlobalTensor &srcGT, - const GlobalTensor &dstGT) + const GlobalTensor &dstGt) { if (remainNum <= 0) { return; @@ -325,8 +325,8 @@ public: localUB[1] = tbuf.GetWithOffset(95 * 1024, 95 * 1024); AscendC::PipeBarrier(); - LoopVaddProcess(localUB, remainNum * sizeof(T), targetRankArr, targetRankArrValidSize, - srcIpcOffsetNum, srcGT, dstGT, 0); + LoopVaddProcess(localUB, remainNum * sizeof(T), targetRankArr, targetRankArrValidSize, + srcIpcOffsetNum, srcGt, dstGt, 0); AscendC::PipeBarrier(); } template @@ -334,13 +334,13 @@ public: int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, const GlobalTensor &srcGT, const GlobalTensor &dstGT, int64_t alreadyDealNum) { - for (int64_t alreadyDealSize = 0; alreadyDealSize < remainSize; - alreadyDealSize += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + for (int64_t alreadyDealNum = 0; alreadyDealNum < remainSize; + alreadyDealNum += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { int64_t curDealSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; - if (remainSize - alreadyDealSize < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { - curDealSize = remainSize - alreadyDealSize; + if (remainSize - alreadyDealNum < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + curDealSize = remainSize - alreadyDealNum; } - if (alreadyDealSize != 0) { + if (alreadyDealNum != 0) { AscendC::WaitFlag(EVENT_ID0); } @@ -356,7 +356,7 @@ public: } GlobalTensor srcGtTmp; srcGtTmp.SetGlobalBuffer( - (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealSize); + (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealNum); DataCopywrap(localUB[1], srcGtTmp, curDealSize); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -378,7 +378,7 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); DataCopywrap(dstGT[alreadyDealNum], localUB[0], curDealSize); - if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { + if (alreadyDealNum + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { AscendC::SetFlag(EVENT_ID0); } alreadyDealNum += curDealSize / sizeof(T); -- Gitee From d032c43f0acc291a1ab15d3eadd07d837e60257c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:05:10 +0800 Subject: [PATCH 025/414] collectives.h qifei --- comm/lcal/src/ascendc_kernels/collectives.h | 65 ++++++++++----------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index ff2b333c..29ed0a07 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -180,13 +180,13 @@ public: int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, __gm__ T *srcGmMem, __gm__ T *dstIpcMem, int64_t alreadyDealNum) { - for (int64_t alreadyDealNum = 0; alreadyDealNum < remainSize; - alreadyDealNum += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + for (int64_t alreadyDealSize = 0; alreadyDealSize < remainSize; + alreadyDealSize += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { int64_t curDealSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; - if (remainSize - alreadyDealNum < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { - curDealSize = remainSize - alreadyDealNum; + if (remainSize - alreadyDealSize < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + curDealSize = remainSize - alreadyDealSize; } - if (alreadyDealNum != 0) { + if (alreadyDealSize != 0) { AscendC::WaitFlag(EVENT_ID0); } @@ -212,7 +212,7 @@ public: if (((i + 1) == targetRankArrValidSize)) { continue; } - if (((i + 1) == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { + if ((i + 1 == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { continue; } AscendC::SetFlag(EVENT_ID1); @@ -223,7 +223,7 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); CpUB2GM((__gm__ T*)dstIpcMem + alreadyDealNum), localUB[0], curDealSize); - if (alreadyDealNum + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { + if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { AscendC::SetFlag(EVENT_ID0); } alreadyDealNum += curDealSize / sizeof(T); @@ -246,8 +246,8 @@ public: { constexpr int32_t ubBlockSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; constexpr int32_t ubAlignNum = ubBlockSize / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; - constexpr int32_t inputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(U); - constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); + constexpr int32_t inputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(U); + constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); @@ -314,7 +314,7 @@ public: template FORCE_INLINE_AICORE void LoopVadd(TBuf tbuf, int64_t &remainNum, int64_t (&targetRankArr)[8], - int64_t targetRankArrValidSize, int64_t srcIpcOffsetNum, const GlobalTensor &srcGT, + int64_t targetRankArrValidSize, int64_t srcIpcOffsetNum, const GlobalTensor &srcGt, const GlobalTensor &dstGt) { if (remainNum <= 0) { @@ -330,21 +330,21 @@ public: AscendC::PipeBarrier(); } template - FORCE_INLINE_AICORE void LoopVaddProcess(LocalTensor localUB[2], const int64_t remainSize, + FORCE_INLINE_AICORE void LoopVaddProcess(LocalTensor (&localUB)[2], const int64_t remainSize, int64_t (&targetRankArr)[8], const int64_t targetRankArrValidSize, const int64_t srcIpcOffsetNum, - const GlobalTensor &srcGT, const GlobalTensor &dstGT, int64_t alreadyDealNum) + const GlobalTensor &srcGt, const GlobalTensor &dstGt, int64_t alreadyDealNum) { - for (int64_t alreadyDealNum = 0; alreadyDealNum < remainSize; - alreadyDealNum += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + for (int64_t alreadyDealSize = 0; alreadyDealSize < remainSize; + alreadyDealSize += UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { int64_t curDealSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; - if (remainSize - alreadyDealNum < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { - curDealSize = remainSize - alreadyDealNum; + if (remainSize - alreadyDealSize < UB_SINGLE_PING_PONG_ADD_SIZE_MAX) { + curDealSize = remainSize - alreadyDealSize; } - if (alreadyDealNum != 0) { + if (alreadyDealSize != 0) { AscendC::WaitFlag(EVENT_ID0); } - DataCopywrap(localUB[0], srcGT[alreadyDealNum], curDealSize); + DataCopyWrap(localUB[0], srcGt[alreadyDealNum], curDealSize); for (int64_t i = 0; i < targetRankArrValidSize; i++) { int64_t targetRank = targetRankArr[i]; @@ -357,7 +357,7 @@ public: GlobalTensor srcGtTmp; srcGtTmp.SetGlobalBuffer( (__gm__ T*)(shareAddrs[targetRank] + IPC_DATA_OFFSET) + srcIpcOffsetNum + alreadyDealNum); - DataCopywrap(localUB[1], srcGtTmp, curDealSize); + DataCopyWrap(localUB[1], srcGtTmp, curDealSize); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -365,9 +365,9 @@ public: AscendC::WaitFlag(EVENT_ID2); VecAdd(curDealSize, localUB[0], localUB[1]); if (((i + 1) == targetRankArrValidSize)) { - continue + continue; } - if (((i + 1) == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { + if ((i + 1 == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { continue; } AscendC::SetFlag(EVENT_ID1); @@ -377,8 +377,8 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - DataCopywrap(dstGT[alreadyDealNum], localUB[0], curDealSize); - if (alreadyDealNum + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { + DataCopyWrap(dstGt[alreadyDealNum], localUB[0], curDealSize); + if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { AscendC::SetFlag(EVENT_ID0); } alreadyDealNum += curDealSize / sizeof(T); @@ -415,12 +415,11 @@ protected: GlobalTensor dfx; SyncCollectives sync; GM_ADDR dumpAddr_ = nullptr; - GM_ADDR shareAddrs[LCAL_MAX_RANK_SIZE]; template FORCE_INLINE_AICORE void SetAscendCAtomic(int op) { - SetAtomicType(op); + SetAtomicType(); switch (op) { case ADD: SetAtomicAdd(); @@ -444,7 +443,7 @@ protected: PipeBarrier(); if (op != -1) { #ifdef __DAV_C220_VEC__ - SetAscendCAtomic(op); + SetAtomicOpType(op); #endif } PipeBarrier(); @@ -465,17 +464,17 @@ protected: AscendC::WaitFlag(eventId); } - FORCE_INLINE_AICORE int64_t DumpLcclLogInfo(LogId logId, Op operationType) + FORCE_INLINE_AICORE void DumpLcclLogInfo(LogId logId, Op operationType) { #ifdef ENABLE_LCCL_DUMP constexpr int32_t UB_HEAD_OFFSET = 96; AscendC::PipeBarrier(); - GM_ADDR blockGm = (GM_ADDR)(dumpAddr_ + LCCL_DUMP_UNIT_SIZE * GetBlockIdx())); + GM_ADDR blockGm = (GM_ADDR)(dumpAddr_ + LCCL_DUMP_UINT_SIZE * GetBlockIdx()); __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); - __ubuf__ LcclDumpLogInfo *LogUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); + __ubuf__ LcclDumpLogInfo *logUb = (__ubuf__ LcclDumpLogInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); - CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); + CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpLogInfo)); AscendC::PipeBarrier(); if (blockUb->dumpOffset < sizeof(LcclDumpBlockInfo)) { @@ -485,14 +484,14 @@ protected: logUb->logId = logId; logUb->blockId = GetBlockIdx(); logUb->syscyc = static_cast(GetSystemCycle()); - logUb->curpc = static_cast(get_pc()); + logUb->curPc = static_cast(get_pc()); logUb->operationType = operationType; logUb->rsv = 0; CpUB2GM((GM_ADDR)blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpLogInfo)); - blockUb->dumpAddr += sizeof(LcclDumpLogInfo); + blockUb->dumpAddr += sizeof(LcclDumpBlockInfo); blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); - CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpLogInfo)); + CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpBlockInfo)); AscendC::PipeBarrier(); #endif } -- Gitee From 628614eed621a890d61869051bc7c1a84e3610f5 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:06:40 +0800 Subject: [PATCH 026/414] collective.s final fantasy --- comm/lcal/src/ascendc_kernels/collectives.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index 29ed0a07..9c50bd34 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -222,7 +222,7 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CpUB2GM((__gm__ T*)dstIpcMem + alreadyDealNum), localUB[0], curDealSize); + CpUB2GM((__gm__ T*)dstIpcMem + alreadyDealNum, localUB[0], curDealSize); if (alreadyDealSize + UB_SINGLE_PING_PONG_ADD_SIZE_MAX < remainSize) { AscendC::SetFlag(EVENT_ID0); } @@ -487,11 +487,11 @@ protected: logUb->curPc = static_cast(get_pc()); logUb->operationType = operationType; logUb->rsv = 0; - CpUB2GM((GM_ADDR)blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpLogInfo)); + CpUB2GM((GM_ADDR)blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpBlockInfo)); blockUb->dumpAddr += sizeof(LcclDumpBlockInfo); blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); - CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpBlockInfo)); + CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpLogInfo)); AscendC::PipeBarrier(); #endif } -- Gitee From 0bf47ad54a1f10c53f9d26bfe87aa4f5348da683 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:07:48 +0800 Subject: [PATCH 027/414] collectives.h success --- comm/lcal/src/ascendc_kernels/collectives.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index 9c50bd34..39342760 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -474,10 +474,10 @@ protected: __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); __ubuf__ LcclDumpLogInfo *logUb = (__ubuf__ LcclDumpLogInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); - CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpLogInfo)); + CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpBLockInfo)); AscendC::PipeBarrier(); - if (blockUb->dumpOffset < sizeof(LcclDumpBlockInfo)) { + if (blockUb->dumpOffset < sizeof(LcclDumpLogInfo)) { return; } @@ -487,11 +487,11 @@ protected: logUb->curPc = static_cast(get_pc()); logUb->operationType = operationType; logUb->rsv = 0; - CpUB2GM((GM_ADDR)blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpBlockInfo)); + CpUB2GM((GM_ADDR)blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpLogInfo)); blockUb->dumpAddr += sizeof(LcclDumpBlockInfo); blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); - CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpLogInfo)); + CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpBlockInfo)); AscendC::PipeBarrier(); #endif } -- Gitee From d6fdf99ab1cf949a8f6ee48ee9a617d02d08e313 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:08:20 +0800 Subject: [PATCH 028/414] collectives.h ultimate --- comm/lcal/src/ascendc_kernels/collectives.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index 39342760..036c5e3b 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -474,7 +474,7 @@ protected: __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); __ubuf__ LcclDumpLogInfo *logUb = (__ubuf__ LcclDumpLogInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); - CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpBLockInfo)); + CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); AscendC::PipeBarrier(); if (blockUb->dumpOffset < sizeof(LcclDumpLogInfo)) { -- Gitee From a03f90dbbde658b86435c97496929a49d2cd0b3a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:19:00 +0800 Subject: [PATCH 029/414] cmake draft1 --- comm/lcal/CMakeLists.txt | 0 comm/lcal/cmake/CMakeCCECompiler.cmake.in | 14 ++++++++++++++ comm/lcal/cmake/CMakeCCEInformation.cmake | 0 comm/lcal/cmake/CMakeDetermineCCECompiler.cmake | 0 comm/lcal/cmake/CMakeTestCCECompiler.cmake | 0 5 files changed, 14 insertions(+) create mode 100644 comm/lcal/CMakeLists.txt create mode 100644 comm/lcal/cmake/CMakeCCECompiler.cmake.in create mode 100644 comm/lcal/cmake/CMakeCCEInformation.cmake create mode 100644 comm/lcal/cmake/CMakeDetermineCCECompiler.cmake create mode 100644 comm/lcal/cmake/CMakeTestCCECompiler.cmake diff --git a/comm/lcal/CMakeLists.txt b/comm/lcal/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/cmake/CMakeCCECompiler.cmake.in b/comm/lcal/cmake/CMakeCCECompiler.cmake.in new file mode 100644 index 00000000..d77d1a4d --- /dev/null +++ b/comm/lcal/cmake/CMakeCCECompiler.cmake.in @@ -0,0 +1,14 @@ +# + # Copyright (c) 2024 Huawei Technologies Co., Ltd. + # This file is a part of the CANN Open Software. + # Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + # Please refer to the License for details. You may not use this file except in compliance with the License. + # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + # See LICENSE in the root of the software repository for the full text of the License. + # + set(CMAKE_CCE_COMPILER "@CMAKE_CCE_COMPILER@") + set(CMAKE_CCE_COMPILER_LOADED) + set(CMAKE_CCE_OUTPUT_EXTENSION @CMAKE_CCE_OUTPUT_EXTENSION@) + set(CMAKE_CCE_COMPILER_ENV_VAR "@CMAKE_CCE_COMPILER_ENV_VAR@") + set(CMAKE_CCE_SOURCE_FILE_EXTENSIONS @CAMKE_CCE_SOURCE_FILE_EXTENSIONS@) diff --git a/comm/lcal/cmake/CMakeCCEInformation.cmake b/comm/lcal/cmake/CMakeCCEInformation.cmake new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake b/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/cmake/CMakeTestCCECompiler.cmake b/comm/lcal/cmake/CMakeTestCCECompiler.cmake new file mode 100644 index 00000000..e69de29b -- Gitee From 2a502ff08cec2fb2f4ea9b2db38697f4e05a8a3d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:19:38 +0800 Subject: [PATCH 030/414] cmakein 2nd --- comm/lcal/cmake/CMakeCCECompiler.cmake.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/cmake/CMakeCCECompiler.cmake.in b/comm/lcal/cmake/CMakeCCECompiler.cmake.in index d77d1a4d..073b728b 100644 --- a/comm/lcal/cmake/CMakeCCECompiler.cmake.in +++ b/comm/lcal/cmake/CMakeCCECompiler.cmake.in @@ -8,7 +8,7 @@ # See LICENSE in the root of the software repository for the full text of the License. # set(CMAKE_CCE_COMPILER "@CMAKE_CCE_COMPILER@") - set(CMAKE_CCE_COMPILER_LOADED) + set(CMAKE_CCE_COMPILER_LOADED 1) set(CMAKE_CCE_OUTPUT_EXTENSION @CMAKE_CCE_OUTPUT_EXTENSION@) set(CMAKE_CCE_COMPILER_ENV_VAR "@CMAKE_CCE_COMPILER_ENV_VAR@") - set(CMAKE_CCE_SOURCE_FILE_EXTENSIONS @CAMKE_CCE_SOURCE_FILE_EXTENSIONS@) + set(CMAKE_CCE_SOURCE_FILE_EXTENSIONS @CMAKE_CCE_SOURCE_FILE_EXTENSIONS@) -- Gitee From 853f369f841befd70d906c9ee4164541a9478348 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:29:38 +0800 Subject: [PATCH 031/414] info cmake --- comm/lcal/cmake/CMakeCCEInformation.cmake | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/comm/lcal/cmake/CMakeCCEInformation.cmake b/comm/lcal/cmake/CMakeCCEInformation.cmake index e69de29b..63739908 100644 --- a/comm/lcal/cmake/CMakeCCEInformation.cmake +++ b/comm/lcal/cmake/CMakeCCEInformation.cmake @@ -0,0 +1,28 @@ +# + # Copyright (c) 2024 Huawei Technologies Co., Ltd. + # This file is a part of the CANN Open Software. + # Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + # Please refer to the License for details. You may not use this file except in compliance with the License. + # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + # See LICENSE in the root of the software repository for the full text of the License. + # +include(CMakeCommonLanguageInclude) +include(Compiler/CMakeCommonCompilerMacros) +set(CMAKE_INCLUDE_FLAG_CCE "-I") +if(UNIX) + set(CMAKE_CCE_OUTPUT_EXTENSION .o) +else() + set(CMAKE_CCE_OUTPUT_EXTENSION .obj) +endif() +set(CMAKE_DEPFILE_FLAGS_CCE "-MD -MT -MF ") +set(CMAKE_CCE_DEPFILE_FORMAT gcc) +set(CMAKE_CCE_DEPENDS_USE_COMPILER TRUE) +if(NOT CMAKE_CCE_COMPILE_OBJECT) + set(CMAKE_CCE_COMPILE_OBJECT + "${CMAKE_CCE_COMPILER} -xcce \ + ${__IMPLICIT_INCLUDE} ${_CMAKE_CCE_BUILTIN_INCLUDE_PATH}\ + $<_CMAKE_COMPILE_AS_CCE_FLAG} ${_CMAKE_CCE_COMPILE_OPTIONS}\ + ${_CMAKE_CCE_COMMON_COMPILE_OPTIONS} -o -c ") +endif() + -- Gitee From 9de83815de3e1662cd9196b4449fcbc90cfa8301 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:30:11 +0800 Subject: [PATCH 032/414] info cmake 2 --- comm/lcal/cmake/CMakeCCEInformation.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/cmake/CMakeCCEInformation.cmake b/comm/lcal/cmake/CMakeCCEInformation.cmake index 63739908..a7849157 100644 --- a/comm/lcal/cmake/CMakeCCEInformation.cmake +++ b/comm/lcal/cmake/CMakeCCEInformation.cmake @@ -21,8 +21,8 @@ set(CMAKE_CCE_DEPENDS_USE_COMPILER TRUE) if(NOT CMAKE_CCE_COMPILE_OBJECT) set(CMAKE_CCE_COMPILE_OBJECT "${CMAKE_CCE_COMPILER} -xcce \ - ${__IMPLICIT_INCLUDE} ${_CMAKE_CCE_BUILTIN_INCLUDE_PATH}\ - $<_CMAKE_COMPILE_AS_CCE_FLAG} ${_CMAKE_CCE_COMPILE_OPTIONS}\ + ${__IMPLICIT_INCLUDES} ${_CMAKE_CCE_BUILTIN_INCLUDE_PATH}\ + ${_CMAKE_COMPILE_AS_CCE_FLAG} ${_CMAKE_CCE_COMPILE_OPTIONS}\ ${_CMAKE_CCE_COMMON_COMPILE_OPTIONS} -o -c ") endif() -- Gitee From 318d092989c2b44c039e866d3766192cf91c150b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:37:45 +0800 Subject: [PATCH 033/414] testcompilercmake --- comm/lcal/cmake/CMakeCCEInformation.cmake | 16 ++++++++-------- comm/lcal/cmake/CMakeTestCCECompiler.cmake | 10 ++++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/comm/lcal/cmake/CMakeCCEInformation.cmake b/comm/lcal/cmake/CMakeCCEInformation.cmake index a7849157..e85910f9 100644 --- a/comm/lcal/cmake/CMakeCCEInformation.cmake +++ b/comm/lcal/cmake/CMakeCCEInformation.cmake @@ -1,12 +1,12 @@ # - # Copyright (c) 2024 Huawei Technologies Co., Ltd. - # This file is a part of the CANN Open Software. - # Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - # Please refer to the License for details. You may not use this file except in compliance with the License. - # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - # See LICENSE in the root of the software repository for the full text of the License. - # +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# include(CMakeCommonLanguageInclude) include(Compiler/CMakeCommonCompilerMacros) set(CMAKE_INCLUDE_FLAG_CCE "-I") diff --git a/comm/lcal/cmake/CMakeTestCCECompiler.cmake b/comm/lcal/cmake/CMakeTestCCECompiler.cmake index e69de29b..7c419c10 100644 --- a/comm/lcal/cmake/CMakeTestCCECompiler.cmake +++ b/comm/lcal/cmake/CMakeTestCCECompiler.cmake @@ -0,0 +1,10 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +set(CMAKE_CCE_COMPILE_WORKS 1 CACHE INTERNAL "") \ No newline at end of file -- Gitee From a27a0df3d2daa33fc30efdbf6a62768721d5e4cb Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:49:11 +0800 Subject: [PATCH 034/414] determinecompiler --- .../cmake/CMakeDetermineCCECompiler.cmake | 31 +++++++++++++++++++ comm/lcal/cmake/CMakeTestCCECompiler.cmake | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake b/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake index e69de29b..21248c1b 100644 --- a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake +++ b/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake @@ -0,0 +1,31 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +set(PRIVATE_CCEC_PATH ${CMAKE_SOURCE_DIR}/3rdparty/compiler) +find_program(CMAKE_CCE_COMPILER + NAMES "ccec" + HINTS "${PRIVATE_CCEC_PATH}/ccec_compiler/bin" + HINTS "${ASCEND_HOME_PATH}/${ARCH}--linux/ccec_compiler/bin" + DOC "CCE Compiler" +) +find_program(CMAKE_CCE_LINKER + NAMES "ld.lld" + HINTS "${PRIVATE_CCEC_PATH}/ccec_compiler/bin" + HINTS "${ASCEND_HOME_PATH}/${ARCH}--linux/ccec_compiler/bin" + DOC "CCE Linker" +) +message(STATUS "CMAKE_CCE_COMPILER: ${CMAKE_CCE_COMPILER}") +message(STATUS "CMAKE_PLATFORM_INFO_DIR: "${CMAKE_PLATFORM_INFO_DIR}) +configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeCCEInformation.cmake.in + ${CMAKE_PLATFORM_INFO_DIR}/CMakeCCEInformation.cmake + @ONLY +) +set(CMAKE_CCE_SOURCE_FILE_EXTENSION cce;cpp) +set(CMAKE_CCE_COMPILER_ENV_VAR "CCEC") + diff --git a/comm/lcal/cmake/CMakeTestCCECompiler.cmake b/comm/lcal/cmake/CMakeTestCCECompiler.cmake index 7c419c10..1136cb4b 100644 --- a/comm/lcal/cmake/CMakeTestCCECompiler.cmake +++ b/comm/lcal/cmake/CMakeTestCCECompiler.cmake @@ -7,4 +7,4 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # -set(CMAKE_CCE_COMPILE_WORKS 1 CACHE INTERNAL "") \ No newline at end of file +set(CMAKE_CCE_COMPILER_WORKS 1 CACHE INTERNAL "") \ No newline at end of file -- Gitee From bb498a899bc573a1ad25c257aca4cdbc42a9a037 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 13:50:37 +0800 Subject: [PATCH 035/414] determine 2nd --- comm/lcal/cmake/CMakeDetermineCCECompiler.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake b/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake index 21248c1b..955cf6af 100644 --- a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake +++ b/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake @@ -11,21 +11,21 @@ set(PRIVATE_CCEC_PATH ${CMAKE_SOURCE_DIR}/3rdparty/compiler) find_program(CMAKE_CCE_COMPILER NAMES "ccec" HINTS "${PRIVATE_CCEC_PATH}/ccec_compiler/bin" - HINTS "${ASCEND_HOME_PATH}/${ARCH}--linux/ccec_compiler/bin" + HINTS "${ASCEND_HOME_PATH}/${ARCH}-linux/ccec_compiler/bin" DOC "CCE Compiler" ) find_program(CMAKE_CCE_LINKER NAMES "ld.lld" HINTS "${PRIVATE_CCEC_PATH}/ccec_compiler/bin" - HINTS "${ASCEND_HOME_PATH}/${ARCH}--linux/ccec_compiler/bin" + HINTS "${ASCEND_HOME_PATH}/${ARCH}-linux/ccec_compiler/bin" DOC "CCE Linker" ) -message(STATUS "CMAKE_CCE_COMPILER: ${CMAKE_CCE_COMPILER}") +message(STATUS "CMAKE_CCE_COMPILER: " ${CMAKE_CCE_COMPILER}) message(STATUS "CMAKE_PLATFORM_INFO_DIR: "${CMAKE_PLATFORM_INFO_DIR}) -configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeCCEInformation.cmake.in - ${CMAKE_PLATFORM_INFO_DIR}/CMakeCCEInformation.cmake +configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeCCECompiler.cmake.in + ${CMAKE_PLATFORM_INFO_DIR}/CMakeCCECompiler.cmake @ONLY ) -set(CMAKE_CCE_SOURCE_FILE_EXTENSION cce;cpp) +set(CMAKE_CCE_SOURCE_FILE_EXTENSIONS cce;cpp) set(CMAKE_CCE_COMPILER_ENV_VAR "CCEC") -- Gitee From 5241eabfc29fbfa47ac458ec0be85b9ca24734d7 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 14:04:23 +0800 Subject: [PATCH 036/414] cmakelists --- comm/lcal/CMakeLists.txt | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/comm/lcal/CMakeLists.txt b/comm/lcal/CMakeLists.txt index e69de29b..e4931b64 100644 --- a/comm/lcal/CMakeLists.txt +++ b/comm/lcal/CMakeLists.txt @@ -0,0 +1,59 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +cmake_minimum_required(VERSION 3.12) +project(Lcal LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 14) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + +option(UCS_CXX11_ABI "USE_CXX11_ABI" 0) + +IF (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(ARCH aarch64) +ELSEIF (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + set(ARCH x86_64) +ENDIF() +if(USE_CXX11_ABI) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") +endif() + +message("== CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") +message("== CMAKE_BUILD_TYPE ${CMAKE_BUILD_DIR}") + +if (DEFINED ENV{ASCEND_HOME_PATH}) + set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +else() + message("ASCEND_HOME_PATH not set! using default path!") + set(ASCEND_HOME_PATH "/usr/local/Ascend/ascend-toolkit/latest") +endif() + +message("== ASCEND_HOME_PATH ${ASCEND_HOME_PATH}") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations") +add_link_options(-Wl,-z,relro,-z,now) +add_link_options(-s) + +include_directories( + ${CMAKE_CURRENT_LIST_DIR}/include + ${CMAKE_CURRENT_LIST_DIR}/inlude/lcoc/ + ${CMAKE_CURRENT_LIST_DIR}/include/lcoc/tiling/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/include/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/include/hccl/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment/runtime/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment/msprof/ + ) +link_directories(${ASCEND_HOME_PATH}/${ARCH}-linux/lib64/) + +set(AIV_ARCH dav-c220-vec) +set(AIV_ARCH dav-c220-cube) + +add_subdirectory(src) -- Gitee From 4b5d9f968a6d8f85afec5a2f36323f5bf4605213 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 14:06:44 +0800 Subject: [PATCH 037/414] cmakelist 2nd --- comm/lcal/CMakeLists.txt | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/comm/lcal/CMakeLists.txt b/comm/lcal/CMakeLists.txt index e4931b64..428491c0 100644 --- a/comm/lcal/CMakeLists.txt +++ b/comm/lcal/CMakeLists.txt @@ -12,7 +12,7 @@ project(Lcal LANGUAGES CXX) set(CMAKE_CXX_STANDARD 14) list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -option(UCS_CXX11_ABI "USE_CXX11_ABI" 0) +option(USE_CXX11_ABI "USE_CXX11_ABI" 0) IF (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") set(ARCH aarch64) @@ -26,16 +26,17 @@ else() endif() message("== CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") -message("== CMAKE_BUILD_TYPE ${CMAKE_BUILD_DIR}") +message("== CMAKE_SOURCE_DIR ${CMAKE_BUILD_DIR}") +# 获取环境变量 if (DEFINED ENV{ASCEND_HOME_PATH}) set(ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) else() message("ASCEND_HOME_PATH not set! using default path!") - set(ASCEND_HOME_PATH "/usr/local/Ascend/ascend-toolkit/latest") + set(ASCEND_HOME_PATH /usr/local/Ascend/ascend-toolkit/latest) endif() -message("== ASCEND_HOME_PATH ${ASCEND_HOME_PATH}") +message("== ASCEND_HOME_PATH: ${ASCEND_HOME_PATH}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations") add_link_options(-Wl,-z,relro,-z,now) @@ -43,17 +44,17 @@ add_link_options(-s) include_directories( ${CMAKE_CURRENT_LIST_DIR}/include - ${CMAKE_CURRENT_LIST_DIR}/inlude/lcoc/ + ${CMAKE_CURRENT_LIST_DIR}/include/lcoc/ ${CMAKE_CURRENT_LIST_DIR}/include/lcoc/tiling/ ${ASCEND_HOME_PATH}/${ARCH}-linux/include/ ${ASCEND_HOME_PATH}/${ARCH}-linux/include/hccl/ - ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment/runtime/ ${ASCEND_HOME_PATH}/${ARCH}-linux/include/experiment/msprof/ ) link_directories(${ASCEND_HOME_PATH}/${ARCH}-linux/lib64/) set(AIV_ARCH dav-c220-vec) -set(AIV_ARCH dav-c220-cube) +set(AIC_ARCH dav-c220-cube) add_subdirectory(src) -- Gitee From 7d079a7375f823446f62bfd4735130e57d9422d8 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 14:07:08 +0800 Subject: [PATCH 038/414] cmakelists 3rd --- comm/lcal/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/CMakeLists.txt b/comm/lcal/CMakeLists.txt index 428491c0..8687e512 100644 --- a/comm/lcal/CMakeLists.txt +++ b/comm/lcal/CMakeLists.txt @@ -26,7 +26,7 @@ else() endif() message("== CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}") -message("== CMAKE_SOURCE_DIR ${CMAKE_BUILD_DIR}") +message("== CMAKE_SOURCE_DIR ${CMAKE_SOURCE_DIR}") # 获取环境变量 if (DEFINED ENV{ASCEND_HOME_PATH}) -- Gitee From e54c13caf24f5ba621a790b9eb4f6f6ff4e4b1a8 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 14:08:50 +0800 Subject: [PATCH 039/414] cmakelist ultimate --- comm/lcal/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/CMakeLists.txt b/comm/lcal/CMakeLists.txt index 8687e512..a5e63434 100644 --- a/comm/lcal/CMakeLists.txt +++ b/comm/lcal/CMakeLists.txt @@ -10,7 +10,7 @@ cmake_minimum_required(VERSION 3.12) project(Lcal LANGUAGES CXX) set(CMAKE_CXX_STANDARD 14) -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") option(USE_CXX11_ABI "USE_CXX11_ABI" 0) -- Gitee From e81160d13f00ece3e0b1be4463cb6b25e4e00cc2 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 14:13:16 +0800 Subject: [PATCH 040/414] op1cpp --- comm/lcal/src/ascendc_kernels/lccl_op1.cpp | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 comm/lcal/src/ascendc_kernels/lccl_op1.cpp diff --git a/comm/lcal/src/ascendc_kernels/lccl_op1.cpp b/comm/lcal/src/ascendc_kernels/lccl_op1.cpp new file mode 100644 index 00000000..140899e1 --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/lccl_op1.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ + +#include "lccl_op.h" + +LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF); + +#endif + +#ifdef __DAV_C220_CUBE__ + +#include "lccl_op.h" + +LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF); + +#endif + -- Gitee From e40b8eae0775033af64e63ae5e385db05a70739b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sun, 17 Aug 2025 14:14:00 +0800 Subject: [PATCH 041/414] op1 ultimate --- comm/lcal/src/ascendc_kernels/lccl_op1.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op1.cpp b/comm/lcal/src/ascendc_kernels/lccl_op1.cpp index 140899e1..62539f3d 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op1.cpp +++ b/comm/lcal/src/ascendc_kernels/lccl_op1.cpp @@ -19,7 +19,7 @@ LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF); #include "lccl_op.h" -LCCL_TYPE_AIV_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF); +LCCL_TYPE_AIC_FUNC(LCCL_ALL_REDUCE_FUNC_AUTO_DEF); #endif -- Gitee From 61efba28bab271c8829c8d9163a88811530635ce Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 09:39:24 +0800 Subject: [PATCH 042/414] ascendc cmake 1st --- comm/lcal/src/ascendc.cmake | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 comm/lcal/src/ascendc.cmake diff --git a/comm/lcal/src/ascendc.cmake b/comm/lcal/src/ascendc.cmake new file mode 100644 index 00000000..1c3f32a5 --- /dev/null +++ b/comm/lcal/src/ascendc.cmake @@ -0,0 +1,51 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +enable_language(CCE) +if(USE_MSSANITIZER) + set(SANITIZER_FLAGS + -g --ce-enable-sanitizer + ) + set(SANITIZER_DEPEND_LIBS + --dependent-libraries ${ASCEND_HOME_PATH}/tools/mssanitizer/lib64/libsanitizer_stub_dav-c220-vec.a + --dependent-libraries ${ASCEND_HOME_PATH}/tools/mssanitizer/lib64/libsanitizer_stub_dav-c220-cube.a + ) +else() + set(SANITIZER_FLAGS) + set(SANITIZER_DEPEND_LIBS) +endif() +set(CCE_COMPILE_OPTION + -02 -std=gnu++17 + --cce-aicore-only + --cce-aicore-only + -Wno-deprecated-declarations + ${SANITIZER_FLAGS} + "SHELL:-mllvm -cce-aicore-long-call" + "SHELL:-mllvm -cce-aicore-function-stach-size=16000" + "SHELL:-mllvm -cce-aicore-record-overflow=false" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm -cce-aicore-addr-transform" + "SHELL:-mllvm --cce-aicore-jump-expand=true" +) +set(PRIVATE_CCEC_PATH ${CMAKE_SOURCE_DIR}/3rdparty/compiler) +if (EXISTS ${PRIVATE_CCEC_PATH}) + message(STATUS "Using suctom ccec include directories") + set(CCE_INCLUDE_BASE ${PRIVATE_CCEC_PATH}) +else() + set(CCE_INCLUDE_BASE ${ASCEND_HOME_PATH}/${ARCH}-linux) +endif() + +message(STATUS "Using tikcpp include directories") +include_directories( + ${ASCEND_HOME_PATH}/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include/c++/7.3.0 + ${ASCEND_HOME_PATH}/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include/c++/7.3.0/aarch64-target-linux-gnu/ + ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/ + ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/interfaces/ + ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/impl/ +) \ No newline at end of file -- Gitee From 86e25c70cddb05eca33c111b7313490aaee35261 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 09:42:37 +0800 Subject: [PATCH 043/414] ascendc cmake 2nd --- comm/lcal/src/ascendc.cmake | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/ascendc.cmake b/comm/lcal/src/ascendc.cmake index 1c3f32a5..f6f892aa 100644 --- a/comm/lcal/src/ascendc.cmake +++ b/comm/lcal/src/ascendc.cmake @@ -8,34 +8,37 @@ # See LICENSE in the root of the software repository for the full text of the License. # enable_language(CCE) +# 设置编译选项 +# 定义 sanitizer 相关的编译选项,如果启用则添加,否则为空 if(USE_MSSANITIZER) set(SANITIZER_FLAGS - -g --ce-enable-sanitizer + -g --cce-enable-sanitizer ) set(SANITIZER_DEPEND_LIBS --dependent-libraries ${ASCEND_HOME_PATH}/tools/mssanitizer/lib64/libsanitizer_stub_dav-c220-vec.a --dependent-libraries ${ASCEND_HOME_PATH}/tools/mssanitizer/lib64/libsanitizer_stub_dav-c220-cube.a ) else() - set(SANITIZER_FLAGS) + set(SANITIZER_FLAGS) # 空 set(SANITIZER_DEPEND_LIBS) endif() set(CCE_COMPILE_OPTION - -02 -std=gnu++17 + -O2 -std=gnu++17 --cce-aicore-only --cce-aicore-only -Wno-deprecated-declarations ${SANITIZER_FLAGS} "SHELL:-mllvm -cce-aicore-long-call" - "SHELL:-mllvm -cce-aicore-function-stach-size=16000" + "SHELL:-mllvm -cce-aicore-function-stack-size=16000" "SHELL:-mllvm -cce-aicore-record-overflow=false" "SHELL:-mllvm -cce-aicore-addr-transform" "SHELL:-mllvm -cce-aicore-addr-transform" "SHELL:-mllvm --cce-aicore-jump-expand=true" ) set(PRIVATE_CCEC_PATH ${CMAKE_SOURCE_DIR}/3rdparty/compiler) +# 设置包含路径 if (EXISTS ${PRIVATE_CCEC_PATH}) - message(STATUS "Using suctom ccec include directories") + message(STATUS "Using custom ccec include directories") set(CCE_INCLUDE_BASE ${PRIVATE_CCEC_PATH}) else() set(CCE_INCLUDE_BASE ${ASCEND_HOME_PATH}/${ARCH}-linux) @@ -46,6 +49,6 @@ include_directories( ${ASCEND_HOME_PATH}/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include/c++/7.3.0 ${ASCEND_HOME_PATH}/toolkit/toolchain/hcc/aarch64-target-linux-gnu/include/c++/7.3.0/aarch64-target-linux-gnu/ ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/ - ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/interfaces/ + ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/interface/ ${CCE_INCLUDE_BASE}/tikcpp/tikcfw/impl/ ) \ No newline at end of file -- Gitee From a2ec44cd6bb965440db9d43fa7655e38526378ae Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 09:44:43 +0800 Subject: [PATCH 044/414] ascendc cmake 3rd --- comm/lcal/src/ascendc.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/comm/lcal/src/ascendc.cmake b/comm/lcal/src/ascendc.cmake index f6f892aa..27f3d366 100644 --- a/comm/lcal/src/ascendc.cmake +++ b/comm/lcal/src/ascendc.cmake @@ -25,14 +25,12 @@ endif() set(CCE_COMPILE_OPTION -O2 -std=gnu++17 --cce-aicore-only - --cce-aicore-only -Wno-deprecated-declarations ${SANITIZER_FLAGS} "SHELL:-mllvm -cce-aicore-long-call" "SHELL:-mllvm -cce-aicore-function-stack-size=16000" "SHELL:-mllvm -cce-aicore-record-overflow=false" "SHELL:-mllvm -cce-aicore-addr-transform" - "SHELL:-mllvm -cce-aicore-addr-transform" "SHELL:-mllvm --cce-aicore-jump-expand=true" ) set(PRIVATE_CCEC_PATH ${CMAKE_SOURCE_DIR}/3rdparty/compiler) -- Gitee From 449fa3943cab003766dbed464304c81ce021891f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 09:52:09 +0800 Subject: [PATCH 045/414] op2 1st --- comm/lcal/src/ascendc_kernels/lccl_op2.cpp | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 comm/lcal/src/ascendc_kernels/lccl_op2.cpp diff --git a/comm/lcal/src/ascendc_kernels/lccl_op2.cpp b/comm/lcal/src/ascendc_kernels/lccl_op2.cpp new file mode 100644 index 00000000..7a1f900e --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/lccl_op2.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ + +#include "lccl_op.h" + +LCCL_TYPE_AIV_FUNC(LCCL_ALLGATHER_FUNC_AUTO_DEF); +LCCL_TYPE_AIV_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF); +LCCL_TYPE_AIV_FUNC(LCCL_ALL2ALL_FUNC_AUTO_DEF); + +#ifdef ENABLE_LCCL_MIX +LCCL_BROADCAST_FUNC_AUTO_DEF(_mix_aiv) +#else +LCCL_BROADCAST_FUNC_AUTO_DEF() +#endif +#endif +#ifdef __DAV_C220_CUBE__ + +#include "lccl_op.h" +LCCL_TYPE_AIV_FUNC(LCCL_ALLGATHER_FUNC_AUTO_DEF); +LCCL_TYPE_AIV_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF); +LCCL_TYPE_AIV_FUNC(LCCL_ALL2ALL_FUNC_AUTO_DEF); +#ifdef ENABLE_LCCL_MIX +LCCL_BROADCAST_FUNC_AUTO_DEF(_mix_aic) +#else +LCCL_BROADCAST_FUNC_AUTO_DEF() +#endif +#endif \ No newline at end of file -- Gitee From 991b17a90424d7d47f0fed9f48af456a08834b32 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 09:52:48 +0800 Subject: [PATCH 046/414] op2 2nd --- comm/lcal/src/ascendc_kernels/lccl_op2.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op2.cpp b/comm/lcal/src/ascendc_kernels/lccl_op2.cpp index 7a1f900e..76b52f73 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op2.cpp +++ b/comm/lcal/src/ascendc_kernels/lccl_op2.cpp @@ -24,9 +24,9 @@ LCCL_BROADCAST_FUNC_AUTO_DEF() #ifdef __DAV_C220_CUBE__ #include "lccl_op.h" -LCCL_TYPE_AIV_FUNC(LCCL_ALLGATHER_FUNC_AUTO_DEF); -LCCL_TYPE_AIV_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF); -LCCL_TYPE_AIV_FUNC(LCCL_ALL2ALL_FUNC_AUTO_DEF); +LCCL_TYPE_AIC_FUNC(LCCL_ALLGATHER_FUNC_AUTO_DEF); +LCCL_TYPE_AIC_FUNC(LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF); +LCCL_TYPE_AIC_FUNC(LCCL_ALL2ALL_FUNC_AUTO_DEF); #ifdef ENABLE_LCCL_MIX LCCL_BROADCAST_FUNC_AUTO_DEF(_mix_aic) #else -- Gitee From 59fe910a7abe1ab5412f3a284935d1c0841b413c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 10:00:59 +0800 Subject: [PATCH 047/414] cclkarg1 --- comm/lcal/src/ccl_kernels_args.h | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 comm/lcal/src/ccl_kernels_args.h diff --git a/comm/lcal/src/ccl_kernels_args.h b/comm/lcal/src/ccl_kernels_args.h new file mode 100644 index 00000000..18fdffe7 --- /dev/null +++ b/comm/lcal/src/ccl_kernels_args.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_CCL_KERNEL_ARGS_H +#define LCAL_CCL_KERNEL_ARGS_H + +#include "lcal_types.h" +#include "comm_args.h" + +namespace Lcal { +struct AscendCCLKernelArgs { + const void *input = nullptr; + const void *output = nullptr; + const void *commArgsPtr = nullptr; + int64_t count = 0; + int64_t magic = 0; + int op = 0; + int root = 0; + int cycleCount = 0; + const void *scale = nullptr; + int64_t scaleCount = 0; + const void *offset = nullptr; +}; + +struct CCLGatherArgs { + const void *embTable = nullptr; + const void *lookup = nullptr; + const void *revData = nullptr; + int64_t lookupLen = 0; + int64_t embTableLen = 0; + int64_t embTableDim = 0; +}; +} +#endif // LCAL_CCL_KERNEL_ARGS_H -- Gitee From 32b80642df51e972e41c3ac23e4272d9794e5d07 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 10:01:54 +0800 Subject: [PATCH 048/414] cclkarg2nd --- comm/lcal/src/{ccl_kernels_args.h => ccl_kernel_args.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename comm/lcal/src/{ccl_kernels_args.h => ccl_kernel_args.h} (100%) diff --git a/comm/lcal/src/ccl_kernels_args.h b/comm/lcal/src/ccl_kernel_args.h similarity index 100% rename from comm/lcal/src/ccl_kernels_args.h rename to comm/lcal/src/ccl_kernel_args.h -- Gitee From 101d9ecd9752d990897d9af13976616fede03279 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 10:26:44 +0800 Subject: [PATCH 049/414] cmakelists src 1st --- comm/lcal/src/CmakeLists.txt | 58 ++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/comm/lcal/src/CmakeLists.txt b/comm/lcal/src/CmakeLists.txt index e69de29b..d3c65445 100644 --- a/comm/lcal/src/CmakeLists.txt +++ b/comm/lcal/src/CmakeLists.txt @@ -0,0 +1,58 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +set_source_files_properties(lcal_internal.cpp PROPERTIES COMILE_FLAGS "-03") + +set(LCAL_SOURCE_FILE lcal_comm.cpp lccl.cpp + lcal_internal.cpp lcal_internal.h lcal_wrap.cpp + tools/socket/lcal_sock_exchange.h + tools/socket/lcal_sock_exchange.cpp + coc_kernel_args.h coc_kernel_args.cpp lcoc.cpp lcoc_func.cpp +) +file(GLOB TILING_SOURCE_FILE tiling/*.cpp) +list(APPEND LCAL_SOURCE_FILE ${TILING_SOURCE_FILE}) + +add_library(lcal SHARED ${LCAL_SOURCE_FILE}) +add_library(lcal_static STATIC ${LCAL_SOURCE_FILE}) +set_target_properties(lcal_static PROPERTIES POSITION_INDEPENDENT_CODE ON) + +target_link_libraries(lcal ascendcl runtime profapi c_sec mki) +target_link_libraries(lcal_static ascendcl runtime profapi c_sec mki) + +message(STATUS "LCAL USE_MSSANITIZER = ${USE_MSSANITIZER}") +set(LCAL_CCE_PATH "/tmp/lcal_cce.o") +if(USE_MSSANITIZER) +math(EXPR LCAL_10P BIN_SIZE "128 * 1024 * 1024") +add_definitions(-DUSE_MSSANITIZER) +else() +math(EXPR LCAL_10P BIN_SIZE "5 * 1024 * 1024") +endif() + +add_definitions(-DLCAL_10P_BIN_SIZE=${LCAL_10P_BIN_SIZE}) + +add_subdirectory(kernels) +add_subdirectory(ascendc_kernels) + +add_custom_command( + OUTPUT ${LCAL_CCE_PATH} + COMMAND cat ascendc_kernels/lccl_op.o kernels/lcoc_op.o > ${LCAL_CCE_PATH} + COMMAND echo "concat op..." + DEPENDS lccl_op lcoc_op +) + +set_source_files_properties( + lcal_internal.cpp + PROPERTIES + OBJECT_DEPENDS ${LCAL_CCE_PATH} +) +install(TARGETS lcal LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +install(TARGETS lcal_static DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) + + + -- Gitee From d4ff9e658c21e1fdc768f4f57d331668dbaef440 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 10:27:48 +0800 Subject: [PATCH 050/414] cmakelists src 2nd --- comm/lcal/src/CmakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/CmakeLists.txt b/comm/lcal/src/CmakeLists.txt index d3c65445..ffcde4b9 100644 --- a/comm/lcal/src/CmakeLists.txt +++ b/comm/lcal/src/CmakeLists.txt @@ -7,7 +7,7 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # -set_source_files_properties(lcal_internal.cpp PROPERTIES COMILE_FLAGS "-03") +set_source_files_properties(lcal_internal.cpp PROPERTIES COMPILE_FLAGS "-O3") set(LCAL_SOURCE_FILE lcal_comm.cpp lccl.cpp lcal_internal.cpp lcal_internal.h lcal_wrap.cpp @@ -28,13 +28,13 @@ target_link_libraries(lcal_static ascendcl runtime profapi c_sec mki) message(STATUS "LCAL USE_MSSANITIZER = ${USE_MSSANITIZER}") set(LCAL_CCE_PATH "/tmp/lcal_cce.o") if(USE_MSSANITIZER) -math(EXPR LCAL_10P BIN_SIZE "128 * 1024 * 1024") +math(EXPR LCAL_1OP BIN_SIZE "128 * 1024 * 1024") add_definitions(-DUSE_MSSANITIZER) else() -math(EXPR LCAL_10P BIN_SIZE "5 * 1024 * 1024") +math(EXPR LCAL_1OP BIN_SIZE "5 * 1024 * 1024") endif() -add_definitions(-DLCAL_10P_BIN_SIZE=${LCAL_10P_BIN_SIZE}) +add_definitions(-DLCAL_1OP_BIN_SIZE=${LCAL_1OP_BIN_SIZE}) add_subdirectory(kernels) add_subdirectory(ascendc_kernels) -- Gitee From 90dc910a7c2448c3d06d5634bda9590a9b1f24f1 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 10:28:43 +0800 Subject: [PATCH 051/414] src cmakelists 3rd --- comm/lcal/src/CmakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/CmakeLists.txt b/comm/lcal/src/CmakeLists.txt index ffcde4b9..400edecd 100644 --- a/comm/lcal/src/CmakeLists.txt +++ b/comm/lcal/src/CmakeLists.txt @@ -28,10 +28,10 @@ target_link_libraries(lcal_static ascendcl runtime profapi c_sec mki) message(STATUS "LCAL USE_MSSANITIZER = ${USE_MSSANITIZER}") set(LCAL_CCE_PATH "/tmp/lcal_cce.o") if(USE_MSSANITIZER) -math(EXPR LCAL_1OP BIN_SIZE "128 * 1024 * 1024") +math(EXPR LCAL_1OP_BIN_SIZE "128 * 1024 * 1024") add_definitions(-DUSE_MSSANITIZER) else() -math(EXPR LCAL_1OP BIN_SIZE "5 * 1024 * 1024") +math(EXPR LCAL_1OP_BIN_SIZE "5 * 1024 * 1024") endif() add_definitions(-DLCAL_1OP_BIN_SIZE=${LCAL_1OP_BIN_SIZE}) -- Gitee From 6783a4286b2e3867cf5613f8b6b690527f57f1a0 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 11:07:45 +0800 Subject: [PATCH 052/414] src kernel cmakelists 1st --- comm/lcal/src/kernels/CMakeLists.txt | 54 +++++++++++++++++++++++++++ comm/lcal/src/kernels/collectives.cce | 0 2 files changed, 54 insertions(+) create mode 100644 comm/lcal/src/kernels/CMakeLists.txt create mode 100644 comm/lcal/src/kernels/collectives.cce diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/comm/lcal/src/kernels/CMakeLists.txt new file mode 100644 index 00000000..e539cd56 --- /dev/null +++ b/comm/lcal/src/kernels/CMakeLists.txt @@ -0,0 +1,54 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +include(../ascendc.cmake) +set(OP_NAMES pure_matmul matmul_allreduce matmul_reduce_scatter allgather_matmul + allgather_matmul_reduce_scatter alltoallv_allgather_matmul matmul_reduce_scatter_alltoallv) + +file(GLOB KERNEL_FILES *.cpp) +set_source_files_properties(${KERNEL_FILES} PROPERTIES LANGUAGE CCE) +file(GLOB KERNEL_FILES2 *.cpp) +set_source_files_properties(${KERNEL_FILES2} PROPERTIES LANGUAGE CCE) + +foreach(OP_NAME IN LISTS OP_NAMES) + add_library(lcoc_${OP_NAME}_aic_obj OBJECT coc_${OP_NAME}.cce) + target_compile_options(lcoc_${OP_NAME}_aic_obj PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIV_ARCH} + ) + add_library(lcoc_${OP_NAME}_aiv_obj OBJECT coc_${OP_NAME}.cce) + target_compile_options(lcoc_${OP_NAME}_aiv_obj PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIV_ARCH} + ) + add_custom_target(${OP_NAME}_o + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/lcoc_${OP_NAME}_aic_obj.dir/coc_${OP_NAME}*.o" + "CMakeFiles/lcoc_${OP_NAME}_aiv_obj.dir/coc_${OP_NAME}*.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "lcal_coc_${OP_NAME}.o" --allow-multiple-definition + COMMAND truncate -c -s ${LCAL_1OP_BIN_SIZE} "lcal_coc_${OP_NAME}.0" + ) +endforeach() +# 生成文件名列表,每个都带有 .o 后缀 +set(OUTPUT_FILES "") +foreach(OP_NAME IN LISTS OP_NAMES) + list(APPEND OUTPUT_FILES "lcal_coc_${OP_NAME}.o") +endforeach() + +add_custom_target(lcoc_op + COMMAND cat ${OUTPUT_FILES} > lcoc_op.option +) +foreach(OP_NAME IN LISTS OP_NAMES) + add_dependencies(${OP_NAME}_o lcoc_${OP_NAME}_aic_obj lcoc_${OP_NAME} + _aiv_obj) + add_dependencies(lcoc_op ${OP_NAME}_o) +endforeach() + + diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce new file mode 100644 index 00000000..e69de29b -- Gitee From 1baeb442ed41c0c9266be50ccacba7b8d2d00e76 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 11:13:06 +0800 Subject: [PATCH 053/414] Refactor CMakeLists.txt: Consolidate OP_NAMES and fix AIC_ARCH reference --- comm/lcal/src/kernels/CMakeLists.txt | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/comm/lcal/src/kernels/CMakeLists.txt index e539cd56..5de5bc6f 100644 --- a/comm/lcal/src/kernels/CMakeLists.txt +++ b/comm/lcal/src/kernels/CMakeLists.txt @@ -8,19 +8,18 @@ # See LICENSE in the root of the software repository for the full text of the License. # include(../ascendc.cmake) -set(OP_NAMES pure_matmul matmul_allreduce matmul_reduce_scatter allgather_matmul - allgather_matmul_reduce_scatter alltoallv_allgather_matmul matmul_reduce_scatter_alltoallv) +set(OP_NAMES pure_matmul matmul_allreduce matmul_reduce_scatter allgather_matmul allgather_matmul_reduce_scatter alltoallv_allgather_matmul matmul_reduce_scatter_alltoallv) file(GLOB KERNEL_FILES *.cpp) set_source_files_properties(${KERNEL_FILES} PROPERTIES LANGUAGE CCE) -file(GLOB KERNEL_FILES2 *.cpp) +file(GLOB KERNEL_FILES2 *.cce) set_source_files_properties(${KERNEL_FILES2} PROPERTIES LANGUAGE CCE) foreach(OP_NAME IN LISTS OP_NAMES) add_library(lcoc_${OP_NAME}_aic_obj OBJECT coc_${OP_NAME}.cce) target_compile_options(lcoc_${OP_NAME}_aic_obj PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIV_ARCH} + --cce-aicore-arch=${AIC_ARCH} ) add_library(lcoc_${OP_NAME}_aiv_obj OBJECT coc_${OP_NAME}.cce) target_compile_options(lcoc_${OP_NAME}_aiv_obj PRIVATE @@ -33,7 +32,7 @@ foreach(OP_NAME IN LISTS OP_NAMES) "CMakeFiles/lcoc_${OP_NAME}_aiv_obj.dir/coc_${OP_NAME}*.o" ${SANITIZER_DEPEND_LIBS} --static -o "lcal_coc_${OP_NAME}.o" --allow-multiple-definition - COMMAND truncate -c -s ${LCAL_1OP_BIN_SIZE} "lcal_coc_${OP_NAME}.0" + COMMAND truncate -c -s ${LCAL_1OP_BIN_SIZE} "lcal_coc_${OP_NAME}.o" ) endforeach() # 生成文件名列表,每个都带有 .o 后缀 @@ -46,8 +45,7 @@ add_custom_target(lcoc_op COMMAND cat ${OUTPUT_FILES} > lcoc_op.option ) foreach(OP_NAME IN LISTS OP_NAMES) - add_dependencies(${OP_NAME}_o lcoc_${OP_NAME}_aic_obj lcoc_${OP_NAME} - _aiv_obj) + add_dependencies(${OP_NAME}_o lcoc_${OP_NAME}_aic_obj lcoc_${OP_NAME}_aiv_obj) add_dependencies(lcoc_op ${OP_NAME}_o) endforeach() -- Gitee From ae927f63d8cd42ff23333a2c1f242e7b9f1d6482 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 11:13:43 +0800 Subject: [PATCH 054/414] Fix output filename in lcoc_op target to use correct extension --- comm/lcal/src/kernels/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/comm/lcal/src/kernels/CMakeLists.txt index 5de5bc6f..a45f05f2 100644 --- a/comm/lcal/src/kernels/CMakeLists.txt +++ b/comm/lcal/src/kernels/CMakeLists.txt @@ -42,7 +42,7 @@ foreach(OP_NAME IN LISTS OP_NAMES) endforeach() add_custom_target(lcoc_op - COMMAND cat ${OUTPUT_FILES} > lcoc_op.option + COMMAND cat ${OUTPUT_FILES} > lcoc_op.o ) foreach(OP_NAME IN LISTS OP_NAMES) add_dependencies(${OP_NAME}_o lcoc_${OP_NAME}_aic_obj lcoc_${OP_NAME}_aiv_obj) -- Gitee From 4c7358cffa962ea2e93af26afc1b12e936a1c76b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 14:47:46 +0800 Subject: [PATCH 055/414] Add AllgatherHierarchyDoubleRing class for efficient data gathering in distributed systems --- .../91093/allgather_hierarchy_double_ring.h | 258 ++++++++++++++++++ .../src/ascendc_kernels/allreduce_big_data.h | 23 +- 2 files changed, 267 insertions(+), 14 deletions(-) create mode 100644 comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h diff --git a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h new file mode 100644 index 00000000..8e76ae5c --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + #ifndef LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H + #define LLCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H + +#include "collectives.h" +#include "ipc_queue.h" +using namespace AscendC; + +constexpr int STAGE_NUM = 4; +constexpr int QUE_DEPTH = 8; +constexpr int QUE_NUM_LOCAL = 2; +constexpr int RING_NUM = 2; +constexpr int STAGE_EVENT = 0; +constexpr int RING_EVENT = 1; + +enum STAGE { + HCCS_RING = 0; + HCCS_TO_OUT, + HCCS_TO_SIO, + SIO_TO_OUT +}; + +template +class AllgatherHierarchyDoubleRing : protected Collectives { +public: + FORCE_INLINE_AICORE AllgatherHierarchyDoubleRing(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclLogInfo(LogId::INIT, Op::COPYONLY); + int64_t dataTotalSize = len * sizeof(T); + const int coreNumPerStep = blockNum / STAGE_NUM; + stage = blockIdx / coreNumPerStep; + int stageCoreIdx = blockIdx % coreNumPerStep; + dataSizePerCore = dataTotalSize / coreNumPerStep; + const int64_t inputOffset = stageCoreIdx * dataSizePerCore; + if (stageCoreIdx == coreNumPerStep - 1) { + dataSizePerCore = dataTotalSize - (coreNumPerStep - 1) * dataSizePerCore; + } + + inputGm.SetGlobalBuffer(input + inputOffset, dataSizePerCore); + if (stage == STAGE::HCCS_TO_OUT) { + for (int i = rank % RING_NUM; i < rankSize; i += RING_NUM) { + outputGm[i / RING_NUM].SetGlobalBuffer(output + dataSizePerCore * i + inputOffset, dataSizePerCore); + } + } elseif (stage == STAGE::SIO_TO_OUT) { + for (int i = (rank + 1) % RING_NUM; i < rankSize; i += RING_NUM) { + outputGm[i / RING_NUM].SetGlobalBuffer(output + dataSizePerCore * i + inputOffset, dataSizePerCore); + } + } + + int64_t queTotalSize = IPC_BUFF_MAX_SIZE / coreNumPerStep; + int64_t queSize = queTotalSize / QUE_NUM_LOCAL; + int64_t queHccsOffset = queSize /QUE_DEPTH; + blockSize = queSize / QUE_DEPTH; + + queHccsLocal.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + queHccsOffset, queSize, blockSize); + queSioLocal.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + queHccsOffset + queSize, + queSize, blockSize); + rankRingForward = (rank + RING_NUM) % rankSize; + queHccsForward.Init(&sync, magic, shareAddrs[rankRingForward] + IPC_DATA_OFFSET + queHccsOffset, + queSize, blockSize); + rankSioAdjoint = rank ^ 1; + queSioAdjoint.Init(&sync, magic, + shareAddrs[rankSioAdjoint] + IPC_DATA_OFFSET + queHccsOffset + queSize, queSize, blockSize); + + for (int i = 0; i < STAGE_NUM; ++i) { + stageEvents[i] = sync.CalEventIdByMulBlockNum(STAGE_EVENT, stageCoreIdx * coreNumPerStep); + } + + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + int count = rankSize / RING_NUM * CeilDiv(dataSizePerCore, blockSize); + if (stage == STAGE::HCCS_RING) { + ProcessHccsRing(count); + } else if (stage == STAGE::HCCS_TO_OUT) { + ProcessHccsToOut(count); + } else if (stage == STAGE::HCCS_TO_SIO) { + ProcessHccsToSio(count); + } else if (stage == STAGE::SIO_TO_OUT) { + ProcessSioToOut(count); + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } +private: + FORCE_INLINE_AICORE void ProcessHccsRing(const int count) + { + constexpr int dependencyNum = 3; + int deQueWaitRanks[dependencyNum] = {(rank + rankSize - RING_NUM) % rankSize, rank, rank}; + int deQueWaitEvents[dependencyNum] = { + sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx); + stageEvents[static_cast(STAGE::HCCS_TO_OUT)], + stageEvents[static_cast(STAGE::HCCS_TO_SIO)]}; + int64_t remainSize = dataSizePerCore; + int64_t dataSize = 0; + GlobalTensor input; + GlobalTensor output; + int64_t waitFlag = 0; + int i = 0; + while (i < count) { + int countRankId = (rank + i * RING_NUM) % rankSize; + if (countRankId == rank) { + dataSize = (remainSize >= blockSize) ? blockSize : remainSize; + input = inputGm[dataSizePerCore - remainSize]; + remainSize -= blockSize; + } else { + if (i == 1) { + sync.WaitSyncFlag(magic, 0 , stageEvents[static_cast(STAGE::HCCS_RING)], rnakRingForward); + waitFlag = sync.GetInnerFlag(rankRingForward, + stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + } + if (waitFlag < i - 1) { + waitFlag = sync.GetInnerFlag(rankRingForward, + stageEvenets[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + continue; + } + input = queHccsForward.ReadFront(); + } + queHccsLocal.Deque(deQueWaitRanks, deQueWaitEvents, dependencyNum); + output = queHccsLocal.EnQue(); + CpGM2GMPingPong(dataSize, input, output, -1); + + sync.SetSyncFlag(magic, i, stageEvents[static_cast(STAGE::HCCS_RING)], rank); + if (countRankId != rank) { + if ((rank + (i + 1) * RING_NUM) % rankSize == rank) { + queHccsForward.ReadFront(); + sync.SetSyncFlag(magic, i , sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), rank); + } else { + sync.SetSyncFlag(magic, i - 1, sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), rank); + } + } + ++i; + } + } + + + FORCE_INLINE_AICORE void ProcessHccsToOut(const int count) + { + GlobalTensor input; + GlobalTensor output; + int64_t remainSize = dataSizePerCore; + int64_t dataSize = 0; + sync.WaitSyncFlag(magic, 0, stageEvents[static_cast(STAGE::HCCS_RING)], rank); + int64_t waitFlag = sync.GetInnerFlag(rank, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + int i = 0; + while (i < count) { + if (waitFlag < i) { + waitFlag = sync.GetInnerFlag(rank, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + continue; + } + int countRankId = (rank + i + RING_NUM) % rankSize; + if (countRankId == rank) { + dataSize = (remainSize >= blockSize) ? blockSize : remainSize; + } + input = queHccsLocal.ReadFront(); + output = outputGm[countRankId / RING_NUM][dataSizePerCore - remainSize]; + CpGM2GMPingPong(dataSize, input, output, -1); + constexpr int32_t halfQueDepth = 2; + if (i % (QUE_DEPTH / halfQueDepth) == 0) { + sync.SetSyncFlag(magic, i, stageEvents[static_cast(STAGE::HCCS_TO_OUT)], rank); + } + if ((countRankId + RING_NUM) % rankSize == rank) { + remainSize -= blockSize; + } + ++i; + } + } + FORCE_INLINE_AICORE void ProcessHccsToSio(const int count) + { + GlobalTensor input; + GlobalTensor output; + int64_t remainSize = dataSizePerCore; + int64_t dataSize = 0; + sync.WaitSyncFlag(magic, 0, stageEvents[static_cast(STAGE::HCCS_RING)], rank); + int64_t waitFlag = sync.GetInnerFlag(rank, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + int i = 0; + while (i < count) { + if (waitFlag < i) { + waitFlag = sync.GetInnerFlag(rank, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + continue; + } + int countRankId = (rank + i + RING_NUM) % rankSize; + if (countRankId == rank) { + dataSize = (remainSize >= blockSize) ? blockSize : remainSize; + remainSize -= blockSize; + } + input = queHccsLocal.ReadFront(); + queSioAdjoint.DeQue(rankSioAdjoint, stageEvents[static_cast(STAGE::SIO_TO_OUT)]); + output = queSioAdjoint.EnQue(); + CpGM2GMPingPong(dataSize, input, output, -1); + sync.SetSyncFlag(magic, i, stageEvents[static_cast(STAGE::HCCS_TO_SIO)], rank); + ++i; + } + } + FORCE_INLINE_AICORE void ProcessSioToOut(const int count) + { + GlobalTensor input; + GlobalTensor output; + int64_t remainSize = dataSizePerCore; + int64_t dataSize = 0; + sync.WaitSyncFlag(magic, 0, stageEvents[static_cast(STAGE::HCCS_TO_SIO)], rankSioAdjoint); + int64_t waitFlag = sync.GetInnerFlag(rankSioAdjoint, + stageEvents[static_cast(STAGE::HCCS_TO_SIO)]) & EVENT_ID_MASK; + int i = 0; + while (i < count) { + if (waitFlag < i) { + waitFlag = sync.GetInnerFlag(rankSioAdjoint, + stageEvents[static_cast(STAGE::HCCS_TO_SIO)]) & EVENT_ID_MASK; + continue; + } + int countRankId = (rankSioAdjoint + i + RING_NUM) % rankSize; + if (countRankId == rankSioAdjoint) { + dataSize = (remainSize >= blockSize) ? blockSize : remainSize; + } + input = queSioAdjoint.ReadFront(); + output = outputGm[countRankId / RING_NUM][dataSizePerCore - remainSize]; + CpGM2GMPingPong(dataSize, input, output, -1); + constexpr int32_t halfQueDepth = 2; + if (i % (QUE_DEPTH / halfQueDepth) == 0) { + sync.SetSyncFlag(magic, i, stageEvents[static_cast(STAGE::SIO_TO_OUT)], rank); + } + if ((countRankId + RING_NUM) % rankSize == rankSioAdjoint) { + remainSize -= blockSize; + } + ++i; + } + } +private: + int stageEvents[STAGE_NUM] + GlobalTensor inputGm; + GlobalTensor outputGm[LCAL_MAX_RANK_SIZE / RING_NUM]; + IpcQueue queHccsLocal; + IpcQueue queHccsForward; + IpcQueue queSioLocal; + IpcQueue queSioAdjoint; + int64_t dataSizePerCore; + int stage; + int rankRingForward; + int rankSioAdjoint; + int64_t blockSize; +}; + +#endif // LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 8ff63353..1910f1ad 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -8,16 +8,16 @@ * See LICENSE in the root of the software repository for the full text of the License. */ - #ifndef LCCL_ALLREDUCE_BIG_DATA_H - #define LCCL_ALLREDUCE_BIG_DATA_H +#ifndef LCCL_ALLREDUCE_BIG_DATA_H +#define LCCL_ALLREDUCE_BIG_DATA_H - #include "allreduce_quant.h" - #include "sync_collectives.h" - #include "ipc_queue.h" - using namespace AscendC; +#include "allreduce_quant.h" +#include "sync_collectives.h" +#include "ipc_queue.h" +using namespace AscendC; - template - class AllReduceBigData : protected AllReduceQuant { +template +class AllReduceBigData : protected AllReduceQuant { constexpr static int QUEUE_DEPTH = 4; constexpr static T oneCast = (T) 1; @@ -253,9 +253,4 @@ private: bool isVectorScale = false; }; -#endif // LCCL_ALLREDUCE_BIG_DATA_H - - - - - +#endif // LCCL_ALLREDUCE_BIG_DATA_H \ No newline at end of file -- Gitee From 77090106527aa1376b1eea64569fe2d1e8ef29d4 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 14:58:53 +0800 Subject: [PATCH 056/414] Fix header guard typo, improve class naming, and correct buffer size calculations in AllGatherHierarchyDoubleRing --- .../91093/allgather_hierarchy_double_ring.h | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h index 8e76ae5c..38c364e5 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h @@ -9,7 +9,7 @@ */ #ifndef LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H - #define LLCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H + #define LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H #include "collectives.h" #include "ipc_queue.h" @@ -23,16 +23,16 @@ constexpr int STAGE_EVENT = 0; constexpr int RING_EVENT = 1; enum STAGE { - HCCS_RING = 0; + HCCS_RING = 0, HCCS_TO_OUT, HCCS_TO_SIO, SIO_TO_OUT }; template -class AllgatherHierarchyDoubleRing : protected Collectives { +class AllGatherHierarchyDoubleRing : public Collectives { public: - FORCE_INLINE_AICORE AllgatherHierarchyDoubleRing(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE AllGatherHierarchyDoubleRing(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) @@ -52,17 +52,17 @@ public: inputGm.SetGlobalBuffer(input + inputOffset, dataSizePerCore); if (stage == STAGE::HCCS_TO_OUT) { for (int i = rank % RING_NUM; i < rankSize; i += RING_NUM) { - outputGm[i / RING_NUM].SetGlobalBuffer(output + dataSizePerCore * i + inputOffset, dataSizePerCore); + outputGm[i / RING_NUM].SetGlobalBuffer(output + dataTotalSize * i + inputOffset, dataSizePerCore); } - } elseif (stage == STAGE::SIO_TO_OUT) { + } else if (stage == STAGE::SIO_TO_OUT) { for (int i = (rank + 1) % RING_NUM; i < rankSize; i += RING_NUM) { - outputGm[i / RING_NUM].SetGlobalBuffer(output + dataSizePerCore * i + inputOffset, dataSizePerCore); + outputGm[i / RING_NUM].SetGlobalBuffer(output + dataTotalSize * i + inputOffset, dataSizePerCore); } } int64_t queTotalSize = IPC_BUFF_MAX_SIZE / coreNumPerStep; int64_t queSize = queTotalSize / QUE_NUM_LOCAL; - int64_t queHccsOffset = queSize /QUE_DEPTH; + int64_t queHccsOffset = stageCoreIdx * queTotalSize; blockSize = queSize / QUE_DEPTH; queHccsLocal.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + queHccsOffset, queSize, blockSize); @@ -76,15 +76,15 @@ public: shareAddrs[rankSioAdjoint] + IPC_DATA_OFFSET + queHccsOffset + queSize, queSize, blockSize); for (int i = 0; i < STAGE_NUM; ++i) { - stageEvents[i] = sync.CalEventIdByMulBlockNum(STAGE_EVENT, stageCoreIdx * coreNumPerStep); + stageEvents[i] = sync.CalEventIdByMulBlockNum(STAGE_EVENT, stageCoreIdx + coreNumPerStep * i); } - DumpLcclLogInfo(LogId::INIT, static_cast(op)); + DumpLcclLogInfo(LogId::INIT, Op::COPYONLY); } FORCE_INLINE_AICORE void Process() { - DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + DumpLcclLogInfo(LogId::PROCESS, Op::COPYONLY); int count = rankSize / RING_NUM * CeilDiv(dataSizePerCore, blockSize); if (stage == STAGE::HCCS_RING) { ProcessHccsRing(count); @@ -95,7 +95,7 @@ public: } else if (stage == STAGE::SIO_TO_OUT) { ProcessSioToOut(count); } - DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + DumpLcclLogInfo(LogId::PROCESS, Op::COPYONLY); } private: FORCE_INLINE_AICORE void ProcessHccsRing(const int count) @@ -103,7 +103,7 @@ private: constexpr int dependencyNum = 3; int deQueWaitRanks[dependencyNum] = {(rank + rankSize - RING_NUM) % rankSize, rank, rank}; int deQueWaitEvents[dependencyNum] = { - sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx); + sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), stageEvents[static_cast(STAGE::HCCS_TO_OUT)], stageEvents[static_cast(STAGE::HCCS_TO_SIO)]}; int64_t remainSize = dataSizePerCore; @@ -120,18 +120,18 @@ private: remainSize -= blockSize; } else { if (i == 1) { - sync.WaitSyncFlag(magic, 0 , stageEvents[static_cast(STAGE::HCCS_RING)], rnakRingForward); + sync.WaitSyncFlag(magic, 0 , stageEvents[static_cast(STAGE::HCCS_RING)], rankRingForward); waitFlag = sync.GetInnerFlag(rankRingForward, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; } if (waitFlag < i - 1) { waitFlag = sync.GetInnerFlag(rankRingForward, - stageEvenets[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; + stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; continue; } input = queHccsForward.ReadFront(); } - queHccsLocal.Deque(deQueWaitRanks, deQueWaitEvents, dependencyNum); + queHccsLocal.DeQue(deQueWaitRanks, deQueWaitEvents, dependencyNum); output = queHccsLocal.EnQue(); CpGM2GMPingPong(dataSize, input, output, -1); @@ -163,7 +163,7 @@ private: waitFlag = sync.GetInnerFlag(rank, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; continue; } - int countRankId = (rank + i + RING_NUM) % rankSize; + int countRankId = (rank + i * RING_NUM) % rankSize; if (countRankId == rank) { dataSize = (remainSize >= blockSize) ? blockSize : remainSize; } @@ -194,7 +194,7 @@ private: waitFlag = sync.GetInnerFlag(rank, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; continue; } - int countRankId = (rank + i + RING_NUM) % rankSize; + int countRankId = (rank + i * RING_NUM) % rankSize; if (countRankId == rank) { dataSize = (remainSize >= blockSize) ? blockSize : remainSize; remainSize -= blockSize; @@ -223,11 +223,11 @@ private: stageEvents[static_cast(STAGE::HCCS_TO_SIO)]) & EVENT_ID_MASK; continue; } - int countRankId = (rankSioAdjoint + i + RING_NUM) % rankSize; + int countRankId = (rankSioAdjoint + i * RING_NUM) % rankSize; if (countRankId == rankSioAdjoint) { dataSize = (remainSize >= blockSize) ? blockSize : remainSize; } - input = queSioAdjoint.ReadFront(); + input = queSioLocal.ReadFront(); output = outputGm[countRankId / RING_NUM][dataSizePerCore - remainSize]; CpGM2GMPingPong(dataSize, input, output, -1); constexpr int32_t halfQueDepth = 2; @@ -241,7 +241,7 @@ private: } } private: - int stageEvents[STAGE_NUM] + int stageEvents[STAGE_NUM]; GlobalTensor inputGm; GlobalTensor outputGm[LCAL_MAX_RANK_SIZE / RING_NUM]; IpcQueue queHccsLocal; -- Gitee From 24cb40334807a5dc9f2f29416a4a665484e6a497 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 15:57:22 +0800 Subject: [PATCH 057/414] Fix logging condition in AllReduceBigData and add AllReduceTwoShot class for enhanced data processing --- .../src/ascendc_kernels/allreduce_big_data.h | 4 +- .../src/ascendc_kernels/allreduce_two_shot.h | 188 ++++++++++++++++++ 2 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 comm/lcal/src/ascendc_kernels/allreduce_two_shot.h diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 1910f1ad..0bb56493 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -33,8 +33,8 @@ public: } if (blockIdx >= PING_PONG_SIZE * rankSize) { - DumpLcclLogInfo(LogId::INIT, static_cast(op)); - return; + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + return; } perStepBlockNum = rankSize; diff --git a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h new file mode 100644 index 00000000..385f7715 --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCCL_ALLREDUCE_TWO_SHOTH +#define LCCL_ALLREDUCE_TWO_SHOT_H + +#include "allreduce_quant.h" +#include "sync_collectives.h" + +template +class AllReduceBigData : protected AllReduceQuant { + constexpr static int QUEUE_DEPTH = 4; + constexpr static T oneCast = (T) 1; + +public: + FORCE_INLINE_AICORE AllReduceBigData(int rank, int rankSize, uint32_t extraFlag) + : AllReduceQuant(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + if constexpr(!std::is_same_v) { + BuildScaleOffset(scale, scaleCount, offset); + } + + if (blockIdx >= rankSize) { + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + return; + } + + blockNum = rankSize; + + __gm__ CommArgs *localArgs = reinterpret_cast<__gm__ CommArgs *>(commArgs); + + int localRankSize = localArgs->localRankSize <= 0 ? rankSize : localArgs->localRankSize; + int globalRankSize = localArgs->rankSize <= 0 ? rankSize : localArgs->rankSize; + int serverNum = globalRankSize / localRankSize; + int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / + QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; + ipcDataPerParagraphSize = ipcBuffMaxSizeAligned / localRankSize; + int64_t ipcDataPerParagraphNum = ipcDataPerParagraphSize / sizeof(T); + atomOp = op; + corePerRank = blockNum / rankSize; + coreSegmentedIdx = blockIdx % corePerRank; + peerRank = blockIdx / corePerRank; + perRankDataNum = GetDataCount(len, rankSize) / scaleNum * scaleNum; + curRankDataNum (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; + pullRankDataNum = perRankDataNum; + if (peerRank == rankSize - 1) { + pullRankDataNum = len - rank * perRankDataNum; + } + pullBlockDataNum = GetDataCount(pullRankDataNum, corePerRank); + dataNumPreBlock = pullBlockDataNum; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumPreBlock = pullRankDataNum - (corePerRank - 1) * pullBlockDataNum; + } + buffOffsetNum = peerRank * perRankDataNum + coreSegmentedIdx * pullBlockDataNum + + ipcDataPerParagraphNum * peerRank; + + curBlockDataNum = GetDataCount(curRankDataNum, corePerRank); + dataNumPreBlock = pullBlockDataNum; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumPreBlock = pullRankDataNum - (corePerRank - 1) * pullBlockDataNum; + } + + buffOffsetNum = peerRank * perRankDataNum + coreSegmentedIdx * pullBlockDataNum + + ipcDataPerParagraphNum * peerRank; + curBlockDataNum = GetDataCount(curRankDataNum, corePerRank); + ipcDataNumPreBlock = curBlockDataNum; + ipcBuffOffsetNum = rank * perRankDataNum + coreSegmentedIdx * curBlockDataNum + ipcDataPerParagraphNum * rank; + + inputGt.SetGlobalBuffer((__gm__ U*)input + buffOffsetNum - ipcDataPerParagraphNum * peerRank, dataNumPreBlock); + inputIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + IPC_DATA_OFFSET) + buffOffsetNum, dataNumPreBlock); + srcIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[peerRank] + IPC_DATA_OFFSET) + ipcbuffOffsetNum, + ipcDataNumPreBlock); + processIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + IPC_DATA_OFFSET) + ipcbuffOffsetNum, + ipcDataNumPreBlock); + + processedIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[peerRank] + IPC_DATA_OFFSET) + buffOffsetNum, + dataNumPreBlock); + outputGt.SetGlobalBuffer((__gm__ T*)output + buffOffsetNum - ipcDataPerParagraphNum * peerRank, + dataNumPreBlock); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + if (blockIdx >= rankSize) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + return; + } + if constexpr (std::is_same_v) { + Collectives::CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY); + } else { + if (peerRank == rank) { + if (!isEnabled) { + Collectives::CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY); + } else if (!isVectorScale){ + CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY, firstScale, offset); + } else { + CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY, scaleGt, scaleNum, offset); + } + } else { + GlobalTensor inputIpcGtTmp; + inputIpcGtTmp.SetGlobalBuffer((__gm__ U*)inputIpcGt.GetPhyAddr()); + Collectives::CpGM2GM(inputIpcGtTmp, intputGt, dataNumPreBlock, COPYONLY); + } + } + sync.SetInnerFlag(magic, 1); + + sync.WaitInnerFlag(magic, 1, rank, coreSegmentedIdx + rank * corePerRank); + sync.WaitInnerFlag(magic, 1, peerRank, coreSegmentedIdx + rank * corePerRank); + if (peerRank != rank) { + if constexpr (std::is_same_v) { + Collectives::CpGM2GM(processIpcGt, srcIpcGt, ipcDataNumPreBlock, atomOp); + } else { + GlobalTensor srcIpcGtTmp; + srcIpcGtTmp.SetGlobalBuffer((__gm__ U*)srcIpcGt.GetPhyAddr()); + if (!isEnabled) { + Collectives::CpGM2GM(processIpcGt, srcIpcGtTmp, ipcDataNumPreBlock, atomOp); + } else if (!isVectorScale) { + CpGM2GM(processIpcGt, srcIpcGtTmp, ipcDataNumPreBlock, atomOp, firstScale, offset); + } else { + CpGM2GM(processIpcGt, srcIpcGtTmp, ipcDataNumPreBlock, atomOp, scaleGt, scaleNum, offset); + } + } + } + + if (!(extraFlag & ExtraFlag::RDMA)) { + sync.SetOuterFlag(magic, 1); + sync.WaitOneRankOuterFlag(magic, 1, peerRank); + Collectives::CpGM2GM(outputGt, processedIpcGt, dataNumPreBlock, COPYONLY); + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } + +private: + GlobalTensor inputGt; + GlobalTensor outputGt; + GlobalTensor inputIpcGt; + GlobalTensor srcIpcGt; + GlobalTensor processedIpcGt; + GlobalTensor processIpcGt; + + int atomOp; + + int64_t corePerRank; + int64_t coreSegmentedIdx; + int64_t ipcDataperParagraphSize; + int64_t perRankDataNum; + int64_t curRankDataNum; + int64_t pullBlockDataNum; + int64_t curBlockDataNum; + int64_t peerRank; + int64_t pullRankDataNum; + int64_t dataNumPreBlock; + int64_t buffOffsetNum; + int64_t ipcDataNumPreBlcok; + int64_t ipcbuffOffsetNum; + + GlobalTensor scaleGt; + int64_t scaleNum = 1; + T firstScale = 1; + T offset = 0; + bool isEnableScale = false; + bool isVectorScale = false; + FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) { + if (scale != nullptr && offset != nullptr) { + scaleGt.SetGlobalBuffer((__gm__ T*)scale); + this->firstScale = scaleGt.GetValue(0); + this->scaleNum = scaleCount < 1 ? 1 : scaleCount; + this->offset =* reinterpret_cast<__gm__ T*>(offset); + isVectorScale = scaleCount > 1; + isEnableScale = scaleCount > 0 && !(*(uint16_t *)(&(this->offset)) == 0 && + scaleCount == 1 && *(uint16_t *)(&firstScale) == *(uint16_t *)(&oneCast)); + } + } +}; + +#endif // LCCL_ALLREDUCE_TWO_SHOT_H \ No newline at end of file -- Gitee From 106f085d5fb4ee20dd2d7797a8430f58c63e8f17 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 16:02:51 +0800 Subject: [PATCH 058/414] Fix header guard typo, rename class from AllReduceBigData to AllReduceTwoShot, and correct variable assignments in allreduce_two_shot.h --- .../src/ascendc_kernels/allreduce_two_shot.h | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h index 385f7715..068565da 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h @@ -8,19 +8,19 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef LCCL_ALLREDUCE_TWO_SHOTH +#ifndef LCCL_ALLREDUCE_TWO_SHOT_H #define LCCL_ALLREDUCE_TWO_SHOT_H #include "allreduce_quant.h" #include "sync_collectives.h" - +using namespace AscendC; template -class AllReduceBigData : protected AllReduceQuant { +class AllReduceTwoShot : protected AllReduceQuant { constexpr static int QUEUE_DEPTH = 4; constexpr static T oneCast = (T) 1; public: - FORCE_INLINE_AICORE AllReduceBigData(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE AllReduceTwoShot(int rank, int rankSize, uint32_t extraFlag) : AllReduceQuant(rank, rankSize, extraFlag) {} FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { @@ -51,7 +51,7 @@ public: coreSegmentedIdx = blockIdx % corePerRank; peerRank = blockIdx / corePerRank; perRankDataNum = GetDataCount(len, rankSize) / scaleNum * scaleNum; - curRankDataNum (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; + curRankDataNum = (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; pullRankDataNum = perRankDataNum; if (peerRank == rankSize - 1) { pullRankDataNum = len - rank * perRankDataNum; @@ -59,22 +59,14 @@ public: pullBlockDataNum = GetDataCount(pullRankDataNum, corePerRank); dataNumPreBlock = pullBlockDataNum; if (coreSegmentedIdx == corePerRank - 1) { - dataNumPreBlock = pullRankDataNum - (corePerRank - 1) * pullBlockDataNum; + dataNumPreBlock = pullRankDataNum - coreSegmentedIdx * pullBlockDataNum; } buffOffsetNum = peerRank * perRankDataNum + coreSegmentedIdx * pullBlockDataNum + ipcDataPerParagraphNum * peerRank; curBlockDataNum = GetDataCount(curRankDataNum, corePerRank); - dataNumPreBlock = pullBlockDataNum; - if (coreSegmentedIdx == corePerRank - 1) { - dataNumPreBlock = pullRankDataNum - (corePerRank - 1) * pullBlockDataNum; - } - - buffOffsetNum = peerRank * perRankDataNum + coreSegmentedIdx * pullBlockDataNum + - ipcDataPerParagraphNum * peerRank; - curBlockDataNum = GetDataCount(curRankDataNum, corePerRank); ipcDataNumPreBlock = curBlockDataNum; - ipcBuffOffsetNum = rank * perRankDataNum + coreSegmentedIdx * curBlockDataNum + ipcDataPerParagraphNum * rank; + ipcbuffOffsetNum = rank * perRankDataNum + coreSegmentedIdx * curBlockDataNum + ipcDataPerParagraphNum * rank; inputGt.SetGlobalBuffer((__gm__ U*)input + buffOffsetNum - ipcDataPerParagraphNum * peerRank, dataNumPreBlock); inputIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + IPC_DATA_OFFSET) + buffOffsetNum, dataNumPreBlock); @@ -101,7 +93,7 @@ public: Collectives::CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY); } else { if (peerRank == rank) { - if (!isEnabled) { + if (!isEnableScale) { Collectives::CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY); } else if (!isVectorScale){ CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY, firstScale, offset); @@ -111,7 +103,7 @@ public: } else { GlobalTensor inputIpcGtTmp; inputIpcGtTmp.SetGlobalBuffer((__gm__ U*)inputIpcGt.GetPhyAddr()); - Collectives::CpGM2GM(inputIpcGtTmp, intputGt, dataNumPreBlock, COPYONLY); + Collectives::CpGM2GM(inputIpcGtTmp, inputGt, dataNumPreBlock, COPYONLY); } } sync.SetInnerFlag(magic, 1); @@ -124,7 +116,7 @@ public: } else { GlobalTensor srcIpcGtTmp; srcIpcGtTmp.SetGlobalBuffer((__gm__ U*)srcIpcGt.GetPhyAddr()); - if (!isEnabled) { + if (!isEnableScale) { Collectives::CpGM2GM(processIpcGt, srcIpcGtTmp, ipcDataNumPreBlock, atomOp); } else if (!isVectorScale) { CpGM2GM(processIpcGt, srcIpcGtTmp, ipcDataNumPreBlock, atomOp, firstScale, offset); @@ -154,7 +146,7 @@ private: int64_t corePerRank; int64_t coreSegmentedIdx; - int64_t ipcDataperParagraphSize; + int64_t ipcDataPerParagraphSize; int64_t perRankDataNum; int64_t curRankDataNum; int64_t pullBlockDataNum; @@ -163,7 +155,7 @@ private: int64_t pullRankDataNum; int64_t dataNumPreBlock; int64_t buffOffsetNum; - int64_t ipcDataNumPreBlcok; + int64_t ipcDataNumPreBlock; int64_t ipcbuffOffsetNum; GlobalTensor scaleGt; @@ -172,7 +164,8 @@ private: T offset = 0; bool isEnableScale = false; bool isVectorScale = false; - FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) { + FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) + { if (scale != nullptr && offset != nullptr) { scaleGt.SetGlobalBuffer((__gm__ T*)scale); this->firstScale = scaleGt.GetValue(0); -- Gitee From c2aadceebe294df78d06826070a768e8811a56ce Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 16:03:29 +0800 Subject: [PATCH 059/414] 1 --- comm/lcal/src/ascendc_kernels/allreduce_two_shot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h index 068565da..f4d36d25 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h @@ -54,7 +54,7 @@ public: curRankDataNum = (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; pullRankDataNum = perRankDataNum; if (peerRank == rankSize - 1) { - pullRankDataNum = len - rank * perRankDataNum; + pullRankDataNum = len - peerRank * perRankDataNum; } pullBlockDataNum = GetDataCount(pullRankDataNum, corePerRank); dataNumPreBlock = pullBlockDataNum; -- Gitee From cc5a63847453c20023506583b139555e28dfe811 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 18 Aug 2025 20:28:39 +0800 Subject: [PATCH 060/414] lcclop --- comm/lcal/src/ascendc_kernels/lccl_op.h | 72 +++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 comm/lcal/src/ascendc_kernels/lccl_op.h diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h new file mode 100644 index 00000000..1c3f423f --- /dev/null +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#if defined(__DAV_C220_VEC__) || defined(__DAV_C220_CUBE__) + +#include "op_def.h" +#include "allgather.h" +#include "91093/allgather_hierarchy_double_ring.h" +#include "allreduce_one_shot.h" +#include "allreduce_two_shot.h" +#include "allreduce_big_data.h" +#include "91093/allreduce_big_data_sio.h" +#include "91093/allreduce_hierarchy_double_ring.h" +#include "91093/reduce_scatter_big_data_91093_4step.h" +#include "91093/reduce_scatter_hierarchy_double_ring.h" +#include "91093/all2all_hierarchy.h" +#include "91093/all2all_hierarchy_small.h" + +#include "../kernels/lcal_allreduce_2npu_read.cce" +#include "../kernels/lcal_allreduce_2npu_write.cce" +#include "../kernels/lcal_allreduce_2npu_big_write.cce" +#include "../kernels/lcal_allreduce_two_shot.cce" +#include "../kernels/lcal_allreduce_big_data.cce" +#include "../kernels/lcal_allreduce_two_shot_910B2C.cce" +#include "../kernels/lcal_allreduce_big_data_910B2C.cce" +#include "../kernels/lcal_allreduce_deterministic.cce" +#include "../kernels/lcal_allreduce_deterministic_big_data.cce" +#include "../kernels/lcal_reduce_scatter_big_data_write.cce" +#include "../kernels/lcal_reduce_scatter_write.cce" +#include "../kernels/lcal_reduce_scatter.cce" +#include "../kernels/lcal_reduce_scatter_big_data.cce" +#include "../kernels/lcal_allgather_910B2C.cce" +#include "../kernels/lcal_allgather_big_data_910B2C.cce" +#include "../kernels/lcal_allgather_2npu.cce" +#include "../kernels/lcal_allgather_2npu_big_data_write.cce" +#include "../kernels/lcal_allgather.cce" +#include "../kernels/lcal_allgather_big_data.cce" +#include "../kernels/lcal_broadcast_write.cce" +#include "../kernels/lcal_broadcast_big_data.cce" +#include "../kernels/lcal_all2all_transpose.cce" + +#define CLASS_OP_910B_RDMA_LAUNCH(name, type) \ +do { \ +name opKernel(localRank, localRankSize, extraFlag); \ +opKernel.Init(KERNELS_ARGS_CALL()); \ +opKernel.Process(); \ +} while (0) + +extern "C" __global__ __aicore__ __attribute__((section("Attr_Section_Lcal"))) void LcalDescriptor() {} + +#define LCCL_BROADCAST_FUNC_AUTO_DEF(suffix) \ +extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNEL_ARGS_FUN()) | +{ \ + if ASCEND_IS_AIV { \ + GET_COMM_ARGS; \ + __gm__ char * shareAddrs[LCAL_MAX_RANK_SIZE]; \ + GET_IPC_MEM_ARGS(char); \ + if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ + LcalBroadcast2npuBigDataWrite(ALLREDUCE_ARGS_CALL(char)); \ + } else { \ + LcalBroadcastBigData(ALLREDUCE_ARGS_CALL(char)); \ + } \ + } \ +} + + -- Gitee From bde2cba9c9eaf44b01f1a76a72052b27586d727f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 09:14:48 +0800 Subject: [PATCH 061/414] Add new launch macros for quantization and all-gather operations, enhancing data processing capabilities --- comm/lcal/src/ascendc_kernels/lccl_op.h | 135 +++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index 1c3f423f..f123f959 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -52,10 +52,17 @@ opKernel.Init(KERNELS_ARGS_CALL()); \ opKernel.Process(); \ } while (0) +#define CLASS_OP_QUANT_LAUNCH(name, outputType, inputType) \ +do { \ +name opKernel(localRank, localRankSize, extraFlag); \ +opKernel.Init(KERNELS_ARGS_CALL()); \ +opKernel.Process(); \ +} while (0) + extern "C" __global__ __aicore__ __attribute__((section("Attr_Section_Lcal"))) void LcalDescriptor() {} #define LCCL_BROADCAST_FUNC_AUTO_DEF(suffix) \ -extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNEL_ARGS_FUN()) | +extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNEL_ARGS_FUN()) \ { \ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ @@ -69,4 +76,130 @@ extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNEL_ARGS_FUN()) | } \ } +#define LCCL_ALLGATHER_FUNC_AUTO_DEF(type, suffix) \ +extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNEL_ARGS_FUN()) {\ + if ASCEND_IS_AIV { \ + GET_COMM_ARGS; \ + constexpr int32_t quickOneshotRankSize = 2 \ + constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024 \ + constexpr int32_t smallRankSize = 8 \ + constexpr int32_t smallDataSize910a3 = 2 \ + __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ + GET_IPC_MEM_ARGS(type); \ + if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ + if (len * sizeof(type) < cceSmallDataSize) { \ + LcalAllGather910B2C(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else { + LcalAllGatherBigData910B2C(ALLREDUCE_ARGS_CALL_16p(type)); + } \ + } else if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ + LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && lcalBlockNum != rankSize && \ + (len > smallDataSize910a3 / sizeof(type) || rankSize > smallRankSize) && \ + rankSize > quickOneshotRankSize && rankSize % quickOneshotRankSize == 0) { \ + CLASS_OP_LAUNCH(AllGatherHierarchyDoubleRing, type); \ + } else { \ + if (rankSize == quickOneshotRankSize && len * sizeof(type) < SIZE_OF_8M && lcalBlockNum != rankSize) { \ + LcalAllGather2npu(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else if (rankSize == quickOneshotRankSize && lcalBlockNum != rankSize) { \ + LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize || \ + lcalBlockNum == rankSize) { \ + LcalAllGather(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else { + LcalAllGatherBigData(ALLREDUCE_ARGS_CALL_16p(type)); \ + } + }\ + } \ +} +#define LCCL_ALL_REDUCE_FUNC_AUTO_DEF(type, suffix) \ +extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNEL_ARGS_FUN()) {\ + if ASCEND_IS_AIV { \ + GET_COMM_ARGS; + constexpr int32_t quickOneshotRankSize = 2; \ + constexpr int32_t threeStepNum = 3; \ + constexpr int32_t smallRankSize = 8; \ + constexpr int32_t oneshotDataSize = 16 * 1024; \ + constexpr int32_t quantSmallDataSzie = 512 * 1024; \ + constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024 \ + constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024 \ + constexpr int32_t rankSize910a3 = 16 \ + __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ + GET_IPC_MEM_ARGS(type); \ + if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ + if (len * sizeof(type) < SIZE_OF_8M) { \ + LcalAllReduce2npuWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ + } else { + LcalAllReduce2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); + } \ + } else if ((extraFlag & ExtraFlag::QUANT_FP16) != 0 && std::is_same_V) { \ + if (len * sizeof(type) <= oneshotDataSize) { \ + CLASS_OP_QUANT_LAUNCH(AllReduceOneShot, half, int8_t); \ + } else if (len * sizeof(type) <= oneshotDataSize) { \ + CLASS_OP_QUANT_LAUNCH(AllReduceTwoShot, half, int8_t); \ + } else { \ + CLASS_OP_QUANT_LAUNCH(AllReduceBigData, half, int8_t); \ + } + } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ + if (len * sizeof(type) < cceSmallDataSize) { \ + LcalAllReduceTowShot910B2C(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else { \ + LcalAllReduceBigData910B2C(ALLREDUCE_ARGS_CALL_16p(type)); \ + } \ + } else if ((extraFlag & ExtraFlag::DETERMINISTIC) != 0) { \ + constexpr uint32_t maxAivNum = 40; \ + const bool isAivNumSupport = ((extraFlag & ExtraFlag::IS_GREATER_THAN_40_AIV) != 0 || \ + rankSize * threeStepNum <= maxAivNum); \ + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { \ + if (rankSize % quickOneshotRankSize == 0 && rankSize > quickOneshotRankSize || \ + (rankSize <= rankSize910a3 && len * sizeof(type) <= smallDataSize910a3 && isAivNumSupport)) { \ + LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else { \ + CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ + } \ + } else if (len * sizeof(type) < SMALL_DATA_SIZE) { \ + LcalAllReduceDeterministic(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else { \ + LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16p(type)); \ + } \ + } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && lcalBlockNum != rankSize && \ + (rankSize == quickOneshotRankSize && len * sizeof(type) > smallDataSize910a3)) { \ + if (rankSize == quickOneshotRankSize) { \ + LcalAllRecude2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16p(type)); \ + } else if (rankSize % quickOneshotRankSize == 0) { \ + CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ + } else { \ + CLASS_OP_LAUNCH(AllReduceBigDataSio, type); \ + } \ + } else { \ + if (len * sizeof(type) < cceSmallDataSize or lcalBlockNum == rankSize) { \ + if (rankSize == quickOneshotRankSize && lcalBlockNum != rankSize) { \ + LcalAllReduce2npu(ALLREDUCE_ARGS_CALL(type)); \ + } else { \ + LcalAllReduceTwoShot(ALLREDUCE_ARGS_CALL_16P(type)); \ + } else { + LcalAllReduceBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ + } \ + }\ + } \ +} + +#define LCCL_ALL_REDUCE_FUNC_AUTO_DEF(type, suffix) \ +extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNEL_ARGS_FUN()) {\ + if ASCEND_IS_AIV { \ + GET_COMM_ARGS; + __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ + GET_IPC_MEM_ARGS(type); \ + constexpr int32_t smallRankSize = 8; \ + if (op != 0 && root != 0) { \ + LcalAll2AllTranspose(ALLREDUCE_ARGS_CALL_16p(type)); \ + } \ + else if ((extraFlag & ExtraFlag::TOPO_91093) != 0) { \ + if (rankSize <= smallRankSize && len * sizeof(type) > SMALL_DATA_SIZE) { \ + CLASS_OP_LAUNCH(All2AllHierarchySmall, type); \ + } else { \ + LcalAll2All(All2AllHierarchy, type); \ + } \ + } \ +} -- Gitee From 952faf6c7511009010813eba7fcde19b5703116f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 09:50:25 +0800 Subject: [PATCH 062/414] Refactor kernel function signatures and include missing headers for improved functionality and consistency --- comm/lcal/src/ascendc_kernels/lccl_op.h | 49 +++++++++++++------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index f123f959..d1ae5bb5 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -17,6 +17,7 @@ #include "allreduce_big_data.h" #include "91093/allreduce_big_data_sio.h" #include "91093/allreduce_hierarchy_double_ring.h" +#include "reduce_scatter.h" #include "91093/reduce_scatter_big_data_91093_4step.h" #include "91093/reduce_scatter_hierarchy_double_ring.h" #include "91093/all2all_hierarchy.h" @@ -62,7 +63,7 @@ opKernel.Process(); \ extern "C" __global__ __aicore__ __attribute__((section("Attr_Section_Lcal"))) void LcalDescriptor() {} #define LCCL_BROADCAST_FUNC_AUTO_DEF(suffix) \ -extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNEL_ARGS_FUN()) \ +extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNELS_ARGS_FUN()) \ { \ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ @@ -77,51 +78,51 @@ extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNEL_ARGS_FUN()) \ } #define LCCL_ALLGATHER_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNEL_ARGS_FUN()) {\ +extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_FUN()) {\ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ - constexpr int32_t quickOneshotRankSize = 2 \ - constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024 \ - constexpr int32_t smallRankSize = 8 \ - constexpr int32_t smallDataSize910a3 = 2 \ + constexpr int32_t quickOneshotRankSize = 2; \ + constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \ + constexpr int32_t smallRankSize = 8; \ + constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ if (len * sizeof(type) < cceSmallDataSize) { \ - LcalAllGather910B2C(ALLREDUCE_ARGS_CALL_16p(type)); \ - } else { - LcalAllGatherBigData910B2C(ALLREDUCE_ARGS_CALL_16p(type)); + LcalAllGather910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ + } else { \ + LcalAllGatherBigData910B2C(ALLREDUCE_ARGS_CALL_16P(type)); } \ } else if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ - LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && lcalBlockNum != rankSize && \ (len > smallDataSize910a3 / sizeof(type) || rankSize > smallRankSize) && \ rankSize > quickOneshotRankSize && rankSize % quickOneshotRankSize == 0) { \ CLASS_OP_LAUNCH(AllGatherHierarchyDoubleRing, type); \ } else { \ if (rankSize == quickOneshotRankSize && len * sizeof(type) < SIZE_OF_8M && lcalBlockNum != rankSize) { \ - LcalAllGather2npu(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllGather2npu(ALLREDUCE_ARGS_CALL_16P(type)); \ } else if (rankSize == quickOneshotRankSize && lcalBlockNum != rankSize) { \ - LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize || \ lcalBlockNum == rankSize) { \ - LcalAllGather(ALLREDUCE_ARGS_CALL_16p(type)); \ - } else { - LcalAllGatherBigData(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllGather(ALLREDUCE_ARGS_CALL_16P(type)); \ + } else { \ + LcalAllGatherBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } }\ } \ } #define LCCL_ALL_REDUCE_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNEL_ARGS_FUN()) {\ +extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_FUN()) {\ if ASCEND_IS_AIV { \ GET_COMM_ARGS; constexpr int32_t quickOneshotRankSize = 2; \ constexpr int32_t threeStepNum = 3; \ constexpr int32_t smallRankSize = 8; \ constexpr int32_t oneshotDataSize = 16 * 1024; \ - constexpr int32_t quantSmallDataSzie = 512 * 1024; \ + constexpr int64_t quantSmallDataSize = 512 * 1024; \ constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024 \ constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024 \ constexpr int32_t rankSize910a3 = 16 \ @@ -143,9 +144,9 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNEL_ARGS_F } } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ if (len * sizeof(type) < cceSmallDataSize) { \ - LcalAllReduceTowShot910B2C(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllReduceTowShot910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ - LcalAllReduceBigData910B2C(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllReduceBigData910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ } else if ((extraFlag & ExtraFlag::DETERMINISTIC) != 0) { \ constexpr uint32_t maxAivNum = 40; \ @@ -154,19 +155,19 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNEL_ARGS_F if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { \ if (rankSize % quickOneshotRankSize == 0 && rankSize > quickOneshotRankSize || \ (rankSize <= rankSize910a3 && len * sizeof(type) <= smallDataSize910a3 && isAivNumSupport)) { \ - LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ } \ } else if (len * sizeof(type) < SMALL_DATA_SIZE) { \ - LcalAllReduceDeterministic(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllReduceDeterministic(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ - LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && lcalBlockNum != rankSize && \ (rankSize == quickOneshotRankSize && len * sizeof(type) > smallDataSize910a3)) { \ if (rankSize == quickOneshotRankSize) { \ - LcalAllRecude2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAllRecude2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } else if (rankSize % quickOneshotRankSize == 0) { \ CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ } else { \ @@ -193,7 +194,7 @@ extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNEL_ARGS_FUN GET_IPC_MEM_ARGS(type); \ constexpr int32_t smallRankSize = 8; \ if (op != 0 && root != 0) { \ - LcalAll2AllTranspose(ALLREDUCE_ARGS_CALL_16p(type)); \ + LcalAll2AllTranspose(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ else if ((extraFlag & ExtraFlag::TOPO_91093) != 0) { \ if (rankSize <= smallRankSize && len * sizeof(type) > SMALL_DATA_SIZE) { \ -- Gitee From b6294d853269f8795dd7a6266108461d4e385177 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 10:23:36 +0800 Subject: [PATCH 063/414] Fix formatting issues and correct variable names in LCCL functions for improved readability and consistency --- comm/lcal/src/ascendc_kernels/lccl_op.h | 80 ++++++++++++++++++------- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index d1ae5bb5..ab9690cf 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -91,7 +91,7 @@ extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_ if (len * sizeof(type) < cceSmallDataSize) { \ LcalAllGather910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ - LcalAllGatherBigData910B2C(ALLREDUCE_ARGS_CALL_16P(type)); + LcalAllGatherBigData910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ } else if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ @@ -117,34 +117,34 @@ extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_ #define LCCL_ALL_REDUCE_FUNC_AUTO_DEF(type, suffix) \ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_FUN()) {\ if ASCEND_IS_AIV { \ - GET_COMM_ARGS; + GET_COMM_ARGS; \ constexpr int32_t quickOneshotRankSize = 2; \ constexpr int32_t threeStepNum = 3; \ constexpr int32_t smallRankSize = 8; \ constexpr int32_t oneshotDataSize = 16 * 1024; \ constexpr int64_t quantSmallDataSize = 512 * 1024; \ - constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024 \ - constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024 \ - constexpr int32_t rankSize910a3 = 16 \ + constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \ + constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; \ + constexpr int32_t rankSize910a3 = 16; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ if (len * sizeof(type) < SIZE_OF_8M) { \ LcalAllReduce2npuWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ - } else { - LcalAllReduce2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); + } else { \ + LcalAllReduce2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ - } else if ((extraFlag & ExtraFlag::QUANT_FP16) != 0 && std::is_same_V) { \ + } else if ((extraFlag & ExtraFlag::QUANT_FP16) != 0 && std::is_same_v) { \ if (len * sizeof(type) <= oneshotDataSize) { \ CLASS_OP_QUANT_LAUNCH(AllReduceOneShot, half, int8_t); \ - } else if (len * sizeof(type) <= oneshotDataSize) { \ + } else if (len * sizeof(type) <= quatSmallDataSize) { \ CLASS_OP_QUANT_LAUNCH(AllReduceTwoShot, half, int8_t); \ } else { \ CLASS_OP_QUANT_LAUNCH(AllReduceBigData, half, int8_t); \ - } + } \ } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ if (len * sizeof(type) < cceSmallDataSize) { \ - LcalAllReduceTowShot910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ + LcalAllReduceTwoShot910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ LcalAllReduceBigData910B2C(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ @@ -153,8 +153,8 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ const bool isAivNumSupport = ((extraFlag & ExtraFlag::IS_GREATER_THAN_40_AIV) != 0 || \ rankSize * threeStepNum <= maxAivNum); \ if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { \ - if (rankSize % quickOneshotRankSize == 0 && rankSize > quickOneshotRankSize || \ - (rankSize <= rankSize910a3 && len * sizeof(type) <= smallDataSize910a3 && isAivNumSupport)) { \ + if (rankSize % quickOneshotRankSize == 0 || rankSize > quickOneshotRankSize || \ + (rankSize <= rankSize910a3 || len * sizeof(type) <= smallDataSize910a3 && isAivNumSupport)) { \ LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ @@ -167,7 +167,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && lcalBlockNum != rankSize && \ (rankSize == quickOneshotRankSize && len * sizeof(type) > smallDataSize910a3)) { \ if (rankSize == quickOneshotRankSize) { \ - LcalAllRecude2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ + LcalAllReduce2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } else if (rankSize % quickOneshotRankSize == 0) { \ CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ } else { \ @@ -176,31 +176,69 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ } else { \ if (len * sizeof(type) < cceSmallDataSize or lcalBlockNum == rankSize) { \ if (rankSize == quickOneshotRankSize && lcalBlockNum != rankSize) { \ - LcalAllReduce2npu(ALLREDUCE_ARGS_CALL(type)); \ + LcalAllReduce2npuRead(ALLREDUCE_ARGS_CALL(type)); \ } else { \ LcalAllReduceTwoShot(ALLREDUCE_ARGS_CALL_16P(type)); \ - } else { + } \ + } else { \ LcalAllReduceBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ }\ } \ } -#define LCCL_ALL_REDUCE_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNEL_ARGS_FUN()) {\ +#define LCCL_ALL2ALL_FUNC_AUTO_DEF(type, suffix) \ +extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNELS_ARGS_FUN()) {\ if ASCEND_IS_AIV { \ - GET_COMM_ARGS; + GET_COMM_ARGS; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ constexpr int32_t smallRankSize = 8; \ if (op != 0 && root != 0) { \ LcalAll2AllTranspose(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ - else if ((extraFlag & ExtraFlag::TOPO_91093) != 0) { \ + else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { \ if (rankSize <= smallRankSize && len * sizeof(type) > SMALL_DATA_SIZE) { \ CLASS_OP_LAUNCH(All2AllHierarchySmall, type); \ } else { \ - LcalAll2All(All2AllHierarchy, type); \ + CLASS_OP_LAUNCH(All2AllHierarchy, type); \ + } \ } \ } \ } + +#define LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF(type, suffix) \ +extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_ARGS_FUN()) {\ + if ASCEND_IS_AIV { \ + GET_COMM_ARGS; \ + constexpr int32_t quickOneshotRankSize = 2; \ + constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \ + constexpr int32_t a3BigDataSize = 32 * 1024 * 1024; \ + constexpr int32_t a3SupportRankSize = 4; \ + constexpr int32_t smallRankSize = 8; \ + const bool isDbRing = (rankSize == a3SupportRankSize || rankSize == smallRankSize) && + (len * sizeof(type) * smallRankSize > cceSmallDataSize && \ + len * sizeof(type) * smallRankSzie <= a3BigDataSize); \ + __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ + GET_IPC_MEM_ARGS(type); \ + if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ + LcalReduceScatterBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ + } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && (rankSize > smallRankSize || isDbRing) { \ + if (isDbRing) { \ + CLASS_OP_LAUNCH(ReduceScatterHierarchyDoubleRing, type); \ + } else if (len * sizeof(type) < SMALL_DATA_SIZE) { \ + CLASS_OP_LAUNCH(ReduceScatter, type); \ + } else { \ + CLASS_OP_LAUNCH(ReduceScatterBigData91093, type); \ + } \ + } else { \ + if (rankSize == quickOneshotRankSize && len * sizeof(type) < SIZE_OF_8M) { \ + LcalReduceScatterWrite(ALLREDUCE_ARGS_CALL(type)); \ + } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize) || \ + LcalReduceScatter(ALLREDUCE_ARGS_CALL(type)); \ + } else { \ + LcalReduceScatterBigData(ALLREDUCE_ARGS_CALL(type)); \ + } \ + } \ +} + -- Gitee From 5a6d14df31ef7e0b44becbc57f7b4d6a41451db3 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 10:27:19 +0800 Subject: [PATCH 064/414] Fix formatting and variable name inconsistencies in LcalAllGather and LcalAllReduce functions for improved clarity and functionality --- comm/lcal/src/ascendc_kernels/lccl_op.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index ab9690cf..5019f4f9 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -109,7 +109,7 @@ extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_ LcalAllGather(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ LcalAllGatherBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ - } + } \ }\ } \ } @@ -137,7 +137,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ } else if ((extraFlag & ExtraFlag::QUANT_FP16) != 0 && std::is_same_v) { \ if (len * sizeof(type) <= oneshotDataSize) { \ CLASS_OP_QUANT_LAUNCH(AllReduceOneShot, half, int8_t); \ - } else if (len * sizeof(type) <= quatSmallDataSize) { \ + } else if (len * sizeof(type) <= quantSmallDataSize) { \ CLASS_OP_QUANT_LAUNCH(AllReduceTwoShot, half, int8_t); \ } else { \ CLASS_OP_QUANT_LAUNCH(AllReduceBigData, half, int8_t); \ @@ -153,8 +153,8 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ const bool isAivNumSupport = ((extraFlag & ExtraFlag::IS_GREATER_THAN_40_AIV) != 0 || \ rankSize * threeStepNum <= maxAivNum); \ if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { \ - if (rankSize % quickOneshotRankSize == 0 || rankSize > quickOneshotRankSize || \ - (rankSize <= rankSize910a3 || len * sizeof(type) <= smallDataSize910a3 && isAivNumSupport)) { \ + if (rankSize % quickOneshotRankSize == 1 || rankSize == quickOneshotRankSize || \ + (rankSize <= rankSize910a3 && len * sizeof(type) <= smallDataSize910a3 && isAivNumSupport)) { \ LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ CLASS_OP_LAUNCH(AllReduceHierarchyDoubleRing, type); \ @@ -165,7 +165,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && lcalBlockNum != rankSize && \ - (rankSize == quickOneshotRankSize && len * sizeof(type) > smallDataSize910a3)) { \ + (rankSize == quickOneshotRankSize || len * sizeof(type) > smallDataSize910a3)) { \ if (rankSize == quickOneshotRankSize) { \ LcalAllReduce2npuBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } else if (rankSize % quickOneshotRankSize == 0) { \ @@ -216,17 +216,17 @@ extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_A constexpr int32_t a3BigDataSize = 32 * 1024 * 1024; \ constexpr int32_t a3SupportRankSize = 4; \ constexpr int32_t smallRankSize = 8; \ - const bool isDbRing = (rankSize == a3SupportRankSize || rankSize == smallRankSize) && + const bool isDbRing = (rankSize == a3SupportRankSize || rankSize == smallRankSize) && \ (len * sizeof(type) * smallRankSize > cceSmallDataSize && \ - len * sizeof(type) * smallRankSzie <= a3BigDataSize); \ + len * sizeof(type) * smallRankSize <= a3BigDataSize); \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ - LcalReduceScatterBigDataWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ - } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && (rankSize > smallRankSize || isDbRing) { \ + LcalReduceScatterBigDataWrite(ALLREDUCE_ARGS_CALL(type)); \ + } else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && (rankSize > smallRankSize || isDbRing)) { \ if (isDbRing) { \ CLASS_OP_LAUNCH(ReduceScatterHierarchyDoubleRing, type); \ - } else if (len * sizeof(type) < SMALL_DATA_SIZE) { \ + } else if (len * sizeof(type) <= SMALL_DATA_SIZE) { \ CLASS_OP_LAUNCH(ReduceScatter, type); \ } else { \ CLASS_OP_LAUNCH(ReduceScatterBigData91093, type); \ @@ -234,11 +234,12 @@ extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_A } else { \ if (rankSize == quickOneshotRankSize && len * sizeof(type) < SIZE_OF_8M) { \ LcalReduceScatterWrite(ALLREDUCE_ARGS_CALL(type)); \ - } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize) || \ + } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize){\ LcalReduceScatter(ALLREDUCE_ARGS_CALL(type)); \ } else { \ LcalReduceScatterBigData(ALLREDUCE_ARGS_CALL(type)); \ } \ } \ + }\ } - +#endif -- Gitee From 921051ff7d486d4b10e1ac2fef103a58db9653b8 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 10:34:08 +0800 Subject: [PATCH 065/414] Add new header files for all-to-all and reduce scatter operations to enhance kernel functionality --- comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h | 0 comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h | 0 comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h | 0 .../src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h | 0 .../ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h | 0 .../ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h | 0 comm/lcal/src/ascendc_kernels/CMakeLists.txt | 0 comm/lcal/src/ascendc_kernels/allgather.h | 0 comm/lcal/src/ascendc_kernels/allreduce_one_shot.h | 0 comm/lcal/src/ascendc_kernels/allreduce_quant.h | 0 comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h | 0 comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h | 0 comm/lcal/src/ascendc_kernels/ipc_queue.h | 0 comm/lcal/src/ascendc_kernels/op_def.h | 0 comm/lcal/src/ascendc_kernels/reduce_scatter.h | 0 comm/lcal/src/ascendc_kernels/sync_collectives.h | 0 comm/lcal/src/kernels/coc_dequant_runner.cce | 0 comm/lcal/src/kernels/coc_internal.cce | 0 18 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h create mode 100644 comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h create mode 100644 comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h create mode 100644 comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h create mode 100644 comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h create mode 100644 comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h create mode 100644 comm/lcal/src/ascendc_kernels/CMakeLists.txt create mode 100644 comm/lcal/src/ascendc_kernels/allgather.h create mode 100644 comm/lcal/src/ascendc_kernels/allreduce_one_shot.h create mode 100644 comm/lcal/src/ascendc_kernels/allreduce_quant.h create mode 100644 comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h create mode 100644 comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h create mode 100644 comm/lcal/src/ascendc_kernels/ipc_queue.h create mode 100644 comm/lcal/src/ascendc_kernels/op_def.h create mode 100644 comm/lcal/src/ascendc_kernels/reduce_scatter.h create mode 100644 comm/lcal/src/ascendc_kernels/sync_collectives.h create mode 100644 comm/lcal/src/kernels/coc_dequant_runner.cce create mode 100644 comm/lcal/src/kernels/coc_internal.cce diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/CMakeLists.txt b/comm/lcal/src/ascendc_kernels/CMakeLists.txt new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/comm/lcal/src/ascendc_kernels/ipc_queue.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/comm/lcal/src/ascendc_kernels/op_def.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce new file mode 100644 index 00000000..e69de29b diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce new file mode 100644 index 00000000..e69de29b -- Gitee From d97e43e8b2cd407ad65a9e92fb93939b693d00c9 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 14:51:26 +0800 Subject: [PATCH 066/414] Add All2AllHierarchy class implementation for enhanced collective operations --- .../ascendc_kernels/91093/all2all_hierarchy.h | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h index e69de29b..64e54892 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ALL2ALL_HIERARCHY_H +#define ALL2ALL_HIERARCHY_H + +#include "collectives.h" +#include "sync_collectives.h" +#include "ipc_queue.h" + +using namespace AscendC; + +template +class All2AllHierarchy : protected Collectives { + constexpr static int QUEUE_DEPTH = 2; + constexpr static int32_t STEP_TIMES = 2; + constexpr static int INVALID_RANK_NUM = 0xFFFFFFFF; + constexpr static int INVALID_RANK = 0xFFFFFFFF; + constexpr static const int64_t SIO = 2; + constexpr static int64_t CORE_NUM_PER_STAGE = 16; + constexpr static int64_t MULTI_RANK_SIZE = CORE_NUM_PER_STAGE; + constexpr static int64_t PRODUCER_CORE = 1; + constexpr static int64_t COMSUMER_CORE = 2; + static const int64_t DIE_CHANGE = 1; + +public: + FORCE_INLINE_AICORE All2AllHierarchy(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { + Collectives::Init(KERNELS_ARGS_CALL()); + this->input = (__gm__ T *) input; + this->output = (__gm__ T *) output; + + perRankDataNum = GetDataCount(len, rankSize); + curRankDataNum = perRankDataNum; + InitShare(); + InitCoreGroup(); + InitDataSlice(); + } + FORCE_INLINE_AICORE void Process() + { + if (coreGroup == PRODUCER_CORE) { + Producer(); + } else { + Consumer(); + } + } +private: + FORCE_INLINE_AICORE void InitShare() + { + int64_t queNum = blockNum / STEP_TIMES; + if (rankSize <= CORE_NUM_PER_STAGE) { + queNum = rankSize; + } + if (len < perQueElemLen) { + coreNumPerStage = 1; + } + perQueElemLen = IPC_BUFF_MAX_SIZE / queNum / QUEUE_DEPTH / sizeof(T); + queLen = perQueElemLen * QUEUE_DEPTH; + queSize = queLen * sizeof(T); + } + + FORCE_INLINE_AICORE void InitCoreGroup() + { + coreNumPerRank = 1; + if (len < perQueElemLen) { + coreNumPerRank = 1; + } + coreNumPerStage = coreNumPerRank * rankSize < CORE_NUM_PER_STAGE ? + coreNumPerRank * rankSize : CORE_NUM_PER_STAGE; + rankNumPerCore = CeilDiv(rankSize, coreNumPerStage); + flagNumPerStage = rankSize; + groupCore = (rank / coreNumPerStage) * coreNumPerStage; + if (blockIdx < coreNumPerStage) [ + coreGroup = PRODUCER_CORE; + for (auto i = 0; i < rankNumPerStage; ++i) { + groupCoreIdx[i] = (groupCore + i * coreNumPerRank) % rankSize + blockIdx; + } + ] else if (blockIdx < coreNumPerStage + coreNumPerStage) { + coreGroup = COMSUMER_CORE; + for (auto i = 0; i < rankNumPerStage; ++i) { + int64_t prefix = (groupCore - i * coreNumPerStage) >= 0 ? + (groupCore - i * coreNumPerStage) : groupCore + ((rankNumPerCore - 1) * coreNumPerStage); + groupCoreIdx[i] = prefix + blockIdx - coreNumPerStage; + } + } + } + + FORCE_INLINE_AICORE void InitDataSlice() + { + ipcDataNumPreBlock = curRankDataNum; + if (coreGroup == PRODUCER_CORE) { + for (auto i = 0; i < rankNumPerCore; ++i) { + if (groupCoreIdx[i] % SIO = rank % SIO) { + srcInnerQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + (groupCoreIdx[i] % coreNumPerStage) * queSize, queLen, perQueElemLen); + } else { + SrcSioQue[i].Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + + ((groupCoreIdx[i] + (rank - sioRank)) % coreNumPerStage) * queSize, queLen, perQueElemLen); + } + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + } + } else if (coreGroup = CONSUMER_CORE) { + for (auto i = 0; i < rankNumPerCore; ++i) { + computePullRank(groupCoreIdx[i], rank); + if (rank % SIO == 0) { + pullOffset = DIE_CHANGE * groupCoreIdx[i] % SIO; + } else { + pullOffset = groupCoreIdx[i] % SIO - DIE_CHANGE; + } + + pullQue[i].Init(&sync, magic, shareAddrs[pullRank] + IPC_DATA_OFFSET + + (rank % coreNumPerStage) * queSize + pullOffset * queSize, queLen, perQueElemLen); + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + } + } + } + + FORCE_INLINE_AICORE void Producer() + { + for (auto i = 0; i < rankNumPerCore; ++i) { + for (auto sliceIdx = 0; sliceIdx < sliceNum; ++sliceIdx) { + Input2IpcSlice(i, sliceIdx); + } + } + } + + FORCE_INLINE_AICORE void Input2IpcSlice(int64_t idx, int64_t sliceIdx) + { + inputGt.SetGlobalBuffer((__gm__ T*)input + groupCoreIdx[idx] * ipcDataNumPreBlock, ipcDataNumPreBlock); + copyLen = ipcDataNumPreBlock - perQueElemLen * sliceIdx; + if (copyLen > perQueElemLen) { + copyLen = perQueElemLen; + } else if (copyLen < 0) { + copyLen = 0; + } + if (groupCoreIdx[idx] % SIO == rank % SIO) { + if (idx > 0) { + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), groupCoreIdx[idx - 1] + flagNumPerStage, rank); + } + srcInnerQue[idx].DeQue(rank, groupCoreIdx[idx] + flagNumPerStage); + writeGt = srcInnerQue[idx].EnQue(); + if(copyLen > 0) { + CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx] + (rank - sioRank), sioRank); + } + } + } + FORCE_INLINE_AICORE void Consumer() + { + for (auto i = 0; i < rankNumPerCore; ++i) { + computePullRank(groupCoreIdx[i], rank); + for (auto sliceIdx = 0; sliceIdx < sliceNum; ++sliceIdx) { + Ipc2Output(i, sliceIdx); + } + } + } + + FORCE_INLINE_AICORE void computePullRank(int64_t target, int64_t rank) + { + if (rank % SIO == 0) { + pullRank = (target / SIO) * SIO; + } else { + pullRank = (target / SIO) * SIO + DIE_CHANGE; + } + } + + FORCE_INLINE_AICORE void Ipc2Output(int64_t idx, int64_t sliceIdx) + { + outputGt.SetGlobalBuffer((__gm__ T*)output + groupCoreIdx[idx] * ipcDataNumPreBlock, + ipcDataNumPreBlock); + cpOffset = rank % SIO == 0 ? rank + groupCoreIdx[idx] % SIO : + (rank - DIE_CHANGE) + groupCoreIdx[idx] % SIO; + copyLen = ipcDataNumPreBlock - perQueElemLen * sliceIdx; + if (copyLen > perQueElemLen) { + copyLen = perQueElemLen; + } else if (copyLen < 0) { + copyLen = 0; + } + readGt = pullQue[idx].ReadFront(); + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * idx, cpOffset, pullRank); + if (copyLen > 0) { + CpGMPingPong2GM(copyLen * sizeof(T), readGt, outputGt[sliceIdx * perQueElemLen], Op::COPYONLY); + } + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, cpOffset + flagNumPerStage, pullRank); + } + GlobalTensor inputGt; + GlobalTensor outputGt; + GlobalTensor readGt; + GlobalTensor writeGt; + __gm__ T *input; + __gm__ T *output; + + int atomOp; + IpcQueue srcInnerQue[MULTI_RANK_SIZE]; + IpcQueue SrcSioQue[MULTI_RANK_SIZE]; + IpcQueue pullQue[MULTI_RANK_SIZE]; + int64_t perRankDataNum; + int64_t curRankDataNum; + int64_t ipcDataNumPreBlock; + int64_t pullRank; + int64_t pullOffset; + int64_t sioRank = (rank % 2 == 0 ? rank + 1:rank - 1); + int64_t cpOffset; + int64_t perQueElemLen; + int64_t queLen; + int64_t queSize; + int64_t coreNumPerStage; + int64_t flagNumPerStage; + int64_t coreNumPerRank; + int64_t rankNumPerCore; + int64_t coreGroup; + int64_t groupCoreIdx[MULTI_RANK_SIZE]; + int64_t sliceNum; + int64_t copyLen; + int64_t groupCore; +}; + +#endif // LCCL_ALL2ALL_HIERARCHY_H -- Gitee From 73dba60c234202d458facca2f8f73b9f1752e04e Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 15:00:59 +0800 Subject: [PATCH 067/414] Fix header guards, correct variable names, and improve code formatting in All2AllHierarchy class for consistency and clarity --- .../ascendc_kernels/91093/all2all_hierarchy.h | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h index 64e54892..1e99f88c 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h @@ -8,8 +8,8 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef ALL2ALL_HIERARCHY_H -#define ALL2ALL_HIERARCHY_H +#ifndef LCCL_ALL2ALL_HIERARCHY_H +#define LCCL_ALL2ALL_HIERARCHY_H #include "collectives.h" #include "sync_collectives.h" @@ -27,13 +27,14 @@ class All2AllHierarchy : protected Collectives { constexpr static int64_t CORE_NUM_PER_STAGE = 16; constexpr static int64_t MULTI_RANK_SIZE = CORE_NUM_PER_STAGE; constexpr static int64_t PRODUCER_CORE = 1; - constexpr static int64_t COMSUMER_CORE = 2; + constexpr static int64_t CONSUMER_CORE = 2; static const int64_t DIE_CHANGE = 1; public: FORCE_INLINE_AICORE All2AllHierarchy(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} - FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { Collectives::Init(KERNELS_ARGS_CALL()); this->input = (__gm__ T *) input; this->output = (__gm__ T *) output; @@ -60,7 +61,7 @@ private: queNum = rankSize; } if (len < perQueElemLen) { - coreNumPerStage = 1; + coreNumPerRank = 1; } perQueElemLen = IPC_BUFF_MAX_SIZE / queNum / QUEUE_DEPTH / sizeof(T); queLen = perQueElemLen * QUEUE_DEPTH; @@ -78,16 +79,16 @@ private: rankNumPerCore = CeilDiv(rankSize, coreNumPerStage); flagNumPerStage = rankSize; groupCore = (rank / coreNumPerStage) * coreNumPerStage; - if (blockIdx < coreNumPerStage) [ + if (blockIdx < coreNumPerStage) { coreGroup = PRODUCER_CORE; - for (auto i = 0; i < rankNumPerStage; ++i) { - groupCoreIdx[i] = (groupCore + i * coreNumPerRank) % rankSize + blockIdx; + for (auto i = 0; i < rankNumPerCore; ++i) { + groupCoreIdx[i] = (groupCore + i * coreNumPerStage) % rankSize + blockIdx; } - ] else if (blockIdx < coreNumPerStage + coreNumPerStage) { - coreGroup = COMSUMER_CORE; - for (auto i = 0; i < rankNumPerStage; ++i) { + } else if (blockIdx < coreNumPerStage + coreNumPerStage) { + coreGroup = CONSUMER_CORE; + for (auto i = 0; i < rankNumPerCore; ++i) { int64_t prefix = (groupCore - i * coreNumPerStage) >= 0 ? - (groupCore - i * coreNumPerStage) : groupCore + ((rankNumPerCore - 1) * coreNumPerStage); + (groupCore - i * coreNumPerStage) : groupCore + ((rankNumPerCore - i) * coreNumPerStage); groupCoreIdx[i] = prefix + blockIdx - coreNumPerStage; } } @@ -98,7 +99,7 @@ private: ipcDataNumPreBlock = curRankDataNum; if (coreGroup == PRODUCER_CORE) { for (auto i = 0; i < rankNumPerCore; ++i) { - if (groupCoreIdx[i] % SIO = rank % SIO) { + if (groupCoreIdx[i] % SIO == rank % SIO) { srcInnerQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + (groupCoreIdx[i] % coreNumPerStage) * queSize, queLen, perQueElemLen); } else { @@ -107,7 +108,7 @@ private: } sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); } - } else if (coreGroup = CONSUMER_CORE) { + } else if (coreGroup == CONSUMER_CORE) { for (auto i = 0; i < rankNumPerCore; ++i) { computePullRank(groupCoreIdx[i], rank); if (rank % SIO == 0) { @@ -147,6 +148,17 @@ private: } srcInnerQue[idx].DeQue(rank, groupCoreIdx[idx] + flagNumPerStage); writeGt = srcInnerQue[idx].EnQue(); + if(copyLen > 0) { + CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx] + (rank - sioRank), rank); + } + } else { + if (idx > 0) { + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), + groupCoreIdx[idx - 1] + flagNumPerStage, rank); + } + SrcSioQue[idx].DeQue(rank, groupCoreIdx[idx] + flagNumPerStage); + writeGt = SrcSioQue[idx].EnQue(); if(copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx] + (rank - sioRank), sioRank); @@ -163,7 +175,7 @@ private: } } - FORCE_INLINE_AICORE void computePullRank(int64_t target, int64_t rank) + FORCE_INLINE_AICORE void computePullRank(int64_t& target, int64_t rank) { if (rank % SIO == 0) { pullRank = (target / SIO) * SIO; @@ -187,7 +199,7 @@ private: readGt = pullQue[idx].ReadFront(); sync.WaitSyncFlag(magic, sliceIdx + sliceNum * idx, cpOffset, pullRank); if (copyLen > 0) { - CpGMPingPong2GM(copyLen * sizeof(T), readGt, outputGt[sliceIdx * perQueElemLen], Op::COPYONLY); + CpGM2GMPingPong(copyLen * sizeof(T), readGt, outputGt[sliceIdx * perQueElemLen], Op::COPYONLY); } sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, cpOffset + flagNumPerStage, pullRank); } @@ -207,7 +219,7 @@ private: int64_t ipcDataNumPreBlock; int64_t pullRank; int64_t pullOffset; - int64_t sioRank = (rank % 2 == 0 ? rank + 1:rank - 1); + int64_t sioRank = (rank % 2 == 0) ? rank + 1:rank - 1; int64_t cpOffset; int64_t perQueElemLen; int64_t queLen; -- Gitee From a0e644bf52b135bc01495caa0c8a810d1e31efbc Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 15:02:56 +0800 Subject: [PATCH 068/414] Fix synchronization logic in All2AllHierarchy class for correct rank handling and improved data flow --- comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h index 1e99f88c..f01b2b20 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h @@ -150,14 +150,14 @@ private: writeGt = srcInnerQue[idx].EnQue(); if(copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); - sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx] + (rank - sioRank), rank); + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx], rank); } } else { if (idx > 0) { sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), - groupCoreIdx[idx - 1] + flagNumPerStage, rank); + groupCoreIdx[idx - 1] + flagNumPerStage + (rank - sioRank), sioRank); } - SrcSioQue[idx].DeQue(rank, groupCoreIdx[idx] + flagNumPerStage); + SrcSioQue[idx].DeQue(sioRank, groupCoreIdx[idx] + (rank - sioRank) + flagNumPerStage); writeGt = SrcSioQue[idx].EnQue(); if(copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); -- Gitee From b5a3a18b93884fa16c2f23e8b40a81795a140b8d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 15:46:10 +0800 Subject: [PATCH 069/414] Add All2AllHierarchySmall class implementation for enhanced collective operations and data handling --- .../91093/all2all_hierarchy_small.h | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h index e69de29b..df6bb7ed 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCCL_ALL2ALL_HIERARCHY_H +#define LCCL_ALL2ALL_HIERARCHY_H + +#include "collectives.h" +#include "sync_collectives.h" +#include "ipc_queue.h" + +using namespace AscendC; + +template +class All2AllHierarchySmall : protected Collectives { + constexpr static int QUEUE_DEPTH = 2; + constexpr static int32_t STEP_TIMES = 2; + constexpr static int INVALID_RANK_NUM = 0xFFFFFFFF; + constexpr static int INVALID_RANK = 0xFFFFFFFF; + constexpr static int64_t CORE_NUM_PER_STAGE = 16; + constexpr static int64_t PRODUCER_CORE = 1; + constexpr static int64_t CONSUMER_CORE = 2; + static const int64_t SIO = 2; + +public: + FORCE_INLINE_AICORE All2AllHierarchySmall(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + this->input = (__gm__ T *) input; + this->output = (__gm__ T *) output; + + curRankDataNum = GetDataCount(len, rankSize); + InitShare(); + InitCoreGroup(); + InitDataSlice(); + } + FORCE_INLINE_AICORE void Process() + { + if (coreGroup == PRODUCER_CORE) { + Producer(); + } else { + Consumer(); + } + } +private: + FORCE_INLINE_AICORE void InitShare() + { + coreNumPerStage = CORE_NUM_PER_STAGE; + singleStage = coreNumPerStage / SIO; + perQueElemLen = IPC_BUFF_MAX_SIZE / SIO / singleStage / QUEUE_DEPTH / sizeof(T); + queLen = perQueElemLen * QUEUE_DEPTH; + queSize = queLen * sizeof(T); + queBlockSize = IPC_BUFF_MAX_SIZE / SIO; + } + + FORCE_INLINE_AICORE void InitCoreGroup() + { + if (len < perQueElemLen) { + coreNumPerRank = 1; + } + loopCount = rankSize / SIO; + flagNumPerStage = coreNumPerStage; + if (blockIdx < coreNumPerStage) { + coreGroup = PRODUCER_CORE; + groupCoreIdx = blockIdx; + } else if (blockIdx < coreNumPerStage + coreNumPerStage) { + coreGroup = CONSUMER_CORE; + groupCoreIdx = blockIdx - coreNumPerStage; + } + } + + FORCE_INLINE_AICORE void InitDataSlice() + { + ipcDataNumPreBlock = GetDataCount(curRankDataNum, singleStage); + int64_t ifOffSet = queBlockSize * (rank % SIO); + if (coreGroup == PRODUCER_CORE) { + for (auto i = 0; i < loopCount; ++i) { + if (groupCoreIdx < singelStage) { + srcLocalQue1.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ifOffSet + + group CoreIdx * queSize, queLen, perQueElemLen); + } else { + srcLocalQue1.Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + ifOffSet + + (groupCoreIdx - singleStage) * queSize, queLen, perQueElemLen); + } + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + } + } + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + } + + FORCE_INLINE_AICORE void Producer() + { + for (auto i = 0; i < loopCount; ++i) { + srcRank = (rank + i % SIO) % rankSize; + sioSrcRank = (srcRank % SIO == 0) ? srcRank + 1 : srcRank - 1; + srcLocalQue = srcLocalQue1; + srcSioQue = srcSioQue1; + for (auto sliceIdx = 0; sliceIdx < sliceNum; ++sliceIdx) { + Input2IpcSlice(i, sliceIdx); + } + } + } + + FORCE_INLINE_AICORE void Input2IpcSlice(int64_t idx, int64_t sliceIdx) + { + copyLen = ipcDataNumPreBlock - perQueElemLen * sliceIdx; + if (copyLen > perQueElemLen) { + copyLen = perQueElemLen; + } else if (copyLen < 0) { + copyLen = 0; + } + int64_t flagIdx = groupCoreIdx + (rank % SIO) * singleStage; + if (groupCoreIdx < singleStage) { + if (idx > 0) { + int64_t waitRank = (srcRank - SIO) >= 0 ? (srcRank - SIO) : srcRank + ((loopCount - 1) * SIO); + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), flagIdx + (waitRank / SIO) * coreNumPerStage + + flagNumPerStage, rank); + } + inputGt.SetGlobalBuffer((__gm__ T*)input + srcRank * curRankDataNum + groupCoreIdx * ipcDataNumPreBlock, + ipcDataNumPreBlock); + srcLocalQue.DeQue(rank, flagIdx + (srcRank / SIO) * coreNumPerStage + flagNumPerStage); + writeGt = srcLocalQue.EnQue(); + if(copyLen > 0) { + CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, flagIdx, rank); + } + } else { + flagIdx = flagIdx - singleStage; + if (idx > 0) { + int64_t waitRank = (sioSrcRank - SIO) >= 0 ? (sioSrcRank - SIO) : sioSrcRank + ((loopCount - 1) * SIO); + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), flagIdx + (waitRank / SIO) * coreNumPerStage + + flagNumPerStage + (rank - sioRank), sioRank); + } + inputGt.SetGlobalBuffer((__gm__ T*)input + sioSrcRank * curRankDataNum + + (groupCoreIdx - singleStage) * ipcDataNumPreBlock, ipcDataNumPreBlock); + srcSioQue[idx].DeQue(sioRank, flagIdx + (sioSrcRank / SIO) * coreNumPerStage + + flagNumPerStage); + writeGt = srcSioQue.EnQue(); + if(copyLen > 0) { + CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * + perQueElemLen], writeGt, Op::COPYONLY); + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, flagIdx, sioRank); + } + } + } + FORCE_INLINE_AICORE void Consumer() + { + for (auto i = 0; i < loopCount; ++i) { + destRank = (rank - i * SIO) >= 0 ? (rank - i * SIO) : rank + ((loopCount - + i) * SIO); + if (groupCoreIdx < singleStage) { + detHccsQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + queBlockSize + + (groupCoreIdx - singleStage) * queSize, queLen, perQueElemLen); + } + for (auto sliceIdx = 0; sliceIdx < sliceNum; ++sliceIdx) { + Ipc2Output(i, sliceIdx); + } + } + } + + FORCE_INLINE_AICORE void Ipc2Output(int64_t idx, int64_t sliceIdx) + { + outputGt.SetGlobalBuffer((__gm__ T*)output + (destRank / SIO) * SIO * curRankDataNum + + groupCoreIdx * ipcDataNumPreBlock, ipcDataNumPreBlock); + copyLen = ipcDataNumPreBlock - perQueElemLen * sliceIdx; + if (copyLen > perQueElemLen) { + copyLen = perQueElemLen; + } else if (copyLen < 0) { + copyLen = 0; + } + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx, destRank); + if (groupCoreIdx < singleStage) { + readGt = detHccsQue.ReadFront(); + } else { + readGt = detHccsSioQue.ReadFront(); + } + DpGM2GMPingPong(copyLen * sizeof(T), readGt, outputGt[sliceIdx * perQueElemLen], Op::COPYONLY); + sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx + flagNumPerStage + + (rank / SIO) * coreNumPerStage, destRank); + } + GlobalTensor inputGt; + GlobalTensor outputGt; + GlobalTensor writeGt; + GlobalTensor outputGt; + __gm__ T *input; + __gm__ T *output; + + int atomOp; + IpcQueue srcLocalQue; + IpcQueue SrcSioQue; + IpcQueue detHccsQue; + IpcQueue detHccsSioQue; + IpcQueue srcLocalQue1; + IpcQueue srcSioQue1; + + int64_t loopCount; + int64_t queBlockSize; + int64_t srcRank; + int64_t sioSrcRank; + int64_t destRank; + int64_t singleStage; + int64_t curRankDataNum; + int64_t ipcDataNumPreBlock; + int64_t sioRank = (rank % 2 == 0) ? rank + 1:rank - 1; + int64_t perQueElemLen; + int64_t queLen; + int64_t queSize; + int64_t coreNumPerStage; + int64_t flagNumPerStage; + int64_t coreNumPerRank; + int64_t coreGroup; + int64_t groupCoreIdx; + int64_t sliceNum; + int64_t copyLen; +}; + +#endif // LCCL_ALL2ALL_HIERARCHY_SMALL_H -- Gitee From 32de0a41888945e1ad77658105241bffd951b697 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 16:01:28 +0800 Subject: [PATCH 070/414] Fix header guards, correct variable names, and improve code formatting in All2AllHierarchySmall class for consistency and clarity --- .../91093/all2all_hierarchy_small.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h index df6bb7ed..927a4c49 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h @@ -8,8 +8,8 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef LCCL_ALL2ALL_HIERARCHY_H -#define LCCL_ALL2ALL_HIERARCHY_H +#ifndef LCCL_ALL2ALL_HIERARCHY_SMALL_H +#define LCCL_ALL2ALL_HIERARCHY_SMALL_H #include "collectives.h" #include "sync_collectives.h" @@ -26,7 +26,7 @@ class All2AllHierarchySmall : protected Collectives { constexpr static int64_t CORE_NUM_PER_STAGE = 16; constexpr static int64_t PRODUCER_CORE = 1; constexpr static int64_t CONSUMER_CORE = 2; - static const int64_t SIO = 2; + constexpr static int64_t SIO = 2; public: FORCE_INLINE_AICORE All2AllHierarchySmall(int rank, int rankSize, uint32_t extraFlag) @@ -83,11 +83,11 @@ private: int64_t ifOffSet = queBlockSize * (rank % SIO); if (coreGroup == PRODUCER_CORE) { for (auto i = 0; i < loopCount; ++i) { - if (groupCoreIdx < singelStage) { + if (groupCoreIdx < singeStage) { srcLocalQue1.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ifOffSet + - group CoreIdx * queSize, queLen, perQueElemLen); + groupCoreIdx * queSize, queLen, perQueElemLen); } else { - srcLocalQue1.Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + ifOffSet + + srcSioQue1.Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + ifOffSet + (groupCoreIdx - singleStage) * queSize, queLen, perQueElemLen); } sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); @@ -99,7 +99,7 @@ private: FORCE_INLINE_AICORE void Producer() { for (auto i = 0; i < loopCount; ++i) { - srcRank = (rank + i % SIO) % rankSize; + srcRank = (rank + i * SIO) % rankSize; sioSrcRank = (srcRank % SIO == 0) ? srcRank + 1 : srcRank - 1; srcLocalQue = srcLocalQue1; srcSioQue = srcSioQue1; @@ -137,16 +137,14 @@ private: if (idx > 0) { int64_t waitRank = (sioSrcRank - SIO) >= 0 ? (sioSrcRank - SIO) : sioSrcRank + ((loopCount - 1) * SIO); sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), flagIdx + (waitRank / SIO) * coreNumPerStage - + flagNumPerStage + (rank - sioRank), sioRank); + + flagNumPerStage, sioRank); } inputGt.SetGlobalBuffer((__gm__ T*)input + sioSrcRank * curRankDataNum + (groupCoreIdx - singleStage) * ipcDataNumPreBlock, ipcDataNumPreBlock); - srcSioQue[idx].DeQue(sioRank, flagIdx + (sioSrcRank / SIO) * coreNumPerStage + - flagNumPerStage); + srcSioQue.DeQue(sioRank, flagIdx + (sioSrcRank / SIO) * coreNumPerStage + flagNumPerStage); writeGt = srcSioQue.EnQue(); if(copyLen > 0) { - CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * - perQueElemLen], writeGt, Op::COPYONLY); + CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, flagIdx, sioRank); } } @@ -154,10 +152,12 @@ private: FORCE_INLINE_AICORE void Consumer() { for (auto i = 0; i < loopCount; ++i) { - destRank = (rank - i * SIO) >= 0 ? (rank - i * SIO) : rank + ((loopCount - - i) * SIO); + destRank = (rank - i * SIO) >= 0 ? (rank - i * SIO) : rank + ((loopCount - i) * SIO); if (groupCoreIdx < singleStage) { - detHccsQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + queBlockSize + + detHccsQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + + groupCoreIdx * queSize, queLen, perQueElemLen); + } else { + detHccsSioQue.Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + queBlockSize + (groupCoreIdx - singleStage) * queSize, queLen, perQueElemLen); } for (auto sliceIdx = 0; sliceIdx < sliceNum; ++sliceIdx) { @@ -182,12 +182,12 @@ private: } else { readGt = detHccsSioQue.ReadFront(); } - DpGM2GMPingPong(copyLen * sizeof(T), readGt, outputGt[sliceIdx * perQueElemLen], Op::COPYONLY); + CpGM2GMPingPong(copyLen * sizeof(T), readGt, outputGt[sliceIdx * perQueElemLen], Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx + flagNumPerStage + (rank / SIO) * coreNumPerStage, destRank); } GlobalTensor inputGt; - GlobalTensor outputGt; + GlobalTensor readGt; GlobalTensor writeGt; GlobalTensor outputGt; __gm__ T *input; @@ -195,7 +195,7 @@ private: int atomOp; IpcQueue srcLocalQue; - IpcQueue SrcSioQue; + IpcQueue srcSioQue; IpcQueue detHccsQue; IpcQueue detHccsSioQue; IpcQueue srcLocalQue1; -- Gitee From 9200d2ba7b29ceb73b03003f46b79fce28506b3d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 16:02:41 +0800 Subject: [PATCH 071/414] Fix typo in condition check and correct initialization parameters in All2AllHierarchySmall class for accurate data handling --- comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h index 927a4c49..db6b9b43 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h @@ -83,7 +83,7 @@ private: int64_t ifOffSet = queBlockSize * (rank % SIO); if (coreGroup == PRODUCER_CORE) { for (auto i = 0; i < loopCount; ++i) { - if (groupCoreIdx < singeStage) { + if (groupCoreIdx < singleStage) { srcLocalQue1.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ifOffSet + groupCoreIdx * queSize, queLen, perQueElemLen); } else { @@ -157,7 +157,7 @@ private: detHccsQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + groupCoreIdx * queSize, queLen, perQueElemLen); } else { - detHccsSioQue.Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + queBlockSize + + detHccsSioQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + queBlockSize + (groupCoreIdx - singleStage) * queSize, queLen, perQueElemLen); } for (auto sliceIdx = 0; sliceIdx < sliceNum; ++sliceIdx) { -- Gitee From 4851e128b7074e5fae094d6a3a0c97f94e44e903 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 16:29:14 +0800 Subject: [PATCH 072/414] Add AllReduceOneShot class implementation for enhanced collective operations and data handling --- .../src/ascendc_kernels/allreduce_one_shot.h | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h index e69de29b..732856bc 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCCL_ALLREDUCE_ONE_SHOT_H +#define LCCL_ALLREDUCE_ONE_SHOT_H + +#include "sync_collectives.h" +#include "allreduce_quant.h" + +using namespace AscendC; +template +class AllReduceOneShot : protected AllReduceQuant { + constexpr static T oneCast = (T) 1; + +public: + FORCE_INLINE_AICORE AllReduceOneShot(int rank, int rankSize, uint32_t extraFlag) + : AllReduceQuant(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + if constexpr(!std::is_same_v) { + BuildScaleOffset(scale, scaleCount, offset); + } + atomOp = op; + blockNum = blockNum / rankSize * rankSize; + if (blockIdx >= rankSize) { + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + return; + } + + corePerRank = blockNum / rankSize; + rankiDOfBlock = blockIdx / corePerRank; + + dataDMAPerCore = len / rankSize / corePerRank / scaleNum * scaleNum; + dataReducePerCore = len / corePerRank / scaleNum * scaleNum; + + blockDataNum = dataDMAPerCore; + if (blockIdx == rankSize * corePerRank - 1) { + blockDataNum = len - blockIdx * dataDMAPerCore; + } + + blockReduceNum = dataReducePerCore; + if (blockIdx % corePerRank == corePerRank - 1) { + blockReduceNum = len - blockIdx * corePerRank * dataReducePerCore; + } + + __gm__ U* curRankGm = (__gm__ U*)shareAddrs[rank] + IPC_DATA_OFFSET / sizeof(U); + __gm__ U* peerRankGm = (__gm__ U*)shareAddrs[rankIDOfBlock] + IPC_DATA_OFFSET / sizeof(U); + __gm__ U* inputGm = (__gm__ U*)input; + __gm__ U* outputGm = (__gm__ U*)output; + + srcInputGt.SetGlobalBuffer(inputGm + blockIdx * dataDMAPerCore); + dstIPCGlobal.SetGlobalBuffer(curRankGm + blockIdx * dataDMAPerCore); + copyOutGlobal.SetGlobalBuffer(outputGm + blockIdx * dataDMAPerCore); + srcIPCGlobal.SetGlobalBuffer(peerRankGm + blockIdx % corePerRank * dataReducePerCore); + dstOutputGlobal.SetGlobalBuffer(outputGm + blockIdx % corePerRank * dataReducePerCore); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + if (blockIdx >= rankSize) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + return; + } + CpInputToBuffAndOutput(); + sync.SetInnerFlag(magic, 1); + + sync.WaitRankInnerFlag(magic, 1, rank); + sync.WaitRankInnerFlag(magic, 1, rankIDOfBlock); + if (rankIDOfBlock != rank) { + if constexpr (std::is_same_v) { + if (!isEnableScale) { + Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop); + } else if (!isVectorScale){ + CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop, firstScale, offset); + } else { + CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop, scaleGt, scaleNum, offset); + } + } else { + Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop); + } + + } else { + GlobalTensor inputIpcGtTmp; + inputIpcGtTmp.SetGlobalBuffer((__gm__ U*)inputIpcGt.GetPhyAddr()); + Collectives::CpGM2GM(inputIpcGtTmp, inputGt, dataNumPreBlock, COPYONLY); + } + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } + + FORCE_INLINE_AICORE void CpInputToBuffAndOutput() + { + Collectives::CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY); + if constexpr (std::is_same_v) { + if (!isEnableScale) { + Collectives::CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, COPYONLY); + } else if (!isVectorScale){ + CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, COPYONLY, firstScale, offset); + } else { + CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, COPYONLY, scaleGt, scaleNum, offset); + } + } else { + Collectives::CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, -1); + } + } + +protected: + GlobalTensor srcInputGlobal; + GlobalTensor srcIPCGlobal; + GlobalTensor dstIPCGlobal; + GlobalTensor dstOutputGlobal; + GlobalTensor copyOutputGlobal; + + int rankIDOfBlock; + int corePerRank; + int dataDMAPerCore; + int dataReducePerCore; + int blockDataNum; + int blockReduceNum; + int atomOp; + GlobalTensor scaleGt; + int64_t scaleNum = 1; + T firstScale = 1; + T offset = 0; + bool isEnableScale = false; + bool isVectorScale = false; + +private: + FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) + { + if (scale != nullptr && offset != nullptr) { + scaleGt.SetGlobalBuffer((__gm__ T*)scale); + this->firstScale = scaleGt.GetValue(0); + this->scaleNum = scaleCount < 1 ? 1 : scaleCount; + this->offset =* reinterpret_cast<__gm__ T*>(offset); + isVectorScale = scaleCount > 1; + isEnableScale = scaleCount > 0 && !(*(uint16_t *)(&(this->offset)) == 0 && + scaleCount == 1 && *(uint16_t *)(&firstScale) == *(uint16_t *)(&oneCast)); + } + } +}; + +#endif // LCCL_ALLREDUCE_ONE_SHOT_H \ No newline at end of file -- Gitee From b5a8245736478f27b4ac8bc3b209d4b570a2cc8f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 16:34:32 +0800 Subject: [PATCH 073/414] Fix logic errors and improve variable handling in AllReduceOneShot class for accurate data processing --- .../src/ascendc_kernels/allreduce_one_shot.h | 46 ++++++++----------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h index 732856bc..0a3bebb4 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h @@ -31,13 +31,13 @@ public: } atomOp = op; blockNum = blockNum / rankSize * rankSize; - if (blockIdx >= rankSize) { + if (blockIdx >= blockNum) { DumpLcclLogInfo(LogId::INIT, static_cast(op)); return; } corePerRank = blockNum / rankSize; - rankiDOfBlock = blockIdx / corePerRank; + rankIDOfBlock = blockIdx / corePerRank; dataDMAPerCore = len / rankSize / corePerRank / scaleNum * scaleNum; dataReducePerCore = len / corePerRank / scaleNum * scaleNum; @@ -49,17 +49,17 @@ public: blockReduceNum = dataReducePerCore; if (blockIdx % corePerRank == corePerRank - 1) { - blockReduceNum = len - blockIdx * corePerRank * dataReducePerCore; + blockReduceNum = len - blockIdx % corePerRank * dataReducePerCore; } __gm__ U* curRankGm = (__gm__ U*)shareAddrs[rank] + IPC_DATA_OFFSET / sizeof(U); __gm__ U* peerRankGm = (__gm__ U*)shareAddrs[rankIDOfBlock] + IPC_DATA_OFFSET / sizeof(U); - __gm__ U* inputGm = (__gm__ U*)input; - __gm__ U* outputGm = (__gm__ U*)output; + __gm__ U* intputGm = (__gm__ U*)input; + __gm__ T* outputGm = (__gm__ U*)output; - srcInputGt.SetGlobalBuffer(inputGm + blockIdx * dataDMAPerCore); + srcInputGlobal.SetGlobalBuffer(inputGm + blockIdx * dataDMAPerCore); dstIPCGlobal.SetGlobalBuffer(curRankGm + blockIdx * dataDMAPerCore); - copyOutGlobal.SetGlobalBuffer(outputGm + blockIdx * dataDMAPerCore); + copyOutputGlobal.SetGlobalBuffer(outputGm + blockIdx * dataDMAPerCore); srcIPCGlobal.SetGlobalBuffer(peerRankGm + blockIdx % corePerRank * dataReducePerCore); dstOutputGlobal.SetGlobalBuffer(outputGm + blockIdx % corePerRank * dataReducePerCore); DumpLcclLogInfo(LogId::INIT, static_cast(op)); @@ -68,7 +68,7 @@ public: FORCE_INLINE_AICORE void Process() { DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); - if (blockIdx >= rankSize) { + if (blockIdx >= blockNum) { DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); return; } @@ -78,22 +78,16 @@ public: sync.WaitRankInnerFlag(magic, 1, rank); sync.WaitRankInnerFlag(magic, 1, rankIDOfBlock); if (rankIDOfBlock != rank) { - if constexpr (std::is_same_v) { + if constexpr (!std::is_same_v) { if (!isEnableScale) { - Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop); + Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp); } else if (!isVectorScale){ - CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop, firstScale, offset); + CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp, firstScale, offset); } else { - CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop, scaleGt, scaleNum, offset); + CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp, scaleGt, scaleNum, offset); } } else { - Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomop); - } - - } else { - GlobalTensor inputIpcGtTmp; - inputIpcGtTmp.SetGlobalBuffer((__gm__ U*)inputIpcGt.GetPhyAddr()); - Collectives::CpGM2GM(inputIpcGtTmp, inputGt, dataNumPreBlock, COPYONLY); + Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp); } } DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); @@ -101,17 +95,17 @@ public: FORCE_INLINE_AICORE void CpInputToBuffAndOutput() { - Collectives::CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY); - if constexpr (std::is_same_v) { + Collectives::CpGM2GM(dstIPCGlobal, srcInputGlobal, blockDataNum, COPYONLY); + if constexpr (!std::is_same_v) { if (!isEnableScale) { - Collectives::CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, COPYONLY); + Collectives::CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY); } else if (!isVectorScale){ - CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, COPYONLY, firstScale, offset); + CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY, firstScale, offset); } else { - CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, COPYONLY, scaleGt, scaleNum, offset); + CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY, scaleGt, scaleNum, offset); } } else { - Collectives::CpGM2GM(copyOutputGt, srcInputGlobal, blockDataNum, -1); + Collectives::CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, -1); } } @@ -140,10 +134,10 @@ private: FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) { if (scale != nullptr && offset != nullptr) { + this->offset =* reinterpret_cast<__gm__ T*>(offset); scaleGt.SetGlobalBuffer((__gm__ T*)scale); this->firstScale = scaleGt.GetValue(0); this->scaleNum = scaleCount < 1 ? 1 : scaleCount; - this->offset =* reinterpret_cast<__gm__ T*>(offset); isVectorScale = scaleCount > 1; isEnableScale = scaleCount > 0 && !(*(uint16_t *)(&(this->offset)) == 0 && scaleCount == 1 && *(uint16_t *)(&firstScale) == *(uint16_t *)(&oneCast)); -- Gitee From 836d9560a7595572524c562eb61ab6ec6a1dd48d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 16:35:30 +0800 Subject: [PATCH 074/414] Fix type casting issue and improve buffer initialization in AllReduceOneShot class for correct data handling --- comm/lcal/src/ascendc_kernels/allreduce_one_shot.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h index 0a3bebb4..259297f1 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h @@ -55,9 +55,9 @@ public: __gm__ U* curRankGm = (__gm__ U*)shareAddrs[rank] + IPC_DATA_OFFSET / sizeof(U); __gm__ U* peerRankGm = (__gm__ U*)shareAddrs[rankIDOfBlock] + IPC_DATA_OFFSET / sizeof(U); __gm__ U* intputGm = (__gm__ U*)input; - __gm__ T* outputGm = (__gm__ U*)output; + __gm__ T* outputGm = (__gm__ T*)output; - srcInputGlobal.SetGlobalBuffer(inputGm + blockIdx * dataDMAPerCore); + srcInputGlobal.SetGlobalBuffer(intputGm + blockIdx * dataDMAPerCore); dstIPCGlobal.SetGlobalBuffer(curRankGm + blockIdx * dataDMAPerCore); copyOutputGlobal.SetGlobalBuffer(outputGm + blockIdx * dataDMAPerCore); srcIPCGlobal.SetGlobalBuffer(peerRankGm + blockIdx % corePerRank * dataReducePerCore); -- Gitee From 1a91e20fc25da773992ad522ae43f282e7d60e95 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 20:15:14 +0800 Subject: [PATCH 075/414] Add initial implementation of communication argument macros and magic setting for enhanced collective operations --- comm/lcal/src/ascendc_kernels/op_def.h | 114 +++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/comm/lcal/src/ascendc_kernels/op_def.h index e69de29b..178aedfa 100644 --- a/comm/lcal/src/ascendc_kernels/op_def.h +++ b/comm/lcal/src/ascendc_kernels/op_def.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#define GET_COMM_ARGS \ + GlobalTensor commArgsGm; + commArgsGm.SetGlobalBuffer(reinterpret_cast<__gm__ int *>(commArgs), 5); \ + int rank = commArgsGm.GetValue(0); \ + int localRank = commArgsGm.GetValue(1); \ + int rankSize = commArgsGm.GetValue(2); \ + int localRankSize = commArgsGm.GetValue(3); \ + uint32_t extraFlag = commArgsGm.GetValue(4); \ + GM_ADDR dumpAddr = (reinterpret_cast<__gm__ CommArgs *>(commArgs))->dumpAddr; \ + int32_t lcalBlockNum = GetBlockNum() + +#ifdef ENABLE_LCCL_MIX +#define SET_MAGIC \ +do { \ + __gm__ CommArgs * commArgsTmp = reinterpret_cast<__gm__ CommArgs *>(commArgs); \ + PipeBarrier(); \ + SetAtomicNone(); \ + SetMaskNormImpl(); \ + SetSyncBaseAddr(commArgsTmp->fftsVal); \ + SetVectorMask((uint64_t)-1, (uint64_t)-1); \ + PipeBarrier(); \ + LocalTensor localSet; \ + localSet.address_.logicPos = static_cast(TPosition::VECIN); \ + GlobalTensor magicGt; \ + magicGt.SetGlobalBuffer((__gm__ int32_t *)commArgsTmp0>magics); \ + if (GetBlockIdx() == 0) { \ + SetAtomicOpType(Op::ADD); \ + localSet.SetValue(0, 1); \ + AscendC::SetFlag(EVENT_ID0); \ + AscendC::WaitFlag(EVENT_ID0); \ + DataCopyExtParams dataCopyParams(1, sizeof(int32_t), 0, 0, 0); \ + DataCopyPad(magicGt[rankSize - 1], localSet, dataCopyParams); \ + AscendC::SetAtomicNont(); \ + PipeBarrier(); \ + } \ + SyncAll(); \ + DataCopyExtParams dataCopyParams(1, sizeof(int32_t), 0, 0, 0); \ + DataCopyPadExtParams padParams; \ + DataCopyPad(localSet, magicGt[rankSize - 1], dataCopyparams, padParams); \ + AscendC::SetFlag(EVENT_ID0); \ + AscendC::WaitFlag(EVENT_ID0); \ + magic = static_cast(localSet.GetValue(0)); \ + PipeBarrier(); \ + constexpr int32_t aivNumPerAic = 2; \ + lcalBlockNum = GetBlockNum() * aivNumPerAic; \ +} while (0) +#else +#define SET_MAGIC \ +do {} while (0) +#endif + +#define GET_IPC_MEM_ARGS(type) \ +do { \ + SET_MAGIC; \ + GlobalTensor peerMemsAddrGm; \ + peerMemsAddrGm.SetGlobalBuffer(&(reinterpret_cast<__gm__ CommArgs *>(commArgs))->peerMems[0], \ + LCAL_MAX_RANK_SIZE); \ + for (int i = 0; i < rankSize; ++i) { \ + shareAddrs[i] = (__gm__ type *) (peerMemsAddrGm.GetValue(i) + \ + (magic % PING_PONG_SIZE) * (IPC_BUFF_MAX_SIZE + IPC_DATA_OFFSET)); \ + } \ + AscendC::PipeBarrier(); \ +} while (0) \ + +#define CLASS_OP_LAUNCH(name, type) \ +do { \ + name opKernel(rank, rankSize, extraFlag); \ + opKernel.Init(KERNELS_ARGS_CALL()); \ + opKernel.Process(); \ +} while (0) + +#define CLASS_OP_QUANT_910A5_LAUNCH(name, outputType, addType, inputType) \ +do { \ + name opKernel(rank, rankSize, extraFlag); \ + opKernel.Init(KERNELS_ARGS_CALL()); \ + opKernel.Process(); \ +} while (0) + +#define LCCL_TYPE_FUNC(fun) \ + fun(int,);fun(int8_t,);fun(int16_t,);fun(int64_t,); \ + fun(float,);fun(float16_t,);fun(bfloat16,) + +#ifdef ENABLE_LCCL_MIX +#define LCCL_TYPE_AIC_FUNC(fun) \ + fun(int, _mix_aic); fun(int8_t, _mix_aic); fun(int16_t, _mix_aic); fun(int64_t, _mix_aic); \ + fun(float, _mix_aic); fun(float16_t, _mix_aic); fun(bfloat16, _mix_aic); + +#define LCCL_TYPE_AIV_FUNC(fun) \ + fun(int, _mix_aiv); fun(int8_t, _mix_aiv); fun(int16_t, _mix_aiv); fun(int64_t, _mix_aiv); \ + fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16, _mix_aiv); + +#define LCCL_TYPE_AIC_FUNC(fun) \ + (void)0 + +#define LCCL_TYPE_AIV_FUNC(fun) \ + fun(int, _mix_aiv); fun(int8_t, _mix_aiv); fun(int16_t, _mix_aiv); fun(int64_t, _mix_aiv); \ + fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16, _mix_aiv); +#endif + +#define LCCL_VADD_910B_TYPE_FUNC(fun) \ + fun(int);fun(int16_t); \ + fun(float);fun(float16_t) + +#define LCCL_QUANT_LOW_TYPE_FUNC(fun) \ + fun(int8_t) \ No newline at end of file -- Gitee From 6524ac4fd558b3afc376d760b68a325a3b4d797c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 20:21:35 +0800 Subject: [PATCH 076/414] 1 --- comm/lcal/src/ascendc_kernels/op_def.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/comm/lcal/src/ascendc_kernels/op_def.h index 178aedfa..16fa3055 100644 --- a/comm/lcal/src/ascendc_kernels/op_def.h +++ b/comm/lcal/src/ascendc_kernels/op_def.h @@ -8,7 +8,7 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #define GET_COMM_ARGS \ - GlobalTensor commArgsGm; + GlobalTensor commArgsGm; \ commArgsGm.SetGlobalBuffer(reinterpret_cast<__gm__ int *>(commArgs), 5); \ int rank = commArgsGm.GetValue(0); \ int localRank = commArgsGm.GetValue(1); \ @@ -30,8 +30,9 @@ do { \ PipeBarrier(); \ LocalTensor localSet; \ localSet.address_.logicPos = static_cast(TPosition::VECIN); \ + localSet.address_.bufferAddr = reinterpret_cast((__ubuf__ int32_t *)96); \ GlobalTensor magicGt; \ - magicGt.SetGlobalBuffer((__gm__ int32_t *)commArgsTmp0>magics); \ + magicGt.SetGlobalBuffer((__gm__ int32_t *)commArgsTmp->magics); \ if (GetBlockIdx() == 0) { \ SetAtomicOpType(Op::ADD); \ localSet.SetValue(0, 1); \ @@ -39,7 +40,7 @@ do { \ AscendC::WaitFlag(EVENT_ID0); \ DataCopyExtParams dataCopyParams(1, sizeof(int32_t), 0, 0, 0); \ DataCopyPad(magicGt[rankSize - 1], localSet, dataCopyParams); \ - AscendC::SetAtomicNont(); \ + AscendC::SetAtomicNone(); \ PipeBarrier(); \ } \ SyncAll(); \ @@ -92,18 +93,18 @@ do { \ #ifdef ENABLE_LCCL_MIX #define LCCL_TYPE_AIC_FUNC(fun) \ fun(int, _mix_aic); fun(int8_t, _mix_aic); fun(int16_t, _mix_aic); fun(int64_t, _mix_aic); \ - fun(float, _mix_aic); fun(float16_t, _mix_aic); fun(bfloat16, _mix_aic); + fun(float, _mix_aic); fun(float16_t, _mix_aic); fun(bfloat16_t, _mix_aic); #define LCCL_TYPE_AIV_FUNC(fun) \ fun(int, _mix_aiv); fun(int8_t, _mix_aiv); fun(int16_t, _mix_aiv); fun(int64_t, _mix_aiv); \ - fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16, _mix_aiv); - + fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16_t, _mix_aiv); +#else #define LCCL_TYPE_AIC_FUNC(fun) \ (void)0 #define LCCL_TYPE_AIV_FUNC(fun) \ - fun(int, _mix_aiv); fun(int8_t, _mix_aiv); fun(int16_t, _mix_aiv); fun(int64_t, _mix_aiv); \ - fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16, _mix_aiv); + fun(int,); fun(int8_t,); fun(int16_t,); fun(int64_t,); \ + fun(float,); fun(float16_t,); fun(bfloat16_t,); #endif #define LCCL_VADD_910B_TYPE_FUNC(fun) \ -- Gitee From 53d7d30b433e0751c5180dcacef82b0d3daa835d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 20:24:50 +0800 Subject: [PATCH 077/414] 2 --- comm/lcal/src/ascendc_kernels/op_def.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/comm/lcal/src/ascendc_kernels/op_def.h index 16fa3055..9770c5e0 100644 --- a/comm/lcal/src/ascendc_kernels/op_def.h +++ b/comm/lcal/src/ascendc_kernels/op_def.h @@ -46,7 +46,7 @@ do { \ SyncAll(); \ DataCopyExtParams dataCopyParams(1, sizeof(int32_t), 0, 0, 0); \ DataCopyPadExtParams padParams; \ - DataCopyPad(localSet, magicGt[rankSize - 1], dataCopyparams, padParams); \ + DataCopyPad(localSet, magicGt[rankSize - 1], dataCopyParams, padParams); \ AscendC::SetFlag(EVENT_ID0); \ AscendC::WaitFlag(EVENT_ID0); \ magic = static_cast(localSet.GetValue(0)); \ @@ -88,23 +88,23 @@ do { \ #define LCCL_TYPE_FUNC(fun) \ fun(int,);fun(int8_t,);fun(int16_t,);fun(int64_t,); \ - fun(float,);fun(float16_t,);fun(bfloat16,) + fun(float,);fun(float16_t,);fun(bfloat16_t,) #ifdef ENABLE_LCCL_MIX #define LCCL_TYPE_AIC_FUNC(fun) \ fun(int, _mix_aic); fun(int8_t, _mix_aic); fun(int16_t, _mix_aic); fun(int64_t, _mix_aic); \ - fun(float, _mix_aic); fun(float16_t, _mix_aic); fun(bfloat16_t, _mix_aic); + fun(float, _mix_aic); fun(float16_t, _mix_aic); fun(bfloat16_t, _mix_aic) #define LCCL_TYPE_AIV_FUNC(fun) \ fun(int, _mix_aiv); fun(int8_t, _mix_aiv); fun(int16_t, _mix_aiv); fun(int64_t, _mix_aiv); \ - fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16_t, _mix_aiv); + fun(float, _mix_aiv); fun(float16_t, _mix_aiv); fun(bfloat16_t, _mix_aiv) #else #define LCCL_TYPE_AIC_FUNC(fun) \ (void)0 #define LCCL_TYPE_AIV_FUNC(fun) \ fun(int,); fun(int8_t,); fun(int16_t,); fun(int64_t,); \ - fun(float,); fun(float16_t,); fun(bfloat16_t,); + fun(float,); fun(float16_t,); fun(bfloat16_t,) #endif #define LCCL_VADD_910B_TYPE_FUNC(fun) \ -- Gitee From 6132d5499f82b1ee19028cbe276fd23f2e005f50 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 19 Aug 2025 21:26:06 +0800 Subject: [PATCH 078/414] 3 --- comm/lcal/src/ascendc_kernels/allgather.h | 86 +++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h index e69de29b..85f5661e 100644 --- a/comm/lcal/src/ascendc_kernels/allgather.h +++ b/comm/lcal/src/ascendc_kernels/allgather.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCCL_ALLGATHER_H +#define LCCL_ALLGATHER_H + +#include "collectives.h" + +using namespace AscendC; + +constexpr int64_t MEM_DMA_UNIT_SIZE = MEM_DMA_UNIT_INT_NUM * sizeof(int64_t); + +constexpr int64_t STEP1 = 1; + +template + +class AllGather : public Collectives { +public: + FORCE_INLINE_AICORE AllGather(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + globalRank = (reinterpret_cast<__gm__ CommArgs *>(commArgs))->rank; + globalRankSize = (reinterpret_cast<__gm__ CommArgs *>(commArgs))->rankSize; + localRankSize = (reinterpret_cast<__gm__ CommArgs *>(commArgs))->localRankSize; + baseOffsetSize = IPC_DATA_OFFSET; + GetBlockDataCount(len, blockNum, offsetFromInput, countToShare); + offsetToShare = offsetFromInput; + + inputGm.SetGlobalBuffer((__gm__ T*)input + offsetFromInput, countToShare); + if (extraFlag & ExtraFlag::RDMA) { + blockNumPerRank = blockNum / localRankSize; + useCoreNumOutput = blockNumPerRank * rankSize; + } + if (blockIdx >= useCoreNumToOutput) { + return; + } + GetBlockDataCount(len, blockNumPerRank, offsetFromShare, countToOutput); + blockrank = blockIdx / blockNumPerRank; + offsetToOutput = blockRank * len + offsetFromShare; + + if ((extraFlag & ExtraFlag::RDMA) == 0) { + outputGm.SetGlobalBuffer((__gm__ T*)output + offsetToOutput, countToOutput); + } + } + FORCE_INLINE_AICORE void Process() + { + + } + +private: + + FORCE_INLINE_AICORE void GetBlockDataCount() + { + + } + + GlobalTensor inputGm; + GlobalTensor outputGm; + GlobalTensor shareGm; + + int64_t baseOffsetSize; + int64_t offsetFromInput; + int64_t offsetToShare; + int64_t countToShare; + int64_t useCoreNumToOutput; + int64_t blockNumPerRank; + int64_t blockRank; + int64_t offsetFromShare; + int64_t offsetToOutput; + int64_t countToOutPut; + int globalRank; + int globalRankSize; + int localRankSize; +}; + +#endif // LCCL_ALLREDUCE_TWO_SHOT_H \ No newline at end of file -- Gitee From 9bbf5ade2fe872cc30b2899c75b2d254a05c5f1c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 08:33:36 +0800 Subject: [PATCH 079/414] 2 --- comm/lcal/src/ascendc_kernels/allgather.h | 52 +++++++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h index 85f5661e..fef2e23e 100644 --- a/comm/lcal/src/ascendc_kernels/allgather.h +++ b/comm/lcal/src/ascendc_kernels/allgather.h @@ -54,16 +54,60 @@ public: } FORCE_INLINE_AICORE void Process() { - + if (extraFlag & ExtraFlag::RDMA) { + shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank % localRankSize] + baseOffsetSize) + + len * globalRank + offsetToShare, countToShare); + if (countToShare > 0) { + CpGM2GMPingPong(countToShare * sizeof(T), inputGm, shareGm, COPYONLY); + } + sync.SetInnerFlag(magic, STEP1); + sync.WaitRankInnerFlag(magic, STEP1, blockRank); + if (blockIdx >= useCoreNumToOutput) { + return; + } + outputGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[globalRank % localRankSize] + baseOffsetSize) + + len * (globalRank / localRankSize) * localRankSize + offsetToOutput, countToOutput); + shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[blockRank] + baseOffsetSize) + + len * (globalRank / localRankSize) * localRankSize + offsetToOutput, countToOutput); + if (countToOutput > 0 && blockRank != rank) { + CpGMPingPong2GM(countToOutput * sizeof(T), shareGm, outputGm, COPYONLY); + } + } else { + shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + baseOffsetSize) + offsetToShare, countToShare); + if (countToShare > 0) { + CpGM2GM(shareGm, inputGm, countToShare, COPYONLY); + } + sync.SetInnerFlag(magic, STEP1); + sync.WaitRankInnerFlag(magic, STEP1, blockRank); + if (blockIdx >= useCoreNumToOutput) { + return; + } + shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[blockRank] + baseOffsetSize) + offsetToOutput, countToOutput); + if (countToOutput > 0) { + CpGM2GM(countToOutput * sizeof(T), shareGm, outputGm, COPYONLY); + } + } } private: - FORCE_INLINE_AICORE void GetBlockDataCount() + FORCE_INLINE_AICORE void GetBlockDataCount( + const int64_t dataLen, const int64_t useBlockNum, int64_t& blockDataOffset, int64_t& blockDataCount) { - + blockDataCount = CeilDiv(dataLen, useBlockNum); + blockDataCount = blockDataCount > MEM_DMA_UNIT_SIZE / sizeof(T) ? + blockDataCount : MEM_DMA_UNIT_SIZE / sizeof(T); + blockDataOffset = blockIdx % useBlockNum * blockDataCount; + if (blockDataOffset >= dataLen) { + blockDataOffset = dataLen; + blockDataCount = 0; + return; + } + if (blockDataOffset + blockDataCount > dataLen) { + blockDataCount = dataLen - blockDataOffset; + } } - +private: GlobalTensor inputGm; GlobalTensor outputGm; GlobalTensor shareGm; -- Gitee From 728b0245d4d932ec09b4de85dafd097b904d55ca Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 08:39:49 +0800 Subject: [PATCH 080/414] 3 --- comm/lcal/src/ascendc_kernels/allgather.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h index fef2e23e..5696efc6 100644 --- a/comm/lcal/src/ascendc_kernels/allgather.h +++ b/comm/lcal/src/ascendc_kernels/allgather.h @@ -39,13 +39,16 @@ public: inputGm.SetGlobalBuffer((__gm__ T*)input + offsetFromInput, countToShare); if (extraFlag & ExtraFlag::RDMA) { blockNumPerRank = blockNum / localRankSize; - useCoreNumOutput = blockNumPerRank * rankSize; + useCoreNumToOutput = blockNumPerRank * localRankSize; + } else { + blockNumPerRank = blockNum / rankSize; + useCoreNumToOutput = blockNumPerRank * localRankSize; } if (blockIdx >= useCoreNumToOutput) { return; } GetBlockDataCount(len, blockNumPerRank, offsetFromShare, countToOutput); - blockrank = blockIdx / blockNumPerRank; + blockRank = blockIdx / blockNumPerRank; offsetToOutput = blockRank * len + offsetFromShare; if ((extraFlag & ExtraFlag::RDMA) == 0) { @@ -70,7 +73,7 @@ public: shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[blockRank] + baseOffsetSize) + len * (globalRank / localRankSize) * localRankSize + offsetToOutput, countToOutput); if (countToOutput > 0 && blockRank != rank) { - CpGMPingPong2GM(countToOutput * sizeof(T), shareGm, outputGm, COPYONLY); + CpGM2GMPingPong(countToOutput * sizeof(T), shareGm, outputGm, COPYONLY); } } else { shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + baseOffsetSize) + offsetToShare, countToShare); @@ -82,9 +85,10 @@ public: if (blockIdx >= useCoreNumToOutput) { return; } - shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[blockRank] + baseOffsetSize) + offsetToOutput, countToOutput); + shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[blockRank] + baseOffsetSize) + offsetFromShare, + countToOutput); if (countToOutput > 0) { - CpGM2GM(countToOutput * sizeof(T), shareGm, outputGm, COPYONLY); + CpGM2GM(outputGm, shareGm, countToOutput, COPYONLY); } } } @@ -119,9 +123,9 @@ private: int64_t useCoreNumToOutput; int64_t blockNumPerRank; int64_t blockRank; - int64_t offsetFromShare; + int64_t offsetFromShare;; int64_t offsetToOutput; - int64_t countToOutPut; + int64_t countToOutput; int globalRank; int globalRankSize; int localRankSize; -- Gitee From eaab2c0829673f8f903a7e3a9f598c4c189f2a72 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 08:40:29 +0800 Subject: [PATCH 081/414] 4 --- comm/lcal/src/ascendc_kernels/allgather.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h index 5696efc6..19ccdb25 100644 --- a/comm/lcal/src/ascendc_kernels/allgather.h +++ b/comm/lcal/src/ascendc_kernels/allgather.h @@ -39,7 +39,7 @@ public: inputGm.SetGlobalBuffer((__gm__ T*)input + offsetFromInput, countToShare); if (extraFlag & ExtraFlag::RDMA) { blockNumPerRank = blockNum / localRankSize; - useCoreNumToOutput = blockNumPerRank * localRankSize; + useCoreNumToOutput = blockNumPerRank * rankSize; } else { blockNumPerRank = blockNum / rankSize; useCoreNumToOutput = blockNumPerRank * localRankSize; -- Gitee From a8026fee932e5a24605cc167f49a7efbf4ae3437 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 08:40:59 +0800 Subject: [PATCH 082/414] 5 --- comm/lcal/src/ascendc_kernels/allgather.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h index 19ccdb25..72df6582 100644 --- a/comm/lcal/src/ascendc_kernels/allgather.h +++ b/comm/lcal/src/ascendc_kernels/allgather.h @@ -39,10 +39,10 @@ public: inputGm.SetGlobalBuffer((__gm__ T*)input + offsetFromInput, countToShare); if (extraFlag & ExtraFlag::RDMA) { blockNumPerRank = blockNum / localRankSize; - useCoreNumToOutput = blockNumPerRank * rankSize; + useCoreNumToOutput = blockNumPerRank * localRankSize; } else { blockNumPerRank = blockNum / rankSize; - useCoreNumToOutput = blockNumPerRank * localRankSize; + useCoreNumToOutput = blockNumPerRank * rankSize; } if (blockIdx >= useCoreNumToOutput) { return; -- Gitee From 7c4d12c7601fdd407023f42f3f671b6e7c99e000 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 08:51:48 +0800 Subject: [PATCH 083/414] begin --- .../src/ascendc_kernels/91093/allreduce_big_data_sio.h | 9 +++++++++ .../91093/allreduce_hierarchy_double_ring.h | 9 +++++++++ .../91093/reduce_scatter_big_data_91093_4step.h | 9 +++++++++ .../91093/reduce_scatter_hierarchy_double_ring.h | 9 +++++++++ comm/lcal/src/ascendc_kernels/allreduce_quant.h | 9 +++++++++ comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h | 9 +++++++++ comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h | 9 +++++++++ comm/lcal/src/ascendc_kernels/ipc_queue.h | 9 +++++++++ comm/lcal/src/ascendc_kernels/reduce_scatter.h | 9 +++++++++ comm/lcal/src/ascendc_kernels/sync_collectives.h | 9 +++++++++ 10 files changed, 90 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/comm/lcal/src/ascendc_kernels/ipc_queue.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/ipc_queue.h +++ b/comm/lcal/src/ascendc_kernels/ipc_queue.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/reduce_scatter.h +++ b/comm/lcal/src/ascendc_kernels/reduce_scatter.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index e69de29b..9a893c3a 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file -- Gitee From 402e0b59d437428a48ed28deeea54ed90d07c78b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 10:20:27 +0800 Subject: [PATCH 084/414] 6 --- .../91093/allreduce_big_data_sio.h | 236 +++++++++++++++++- 1 file changed, 235 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h index 9a893c3a..1224fb5b 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h @@ -6,4 +6,238 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ + +#ifndef LCCL_ALLREDUCE_BIG_DATA_SIO_H +#define LCCL_ALLREDUCE_BIG_DATA_SIO_H + +#include "collectives.h" +#include "sync_collectives.h" +#include "ipc_queue.h" +using namespace AscendC; + +template +class AllReduceBigDataSio : protected Collectives { + constexpr static int QUEUE_DEPTH = 4; + +public: + FORCE_INLINE_AICORE AllReduceBigDataSio(int rank, int rankSize, uint32_t extraFlag) + : AllReduceQuant(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + + perStepBlockNum = rankSize; + + ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / rankSize / QUEUE_DEPTH * rankSize * QUEUE_DEPTH; + perQueSize = ipcBuffMaxSizeAligned / rankSize; + perQueNum = perQueSize / sizeof(T); + curBlockSize = perQueSize / QUEUE_DEPTH; + curBlockNum = curBlockSize / sizeof(T); + atomOp = op; + for (int i = 0; i < rankSize; ++i) { + rankList[i] = i; + coreIdxList[i] = PING_PONG_SIZE * rankSize + blockIdx % perStepBlockNum; + } + + peerRank = blockIdx % perStepBlockNum; + perRankDataNum = len, rankSize; + + if (rank % RANK_SIZE_TWO == 0) { + adjRank = rank + 1; + } else { + adjRank = rank - 1; + } + + curRankDataNum = perRankDataNum; + if (blockIdx % perStepBlockNum == rankSize - 1) { + curRankDataNum = len - (rankSize - 1) * perRankDataNum; + } + inputBuffOffsetNum = blockIdx % rankSize * perRankDataNum; + + inputGt.SetGlobalBuffer((__gm__ U*)input + inputBuffOffsetNum, curRankDataNum); + + outputBuffOffsetNum = peerRank * perRankDataNum; + + outputGt.SetGlobalBuffer((__gm__ T*)output + outputBuffOffsetNum, curRankDataNum); + inputIpcGtOffsetNum = perQueSize % (blockIdx % perStepBlockNum); + + if (blockIdx / perStepBlockNum == 0) { + ProducerInit(); + } else if (blockIdx / perStepBlockNum == 1) { + ConsumerInit(); + } else { + PullerInit(); + } + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + if (blockIdx / perStepBlockNum == 0) { + Producer(); + } else if (blockIdx / perStepBlockNum == 1) { + Consumer(); + } else { + Puller(); + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } +private: + FORCE_INLINE_AICORE void Producer() + { + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int64_t remain = curRankDataNum; + int count = 0; + while (count < loopCount) { + inputQue.DeQue(rankList, coreIdxList, rankSize); + GlobalTensor outputGm = inputQue.EnQue(); + int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; + CpGM2GMPingPong(copyNum * sizeof(T), inputGt[count * curBlockNum], outputGm, COPYONLY); + sync.SetOuterFlag(magic, count); + + if (blockIdx % RANK_SIZE_TWO == rank % RANK_SIZE_TWO) { + sync.WaitOuterFlag(magic, count, rank, blockIdx); + sync.WaitOuterFlag(magic, count, adjRank, blockIdx); + GLobalTensor inputGm = sioAtomSrcQue.ReadFront(); + GlobalTensor outputGm = sioAtomDstQue.EnQue(); + CpGM2GMPingPong(copyNum * sizeof(T), inputGm, outputGm, atomOp); + } + sync.SetInnerFlag(magic, count); + remain = remain - curBlockNum; + count = count + 1; + } + } + + FORCE_INLINE_AICORE void Consumer() + { + int64_t atomLoopCount = CeilDiv(pullRankDataNum, curBlockNum); + int64_t atomRemain = pullRankDataNum; + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int64_t remain = curRankDataNum; + int count = 0; + int64_t maxLoopCount = (loopCount < atomLoopCount) ? loopCount : atomLoopCount; + while (count < maxLoopCount) { + if (peerRank != rank && rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO && count != atomLoopCount) { + sync.WaitInnerFlag(magic, count, rank, rank); + sync.WaitInnerFlag(magic, count, peerRank, rank); + + GlobalTensor inputGm = srcQue.ReadFront(); + GlobalTensor outputGm = dstQue.EnQue(); + + int64_t atomCopyNum = (atomRemain < curBlockNum) ? atomRemain : curBlockNum; + CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp); + atomRemain = atomRemain - curBlockNum; + } + sync.SetOuterFlag(magic, count); + if (count == loopCount) { + break; + } + if (rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { + sync.WaitOneRankPartOuterFlag(magic, count, peerRank, rankSize, rankSize); + if (peerRank != rank) { + GlobalTensor inputGm = pullSrcQue.ReadFront(); + GlobalTensor outputGm = pullDstQue.EnQue(); + int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; + CpGM2GMPingPong(copyNum * sizeof(T), inputGm, outputGm, COPYONLY); + } + sync.SetInnerFlag(magic, count); + } + remain = remain - curBlockNum; + count = count + 1; + } + } + FORCE_INLINE_AICORE void Puller() + { + int64_t loopCount = CeilDiv(pullRankDataNum, curBlockNum); + int64_t remain = curRankDataNum; + int count = 0; + while (count < loopCount) { + if (rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { + sync.WaitInnerFlag(magic, count, rank, blockIdx - perStepBlockNum); + } else { + sync.WaitOuterFlag(magic, count, adjRank, blockIdx - perStepBlockNum); + } + GlobalTensor inputGm = pullSrcQue.ReadFront(); + int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; + CpGM2GMPingPong(copyNum * sizeof(T), inputGm, outputGt[count * curBlockNum], COPYONLY); + sync.SetInnerFlag(magic, count); + remain = remain - curBlockNum; + count = count + 1; + } + } + + FORCE_INLINE_AICORE void ProducerInit() + { + inputQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlockNum); + if (blockIdx % RANK_SIZE_TWO == rank % RANK_SIZE_TWO) { + sioAtomSrcQue.Init(&sync, magic, shareAddrs[adjRank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlockNum); + sioAtomDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlockNum); + } + } + FORCE_INLINE_AICORE void ConsumerInit() + { + srcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + rank * perQueSize, + perQueNum, curBlockNum); + dstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * perQueSize, + perQueNum, curBlockNum); + if (peerRank != rank && rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { + pullSrcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + + peerRank * perQueSize, perQueNum, curBlockNum); + pullDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + peerRank * perQueSize, perQueNum, curBlockNum); + } + + } + + FORCE_INLINE_AICORE void PullerInit() + { + if (rank % RANKSIZE_TWO == peerRank % RANK_SIZE_TWO) { + pullQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlockNum); + } else { + pullQue.Init(&sync, magic, shareAddrs[adjRank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlockNum); + } + } +private: + GlobalTensor inputGt; + GlobalTensor outputGt; + + int atomOp; + int64_t ipcBuffMaxSizeAligned; + + int64_t perRankDataNum; + int64_t curRankDataNum; + int64_t peerRank; + int64_t peerRank; + int64_t adjRank; + int64_t pullRankDataNum; + int64_t inputBuffOffsetNum; + int64_t outputBuffOffsetNum; + int64_t inputIpcGtOffsetNum; + int64_t curBlockSize; + int64_t perStepBlockNum; + int64_t curBlockNum; + int64_t perQueSize; + int64_t perQueNum; + + IpcQueue inputQue; + IpcQueue srcQue; + IpcQueue dstQue; + IpcQueue pullQue; + IpcQueue sioAtomSrcQue; + IpcQueue sioAtomDstQue; + IpcQueue pullSrcQue; + IpcQueue pullDstQue; + + int rankList[LCAL_MAX_RANK_SIZE]; + int coreIdxList[LCAL_MAX_RANK_SIZE]; +}; + +#endif // LCCL_ALLREDUCE_BIG_DATA_H \ No newline at end of file -- Gitee From 826adb7366f319e5e7621d7d8ccd9d823a2b35c9 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 10:28:38 +0800 Subject: [PATCH 085/414] 8 --- .../91093/allreduce_big_data_sio.h | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h index 1224fb5b..6050b056 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h @@ -22,7 +22,7 @@ class AllReduceBigDataSio : protected Collectives { public: FORCE_INLINE_AICORE AllReduceBigDataSio(int rank, int rankSize, uint32_t extraFlag) - : AllReduceQuant(rank, rankSize, extraFlag) {} + : Collectives(rank, rankSize, extraFlag) {} FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { Collectives::Init(KERNELS_ARGS_CALL()); @@ -42,7 +42,7 @@ public: } peerRank = blockIdx % perStepBlockNum; - perRankDataNum = len, rankSize; + perRankDataNum = len / rankSize; if (rank % RANK_SIZE_TWO == 0) { adjRank = rank + 1; @@ -54,14 +54,18 @@ public: if (blockIdx % perStepBlockNum == rankSize - 1) { curRankDataNum = len - (rankSize - 1) * perRankDataNum; } + pullRankDataNum = perRankDataNum; + if (rank == rankSize - 1) { + pullRankDataNum = len - rank * perRankDataNum; + } inputBuffOffsetNum = blockIdx % rankSize * perRankDataNum; - inputGt.SetGlobalBuffer((__gm__ U*)input + inputBuffOffsetNum, curRankDataNum); + inputGt.SetGlobalBuffer((__gm__ T*)input + inputBuffOffsetNum, curRankDataNum); outputBuffOffsetNum = peerRank * perRankDataNum; outputGt.SetGlobalBuffer((__gm__ T*)output + outputBuffOffsetNum, curRankDataNum); - inputIpcGtOffsetNum = perQueSize % (blockIdx % perStepBlockNum); + inputIpcGtOffsetNum = perQueSize * (blockIdx % perStepBlockNum); if (blockIdx / perStepBlockNum == 0) { ProducerInit(); @@ -101,7 +105,7 @@ private: if (blockIdx % RANK_SIZE_TWO == rank % RANK_SIZE_TWO) { sync.WaitOuterFlag(magic, count, rank, blockIdx); sync.WaitOuterFlag(magic, count, adjRank, blockIdx); - GLobalTensor inputGm = sioAtomSrcQue.ReadFront(); + GlobalTensor inputGm = sioAtomSrcQue.ReadFront(); GlobalTensor outputGm = sioAtomDstQue.EnQue(); CpGM2GMPingPong(copyNum * sizeof(T), inputGm, outputGm, atomOp); } @@ -136,7 +140,7 @@ private: break; } if (rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { - sync.WaitOneRankPartOuterFlag(magic, count, peerRank, rankSize, rankSize); + sync.WaitOneRankPartOuterFlag(magic, count, peerRank, perStepBlockNum, perStepBlockNum); if (peerRank != rank) { GlobalTensor inputGm = pullSrcQue.ReadFront(); GlobalTensor outputGm = pullDstQue.EnQue(); @@ -151,16 +155,16 @@ private: } FORCE_INLINE_AICORE void Puller() { - int64_t loopCount = CeilDiv(pullRankDataNum, curBlockNum); + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); int64_t remain = curRankDataNum; int count = 0; while (count < loopCount) { if (rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { sync.WaitInnerFlag(magic, count, rank, blockIdx - perStepBlockNum); } else { - sync.WaitOuterFlag(magic, count, adjRank, blockIdx - perStepBlockNum); + sync.WaitInnerFlag(magic, count, adjRank, blockIdx - perStepBlockNum); } - GlobalTensor inputGm = pullSrcQue.ReadFront(); + GlobalTensor inputGm = pullQue.ReadFront(); int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; CpGM2GMPingPong(copyNum * sizeof(T), inputGm, outputGt[count * curBlockNum], COPYONLY); sync.SetInnerFlag(magic, count); @@ -197,7 +201,7 @@ private: FORCE_INLINE_AICORE void PullerInit() { - if (rank % RANKSIZE_TWO == peerRank % RANK_SIZE_TWO) { + if (rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { pullQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, perQueNum, curBlockNum); } else { @@ -206,7 +210,7 @@ private: } } private: - GlobalTensor inputGt; + GlobalTensor inputGt; GlobalTensor outputGt; int atomOp; -- Gitee From c4351aec9ae0bd71e66a301e38a8049029e69d2d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 10:29:15 +0800 Subject: [PATCH 086/414] 9 --- comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h | 1 - 1 file changed, 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h index 6050b056..c5e5b48a 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h @@ -219,7 +219,6 @@ private: int64_t perRankDataNum; int64_t curRankDataNum; int64_t peerRank; - int64_t peerRank; int64_t adjRank; int64_t pullRankDataNum; int64_t inputBuffOffsetNum; -- Gitee From 224df244ee6d68bc1c9c9e4db42aacee94a22cc2 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 10:52:55 +0800 Subject: [PATCH 087/414] 9 --- comm/lcal/src/ascendc_kernels/ipc_queue.h | 116 +++++++++++++++++++++- 1 file changed, 115 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/comm/lcal/src/ascendc_kernels/ipc_queue.h index 9a893c3a..64972653 100644 --- a/comm/lcal/src/ascendc_kernels/ipc_queue.h +++ b/comm/lcal/src/ascendc_kernels/ipc_queue.h @@ -6,4 +6,118 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ +#ifdef LCCL_IPC_QUEUE_H +#define LCCL_IPC_QUEUE_H +#include "sync_collectives.h" +using namespace AscendC; + +template +class IpcQueue { +public: + FORCE_INLINE_AICORE IpcQueue() {} + + FORCE_INLINE_AICORE void Init(SyncCollectives *sync, int64_t magic, GM_ADDR workSpace, uint64_t bufferNum, + uint64_t blockNum) + { + this->sync = sync; + this->magic = magic; + depth = bufferNum / blockNum; + front = 0; + rear = 0; + count = 0; + this->blockNum = blockNum; + buff.SetGlobalBuffer((__gm__ T*)workSpace, bufferNum); + blockIdx = GetBlockIdx(); + } + + FORCE_INLINE_AICORE bool Full() + { + if ((rear + 1) % depth == front) { + return true; + } + return false; + } + + FORCE_INLINE_AICORE GlobalTensor EnQue() + { + uint64_t rearOld = rear; + rear = (rear + 1) % depth; + return buff[rearOld * blockNum]; + } + + FORCE_INLINE_AICORE void DeQue(int checkRank, int checkBlock = -1) + { + if (!Full()) { + return; + } + if (checkBlock == -1) { + checkBlock = blockIdx; + } + sync->WaitInnerFlag(magic, front, checkRank, checkBlock); + PipeBarrier(); + int64_t val = sync->GetInnerFlag(checkRank, checkBlock) & EVENT_ID_MASK; + count = val + 1; + front = (val + 1) % depth; + } + + FORCE_INLINE_AICORE void DeQue(int *rankList, int checkCount, int checkBlock = -1) + { + if (!Full()) { + return; + } + if (checkBlock == -1) { + checkBlock = blockIdx; + } + int64_t minIndex = LLONG_MAX; + for (int i = 0; i < checkCount; i++) { + sync->WaitInnerFlag(magic, count, rankList[i], checkBlock); + PipeBarrier(); + + int64_t val = sync->GetInnerFlag(rankList[i], checkBlock) & EVENT_ID_MASK; + if (minIndex > val) { + minIndex = val; + } + } + count = minIndex + 1; + front = (minIndex + 1) % depth; + } + FORCE_INLINE_AICORE void DeQue(int *rankList, int *blockIdxList, int checkBlock) + { + if (!Full()) { + return; + } + + int64_t minIndex = LLONG_MAX; + for (int i = 0; i < checkCount; i++) { + sync->WaitInnerFlag(magic, count, rankList[i], blockIdxList[i]); + PipeBarrier(); + + int64_t val = sync->GetInnerFlag(rankList[i], blockIdxList[i]) & EVENT_ID_MASK; + if (minIndex > val) { + minIndex = val; + } + } + count = minIndex + 1; + front = (minIndex + 1) % depth; + } + + FORCE_INLINE_AICORE GlobalTensor ReadFront() + { + uint64_t frontOld = front; + front = (front + 1) % depth; + return buff[frontOld * blockNum]; + } + +private: + int64_t magic; + int64_t depth; + int64_t front; + int64_t rear; + int64_t count; + int64_t blockNum; + GlobalTensor buff; + SyncCollectives *sync; + int64_t blockIdx; +}; + -- Gitee From 9004c824941aefabfab045a69d6bdb290262ebd0 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 10:59:24 +0800 Subject: [PATCH 088/414] 1 --- comm/lcal/src/ascendc_kernels/ipc_queue.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/comm/lcal/src/ascendc_kernels/ipc_queue.h index 64972653..2543176f 100644 --- a/comm/lcal/src/ascendc_kernels/ipc_queue.h +++ b/comm/lcal/src/ascendc_kernels/ipc_queue.h @@ -7,7 +7,7 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifdef LCCL_IPC_QUEUE_H +#ifndef LCCL_IPC_QUEUE_H #define LCCL_IPC_QUEUE_H #include "sync_collectives.h" using namespace AscendC; @@ -54,7 +54,7 @@ public: if (checkBlock == -1) { checkBlock = blockIdx; } - sync->WaitInnerFlag(magic, front, checkRank, checkBlock); + sync->WaitInnerFlag(magic, count, checkRank, checkBlock); PipeBarrier(); int64_t val = sync->GetInnerFlag(checkRank, checkBlock) & EVENT_ID_MASK; count = val + 1; @@ -82,7 +82,7 @@ public: count = minIndex + 1; front = (minIndex + 1) % depth; } - FORCE_INLINE_AICORE void DeQue(int *rankList, int *blockIdxList, int checkBlock) + FORCE_INLINE_AICORE void DeQue(int *rankList, int *blockIdxList, int checkCount) { if (!Full()) { return; @@ -111,13 +111,13 @@ public: private: int64_t magic; - int64_t depth; - int64_t front; - int64_t rear; - int64_t count; - int64_t blockNum; + uint64_t depth; + uint64_t front; + uint64_t rear; + uint64_t count; + uint64_t blockNum; GlobalTensor buff; SyncCollectives *sync; - int64_t blockIdx; + int blockIdx; }; -- Gitee From c83885a7f2893a912fadf0c182b879ef70dea31b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 14:44:53 +0800 Subject: [PATCH 089/414] Add CMake configuration for LCCL operations and dependencies --- comm/lcal/src/ascendc_kernels/CMakeLists.txt | 178 +++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/CMakeLists.txt b/comm/lcal/src/ascendc_kernels/CMakeLists.txt index e69de29b..8838603a 100644 --- a/comm/lcal/src/ascendc_kernels/CMakeLists.txt +++ b/comm/lcal/src/ascendc_kernels/CMakeLists.txt @@ -0,0 +1,178 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +include(../ascendc.cmake) +include_directories(.) +option(ENABLE_LCCL_910A5_OP "ENABLE lccl_910A5_op library and compile options" OFF) + +file(GLOB_RECURSE KERNEL_FILES *.cpp) +set_source_files_properties(${KERNEL_FILES} PROPERTIES LANGUAGE CCE) + +add_library(normal_lccl_op1_tmp OBJECT + lccl_op1.cpp + sync_collectives.h + collectives.h +) + +add_library(normal_lccl_op2_tmp OBJECT + lccl_op2.cpp + sync_collectives.h + collectives.h +) +target_compile_options(normal_lccl_op1_tmp PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIV_ARCH} +) +target_compile_options(normal_lccl_op2_tmp PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIV_ARCH} +) + +add_custom_target(normal_lccl_op1 + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/normal_lccl_op1_tmp.dir/lccl_op1.cpp.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "CMakeFiles/normal_lccl_op1_tmp.dir/lccl_op1.cpp.o" --allow-multiple-definition +) +add_dependencies(normal_lccl_op1 normal_lccl_op1_tmp) +add_custom_target(normal_lccl_op2 + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/normal_lccl_op2_tmp.dir/lccl_op2.cpp.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "CMakeFiles/normal_lccl_op2_tmp.dir/lccl_op2.cpp.o" --allow-multiple-definition +) +add_dependencies(normal_lccl_op2 normal_lccl_op2_tmp) + +add_library(dump_lccl_op1_tmp_aic OBJECT + lccl_op1.cpp + sync_collectives.h + collectives.h +) +add_library(dump_lccl_op1_tmp_aiv OBJECT + lccl_op1.cpp + sync_collectives.h + collectives.h +) +target_compile_options(dump_lccl_op1_tmp_aic PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + -DENABLE_LCCL_DUMP + -DENABLE_LCCL_MIX +) +target_compile_options(dump_lccl_op1_tmp_aiv PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + -DENABLE_LCCL_DUMP + -DENABLE_LCCL_MIX +) + +add_custom_target(dump_lccl_op1 + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/dump_lccl_op1_tmp_aic.dir/lccl_op1.cpp.o" + "CMakeFiles/dump_lccl_op1_tmp_aiv.dir/lccl_op1.cpp.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "CMakeFiles/dump_lccl_op1_tmp_aic.dir/lccl_op1.cpp.o" --allow-multiple-definition +) +add_dependencies(dump_lccl_1_op1 dump_lccl_op1_tmp_aic dump_lccl_op1_tmp_aiv) + +add_library(dump_lccl_op2_tmp_aic OBJECT + lccl_op2.cpp + sync_collectives.h + collectives.h +) +add_library(dump_lccl_op2_tmp_aiv OBJECT + lccl_op2.cpp + sync_collectives.h + collectives.h +) +target_compile_options(dump_lccl_op2_tmp_aic PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + -DENABLE_LCCL_DUMP + -DENABLE_LCCL_MIX +) +target_compile_options(dump_lccl_op2_tmp_aiv PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + -DENABLE_LCCL_DUMP + -DENABLE_LCCL_MIX +) + +add_custom_target(dump_lccl_op2 + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/dump_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" + "CMakeFiles/dump_lccl_op2_tmp_aiv.dir/lccl_op2.cpp.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "CMakeFiles/dump_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" --allow-multiple-definition +) +add_dependencies(dump_lccl_1_op2 dump_lccl_op2_tmp_aic dump_lccl_op2_tmp_aiv) + +# 不带dump的mix算子 + +add_library(mix_lccl_op1_tmp_aic OBJECT + lccl_op1.cpp + sync_collectives.h + collectives.h +) +add_library(mix_lccl_op1_tmp_aiv OBJECT + lccl_op1.cpp + sync_collectives.h + collectives.h +) +target_compile_options(mix_lccl_op1_tmp_aic PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + -DENABLE_LCCL_MIX +) +target_compile_options(mix_lccl_op1_tmp_aiv PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + --cce-long-call=true + -DENABLE_LCCL_MIX +) + +add_custom_target(mix_lccl_op1 + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/mix_lccl_op1_tmp_aic.dir/lccl_op1.cpp.o" + "CMakeFiles/mix_lccl_op1_tmp_aiv.dir/lccl_op1.cpp.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "CMakeFiles/mix_lccl_op1.dir/lccl_op1.cpp.o" --allow-multiple-definition +) +add_dependencies(mix_lccl_1_op1 mix_lccl_op1_tmp_aic mix_lccl_op1_tmp_aiv) + +add_library(mix_lccl_op2_tmp_aic OBJECT + lccl_op2.cpp + sync_collectives.h + collectives.h +) +add_library(mix_lccl_op2_tmp_aiv OBJECT + lccl_op2.cpp + sync_collectives.h + collectives.h +) +target_compile_options(dump_lccl_op2_tmp_aic PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + -DENABLE_LCCL_MIX +) +target_compile_options(mix_lccl_op2_tmp_aiv PRIVATE + ${CCE_COMPILE_OPTION} + --cce-aicore-arch=${AIC_ARCH} + --cce-long-call=true + -DENABLE_LCCL_MIX +) + +add_custom_target(mix_lccl_op2 + COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 + "CMakeFiles/mix_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" + "CMakeFiles/mix_lccl_op2_tmp_aiv.dir/lccl_op2.cpp.o" + ${SANITIZER_DEPEND_LIBS} + --static -o "CMakeFiles/mix_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" --allow-multiple-definition +) +add_dependencies(mix_lccl_1_op2 mix_lccl_op2_tmp_aic mix_lccl_op2_tmp_aiv) \ No newline at end of file -- Gitee From cf2c6c7c822cc2f166716b8021446f1a6b629173 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 15:01:59 +0800 Subject: [PATCH 090/414] 2 --- comm/lcal/src/ascendc_kernels/CMakeLists.txt | 43 +++++++++++++------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/CMakeLists.txt b/comm/lcal/src/ascendc_kernels/CMakeLists.txt index 8838603a..268492ac 100644 --- a/comm/lcal/src/ascendc_kernels/CMakeLists.txt +++ b/comm/lcal/src/ascendc_kernels/CMakeLists.txt @@ -14,6 +14,7 @@ option(ENABLE_LCCL_910A5_OP "ENABLE lccl_910A5_op library and compile options" O file(GLOB_RECURSE KERNEL_FILES *.cpp) set_source_files_properties(${KERNEL_FILES} PROPERTIES LANGUAGE CCE) +# 常规算子 add_library(normal_lccl_op1_tmp OBJECT lccl_op1.cpp sync_collectives.h @@ -25,10 +26,12 @@ add_library(normal_lccl_op2_tmp OBJECT sync_collectives.h collectives.h ) +# 设置编译选项 target_compile_options(normal_lccl_op1_tmp PRIVATE ${CCE_COMPILE_OPTION} --cce-aicore-arch=${AIV_ARCH} ) +# 设置编译选项 target_compile_options(normal_lccl_op2_tmp PRIVATE ${CCE_COMPILE_OPTION} --cce-aicore-arch=${AIV_ARCH} @@ -36,19 +39,20 @@ target_compile_options(normal_lccl_op2_tmp PRIVATE add_custom_target(normal_lccl_op1 COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 - "CMakeFiles/normal_lccl_op1_tmp.dir/lccl_op1.cpp.o" + "CMakeFiles/normal_lccl_op1.dir/lccl_op1.cpp.o" ${SANITIZER_DEPEND_LIBS} --static -o "CMakeFiles/normal_lccl_op1_tmp.dir/lccl_op1.cpp.o" --allow-multiple-definition ) add_dependencies(normal_lccl_op1 normal_lccl_op1_tmp) add_custom_target(normal_lccl_op2 COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 - "CMakeFiles/normal_lccl_op2_tmp.dir/lccl_op2.cpp.o" + "CMakeFiles/normal_lccl_op2.dir/lccl_op2.cpp.o" ${SANITIZER_DEPEND_LIBS} --static -o "CMakeFiles/normal_lccl_op2_tmp.dir/lccl_op2.cpp.o" --allow-multiple-definition ) add_dependencies(normal_lccl_op2 normal_lccl_op2_tmp) +# 带dum的mix算子 add_library(dump_lccl_op1_tmp_aic OBJECT lccl_op1.cpp sync_collectives.h @@ -61,7 +65,8 @@ add_library(dump_lccl_op1_tmp_aiv OBJECT ) target_compile_options(dump_lccl_op1_tmp_aic PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIC_ARCH} + --cce-aicore-arch=${AIV_ARCH} + --cce-long-call=true -DENABLE_LCCL_DUMP -DENABLE_LCCL_MIX ) @@ -77,9 +82,9 @@ add_custom_target(dump_lccl_op1 "CMakeFiles/dump_lccl_op1_tmp_aic.dir/lccl_op1.cpp.o" "CMakeFiles/dump_lccl_op1_tmp_aiv.dir/lccl_op1.cpp.o" ${SANITIZER_DEPEND_LIBS} - --static -o "CMakeFiles/dump_lccl_op1_tmp_aic.dir/lccl_op1.cpp.o" --allow-multiple-definition + --static -o "CMakeFiles/dump_lccl_op1.dir/lccl_op1.cpp.o" --allow-multiple-definition ) -add_dependencies(dump_lccl_1_op1 dump_lccl_op1_tmp_aic dump_lccl_op1_tmp_aiv) +add_dependencies(dump_lccl_op1 dump_lccl_op1_tmp_aic dump_lccl_op1_tmp_aiv) add_library(dump_lccl_op2_tmp_aic OBJECT lccl_op2.cpp @@ -99,7 +104,8 @@ target_compile_options(dump_lccl_op2_tmp_aic PRIVATE ) target_compile_options(dump_lccl_op2_tmp_aiv PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIC_ARCH} + --cce-aicore-arch=${AIV_ARCH} + --cce-long-call=true -DENABLE_LCCL_DUMP -DENABLE_LCCL_MIX ) @@ -109,9 +115,9 @@ add_custom_target(dump_lccl_op2 "CMakeFiles/dump_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" "CMakeFiles/dump_lccl_op2_tmp_aiv.dir/lccl_op2.cpp.o" ${SANITIZER_DEPEND_LIBS} - --static -o "CMakeFiles/dump_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" --allow-multiple-definition + --static -o "CMakeFiles/dump_lccl_op2.dir/lccl_op2.cpp.o" --allow-multiple-definition ) -add_dependencies(dump_lccl_1_op2 dump_lccl_op2_tmp_aic dump_lccl_op2_tmp_aiv) +add_dependencies(dump_lccl_op2 dump_lccl_op2_tmp_aic dump_lccl_op2_tmp_aiv) # 不带dump的mix算子 @@ -132,7 +138,7 @@ target_compile_options(mix_lccl_op1_tmp_aic PRIVATE ) target_compile_options(mix_lccl_op1_tmp_aiv PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIC_ARCH} + --cce-aicore-arch=${AIV_ARCH} --cce-long-call=true -DENABLE_LCCL_MIX ) @@ -144,7 +150,7 @@ add_custom_target(mix_lccl_op1 ${SANITIZER_DEPEND_LIBS} --static -o "CMakeFiles/mix_lccl_op1.dir/lccl_op1.cpp.o" --allow-multiple-definition ) -add_dependencies(mix_lccl_1_op1 mix_lccl_op1_tmp_aic mix_lccl_op1_tmp_aiv) +add_dependencies(mix_lccl_op1 mix_lccl_op1_tmp_aic mix_lccl_op1_tmp_aiv) add_library(mix_lccl_op2_tmp_aic OBJECT lccl_op2.cpp @@ -156,14 +162,14 @@ add_library(mix_lccl_op2_tmp_aiv OBJECT sync_collectives.h collectives.h ) -target_compile_options(dump_lccl_op2_tmp_aic PRIVATE +target_compile_options(mix_lccl_op2_tmp_aic PRIVATE ${CCE_COMPILE_OPTION} --cce-aicore-arch=${AIC_ARCH} -DENABLE_LCCL_MIX ) target_compile_options(mix_lccl_op2_tmp_aiv PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIC_ARCH} + --cce-aicore-arch=${AIV_ARCH} --cce-long-call=true -DENABLE_LCCL_MIX ) @@ -173,6 +179,15 @@ add_custom_target(mix_lccl_op2 "CMakeFiles/mix_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" "CMakeFiles/mix_lccl_op2_tmp_aiv.dir/lccl_op2.cpp.o" ${SANITIZER_DEPEND_LIBS} - --static -o "CMakeFiles/mix_lccl_op2_tmp_aic.dir/lccl_op2.cpp.o" --allow-multiple-definition + --static -o "CMakeFiles/mix_lccl_op2.dir/lccl_op2.cpp.o" --allow-multiple-definition ) -add_dependencies(mix_lccl_1_op2 mix_lccl_op2_tmp_aic mix_lccl_op2_tmp_aiv) \ No newline at end of file +add_dependencies(mix_lccl_op2 mix_lccl_op2_tmp_aic mix_lccl_op2_tmp_aiv) + +add_custom_target(lccl_op + COMMAND echo "generating lccl op ... ENABLE_LCCL_910A5_OP=${ENABLE_LCCL_910A5_OP}" + COMMAND rm -f lccl_op.o + COMMAND find CMakeFiles -name "*.0" ! -path "*tmp*" | sort | xargs -I {} sed '1s/^/DDDD/' {} >> lccl_op.o + COMMAND truncate -c -s ${LCAL_1OP_BIN_SIZE} lccl_op.o + COMMAND rm -f ${LCAL_CCE_PATH} +) +add_dependencies(lccl_op dump_lccl_op1 dump_lccl_op2 mix_lccl_op1 mix_lccl_op2 normal_lccl_op1 normal_lccl_op2) -- Gitee From bc2f2148703bb633581d6ca69f4b497c7eb50f51 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 15:04:29 +0800 Subject: [PATCH 091/414] 32 --- comm/lcal/src/ascendc_kernels/CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/CMakeLists.txt b/comm/lcal/src/ascendc_kernels/CMakeLists.txt index 268492ac..873cb6a6 100644 --- a/comm/lcal/src/ascendc_kernels/CMakeLists.txt +++ b/comm/lcal/src/ascendc_kernels/CMakeLists.txt @@ -39,20 +39,20 @@ target_compile_options(normal_lccl_op2_tmp PRIVATE add_custom_target(normal_lccl_op1 COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 - "CMakeFiles/normal_lccl_op1.dir/lccl_op1.cpp.o" + "CMakeFiles/normal_lccl_op1_tmp.dir/lccl_op1.cpp.o" ${SANITIZER_DEPEND_LIBS} - --static -o "CMakeFiles/normal_lccl_op1_tmp.dir/lccl_op1.cpp.o" --allow-multiple-definition + --static -o "CMakeFiles/normal_lccl_op1.dir/lccl_op1.cpp.o" --allow-multiple-definition ) add_dependencies(normal_lccl_op1 normal_lccl_op1_tmp) add_custom_target(normal_lccl_op2 COMMAND ${CMAKE_CCE_LINKER} -m aicorelinux -Ttext=0 - "CMakeFiles/normal_lccl_op2.dir/lccl_op2.cpp.o" + "CMakeFiles/normal_lccl_op2_tmp.dir/lccl_op2.cpp.o" ${SANITIZER_DEPEND_LIBS} - --static -o "CMakeFiles/normal_lccl_op2_tmp.dir/lccl_op2.cpp.o" --allow-multiple-definition + --static -o "CMakeFiles/normal_lccl_op2.dir/lccl_op2.cpp.o" --allow-multiple-definition ) add_dependencies(normal_lccl_op2 normal_lccl_op2_tmp) -# 带dum的mix算子 +# 带dump的mix算子 add_library(dump_lccl_op1_tmp_aic OBJECT lccl_op1.cpp sync_collectives.h @@ -65,14 +65,14 @@ add_library(dump_lccl_op1_tmp_aiv OBJECT ) target_compile_options(dump_lccl_op1_tmp_aic PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIV_ARCH} - --cce-long-call=true + --cce-aicore-arch=${AIC_ARCH} -DENABLE_LCCL_DUMP -DENABLE_LCCL_MIX ) target_compile_options(dump_lccl_op1_tmp_aiv PRIVATE ${CCE_COMPILE_OPTION} - --cce-aicore-arch=${AIC_ARCH} + --cce-aicore-arch=${AIV_ARCH} + --cce-long-call=true -DENABLE_LCCL_DUMP -DENABLE_LCCL_MIX ) @@ -186,7 +186,7 @@ add_dependencies(mix_lccl_op2 mix_lccl_op2_tmp_aic mix_lccl_op2_tmp_aiv) add_custom_target(lccl_op COMMAND echo "generating lccl op ... ENABLE_LCCL_910A5_OP=${ENABLE_LCCL_910A5_OP}" COMMAND rm -f lccl_op.o - COMMAND find CMakeFiles -name "*.0" ! -path "*tmp*" | sort | xargs -I {} sed '1s/^/DDDD/' {} >> lccl_op.o + COMMAND find CMakeFiles -name "*.o" ! -path "*tmp*" | sort | xargs -I {} sed '1s/^/DDDD/' {} >> lccl_op.o COMMAND truncate -c -s ${LCAL_1OP_BIN_SIZE} lccl_op.o COMMAND rm -f ${LCAL_CCE_PATH} ) -- Gitee From 595947eb22a12f659effaf19acd3bf4f3b99f901 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 16:17:14 +0800 Subject: [PATCH 092/414] 3 --- .../lcal/src/ascendc_kernels/reduce_scatter.h | 86 ++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h index 9a893c3a..905b4d76 100644 --- a/comm/lcal/src/ascendc_kernels/reduce_scatter.h +++ b/comm/lcal/src/ascendc_kernels/reduce_scatter.h @@ -6,4 +6,88 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ + +#ifndef LCCL_REDUCE_SCATTER_H +#define LCCL_REDUCE_SCATTER_H + +#include "sync_collectives.h" +#include "collectives.h" +using namespace AscendC; + +template +class ReduceScatter : protected Collectives { +public: + FORCE_INLINE_AICORE ReduceScatter(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + atomOp = op; + DMANumMax = BLOCK_SIZE / sizeof(T); + corePerRank = blockNum / rankSize; + rankIDOfBlock = blockIdx / corePerRank; + dataDMAPerCore = CeilDiv(len, corePerRank); + blockIdxOfLen = blockIdx % corePerRank; + if (blockIdxOfLen == corePerRank - 1) { + blockDataNum = len - blockIdxOfLen * dataDMAPerCore; + } else { + blockDataNum = dataDMAPerCore; + } + inputOffset = rankIDOfBlock * len + (blockIdx % corePerRank) * dataDMAPerCore; + outputOffset = dataDMAPerCore * (blockIdx % corePerRank); + dstIpcDataOffset = IPC_DATA_OFFSET / sizeof(T) + rankIDOfBlock * len + outputOffset; + srcIpcDataOffset = IPC_DATA_OFFSET / sizeof(T) + rank * len + outputOffset; + srcInputGlobal.SetGlobalBuffer((__gm__ T*)input + inputOffset, blockDataNum); + if ((extraFlag & ExtraFlag::RDMA) == ExtraFlag::RDMA) { + dstOutputGlobal.SetGlobalBuffer((__gm__ T*)shareAddrs[rank] + srcIpcDataOffset, blockDataNum); + } else { + dstOutputGlobal.SetGlobalBuffer((__gm__ T*)output + outputOffset, blockDataNum); + } + dstIPCGlobal.SetGlobalBuffer((__gm__ T*)shareAddrs[rank] + dstIpcDataOffset, blockDataNum); + srcIPCGlobal.SetGlobalBuffer((__gm__ T*)shareAddrs[rankIDOfBlock] + srcIpcDataOffset, blockDataNum); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + CpInputToBuffAndOutput(); + sync.SetInnerFlag(magic, 1); + sync.WaitRankInnerFlag(magic, 1, rank); + sync.WaitRankInnerFlag(magic, 1, rankIDOfBlock, rank * corePerRank + blockIdx % corePerRank); + if (rankIDOfBlock != rank) { + CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockDataNum, atomOp); + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } + + FORCE_INLINE_AICORE void CpInputToBuffAndOutput() + { + Collectives::CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); + if ((extraFlag & ExtraFlag::RDMA) == ExtraFlag::RDMA) { + if ((blockIdx >- rank * corePerRank) && (blockIdx < (rank * corePerRank + corePerRank))) { + CpGM2GM(dstIPCGlobal, srcInputGlobal, blockDataNum, -1); + } + } + } + +protected: + GlobalTensor srcInputGlobal; + GlobalTensor srcIPCGlobal; + GlobalTensor dstIPCGlobal; + GlobalTensor dstOutputGlobal; + int blockIdxOfLen; + int DMANumMax; + int rankIDOfBlock; + int corePerRank; + int inputOffset; + int outputOffset; + int srcIpcDataOffset; + int dstIpcDataOffset; + int dataDMAPerCore; + int blockDataNum; + int atomOp; +}; +#endif // LCCL_REDUCE_SCATTER_H \ No newline at end of file -- Gitee From e5c6f4c5063c57ae9850de767e8ef128786589b5 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 16:26:54 +0800 Subject: [PATCH 093/414] 2 --- comm/lcal/src/ascendc_kernels/reduce_scatter.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h index 905b4d76..538aec82 100644 --- a/comm/lcal/src/ascendc_kernels/reduce_scatter.h +++ b/comm/lcal/src/ascendc_kernels/reduce_scatter.h @@ -15,7 +15,7 @@ #include "collectives.h" using namespace AscendC; -template +template class ReduceScatter : protected Collectives { public: FORCE_INLINE_AICORE ReduceScatter(int rank, int rankSize, uint32_t extraFlag) @@ -56,7 +56,7 @@ public: CpInputToBuffAndOutput(); sync.SetInnerFlag(magic, 1); sync.WaitRankInnerFlag(magic, 1, rank); - sync.WaitRankInnerFlag(magic, 1, rankIDOfBlock, rank * corePerRank + blockIdx % corePerRank); + sync.WaitInnerFlag(magic, 1, rankIDOfBlock, rank * corePerRank + blockIdx % corePerRank); if (rankIDOfBlock != rank) { CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockDataNum, atomOp); } @@ -65,10 +65,10 @@ public: FORCE_INLINE_AICORE void CpInputToBuffAndOutput() { - Collectives::CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); - if ((extraFlag & ExtraFlag::RDMA) == ExtraFlag::RDMA) { - if ((blockIdx >- rank * corePerRank) && (blockIdx < (rank * corePerRank + corePerRank))) { - CpGM2GM(dstIPCGlobal, srcInputGlobal, blockDataNum, -1); + CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); + if ((extraFlag & ExtraFlag::RDMA) != ExtraFlag::RDMA) { + if ((blockIdx >= rank * corePerRank) && (blockIdx < (rank * corePerRank + corePerRank))) { + CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); } } } -- Gitee From a33af0a7b9b7c8dba8f3bd8bb462ee8a43b3ca06 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 16:27:59 +0800 Subject: [PATCH 094/414] 3 --- comm/lcal/src/ascendc_kernels/reduce_scatter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h index 538aec82..264ec36a 100644 --- a/comm/lcal/src/ascendc_kernels/reduce_scatter.h +++ b/comm/lcal/src/ascendc_kernels/reduce_scatter.h @@ -15,7 +15,7 @@ #include "collectives.h" using namespace AscendC; -template +template class ReduceScatter : protected Collectives { public: FORCE_INLINE_AICORE ReduceScatter(int rank, int rankSize, uint32_t extraFlag) @@ -65,10 +65,10 @@ public: FORCE_INLINE_AICORE void CpInputToBuffAndOutput() { - CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); + CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); if ((extraFlag & ExtraFlag::RDMA) != ExtraFlag::RDMA) { if ((blockIdx >= rank * corePerRank) && (blockIdx < (rank * corePerRank + corePerRank))) { - CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); + CpGM2GM(dstIPCGlobal, srcInputGlobal, blockDataNum, -1); } } } -- Gitee From caacdd7a51c62cee29d5313e0e8477dedf120c09 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 20 Aug 2025 16:29:08 +0800 Subject: [PATCH 095/414] 2 --- comm/lcal/src/ascendc_kernels/reduce_scatter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h index 264ec36a..309ffc44 100644 --- a/comm/lcal/src/ascendc_kernels/reduce_scatter.h +++ b/comm/lcal/src/ascendc_kernels/reduce_scatter.h @@ -65,10 +65,10 @@ public: FORCE_INLINE_AICORE void CpInputToBuffAndOutput() { - CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); + CpGM2GM(dstIPCGlobal, srcInputGlobal, blockDataNum, -1); if ((extraFlag & ExtraFlag::RDMA) != ExtraFlag::RDMA) { if ((blockIdx >= rank * corePerRank) && (blockIdx < (rank * corePerRank + corePerRank))) { - CpGM2GM(dstIPCGlobal, srcInputGlobal, blockDataNum, -1); + CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); } } } -- Gitee From 56108330c0df580ad557b50d90b2775fb8101cd6 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 21 Aug 2025 16:59:23 +0800 Subject: [PATCH 096/414] 1 --- .../lcal/src/ascendc_kernels/datacopy_gm2gm.h | 322 +++++++++++++++++- 1 file changed, 321 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index 9a893c3a..9988aa49 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -6,4 +6,324 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ + +#ifndef LCCL_DATACOPY_GM2GM_H +#define LCCL_DATACOPY_GM2GM_H +#include +#include "comm_args.h" + +using namespace AscendD; +using namespace Lcal; + +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t TILE_NUM = 2; +constexpr int32_t BLOCK_SIZE = UB_SINGLE_DMA_SIZE_MAX / TILE_NUM / BUFFER_NUM; + +template +FORCE_INLINE_AICORE void SetAtomicOpType(int op) +{ + switch (op) { + case ADD: + AscendC::SetAtomicAdd(); + break; + + case MUL: + break; + case MAX: + AscendC::SetAtomicMax(); + break; + case MIN: + AscendC::SetAtomicMin(); + return; + default: + AscendC::SetAtomicNone(); + ; + } +} + +template +FORCE_INLINE_AICORE void CpUB2GM(__gm__ T *gmAddr, __ubuf__ T *ubAddr, uint32_t size) +{ + LocalTensor ubTensor; + GlobalTensofr gmTensor; + DataCopyExtParams.dataCopyParams(1, size, 0, 0, 0); + ubTensor.address_.logicPos = static_cast(TPosition::VECIN); + ubTensor.address_.bufferAddr = reinterpret_cast(ubAddr); + gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr)); + DataCopyPadExtParams(ubTensor, gmTensor, dataCopyExtParams, padParams); +} + +template +FORCE_INLINE_AICORE void CpGM2UB(__ubuf__ T *ubAddr, __gm__ T *gmAddr, uint32_t size) +{ + LocalTensor ubTensor; + GlobalTensofr gmTensor; + DataCopyExtParams.dataCopyParams(1, size, 0, 0, 0); + ubTensor.address_.logicPos = static_cast(TPosition::VECIN); + ubTensor.address_.bufferAddr = reinterpret_cast(ubAddr); + gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr)); + DataCopyPadExtParams padParams; + DataCopyPadExtParams(gmTensor, ubTensor, dataCopyExtParams, padParams); +} + +template +FORCE_INLINE_AICORE void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, const uint32_t calCount) +{ + LocalTensor dstTensor; + LocalTensor srcTensor; + TBufAddr srcAddr, dstAddr; + srcAddr.bufferAddr = reinterpret_cast(src); + dstAddr.bufferAddr = reinterpret_cast(dst); + srcTensor.SetAddr(srcAddr); + dstTensor.SetAddr(dstAddr); + DataCopyPadExtParams(dstTensor, srcTensor, calCount); +} +template +__aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const LocalTensor &srcLocal, + const uint32_t size) +{ + if(size % UB_ALIGN_SIZE == 0) { + DataCopy(dstGlobal, srcLocal, size / sizeof(T)); + } else { + DataCopyExtParams copyParams{1, size, 0, 0, 0}; + DataCopyPad(dstGlobal, srcLocal, copyParams); + } +} + +template +__aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const GlobalTensor &srcLocal, + const uint32_t size) +{ + if(size % UB_ALIGN_SIZE == 0) { + DataCopy(dstGlobal, srcLocal, size / sizeof(T)); + } else { + DataCopyExtParams copyParams{1, size, 0, 0, 0}; + DataCopyPadExtParams padParams{true, 0, 1, 0}; + DataCopyPad(dstGlobal, srcLocal, copyParams); + } +} + +template +class DataCopyGm2Gm { + constexpr static int32_t UB_HEAD_OFFSET = 64; + constexpr static int32_t BLOCK_SIZE_PIECE = BLOCK_SIZE / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; + constexpr static int32_t INPUT_BLOCK_SIZE = std::is_save_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); + constexpr static int32_t OUTPUT_BLOCK_SIZE = std::is_save_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); +public: + FORCE_INLINE_AICORE DataCopyGM2GM() {} + FORCE_INLINE_AICORE void Init(const GlobalTensor& outputGt, const GlobalTensor& inputGt, + const uint32_t calCount, int op) + { + inputGm = inputGt.GetPhyAddr(); + outputGm = outputGt.GetPhyAddr(); + inputUB = (__ubuf__ U*)(UB_HEAD_OFFSET); + if constexpr (std::is_same_v) { + outputUB = (__ubuf__ T*)inputUB; + } else { + outputUB = (__ubuf__ T*)(UB_HEAD_OFFSET + INPUT_BLOCK_SIZE); + } + this->op = op; + dataSizeRemain = calCount * sizeof(T); + } + + FORCE_INLINE_AICORE void Process() + { + SetAtomic(op); + int64_t i = 0; + while (dataSizeRemain >= OUTPUT_BLOCK_SIZE) { + CpGM2UB(inputUB, (__gm__ U*)inputGm + i * INPUT_BLOCK_SIZE / sizeof(U), INPUT_BLOCK_SIZE); + Ascend::SetFlag(EVENT_ID0); + Ascend::WaitFlag(EVENT_ID0); + if constexpr (!std::is_same_v) { + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + CestImpl(outputUB, inputUB, RoundMode::CAST_NONE, INPUT_BLOCK_SIZE / sizeof(U)); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + } + CpUB2GM(outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, + OUTPUT_BLOCK_SIZE); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + i += 1; + dataSizeRemain -= OUTPUT_BLOCK_SIZE; + } + if (dataSizeRemain > 0) { + CpGM2UB(inputUB, (__gm__ U*)inputGm + i * INPUT_BLOCK_SIZE / sizeof(U), + dataSizeRemain / sizeof(T) * sizeof(U)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + if constexpr (!std::is_same_v) { + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CestImpl(outputUB, inputUB, RoundMode::CAST_NONE, dataSizeRemain / sizeof(U)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + } + CpUB2GM((__gm__ T*)outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, + dataSizeRemain); + PipeBarrier(); + } + UnsetAtomic(op); + } + + FORCE_INLINE_AICORE void Process(T scale, T offset) + { + SetAtomic(op); + int64_t i = 0; + int64_t batchDataNum = OUTPUT_BLOCK_SIZE / sizeof(T); + while (dataSizeRemain > 0) { + int64_t curProcessNum = (dataSizeRemain > OUTPUT_BLOCK_SIZE ? OUTPUT_BLOCK_SIZE : dataSizeRemain) / + sizeof(T); + CpGM2UB(inputUB, (__gm__ U*)inputGm + i * batchDataNum, curProcessNum * sizeof(U)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + if constexpr (!std::is_same_v) { + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + CestImpl(outputUB, inputUB, RoundMode::CAST_NONE, curProcessNum); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + } + CpUB2GM((__gm__ T*)outputGm + i * batchDataNum, (__ubuf__ T*)outputUB, curProcessNum * sizeof(T)); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + i += 1; + dataSizeRemain -= OUTPUT_BLOCK_SIZE; + } + UnsetAtomic(op); + } + + FORCE_INLINE_AICORE void Process(const GlobalTensor& scaleGT, int64_t scaleCount, T offset) + { + if (scaleCount > UB_SINGLE_DMA_SIZE_MAX / sizeof(T) + sizeof(U) + sizeof(T) / ALIGN_SIZE + ALIGN_SIZE) { + ProcessForBigScale(scaleGT, scaleCount, offset); + } else { + ProcessForSmallScale(scaleGT, scaleCount, offset); + } + } +private: + FORCE_INLINE_AICORE void UnsetAtomic(int op) + { + if (op != -1) { + AscendC::SetAtomicNone(); + } + PipeBarrier(); + } + + FORCE_INLINE_AICORE void SetAtomic(int op) + { + PipeBarrier(); + if (op != -1) { +#ifdef __DAV_C220_VEC__ + SetAtomicOpType(op); +#endif + } + PipeBarrier(); + } + + FORCE_INLINE_AICORE void ProcessForSmallScale(const GLobalTensor& scaleGT, int64_t scaleCount, T offset) + { + SetAtomic(op); + constexpr int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(T) + sizeof(U)) / ALIGN_SIZE * + ALIGN_SIZE; + const int32_t batchDataNum = blockPieceNum / scaleCount * scaleCount; + const int32_t inputBlockSize = batchDataNum * sizeof(U); + const int32_t outputBlockSize = batchDataNum * sizeof(T); + scaleUB = (__ubuf__ T*)(UB_HEAD_OFFSET); + outputUB = (__ubuf__ T*)(scaleUB + blockPieceNum); + inputUB = (__ubuf__ U*)(outputUB + blockPieceNum); + __gm__ T *scale = const_cast<__gm__ T*>(scaleGT.GetPhyAddr()); + + CpGM2UB((__ubuf__ T*)scaleUB, scale, scaleCount * sizeof(T)); + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); + + int64_t repeatTimes = (dataSizeRemain > outputBlockSize ? outputBlockSize : dataSizeRemain) / sizeof(T) / + scaleCount; + int64_t mulVal = 2; + for (int64_t i = 1; i < repeatTimes; i *= mulVal) { + PipeBarrier(); + CopyUB2UB(scaleUB + i * scaleCount, scaleUB, (repeatTimes > i * mulVal ? i : repeatTimes - i) * scaleCount); + } + int64_t i = 0; + while (dataSizeRemain > 0) { + int64_t curProcessNum = (dataSizeRemain > outputBlockSize ? outputBlockSize : dataSizeRemain) / sizeof(T); + CpGM2UB(inputUB, (__gm__ U*)inputGm + i * batchDataNum, curProcessNum * sizeof(U)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, curProcessNum); + PipeBarrier(); + AddsImpl(outputUB, outputUB, offset, curProcessNum); + PipeBarrier(); + MulImpl(outputUB, outputUB, scaleUB, curProcessNum); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM((__gm__ T*)outputGm + i * batchDataNum, (__ubuf__ T*)outputUB, curProcessNum * sizeof(T)); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + i += 1; + dataSizeRemain -= outputBlockSize; + } + UnsetAtomic(op); + } + + FORCE_INLINE_AICORE void ProcessForBigScale(const GLobalTensor& scaleGT, int64_t scaleCount, T offset) + { + SetAtomic(op); + constexpr int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(T) + sizeof(U)) / ALIGN_SIZE * + ALIGN_SIZE; + const int32_t inputBlockSize = blockPieceNum * sizeof(U); + const int32_t outputBlockSize = batchPieceNum * sizeof(T); + const int32_t dataNumPerBatch = outputBlockSize / sizeof(T); + const int32_t scaleBatchNum = (scaleCount + dataNumPerBatch - 1) / dataNumPerBatch; + + scaleUB = (__ubuf__ T*)(UB_HEAD_OFFSET); + outputUB = (__ubuf__ T*)(scaleUB + outputBlockSize / sizeof(T)); + inputUB = (__ubuf__ U*)(outputUB + outputBlockSize / sizeof(T)); + __gm__ T *scale = const_cast<__gm__ T*>(scaleGT.GetPhyAddr()); + + CpGM2UB((__ubuf__ T*)scaleUB, scale, scaleCount * sizeof(T)); + int64_t i = 0; + int32_t curDataNum = 0; + int32_t processedNum = 0; + while (dataSizeRemain > 0) { + if (i % scaleBatchNum == scaleBatchNum - 1) { + curDataNum = scaleCount - i % scaleBatchNum * dataNumPerBatch + } else { + curDataNum = dataNumPerBatch; + } + CpGM2UB(inputUB, (__gm__ U*)inputGm + processedNum, curDataNum * sizeof(U)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpGM2UB(scaleUB, scale + i % scaleBatchNum * dataNumPerBatch, curDataNum * sizeof(U)); + CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, curDataNum); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AddsImpl(outputUB, outputUB, offset, curProcessNum); + PipeBarrier(); + MulImpl(outputUB, outputUB, scaleUB, curProcessNum); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM((__gm__ T*)outputGm + processedNum, (__ubuf__ T*)outputUB, curProcessNum * sizeof(T)); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + i += 1; + dataSizeRemain -= curDataNum * sizeof(T); + processedNum += curDataNum; + } + UnsetAtomic(op); + } +private: + int64_t dataSizeRemain = 0; + __ubuf__ U* inputUB = nullptr; + __ubuf__ T* outputUB = nullptr; + __ubuf__ T* scaleUB = nullptr; + const __gm__ U* inputGm = nullptr; + const __gm__ T* outputGm = nullptr; + int op = 0; +}; +#endif // LCCL_DATACOPY_GM2GM_H + + -- Gitee From 324181e591507366f717afff8e675aa75353d539 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 21 Aug 2025 17:15:45 +0800 Subject: [PATCH 097/414] 2 --- .../lcal/src/ascendc_kernels/datacopy_gm2gm.h | 88 ++++++++++--------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index 9988aa49..2a5be85b 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -13,7 +13,7 @@ #include #include "comm_args.h" -using namespace AscendD; +using namespace AscendC; using namespace Lcal; constexpr int32_t BUFFER_NUM = 1; @@ -35,7 +35,7 @@ FORCE_INLINE_AICORE void SetAtomicOpType(int op) break; case MIN: AscendC::SetAtomicMin(); - return; + break; default: AscendC::SetAtomicNone(); ; @@ -46,38 +46,39 @@ template FORCE_INLINE_AICORE void CpUB2GM(__gm__ T *gmAddr, __ubuf__ T *ubAddr, uint32_t size) { LocalTensor ubTensor; - GlobalTensofr gmTensor; - DataCopyExtParams.dataCopyParams(1, size, 0, 0, 0); + GlobalTensor gmTensor; + DataCopyExtParams dataCopyParams(1, size, 0, 0, 0); ubTensor.address_.logicPos = static_cast(TPosition::VECIN); ubTensor.address_.bufferAddr = reinterpret_cast(ubAddr); gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr)); - DataCopyPadExtParams(ubTensor, gmTensor, dataCopyExtParams, padParams); + DataCopyPad(gmTensor, ubTensor, dataCopyParams); } template FORCE_INLINE_AICORE void CpGM2UB(__ubuf__ T *ubAddr, __gm__ T *gmAddr, uint32_t size) { LocalTensor ubTensor; - GlobalTensofr gmTensor; - DataCopyExtParams.dataCopyParams(1, size, 0, 0, 0); + GlobalTensor gmTensor; + DataCopyExtParams dataCopyParams(1, size, 0, 0, 0); ubTensor.address_.logicPos = static_cast(TPosition::VECIN); ubTensor.address_.bufferAddr = reinterpret_cast(ubAddr); gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(gmAddr)); - DataCopyPadExtParams padParams; - DataCopyPadExtParams(gmTensor, ubTensor, dataCopyExtParams, padParams); + DataCopyPadExtParams padParams; + DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams); } + template FORCE_INLINE_AICORE void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, const uint32_t calCount) { - LocalTensor dstTensor; - LocalTensor srcTensor; - TBufAddr srcAddr, dstAddr; + LocalTensor srcTensor; + LocalTensor dstTensor; + TBuffAddr srcAddr, dstAddr; srcAddr.bufferAddr = reinterpret_cast(src); dstAddr.bufferAddr = reinterpret_cast(dst); srcTensor.SetAddr(srcAddr); dstTensor.SetAddr(dstAddr); - DataCopyPadExtParams(dstTensor, srcTensor, calCount); + DataCopyPad(dstTensor, srcTensor, calCount); } template __aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const LocalTensor &srcLocal, @@ -92,24 +93,24 @@ __aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const Loca } template -__aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const GlobalTensor &srcLocal, +__aicore__ inline void DataCopyWrap(const LocalTensor &dstLobal, const GlobalTensor &srcGlobal, const uint32_t size) { if(size % UB_ALIGN_SIZE == 0) { - DataCopy(dstGlobal, srcLocal, size / sizeof(T)); + DataCopy(dstLocal, srcGlobal, size / sizeof(T)); } else { DataCopyExtParams copyParams{1, size, 0, 0, 0}; DataCopyPadExtParams padParams{true, 0, 1, 0}; - DataCopyPad(dstGlobal, srcLocal, copyParams); + DataCopyPad(dstLocal, srcGlobal, copyParams, padParams); } } template -class DataCopyGm2Gm { +class DataCopyGM2GM { constexpr static int32_t UB_HEAD_OFFSET = 64; constexpr static int32_t BLOCK_SIZE_PIECE = BLOCK_SIZE / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; - constexpr static int32_t INPUT_BLOCK_SIZE = std::is_save_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); - constexpr static int32_t OUTPUT_BLOCK_SIZE = std::is_save_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); + constexpr static int32_t INPUT_BLOCK_SIZE = std::is_same_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); + constexpr static int32_t OUTPUT_BLOCK_SIZE = std::is_same_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); public: FORCE_INLINE_AICORE DataCopyGM2GM() {} FORCE_INLINE_AICORE void Init(const GlobalTensor& outputGt, const GlobalTensor& inputGt, @@ -133,16 +134,16 @@ public: int64_t i = 0; while (dataSizeRemain >= OUTPUT_BLOCK_SIZE) { CpGM2UB(inputUB, (__gm__ U*)inputGm + i * INPUT_BLOCK_SIZE / sizeof(U), INPUT_BLOCK_SIZE); - Ascend::SetFlag(EVENT_ID0); - Ascend::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); if constexpr (!std::is_same_v) { - AscendC::SetFlag(EVENT_ID1); - AscendC::WaitFlag(EVENT_ID1); - CestImpl(outputUB, inputUB, RoundMode::CAST_NONE, INPUT_BLOCK_SIZE / sizeof(U)); - AscendC::SetFlag(EVENT_ID1); - AscendC::WaitFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, INPUT_BLOCK_SIZE / sizeof(U)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); } - CpUB2GM(outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, + CpUB2GM((__gm__ T*)outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, OUTPUT_BLOCK_SIZE); AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); @@ -152,12 +153,12 @@ public: if (dataSizeRemain > 0) { CpGM2UB(inputUB, (__gm__ U*)inputGm + i * INPUT_BLOCK_SIZE / sizeof(U), dataSizeRemain / sizeof(T) * sizeof(U)); - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); if constexpr (!std::is_same_v) { AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CestImpl(outputUB, inputUB, RoundMode::CAST_NONE, dataSizeRemain / sizeof(U)); + CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, dataSizeRemain / sizeof(U)); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); } @@ -180,11 +181,15 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); if constexpr (!std::is_same_v) { - AscendC::SetFlag(EVENT_ID1); - AscendC::WaitFlag(EVENT_ID1); - CestImpl(outputUB, inputUB, RoundMode::CAST_NONE, curProcessNum); - AscendC::SetFlag(EVENT_ID1); - AscendC::WaitFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, curProcessNum); + PipeBarrier(); + AddsImpl(outputUB, outputUB, offset, curPorcessNum); + PipeBarrier(); + MulsImpl(outputUB, outputUB, scale, curPorcessNum); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); } CpUB2GM((__gm__ T*)outputGm + i * batchDataNum, (__ubuf__ T*)outputUB, curProcessNum * sizeof(T)); AscendC::SetFlag(EVENT_ID1); @@ -197,7 +202,7 @@ public: FORCE_INLINE_AICORE void Process(const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { - if (scaleCount > UB_SINGLE_DMA_SIZE_MAX / sizeof(T) + sizeof(U) + sizeof(T) / ALIGN_SIZE + ALIGN_SIZE) { + if (scaleCount > UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(U) + sizeof(T)) / ALIGN_SIZE * ALIGN_SIZE) { ProcessForBigScale(scaleGT, scaleCount, offset); } else { ProcessForSmallScale(scaleGT, scaleCount, offset); @@ -223,7 +228,7 @@ private: PipeBarrier(); } - FORCE_INLINE_AICORE void ProcessForSmallScale(const GLobalTensor& scaleGT, int64_t scaleCount, T offset) + FORCE_INLINE_AICORE void ProcessForSmallScale(const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { SetAtomic(op); constexpr int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(T) + sizeof(U)) / ALIGN_SIZE * @@ -269,13 +274,13 @@ private: UnsetAtomic(op); } - FORCE_INLINE_AICORE void ProcessForBigScale(const GLobalTensor& scaleGT, int64_t scaleCount, T offset) + FORCE_INLINE_AICORE void ProcessForBigScale(const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { SetAtomic(op); - constexpr int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(T) + sizeof(U)) / ALIGN_SIZE * + const int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; const int32_t inputBlockSize = blockPieceNum * sizeof(U); - const int32_t outputBlockSize = batchPieceNum * sizeof(T); + const int32_t outputBlockSize = blockPieceNum * sizeof(T); const int32_t dataNumPerBatch = outputBlockSize / sizeof(T); const int32_t scaleBatchNum = (scaleCount + dataNumPerBatch - 1) / dataNumPerBatch; @@ -284,7 +289,6 @@ private: inputUB = (__ubuf__ U*)(outputUB + outputBlockSize / sizeof(T)); __gm__ T *scale = const_cast<__gm__ T*>(scaleGT.GetPhyAddr()); - CpGM2UB((__ubuf__ T*)scaleUB, scale, scaleCount * sizeof(T)); int64_t i = 0; int32_t curDataNum = 0; int32_t processedNum = 0; @@ -322,7 +326,7 @@ private: __ubuf__ T* scaleUB = nullptr; const __gm__ U* inputGm = nullptr; const __gm__ T* outputGm = nullptr; - int op = 0; + int op; }; #endif // LCCL_DATACOPY_GM2GM_H -- Gitee From 192d1d46cfd4eebe01f19b97be60e2cf1c5c61b2 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 21 Aug 2025 17:18:19 +0800 Subject: [PATCH 098/414] 3 --- .../lcal/src/ascendc_kernels/datacopy_gm2gm.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index 2a5be85b..a5d3a34c 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -78,7 +78,7 @@ FORCE_INLINE_AICORE void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, const uint3 dstAddr.bufferAddr = reinterpret_cast(dst); srcTensor.SetAddr(srcAddr); dstTensor.SetAddr(dstAddr); - DataCopyPad(dstTensor, srcTensor, calCount); + DataCopy(dstTensor, srcTensor, calCount); } template __aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const LocalTensor &srcLocal, @@ -93,7 +93,7 @@ __aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const Loca } template -__aicore__ inline void DataCopyWrap(const LocalTensor &dstLobal, const GlobalTensor &srcGlobal, +__aicore__ inline void DataCopyWrap(const LocalTensor &dstLocal, const GlobalTensor &srcGlobal, const uint32_t size) { if(size % UB_ALIGN_SIZE == 0) { @@ -110,7 +110,7 @@ class DataCopyGM2GM { constexpr static int32_t UB_HEAD_OFFSET = 64; constexpr static int32_t BLOCK_SIZE_PIECE = BLOCK_SIZE / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; constexpr static int32_t INPUT_BLOCK_SIZE = std::is_same_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); - constexpr static int32_t OUTPUT_BLOCK_SIZE = std::is_same_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(U); + constexpr static int32_t OUTPUT_BLOCK_SIZE = std::is_same_v ? BLOCK_SIZE : BLOCK_SIZE_PIECE * sizeof(T); public: FORCE_INLINE_AICORE DataCopyGM2GM() {} FORCE_INLINE_AICORE void Init(const GlobalTensor& outputGt, const GlobalTensor& inputGt, @@ -158,7 +158,7 @@ public: if constexpr (!std::is_same_v) { AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, dataSizeRemain / sizeof(U)); + CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, dataSizeRemain / sizeof(T)); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); } @@ -185,9 +185,9 @@ public: AscendC::WaitFlag(EVENT_ID0); CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, curProcessNum); PipeBarrier(); - AddsImpl(outputUB, outputUB, offset, curPorcessNum); + AddsImpl(outputUB, outputUB, offset, curProcessNum); PipeBarrier(); - MulsImpl(outputUB, outputUB, scale, curPorcessNum); + MulsImpl(outputUB, outputUB, scale, curProcessNum); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); } @@ -277,7 +277,7 @@ private: FORCE_INLINE_AICORE void ProcessForBigScale(const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { SetAtomic(op); - const int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(T) + sizeof(U)) / ALIGN_SIZE * + const int32_t blockPieceNum = UB_SINGLE_DMA_SIZE_MAX / (sizeof(T) + sizeof(U) + sizeof(T)) / ALIGN_SIZE * ALIGN_SIZE; const int32_t inputBlockSize = blockPieceNum * sizeof(U); const int32_t outputBlockSize = blockPieceNum * sizeof(T); @@ -301,16 +301,16 @@ private: CpGM2UB(inputUB, (__gm__ U*)inputGm + processedNum, curDataNum * sizeof(U)); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CpGM2UB(scaleUB, scale + i % scaleBatchNum * dataNumPerBatch, curDataNum * sizeof(U)); + CpGM2UB(scaleUB, scale + i % scaleBatchNum * dataNumPerBatch, curDataNum * sizeof(T)); CastImpl(outputUB, inputUB, RoundMode::CAST_NONE, curDataNum); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - AddsImpl(outputUB, outputUB, offset, curProcessNum); + AddsImpl(outputUB, outputUB, offset, curDataNum); PipeBarrier(); - MulImpl(outputUB, outputUB, scaleUB, curProcessNum); + MulImpl(outputUB, outputUB, scaleUB, curDataNum); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CpUB2GM((__gm__ T*)outputGm + processedNum, (__ubuf__ T*)outputUB, curProcessNum * sizeof(T)); + CpUB2GM((__gm__ T*)outputGm + processedNum, (__ubuf__ T*)outputUB, curDataNum * sizeof(T)); AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); i += 1; -- Gitee From 55b6609cb8a6b68400f6ffaf980e41f953103697 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 21 Aug 2025 17:18:45 +0800 Subject: [PATCH 099/414] 4 --- comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index a5d3a34c..c09cce2a 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -294,7 +294,7 @@ private: int32_t processedNum = 0; while (dataSizeRemain > 0) { if (i % scaleBatchNum == scaleBatchNum - 1) { - curDataNum = scaleCount - i % scaleBatchNum * dataNumPerBatch + curDataNum = scaleCount - i % scaleBatchNum * dataNumPerBatch; } else { curDataNum = dataNumPerBatch; } -- Gitee From a45246a874a700fde423018355a850ab19fcfdca Mon Sep 17 00:00:00 2001 From: Denver Date: Fri, 22 Aug 2025 11:40:06 +0800 Subject: [PATCH 100/414] test --- comm/lcal/src/test.cpp | 1 + 1 file changed, 1 insertion(+) create mode 100644 comm/lcal/src/test.cpp diff --git a/comm/lcal/src/test.cpp b/comm/lcal/src/test.cpp new file mode 100644 index 00000000..75fa785d --- /dev/null +++ b/comm/lcal/src/test.cpp @@ -0,0 +1 @@ +// test \ No newline at end of file -- Gitee From 5c6968ee34c805aff3a5ae4178a79675aa5c2aac Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 11:47:14 +0800 Subject: [PATCH 101/414] 3 --- .../ascendc_kernels/datacopy_gm2gm_delay.h | 125 +++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 9a893c3a..48ac5e04 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -6,4 +6,127 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ +#ifndef LCCL_DATACOPY_GM2GM_DELAY_H +#define LCCL_DATACOPY_GM2GM_DELAY_H +#include "datacopy_gm2gm.h" + +using namespace AscendC; +using namespace Lcal; + +template +class DataCopyGM2GMDelay { + constexpr static int64_t THREE_NUM = 3; + constexpr static int64_t FOUR_NUM = 4; + constexpr static int64_t WORK_OFFSET = 8192; + constexpr static int64_t WORK_BLOCK_NUM = WORK_OFFSET / sizeof(T); + constexpr static int64_t UB_HEAD_OFFSET = WORK_OFFSET * 2; + constexpr static int64_t SCALE_SIZE = 32; + constexpr static int64_t SCALE_NUM = SCALE_SIZE / sizeof(T); + constexpr static int64_t SINGLE_SCALE_SIZE = 2; + constexpr static int64_t BLOCK_NUM = (UB_SINGLE_DMA_SIZE_MAX - WORK_OFFSET * 2 - SCALE_SIZE * 4) / 2 / + (sizeof(U) + sizeof(T)) / ALIGN_SIZE * ALIGN_SIZE; + constexpr static int64_t IN_BLOCKSIZE = BLOCK_NUM * sizeof(U); +} + +public: + FOECE_INLINE_AICORE DataCopyGM2GMDelay() {} + + FORCE_INLINE_AICORE void Init(GlobalTensor& outputGt, GlobalTensor (&inputGt)[8], + GlobalTensor (&inputScaleGt)[8], const uint32_t calNum, int rankCount, GlobalTensor& outScaleGt, + TBuf tbuf) + { + for (int index = 0; index < rankCount; index++) { + this->inputGt[index] = inputGt[index]; + this->inputScaleGt[index] = inputScaleGt[index]; + } + this->outputGt = outputGt; + this->outScaleGt = outScaleGt; + inTensor[0] = tbuf.GetWithOffset(BLOCK_NUM, 0); + inTensor[1] = tbuf.GetWithOffset(BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + IN_BLOCKSIZE * THREE_NUM); + singleScaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); + singleScaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + + IN_BLOCKSIZE * FOUR_NUM); + singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); + singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + + IN_BLOCKSIZE * FOUR_NUM); + workUBTENSOR[0] = tbuf.GetWithOffset(WORK_BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM); + workUBTENSOR[1] = tbuf.GetWithOffset(WORK_BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * FOUR_NUM + + IN_BLOCKSIZE * FOUR_NUM); + outputUBTENSOR[0] = tbuf.GetWithOffset(BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM + WORK_OFFSET); + outputUBTENSOR[1] = tbuf.GetWithOffset(BLOCK_NUM, WORK_OFFSET * HALF_NUM + SCALE_SIZE * FOUR_NUM + + IN_BLOCKSIZE * FOUR_NUM); + this->rankCount = rankCount; + totalDataSize = calNum * sizeof(U); + this->calNum = calNum; + this->rankId = rankId; + } + + FORCE_INLINE_AICORE void PreProcess() + { + for (int index = 0; index < rankCount; index++) { + DataCopyWrap(scaleUUBTENSOR[0][indedx * SCALE_SIZE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); + pipe_barrier(PIPE_ALL); + DataCopyWrap(scaleUUBTensor[1][index * SCALE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); + pipe_barrier(PIPE_ALL); + } + for (int index = 0; index < rankCount; index++) { + scaleUBTensor[0][index].SetValue(0, scaleUBTensor[0].GetValue(index * SCALE_SIZE / sizeof(T))); + pipe_barrier(PIPE_ALL); + scaleUBTensor[1][index].SetValue(0, scaleUBTensor[1].GetValue(index * SCALE_SIZE / sizeof(T))); + pipe_barrier(PIPE_ALL); + outputUBTensor[0][index].SetValue(0, 1); + AscendC::PipeBarrier(); + } + Div(scaleUBTensor[1], outputUBTensor[0], scaleUBTensor[1], rankCount); + AscendC::PipeBarrier(); + ReduceMin(singleScaleUBTensor[0], singleScaleUUBTensor[0], + workUBTensor[1][WORK_BLOCK_NUM / HALF_NUM], rankCount, false); + pipe_barrier(PIPE_ALL); + DataCopyWrap(outScaleGt, singleScaleUUBTensor[0], sizeof(T)); + AscendC::PipeBarrier(); + } + + + FORCE_INLINE_AICORE void LoopUncastAndMul(int idx, int index, event_t eventId) + { + PipeBarrier(); + T scalarValue = scaleUBTensor[1].GetValue(index); + PipeBarrier__ubuf__ U* inputUB = nullptr; + __ubuf__ T* outputUB = nullptr; + + } +private: + template + FORCE_INLINE_AICORE T1 CeilDiv(T1 a, T2 b) + { + if (b == 0) { + return 0; + } + return (a + b - 1) / b; + } + +private: + int64_t totalDataSize = 0; + int rankCount; + int perRankNumRemain; + int calNum; + int rankId; + int numLayer; + + LocalTensor inTensor[2]; + LocalTensor singleScaleUUBTensor[2]; + LocalTensor singleScaleUBTensor[2]; + LocalTensor scaleUUBTensor[2]; + LocalTensor scaleUBTensor[2]; + LocalTensor workUBTensor[2]; + LocalTensor outputUBTensor[2]; + + GlobalTensor outputGt; + GLobalTensor inputGt[8]; + GLobalTensor inputScaleGt[8]; + GLobalTensor outScaleGt; +}; + +#endif // LCCL_DATACOPY_GM2GM_DELAYH + -- Gitee From c8b8e9b5e6dd89865c8fbc9ee497ab419d9133ec Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:02:23 +0800 Subject: [PATCH 102/414] 2 --- .../ascendc_kernels/datacopy_gm2gm_delay.h | 116 +++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 48ac5e04..bbd96217 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -92,9 +92,121 @@ public: { PipeBarrier(); T scalarValue = scaleUBTensor[1].GetValue(index); - PipeBarrier__ubuf__ U* inputUB = nullptr; - __ubuf__ T* outputUB = nullptr; + PipeBarrier(); + int32_t perRankNum; + PipeBarrier(); + for (int j = 0; perRankNumRemain > 0; j++) { + PipeBarrier(); + perRankNum - perRankNumRemain >= WORK_BLOCK_NUM ? WORK_BLOCKNUM : perRankNumRemain; + PipeBarrier(); + int32_t perRankNum = CeilDiv(perRankNumRemain, rankCount); + perRankNumRemain -= perRankNum; + PipeBarrier(); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + PipeBarrier(); + Cast((idx & 1) ? workUBTensor[0] : workUBTensor[1], (idx & 1) ? inTensor[0][j * + WORK_BLOCK_NUM] : inTensor[1][j * WORK_BLOCK_NUM], RoundMode::CAST_NONE, perRankNum); + PipeBarrier(); + if (index == 0) { + Muls((idx & 1) ? outputUBTensor[0][j * WORK_BLOCK_NUM] : outputUBTensor[1][j * + WORK_BLOCK_NUM], (idx & 1) ? workUBTensor[0] : workUBTensor[1], scalarValue, perRankNum); + } else { + Axpy((idx & 1) ? outputUBTensor[0][j * WORK_BLOCK_NUM] : outputUBTensor[1][j * + WORK_BLOCK_NUM], (idx & 1) ? workUBTensor[0] : workUBTensor[1], scalarValue, perRankNum); + } + PipeBarrier(); + } + } + FORCE_INLINE_AICORE void Mte3Process(int idx, int index, int calCount, event_t eventId) + { + if (index == (rankCount - 1)) { + if constexpr (std::is_same_v) { + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? + outputUBTensor[0] : outputUBTensor[1], calCount * sizeof(V)); + } + if constexpr (std::is_same_v) { + PipeBarrier(); + T scaleValue = singleScaleUBTensor[0].GetValue(0); + PipeBarrier(); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + PipeBarrier(); + Muls((idx & 1) ? outputUBTensor[0] : outputUBTensor[1], (idx & 1) ? + outputUBTensor[0] : outputUBTensor[1], scaleValue, calCount); + PipeBarrier(); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? + outputUBTensor[0] : outputUBTensor[1], calCount * sizeof(V)); + } + } + } + + FORCE_INLINE_AICORE int GetSize(int idx, int numOfPiece) + { + int size; + if (idx < (numOfPiece - 1)) { + size = IN_BLOCKSIZE; + } else if (idx == (numOfPiece - 1)) { + size = totalDataSize - (numOfPiece - 1) * IN_BLOCKSIZE; + } else { + size = 0; + } + return size; + } + + FORCE_INLINE_AICORE void Process() + { + PreProcess(); + int numofPiece = CeilDiv(calNum, BLOCK_NUM); + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; i < numOfPiece; i += HALF_NUM) { + for (int index = 0; index < rankCount; index++) { + for (int k = 0; k < HALF_NUM; k++) { + int idx = i + k; + int size = GetSize(idx, numOfPiece); + int32_t calCount = size / sizeof(U); + perRankNumRemain = calCount; + event_t eventId = (idx & 1) ? EVENT_ID0 : EVENT_ID1; + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(eventId1); + AscendC::WaitFlag(eventId0); + DataCopyWrap((idx & 1) ? inTensor[0] : inTensor[1], inputGt[index][BLOCK_NUM * idx], size); + AscendC::SetFlag(eventId1); + AscendC::WaitFlag(eventId1); + AscendC::SetFlag(eventId1); + AscendC::WaitFlag(eventId1); + LoopUncastAndMul(idx, index, eventId); + Mte3Process(idx, index, eventId); + AscendC::SetFlag(eventId1); + AscendC::SetFlag(eventId1); + AscendC::SetFlag(eventId1); + } + } + } + + + AscendC::WaitFlag(eventId0); + AscendC::WaitFlag(eventId1); + AscendC::WaitFlag(eventId0); + AscendC::WaitFlag(eventId1); + AscendC::WaitFlag(eventId0); + AscendC::WaitFlag(eventId1); } private: template -- Gitee From 52513cc8a482f0996fd790dc35cc200df0e32141 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:04:13 +0800 Subject: [PATCH 103/414] 3 --- .../ascendc_kernels/datacopy_gm2gm_delay.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index bbd96217..4b950c08 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -183,30 +183,30 @@ public: event_t eventId = (idx & 1) ? EVENT_ID0 : EVENT_ID1; AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - AscendC::WaitFlag(EVENT_ID0); - AscendC::WaitFlag(eventId1); - AscendC::WaitFlag(eventId0); + AscendC::WaitFlag(eventId); + AscendC::WaitFlag(eventId); + AscendC::WaitFlag(eventId); DataCopyWrap((idx & 1) ? inTensor[0] : inTensor[1], inputGt[index][BLOCK_NUM * idx], size); - AscendC::SetFlag(eventId1); - AscendC::WaitFlag(eventId1); - AscendC::SetFlag(eventId1); - AscendC::WaitFlag(eventId1); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); LoopUncastAndMul(idx, index, eventId); Mte3Process(idx, index, eventId); - AscendC::SetFlag(eventId1); - AscendC::SetFlag(eventId1); - AscendC::SetFlag(eventId1); + AscendC::SetFlag(eventId); + AscendC::SetFlag(eventId); + AscendC::SetFlag(eventId); } } } - AscendC::WaitFlag(eventId0); - AscendC::WaitFlag(eventId1); - AscendC::WaitFlag(eventId0); - AscendC::WaitFlag(eventId1); - AscendC::WaitFlag(eventId0); - AscendC::WaitFlag(eventId1); + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); } private: template -- Gitee From 85628f36494777ff7f5726fa873a204373a0fc95 Mon Sep 17 00:00:00 2001 From: Denver Date: Fri, 22 Aug 2025 16:05:39 +0800 Subject: [PATCH 104/414] add lcoc.cpp --- src/lcoc.cpp | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) create mode 100644 src/lcoc.cpp diff --git a/src/lcoc.cpp b/src/lcoc.cpp new file mode 100644 index 00000000..7f1e2e5b --- /dev/null +++ b/src/lcoc.cpp @@ -0,0 +1,301 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lcal_internal.h" +#include "mki/utils/log/log.h" +#include "profiling/report_timing.h" +#include "runtime/rt_ffts.h" + +using namespace std; +using namespace chrono; +namespace Lcal { +bool CheckLcalComm(const LcalComm *lcalComm) +{ + if (lcalComm == nullptr) { + MKI_LOG(ERROR) << "The lcalComm is nullptr!"; + return false; + } + + auto rank = lcalComm->GetRank(); + auto rankSize = lcalComm->GetRankSize(); + auto coreNum = lcalComm->GetPhysicalInfo().coreNum; + std::vector> paramCheckList = { + {"rankSize", rankSize, PARAM_CHECK_MIN_VALUE_ONE, LCAL_MAX_RANK_SIZE}, + {"rank", rank, PARAM_CHECK_MIN_VALUE_ZERO, rankSize - 1}, + {"coreNum", coreNum, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + }; + return CheckParamScopeList(paramCheckList); +} + +bool CheckLcalType(LcalType lcalType) +{ + if (lcalType < LcalType::PURE_MATMUL || lcalType >= LcalType::LCAL_TYPE_MAX) { + MKI_LOG(ERROR) << "The lcalType:" << int(lcalType) + << "must be in [" << int(LcalType::PURE_MATMUL) << ", " << int(LcalType::LCAL_TYPE_MAX) << ")!"; + return false; + } + return true; +} + +bool Check2DTPType(LcalType lcalType) +{ + return lcalType == LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; +} + +bool CheckCoCParamDesc(LcalType lcalType, const CoCParamDesc ¶mDesc) +{ + if (COC_TYPE2ELE_SIZE.find(paramDesc.dataTypeDesc) == COC_TYPE2ELE_SIZE.end()) { + MKI_LOG(ERROR) << "The dataTypeDesc:" << paramDesc.dataTypeDesc << " is not support yet!"; + return false; + } + if (paramDesc.op != HCCL_REDUCE_SUM) { + MKI_LOG(ERROR) << "The ReduceOp:" << paramDesc.op << "is not support yet!"; + return false; + } + + auto batchSize = paramDesc.mmInfo.batchSize; + auto m = paramDesc.mmInfo.m; + auto n = paramDesc.mmInfo.n; + auto k = paramDesc.mmInfo.k; + std::vector> paramCheckList = { + {"batchSize", batchSize, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MIN_VALUE_ONE}, + {"m", m, INPUT_PARAM_DEFAULT_VALUE, MAX_M_VALUE}, + {"n", n, PARAM_CHECK_MIN_VALUE_ONE, MAX_N_VALUE}, + {"k", k, PARAM_CHECK_MIN_VALUE_ONE, MAX_K_VALUE}, + }; + if (Check2DTPType(lcalType)) { + auto agDim = paramDesc.twoDimTPInfo.agDim; + auto rsDim = paramDesc.twoDimTPInfo.rsDim; + paramCheckList.emplace_back("agDim", agDim, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); + paramCheckList.emplace_back("rsDim", rsDim, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); + } + + return CheckParamScope(paramCheckList); +} + +bool Lcoc::CheckInputParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc) const +{ + if (!CheckLcalComm(comm_)) { + return false; + } + if (!CheckLcalType(lcalType)) { + return false; + } + if (!CheckCoCTiling(tiling)) { + return false; + } + if (!CheckCoCParamDesc(lcalType, paramDesc)) { + return false; + } + return true; +} + +void Lcoc::SetTaskParam(LcalType lcalType, const CoCParamDesc ¶mDesc, const LcalComm &comm) +{ + taskParam_.rank = comm.GetRank(); + taskParam_.rankSize = comm.GetRankSize(); + taskParam_.blockDim = comm.GetPhysicalInfo().coreNum; + taskParam_.chipName = comm.GetPhysicalInfo().chipName; + taskParam_.cocParamDesc = paramDesc; + taskParam_.lcalType = lcalType; + taskParam_.bufferSize = comm.GetBufferSize(); +} + +void Lcoc::SetLcocParam(LcalType lcalType, const CoCParamDesc ¶mDesc) +{ + SetTaskParam(lcalType, paramDesc, *comm_); + tilingSuccess_ = false; +} + +CoCTilingFunc *CreateCoCTilingFunc(LcalType lcalType) +{ + bool isDeterministic = false; + const char *lcocDeterministic = Mki::GetEnv("LCCL_DETERMINISTIC"); + std::string lcocDeterministicStr = lcocDeterministic == nullptr ? "" : lcocDeterministic; + if (lcocDeterministicStr == "1" || lcocDeterministicStr == "true") { + isDeterministic = true; + } + CoCTilingFunc *pTilingFunc = nullptr; + switch (lcalType) { + case LcalType::MATMUL_ALL_REDUCE: + if (isDeterministic) { + pTilingFunc = new (std::nothrow) CoCMatmulAllReduceDeterTilingFunc(); + } else { + pTilingFunc = new (std::nothrow) CoCMatmulAllReduceTilingFunc(); + } + break; + case LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER: + pTilingFunc = new (std::nothrow) CoCAllgatherMatmulReduceScatterTilingFunc(); + break; + default: + pTilingFunc = new (std::nothrow) CoCTilingFunc(); + } + return pTilingFunc; +} + +Lcoc::~Lcoc() {} + +Lcoc::Lcoc(LcalComm *comm) : comm_(comm) {} + +Lcoc::Lcoc(LcalComm &comm) : comm_(&comm) {} + +int Lcoc::SetParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc) +{ + if (!CheckInputParam(lcalType, tiling, paramDesc)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + SetLcocParam(lcalType, paramDesc); + CoCTilingFunc *pTilingFunc = CreateCoCTilingFunc(lcalType); + if (pTilingFunc == nullptr) { + PrintErrorLog(lcalType, "Create CoCTilingFunc Failed!"); + return LCAL_ERROR_INTERNAL; + } + CoCTilingData tilingData = pTilingFunc->GenerateTiling(taskParam_, tiling); + bool tilingCheckRes = pTilingFunc->CheckTiling(taskParam_); + if (!tilingCheckRes) { + PrintErrorLog(lcalType, "Tiling check failed!"); + delete pTilingFunc; + pTilingFunc = nullptr; + return LCAL_ERROR_INTERNAL; + } + tiling_ = tilingData; + tilingSuccess_ = true; + delete pTilingFunc; + pTilingFunc = nullptr; + return LCAL_SUCCESS; +} + +int Lcoc::LaunchOperator(CoCInputPkg &inputPkg, CoCOutputPkg &outputPkg, void *workspace, aclrtStream stream) +{ + CoCKernelArgs args; + int error = args.SetFFTSAddr(); + if (error != LCAL_SUCCESS) { + return error; + } + auto paramDesc = taskParam_.cocParamDesc; + args.SetInputPkgArgs(inputPkg); + args.SetOutputPkgArgs(outputPkg); + args.SetWorkspacePtrArg(workspace); + args.SetParamDescArgs(paramDesc); + args.SetCommArgs(*comm_); + args.SetCoTilingDataArgs(tiling_); + MKI_LOG(DEBUG) << "[" << LCAL_TYPE2NAME.at(taskParam_.lcalType) << "]:" << args.ParamToString(); + return ComputeOverComm(taskParam_.lcalType, args, COC_TYPE2HCCL_TYPE.at(paramDesc.dataTypeDesc), stream); +} + +bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, cosnt CoCOutputPkg &outputPkg, LcalType lcalType) const +{ + if (!tilingSuccess_) { + std::string str = "Tiling error. Please check whether the 'Lcoc::SetParam' method has been called, " + "or verify if the tiling parameter is valid."; + PrintErrorLog(lcalType, str); + return false; + } + if (taskParam_.lcalType != lcalType) { + std::string str = "lcalType of Lcoc::SetParam doesn't match launch function."; + PrintErrorLog(lcalType, str); + return false; + } + if (COC_TYPE2HCCL_TYPE.find(taskParam_.cocParamDesc.dataTypeDesc) == COC_TYPE2HCCL_TYPE.end()) { + std::string str = "invalid dataTypeDesc"; + PrintErrorLog(lcalType, str); + return false; + } + if (inputPkg.matrixA == nullptr || inputPkg.matrixB == nullptr) { + std::string str = "inputPkg.matrixA or inputPkg.matrixB is nullptr"; + PrintErrorLog(lcalType, str); + return false; + } + return true; +} + +int Lcoc:MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +{ + LcalType lcalType = LcalType::MATMUL_ALL_REDUCE; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcocMatmulAllReduce", true); + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + +LcalComm *Lcoc::GetComm() +{ + return comm_; +} + +MatMulInfo &Lcoc::GetMatMulInfo() +{ + return taskParam_.cocParamDesc.mmInfo; +} + +void Lcoc::GetTiling(CoCTiling &tiling) +{ + tiling = tiling_; +} + +bool IsMatrixAligned(const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) +{ + return (transpose ? m : n) % nElemAlign == 0; +} + +int64_t Lcoc::GetWorkspaceSize() +{ + LcalType lcaltype = taskParam_.lcalType; + auto cocParamDesc = taskParam_.cocParamDesc; + bool isDeterministic = (GetComm()->GetCommArgs()->extraFlag & ExtraFlag::DETERMINISTIC) != 0; + CoCDataTypeDesc dataType = cocParamDesc.dataTypeDesc; + const MatMulInfo &mmInfo = cocParamDesc.mmInfo; + const QuantInfo &quantInfo = cocParamDesc.quantInfo; + const MoeInfo& moeInfo = cocParamDesc.moeInfo; + bool hasQuant = quantInfo.quantGranularity != QuantGranularity::QUANT_GRANULARITY_UNDEFINED; + bool hasDequant = quantInfo.dequantGranularity != QuantGranularity::QUANT_GRANULARITY_UNDEFINED; + int32_t eleSize = COC_TYPE2ELE_SIZE.at(dataType); + int32_t nElemAlign = Lcal::ALIGN_BYTES / eleSize; + int32_t mAlign = AlignUp(mmInfo.m, nElemAlign); + int32_t nAlign = AlignUp(mmInfo.n, nElemAlign); + int32_t kAlign = AlignUp(mmInfo.k, nElemAlign); + int32_t maxOutputSize = moeInfo.maxOutputSize; + + bool hasAAlign = hasQuant || (!IsMatrixAligned(mmInfo.m, mmInfo.k, mmInfo.transA, nElemAlign) && mmInfo.m != 1); + + bool hasBAlign = (!mmInfo.weightNz) && ((hasDequant && !mmInfo.isInt8) + || (!IsMatrixAligned(mmInfo.k, mmInfo.n, mmInfo.transB, nElemAlign))); + + int32_t accumRankSize = 0; + + bool hasAccum = dataType == COCDataTypeDesc::INT8INT8_INT32_BF16; + bool hasDequantParam = (quantInfo.dequantGranularity == QuantGranularity::PER_TOKEN || + quantInfo.dequantGranularity == QuantGranularity::PER_TENSOR); + bool hasFormatDequantScale = (quantInfo.dequantGranularity == QuantGranularity::PER_CHANNEL); + bool isMoe = false; + + bool isAlltoallVc = false; + + uint64_t dequantWorkSpaceSize = GetDequantWorkSpaceSize(lcalType, tiling_.withSerialMode, mmInfo.m, mmInfo.n, + tiling_.m0, tiling_.n0, tiling_.pValue, tiling_.nLoop, taskParam_.rankSize, taskParam_.blockDim, maxOutputSize); + LcalWorkspaceInfo lcalWorkSpaceInfo = GetLcalWorkspaceInfo(0, mmInfo.batchSize, mmInfo.m, mmInfo.k, + mmInfo.n, mAlign, kAlign, nAlign, mmInfo.transA, mmInfo.transB, eleSize, hasAAlign, hasBAlign, + accumRankSize, hasAccum, dequantWorkSpaceSize, hasDequantParam, hasFormatDequantScale, isDeterministic, + isMoe, isAlltoallVc, moeInfo.EP, moeInfo.local_expert_nums, maxOutputSize); + + MKI_LOG(DEBUG) << "[Lcoc Workspace]: " << "m=" << mmInfo.m << ",k=" << mmInfo.k << ", n=" << mmInfo.n + << ", mAlign=" << mAlign << ", kAlign=" << kAlign << ", nAlign=" << nAlign << ", transA=" << mmInfo.transA + << ", transB=" << mmInfo.transB << ", eleSize=" << eleSize << ", hasAAlign=" << hasAAlign + << ", hasBAlign=" << hasBAlign << ", accumRankSize=" << accumRankSize << ", hasAccum=" << hasAccum + << ", dequantWorkSapceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam + << ", hasFormatDequantScale=" << hasFormatDequantScale << ", isDeterministic=" << isDeterministic + << ", isMoe=" << isMoe << ", isAlltoallVc=" << isAlltoallVc << ", moeInfo.EP=" << static_cast(moeInfo.EP) + << ", maxOutputSize=" << maxOutputSize << ", workspaceSize=" << lcalWorkspaceInfo.workspaceSize; + return lcalWorkSpaceInfo.workspaceSize; +} + +} + -- Gitee From 9ba6faf89251f4c2d6ad23f991d25d12905c1084 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:06:23 +0800 Subject: [PATCH 105/414] 5 --- .../ascendc_kernels/datacopy_gm2gm_delay.h | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 4b950c08..22e2e2ae 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -102,8 +102,8 @@ public: int32_t perRankNum = CeilDiv(perRankNumRemain, rankCount); perRankNumRemain -= perRankNum; PipeBarrier(); - AscendC::SetFlag(eventId); - AscendC::WaitFlag(eventId); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); PipeBarrier(); Cast((idx & 1) ? workUBTensor[0] : workUBTensor[1], (idx & 1) ? inTensor[0][j * WORK_BLOCK_NUM] : inTensor[1][j * WORK_BLOCK_NUM], RoundMode::CAST_NONE, perRankNum); @@ -145,7 +145,7 @@ public: AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? - outputUBTensor[0] : outputUBTensor[1], calCount * sizeof(V)); + inTensor[0] : inTensor[1], calCount * sizeof(V)); } } } @@ -166,7 +166,7 @@ public: FORCE_INLINE_AICORE void Process() { PreProcess(); - int numofPiece = CeilDiv(calNum, BLOCK_NUM); + int numOfPiece = CeilDiv(calNum, BLOCK_NUM); AscendC::SetFlag(EVENT_ID0); AscendC::SetFlag(EVENT_ID1); AscendC::SetFlag(EVENT_ID0); @@ -192,7 +192,7 @@ public: AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); LoopUncastAndMul(idx, index, eventId); - Mte3Process(idx, index, eventId); + Mte3Process(idx, index, calCount, eventId); AscendC::SetFlag(eventId); AscendC::SetFlag(eventId); AscendC::SetFlag(eventId); @@ -228,16 +228,16 @@ private: LocalTensor inTensor[2]; LocalTensor singleScaleUUBTensor[2]; - LocalTensor singleScaleUBTensor[2]; + LocalTensor singleScaleUBTensor[2]; LocalTensor scaleUUBTensor[2]; - LocalTensor scaleUBTensor[2]; - LocalTensor workUBTensor[2]; - LocalTensor outputUBTensor[2]; + LocalTensor scaleUBTensor[2]; + LocalTensor workUBTensor[2]; + LocalTensor outputUBTensor[2]; GlobalTensor outputGt; - GLobalTensor inputGt[8]; - GLobalTensor inputScaleGt[8]; - GLobalTensor outScaleGt; + GlobalTensor inputGt[8]; + GlobalTensor inputScaleGt[8]; + GlobalTensor outScaleGt; }; #endif // LCCL_DATACOPY_GM2GM_DELAYH -- Gitee From 3078c43d5a7d5689dd274b191ea9c3bac2e2b83d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:08:47 +0800 Subject: [PATCH 106/414] 6 --- .../ascendc_kernels/datacopy_gm2gm_delay.h | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 22e2e2ae..13562cd6 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -18,19 +18,18 @@ template class DataCopyGM2GMDelay { constexpr static int64_t THREE_NUM = 3; constexpr static int64_t FOUR_NUM = 4; - constexpr static int64_t WORK_OFFSET = 8192; - constexpr static int64_t WORK_BLOCK_NUM = WORK_OFFSET / sizeof(T); - constexpr static int64_t UB_HEAD_OFFSET = WORK_OFFSET * 2; - constexpr static int64_t SCALE_SIZE = 32; - constexpr static int64_t SCALE_NUM = SCALE_SIZE / sizeof(T); - constexpr static int64_t SINGLE_SCALE_SIZE = 2; - constexpr static int64_t BLOCK_NUM = (UB_SINGLE_DMA_SIZE_MAX - WORK_OFFSET * 2 - SCALE_SIZE * 4) / 2 / + constexpr static int32_t WORK_OFFSET = 8192; + constexpr static int32_t WORK_BLOCK_NUM = WORK_OFFSET / sizeof(T); + constexpr static int32_t UB_HEAD_OFFSET = WORK_OFFSET * 2; + constexpr static int32_t SCALE_SIZE = 32; + constexpr static int32_t SCALE_NUM = SCALE_SIZE / sizeof(T); + constexpr static int32_t SINGLE_SCALE_SIZE = 2; + constexpr static int32_t BLOCK_NUM = (UB_SINGLE_DMA_SIZE_MAX - WORK_OFFSET * 2 - SCALE_SIZE * 4) / 2 / (sizeof(U) + sizeof(T)) / ALIGN_SIZE * ALIGN_SIZE; - constexpr static int64_t IN_BLOCKSIZE = BLOCK_NUM * sizeof(U); -} + constexpr static int32_t IN_BLOCKSIZE = BLOCK_NUM * sizeof(U); public: - FOECE_INLINE_AICORE DataCopyGM2GMDelay() {} + FORCE_INLINE_AICORE DataCopyGM2GMDelay() {} FORCE_INLINE_AICORE void Init(GlobalTensor& outputGt, GlobalTensor (&inputGt)[8], GlobalTensor (&inputScaleGt)[8], const uint32_t calNum, int rankCount, GlobalTensor& outScaleGt, @@ -50,11 +49,11 @@ public: singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + IN_BLOCKSIZE * FOUR_NUM); - workUBTENSOR[0] = tbuf.GetWithOffset(WORK_BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM); - workUBTENSOR[1] = tbuf.GetWithOffset(WORK_BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * FOUR_NUM + + workUBTensor[0] = tbuf.GetWithOffset(WORK_BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM); + workUBTensor[1] = tbuf.GetWithOffset(WORK_BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * FOUR_NUM + IN_BLOCKSIZE * FOUR_NUM); - outputUBTENSOR[0] = tbuf.GetWithOffset(BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM + WORK_OFFSET); - outputUBTENSOR[1] = tbuf.GetWithOffset(BLOCK_NUM, WORK_OFFSET * HALF_NUM + SCALE_SIZE * FOUR_NUM + + outputUBTensor[0] = tbuf.GetWithOffset(BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM + WORK_OFFSET); + outputUBTensor[1] = tbuf.GetWithOffset(BLOCK_NUM, WORK_OFFSET * HALF_NUM + SCALE_SIZE * FOUR_NUM + IN_BLOCKSIZE * FOUR_NUM); this->rankCount = rankCount; totalDataSize = calNum * sizeof(U); @@ -65,9 +64,9 @@ public: FORCE_INLINE_AICORE void PreProcess() { for (int index = 0; index < rankCount; index++) { - DataCopyWrap(scaleUUBTENSOR[0][indedx * SCALE_SIZE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); + DataCopyWrap(scaleUUBTensor[0][index * SCALE_SIZE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); pipe_barrier(PIPE_ALL); - DataCopyWrap(scaleUUBTensor[1][index * SCALE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); + DataCopyWrap(scaleUUBTensor[1][index * SCALE_SIZE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); pipe_barrier(PIPE_ALL); } for (int index = 0; index < rankCount; index++) { -- Gitee From 84ea0c7974b85214dfdfd09247635f58c0e951ad Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:11:25 +0800 Subject: [PATCH 107/414] 7 --- comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 13562cd6..5463f446 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -46,8 +46,8 @@ public: singleScaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); singleScaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + IN_BLOCKSIZE * FOUR_NUM); - singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); - singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + + scaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); + scaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + IN_BLOCKSIZE * FOUR_NUM); workUBTensor[0] = tbuf.GetWithOffset(WORK_BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM); workUBTensor[1] = tbuf.GetWithOffset(WORK_BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * FOUR_NUM + @@ -79,7 +79,7 @@ public: } Div(scaleUBTensor[1], outputUBTensor[0], scaleUBTensor[1], rankCount); AscendC::PipeBarrier(); - ReduceMin(singleScaleUBTensor[0], singleScaleUUBTensor[0], + ReduceMin(singleScaleUBTensor[0], scaleUBTensor[0], workUBTensor[1][WORK_BLOCK_NUM / HALF_NUM], rankCount, false); pipe_barrier(PIPE_ALL); DataCopyWrap(outScaleGt, singleScaleUUBTensor[0], sizeof(T)); @@ -96,7 +96,7 @@ public: PipeBarrier(); for (int j = 0; perRankNumRemain > 0; j++) { PipeBarrier(); - perRankNum - perRankNumRemain >= WORK_BLOCK_NUM ? WORK_BLOCKNUM : perRankNumRemain; + perRankNum = perRankNumRemain >= WORK_BLOCK_NUM ? WORK_BLOCKNUM : perRankNumRemain; PipeBarrier(); int32_t perRankNum = CeilDiv(perRankNumRemain, rankCount); perRankNumRemain -= perRankNum; -- Gitee From 0443c0ff92aaaa2025199eb97130306a4e2c23d6 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:14:43 +0800 Subject: [PATCH 108/414] 9 --- comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 5463f446..5e624c6e 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -46,6 +46,9 @@ public: singleScaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); singleScaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + IN_BLOCKSIZE * FOUR_NUM); + singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); + singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + + IN_BLOCKSIZE * FOUR_NUM); scaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); scaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + IN_BLOCKSIZE * FOUR_NUM); @@ -96,9 +99,9 @@ public: PipeBarrier(); for (int j = 0; perRankNumRemain > 0; j++) { PipeBarrier(); - perRankNum = perRankNumRemain >= WORK_BLOCK_NUM ? WORK_BLOCKNUM : perRankNumRemain; + perRankNum = perRankNumRemain >= WORK_BLOCK_NUM ? WORK_BLOCK_NUM : perRankNumRemain; PipeBarrier(); - int32_t perRankNum = CeilDiv(perRankNumRemain, rankCount); + perRankNumRemain -= perRankNum; PipeBarrier(); AscendC::SetFlag(eventId); @@ -139,6 +142,8 @@ public: Muls((idx & 1) ? outputUBTensor[0] : outputUBTensor[1], (idx & 1) ? outputUBTensor[0] : outputUBTensor[1], scaleValue, calCount); PipeBarrier(); + Cast((idx & 1) ? inTensor[0] : inTensor[1], (idx & 1) ? + outputUBTensor[0] : outputUBTensor[1], RoundMode::CAST_NONE, calCount); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); AscendC::SetFlag(eventId); -- Gitee From 05072df8406dd38240eb72f547f824f167bab298 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 16:16:47 +0800 Subject: [PATCH 109/414] 1 --- comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index 5e624c6e..a0419016 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -46,9 +46,12 @@ public: singleScaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); singleScaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + IN_BLOCKSIZE * FOUR_NUM); - singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); - singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + + singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); + singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + IN_BLOCKSIZE * FOUR_NUM); + scaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); + scaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + + IN_BLOCKSIZE * FOUR_NUM); scaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); scaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + IN_BLOCKSIZE * FOUR_NUM); @@ -144,6 +147,7 @@ public: PipeBarrier(); Cast((idx & 1) ? inTensor[0] : inTensor[1], (idx & 1) ? outputUBTensor[0] : outputUBTensor[1], RoundMode::CAST_NONE, calCount); + PipeBarrier(); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); AscendC::SetFlag(eventId); -- Gitee From de8d9f91167781fc5adee9a95ebed1b36c1f9eba Mon Sep 17 00:00:00 2001 From: Denver Date: Fri, 22 Aug 2025 16:25:26 +0800 Subject: [PATCH 110/414] fix some error --- src/lcoc.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/lcoc.cpp b/src/lcoc.cpp index 7f1e2e5b..2dd022e5 100644 --- a/src/lcoc.cpp +++ b/src/lcoc.cpp @@ -10,6 +10,7 @@ #include #include "lcal_internal.h" #include "mki/utils/log/log.h" +#include "mki/utils/env/env.h" #include "profiling/report_timing.h" #include "runtime/rt_ffts.h" @@ -77,7 +78,7 @@ bool CheckCoCParamDesc(LcalType lcalType, const CoCParamDesc ¶mDesc) paramCheckList.emplace_back("rsDim", rsDim, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); } - return CheckParamScope(paramCheckList); + return CheckParamScopeList(paramCheckList); } bool Lcoc::CheckInputParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc) const @@ -185,12 +186,12 @@ int Lcoc::LaunchOperator(CoCInputPkg &inputPkg, CoCOutputPkg &outputPkg, void *w args.SetWorkspacePtrArg(workspace); args.SetParamDescArgs(paramDesc); args.SetCommArgs(*comm_); - args.SetCoTilingDataArgs(tiling_); + args.SetCoCTilingDataArgs(tiling_); MKI_LOG(DEBUG) << "[" << LCAL_TYPE2NAME.at(taskParam_.lcalType) << "]:" << args.ParamToString(); return ComputeOverComm(taskParam_.lcalType, args, COC_TYPE2HCCL_TYPE.at(paramDesc.dataTypeDesc), stream); } -bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, cosnt CoCOutputPkg &outputPkg, LcalType lcalType) const +bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, const CoCOutputPkg &outputPkg, LcalType lcalType) const { if (!tilingSuccess_) { std::string str = "Tiling error. Please check whether the 'Lcoc::SetParam' method has been called, " @@ -216,7 +217,7 @@ bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, cosnt CoCOutputPkg &outputPkg return true; } -int Lcoc:MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) { LcalType lcalType = LcalType::MATMUL_ALL_REDUCE; if (!CheckBasic(inputPkg, outputPkg, lcalType)) { @@ -226,6 +227,16 @@ int Lcoc:MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *wor return LaunchOperator(inputPkg, outputPkg, workspace, stream); } +int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +{ + LcalType lcalType = LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcocAllGatherMatmulReduceScatter", true); + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + LcalComm *Lcoc::GetComm() { return comm_; @@ -290,9 +301,10 @@ int64_t Lcoc::GetWorkspaceSize() << ", mAlign=" << mAlign << ", kAlign=" << kAlign << ", nAlign=" << nAlign << ", transA=" << mmInfo.transA << ", transB=" << mmInfo.transB << ", eleSize=" << eleSize << ", hasAAlign=" << hasAAlign << ", hasBAlign=" << hasBAlign << ", accumRankSize=" << accumRankSize << ", hasAccum=" << hasAccum - << ", dequantWorkSapceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam + << ", dequantWorkSpaceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam << ", hasFormatDequantScale=" << hasFormatDequantScale << ", isDeterministic=" << isDeterministic << ", isMoe=" << isMoe << ", isAlltoallVc=" << isAlltoallVc << ", moeInfo.EP=" << static_cast(moeInfo.EP) + << ", moeInfo.local_expert_nums=" << moeInfo.local_expert_nums << ", maxOutputSize=" << maxOutputSize << ", workspaceSize=" << lcalWorkspaceInfo.workspaceSize; return lcalWorkSpaceInfo.workspaceSize; } -- Gitee From a45872a516df0abe60e0a585e0c48c020b3320fc Mon Sep 17 00:00:00 2001 From: Denver Date: Fri, 22 Aug 2025 20:14:22 +0800 Subject: [PATCH 111/414] add class BasePadder --- src/kernels/coc_preprocessor.cce | 249 +++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 src/kernels/coc_preprocessor.cce diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce new file mode 100644 index 00000000..d99969c1 --- /dev/null +++ b/src/kernels/coc_preprocessor.cce @@ -0,0 +1,249 @@ +#ifndef __COC_PREPROCESSOR__ +#define __COC_PREPROCESSOR__ + +#ifdef __DAV_C220_VEC__ + +#include +#include "coc_internal.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class BasePadder { +public: + class LoopIter { + public: + inline __aicore__ LoopIter(int32_t batch_size, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) : + batch_size(batch_size), n_rows(n_rows), n_cols(n_cols), n_cols_aligned(n_cols_aligned) + { + int32_t align_core_num = get_block_num() * get_subblockdim(); + int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); + int32_t n_rows_per_core_base = n_rows / align_core_num; + int32_t n_rows_remainder = n_rows % align_core_num; + int32_t row_offset_base = align_core_idx * n_rows_per_core_base; + if (align_core_idx < n_rows_remainder) { + n_rows_this_core = n_rows_per_core_base + 1; + row_offset_this_core = row_offset_base + align_core_idx; + } else { + n_rows_this_core = n_rows_per_core_base; + row_offset_this_core = row_offset_base + n_rows_remainder; + } + n_cols_this_core = n_cols; + col_offset_this_core = 0; + + src_core_offset = 1LL * row_offset_this_core * n_cols; + dst_core_offset = 1LL * row_offset_this_core * n_cols_aligned; + } + + inline __aicore__ void InitBatchLoop() + { + batch idx = 0; + + src_batch_offset = 0; + dst_batch_offset = 0; + } + + inline __aicore__ bool EndBatchLoop() const + { + return batch_idx = batch_size; + } + + inline __aicore__ void NextBatchLoop() + { + ++batch_idx; + if (EndBatchLoop()) { + return; + } + + src_batch_offset = batch_idx * n_rows * n_cols; + dst_batch_offset = batch_idx * n_rows * n_cols_aligned; + } + + inline __aicore__ void InitRowLoop(int32_t max_rows_per_loop) + { + this->max_rows_per_loop = max_rows_per_loop; + n_rows_complete = 0; + src_row_loop_offset = 0; + dst_row_loop_offset = 0; + + n_rows_this_loop = (n_rows_this_core < max_rows_per_loop) ? n_rows_this_core : max_rows_per_loop; + } + + inline __aicore__ bool EndRowLoop() const + { + return n_rows_complete == n_rows_this_core; + } + + inline __aicore__ void NextRowLoop() + { + n_rows_complete += n_rows_this_loop; + if (EndRowLoop()) { + return; + } + + if (n_rows_complete + n_rows_this_loop > n_rows_this_core) { + n_rows_this_loop = n_rows_this_core - n_rows_complete; + } + src_row_loop_offset = n_rows_complete * n_cols; + dst_row_loop_offset = n_rows_complete * n_cols_aligned; + } + + inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) { + this->max_cols_per_loop = max_cols_per_loop; + n_cols_complete = 0; + col_loop_offset = 0; + + n_cols_this_loop = (n_cols < max_cols_per_loop) ? n_cols : max_cols_per_loop; + } + + inline __aicore__ bool EndColLoop() const + { + return n_cols_complete == n_cols_this_core; + } + + inline __aicore__ void NextColLoop() + { + n_cols_complete += n_cols_this_loop; + if (EndColLoop()) { + return; + } + + if (n_cols_complete + n_cols_this_loop > n_cols_this_core) { + n_cols_this_loop = n_cols_this_core - n_cols_complete; + } + col_loop_offset = n_cols_complete; + } + + inline __aicore__ int64_t src_offset() const + { + return src_core_offset + src_batch_offset + src_row_loop_offset + col_loop_offset; + } + + inline __aicore__ int64_t dst_offset() const + { + return dst_core_offset + dst_batch_offset + dst_row_loop_offset + col_loop_offset; + } + + int32_t batch_size; + int32_t n_rows; + int32_t n_cols; + int32_t n_cols_aligned; + + int32_t n_rows_this_core; + int32_t n_cols_this_core; + int32_t row_offset_this_core; + int32_t col_offset_this_core; + + int32_t max_rows_per_loop; + int32_t max_cols_per_loop; + + int32_t batch_idx; + int32_t n_rows_complete; + int32_t n_cols_complete; + + int32_t n_rows_this_loop; + int32_t n_cols_this_loop; + + int64_t src_core_offset; + int64_t src_core_offset; + int64_t src_batch_offset; + int64_t dst_batch_offset; + int64_t src_row_loop_offset; + int64_t dst_row_loop_offset; + int64_t col_loop_offset; + }; + + __aicore__ explicit BasePadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b) + { + this->gm_a = reinterpret_cast<__gm__ LhsDtype *>(gm_a); + this->gm_b = reinterpret_cast<__gm__ RhsDtype *>(gm_b); + + this->batch_size = batch_size; + this->m = m; + this->k = k; + this->n = n; + this->trans_a = trans_a; + this->trans_b = trans_b; + + this->m_align = m_align; + this->k_align = k_align; + this->n_align = n_align; + + this->aligned_a = aligned_a; + this->aligned_b = aligned_b; + + gm_a_align = reinterpret_cast<__gm__ MmadDtype *>(workspace_info.gm_a_align ? workspace_info.gm_a_align : gm_a); + gm_b_align = reinterpret_cast<__gm__ MmadDtype *>(workspace_info.gm_b_align ? workspace_info.gm_b_align : gm_b); + } + +protected: + inline __aicore__ void PadMatrix(__gm__ MmadDtype *gm_dst, __gm__ MmadDtype *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = Block32B::AlignDown(MAX_UB_BUFF / sizeof(MmadDtype)); + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_base = reinterpret_cast<__ubuf__ MmadDtype *>((uintptr_t)0); + + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + CopyGmToUbufAlign(ub_base, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + CopyUbufToGmAlign(dst, ub_base, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + } + } + } + + inline __aicore__ void Barrier() + { + FFTSCrossCoreSync(0, AIV_FINISH_ALIGN_FLAG_ID); + WaitEvent(AIV_FINISH_ALIGN_FLAG_ID); + + FFTSCrossCoreSync(2, AIC_WAIT_AIV_FINISH_FLAG_ID); + PipeBarrier(); + } + + __gm__ LhsDtype *__restrict__ gm_a{ nullptr }; + __gm__ RhsDtype *__restrict__ gm_b{ nullptr }; + __gm__ MmadDtype *__restrict__ gm_a_align{ nullptr }; + __gm__ MmadDtype *__restrict__ gm_b_align{ nullptr }; + + int32_t batch_size; + + int32_t m_align; + int32_t n_align; + int32_t k_align; + + int32_t m; + int32_t n; + int32_t k; + + bool trans_a; + bool trans_b; + + int32_t aligned_a; + int32_t aligned_b; + + LcalWorkspaceInfo workspace_info; +}; + -- Gitee From c7881b0e87d5cb6a4ae9d8e52f45f949d8658095 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 22 Aug 2025 22:24:01 +0800 Subject: [PATCH 112/414] 1 --- comm/lcal/src/ascendc_kernels/ipc_queue.h | 2 +- .../src/ascendc_kernels/sync_collectives.h | 301 +++++++++++++++++- 2 files changed, 301 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/comm/lcal/src/ascendc_kernels/ipc_queue.h index 2543176f..068232d1 100644 --- a/comm/lcal/src/ascendc_kernels/ipc_queue.h +++ b/comm/lcal/src/ascendc_kernels/ipc_queue.h @@ -120,4 +120,4 @@ private: SyncCollectives *sync; int blockIdx; }; - +#endif // LCCL_IPC_QUEUE_H diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index 9a893c3a..b21b1047 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -6,4 +6,303 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ + +#ifndef LCCL_SYNC_H +#define LCCL_SYNC_H + +#include "comm_args.h" + +using namespace AscendC; +using namespace Lcal; + +constexpr int64_t FLAG_UNIT_INT_NUM = 4; +constexpr int64_t SYNC_UNIT_SIZE = FLAG_UNIT_INT_NUM * sizeof(int64_t); +constexpr int64_t MAGIC_OFFSET = 32; +constexpr int64_t MAGIC_MASK = ~((1LL << MAGIC_OFFSET) - 1); +#ifdef ENABLE_LCCL_MIX +constexpr int32_t LCAL_BLOCK_NUM_MULTI = 2; +#else +constexpr int32_t LCAL_BLOCK_NUM_MULTI = 1; +#endif + +class SyncCollectives { +public: + __aicore__ inline SyncCollectives() {} + + __aicore__ inline void Init(int rank, int rankSize, GM_ADDR *shareAddrs) + { + this->rank = rank; + this->rankSize = rankSize; + this->shareAddrs = shareAddrs; + this->blockIdx = GetBlockIdx(); + this->blockNum = GetBlockNum() * LCAL_BLOCK_NUM_MULTI; + segmentCount = GetBlockNum() * LCAL_BLOCK_NUM_MULTI * FLAG_UNIT_INT_NUM; + localSyncAddr = (__gm__ int64_t*)(shareAddrs[rank]); + basicSyncAddr = (__gm__ int64_t*)(shareAddrs[rank] + GetBlockIdx() * FLAG_UNIT_INT_NUM); + blockOuterSyncAddr = (__gm__ int64_t*)(shareAddrs[rank] + segmentCount + GetBlockIdx() * FLAG_UNIT_INT_NUM); + TPipe pipe; + pipe.InitBuffer(tBuf, GetBlockNum() * SYNC_UNIT_SIZE); + } + + __aicore__ inline void SetSyncFlag(int32_t magic, int32_t value, int32_t eventID) + { + int64_t v = MergeMagicWithValue(magic, value); + SetFlag(localSyncAddr + eventID * FLAG_UNIT_INT_NUM, v); + } + + __aicore__ inline void WaitSyncFlag(int32_t magic, int32_t value, int32_t eventID, int32_t rank, + int32_t breakCycle = 0) + { + int64_t v = MergeMagicWithValue(magic, value); + WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, 1, v, breakCycle); + } + + __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID) + { + int64_t v = MergeMagicWithValue(magic, eventID); + SetFlag(basicSyncAddr, value); + } + __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID, int64_t setRank, int64_t setBlock) + { + int64_t v = MergeMagicWithValue(magic, eventID); + SetFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, value); + } + + __aicore__ inline void WaitInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank, int64_t waitBlock) + { + int64_t value = MergeMagicWithValue(magic, eventID); + WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]) + eventID * FLAG_UNIT_INT_NUM, 1, value); + } + + __aicore__ inline void WaitInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank) + { + int64_t value = MergeMagicWithValue(magic, eventID); + WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); + } + + __aicore__ inline void CheckRankInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank) + { + int64_t value = MergeMagicWithValue(magic, eventID); + WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); + } + + __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID) + { + int64_t value = MergeMagicWithValue(magic, eventID); + SetFlag(blockOuterSyncAddr, value); + } + + __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID) + { + __gm__ int64_t *flagAddr = GetOuterFlagAddr(setRank, setBlock); + int64_t value = MergeMagicWithValue(magic, eventID); + SetFlag(flagAddr, value); + } + + __aicore__ inline void WaitOuterFlag(int32_t magic, int32_t eventID, int64_t waitRank, int64_t waitBlock) + { + int64_t value = MergeMagicWithValue(magic, eventID); + __gm__ int64_t *flagAddr = GetOuterFlagAddr(waitRank, waitBlock); + WaitOneRankPartFlag(flagAddr, 1, value); + } + + __aicore__ inline void WaitOneRankOuterFlag(int32_t magic, int32_t eventID, int64_t rank) + { + int64_t value = MergeMagicWithValue(magic, eventID); + __gm__ int64_t *flagAddr; + flagAddr = GetOuterFlagAddr(rank, 0); + WaitOneRankPartFlag(flagAddr, blockNum, value); + } + __aicore__ inline void WaitAllRankOuterFlag(int32_t magic, int32_t eventID, int64_t rank) + { + int64_t value = MergeMagicWithValue(magic, eventID); + __gm__ int64_t *flagAddr; + for (auto r = 0; r < rankSize; ++r) { + waitRank = (rank + r) % rankSize; + flagAddr = GetOuterFlagAddr(waitRank, blockNum); + WaitOneRankPartFlag(flagAddr, flagNum, value); + } + } + + __aicore__ inline bool CheckAllRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock, + int64_t flagNum) + { + int64_t value = MergeMagicWithValue(magic, eventID); + __gm__ int64_t *flagAddr; + for (auto r = 0; r < rankSize; ++r) { + waitRank = (rank + r) % rankSize; + flagAddr = GetOuterFlagAddr(waitRank, startBlock); + if (!CheckOneRankPartFlag(flagAddr, flagNum, value)) { + return false; + } + } + return true; + } + + __aicore__ inline void WaitAllRankOuterFlag(int32_t magic, int32_t eventID) + { + WaitAllRankPartOuterFlag(magic, eventID, 0, blockNum); + } + + __aicore__ inline void CheckAllRankOuterFlag(int32_t magic, int32_t eventID) + { + return CheckAllRankPartOuterFlag(magic, eventID, 0, blockNum); + } + + __aicore__ inline void SetFlag(__gm__ int64_t* setAddr, int64_t setValue) + { + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + GlobalTensor globalSet; + globalSet.SetGlobalBuffer(setAddr, FLAG_UNIT_INT_NUM); + LocalTensor localSet = tBuf.GetWithOffset(1, 0); + localSet.SetValue(0, setValue); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + DataCopy(globalSet, localSet, FLAG_UNIT_INT_NUM); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + tBuf.FreeTensor(localSet); + } + + __aicore__ inline void WaitFlag(__gm__ int64_t* waitAddr, int64_t waitValue) + { + WaitOneRankPartFlag(waitAddr, 1, waitValue); + } + + __aicore__ inline int64_t GetFlag(__gm__ int64_t* waitAddr) + { + GlobalTensor globalWait; + globalWait.SetGlobalBuffer(waitAddr, FLAG_UNIT_INT_NUM); + LocalTensor localWait = tBuf.GetWithOffset(1, 0); + DataCopy(localWait, globalWait, FLAG_UNIT_INT_NUM); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + int64_t res = localWait.GetValue(0); + tBuf.FreeTensor(localWait); + return res; + } + + + __aicore__ inline void WaitOneRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t waitRank, + int64_t startBlock, int64_t flagNum) + { + int64_t value = MergeMagicWithValue(magic, eventID); + __gm__ int64_t *flagAddr; + flagAddr = GetOuterFlagAddr(waitRank, startBlock); + WaitOneRankPartFlag(flagAddr, flagNum, value); + } + + __aicore__ inline int64_t GetInnerFlag(int64_t waitRank, int64_t waitBlock) + { + return GetFlag((__gm__ int64_t*)(shareAddrs[waitRank]) + waitBlock * FLAG_UNIT_INT_NUM); + } + + __aicore__ inline int64_t GetOuterFlag(int64_t waitRank, int64_t waitBlock) + { + return GetFlag((__gm__ int64_t*)(shareAddrs[waitRank]) + segmentCount + waitBlock * FLAG_UNIT_INT_NUM); + } + +private: + __aicore__ inline int64_t MergeMagicWithValue(int32_t magic, int32_t value) + { + return (static_cast(magic) << 32) | MAGIC_OFFSET | static_casat(value); + } + + __aicore__ inline __gm__ int64_t* GetInnerFlagAddr(int64_t flagRank, int64_t flagBlock) + { + return (__gm__ int64_t*)(shareAddrs[flagRank]) + flagBlock * FLAG_UNIT_INT_NUM; + } + + __aicore__ inline __gm__ int64_t* GetOuterFlagAddr(int64_t flagRank, int64_t flagBlock) + { + return (__gm__ int64_t*)(shareAddrs[flagRank]) + segment + flagBlock * FLAG_UNIT_INT_NUM; + } + + __aicore__ inline void WaitOneRankPartFlag(__gm__ int64_t* waitAddr, int64_t flagNum, int64_t checkValue, + int32_t breakCycle = 0) + { + GlobalTensor globalWait; + globalWait.SetGlobalBuffer(waitAddr, flagNum * FLAG_UNIT_INT_NUM); + LocalTensor localWait = tBuf.GetWithOffset(flagNum * FLAGUNIT_INT_NUM, 0); + bool isSync = true; + do { + if (breakCycle > 0) { + int64_t systemCycleBefore = AscendC::GetSystemCycle(); + int64_t systemCycleAfter = AscendC::GetSystemCycle(); + while (systemCycleAfter - systemCycleBefore < breakCycle) { + systemCycleAfter = AscendC::GetSystemCycle(); + }; + } + DataCopy(localWait, globalWait, flagNum * FLAG_UNIT_INT_NUM); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + isSync = true; + for (auto i = 0; i < flagNum; ++i) { + int64_t v = localWait.GetValue(i * FLAG_UNIT_INT_NUM); + if ((v & MAGIC_MASK) != (checkValue & MAGIC_MASK) || v < checkValue) { + isSync = false; + break; + } + } + } while (!isSync); + tBuf.FreeTensor(localWait); + } + + __aicore__ inline void WaitOneRankAllFlag(__gm__ int64_t* waitAddr, int64_t checkValue) + { + WaitOneRankPartFlag(waitAddr, blockNum, checkValue); + } + + __aicore__ inline bool CheckOneRankPartFlag(__gm__ int64_t* waitAddr, int64_t flagNum, int64_t checkValue) + { + GlobalTensor globalWait; + globalWait.SetGlobalBuffer(waitAddr, flagNum * FLAG_UNIT_INT_NUM); + LocalTensor localWait = tBuf.GetWithOffset(flagNum * FLAGUNIT_INT_NUM, 0); + DataCopy(localWait, globalWait, flagNum * FLAG_UNIT_INT_NUM); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + bool isSync = true; + for (auto i = 0; i < flagNum; ++i) { + int64_t v = localWait.GetValue(i * FLAG_UNIT_INT_NUM); + if ((v & MAGIC_MASK) != (checkValue & MAGIC_MASK) || v < checkValue) { + isSync = false; + break; + } + } + tBuf.FreeTensor(localWait); + return isSync; + } + + __aicore__ inline bool CheckOneRankAllFlag(__gm__ int64_t* waitAddr, int64_t checkValue) + { + return CheckOneRankPartFlag(waitAddr, blockNum, checkValue); + } + + int rank; + int rankSize; + int blockIdx; + int blockNum; + GM_ADDR *shareAddrs; + int64_t segmentCount; + __gm__ int64_t* localSyncAddr; + __gm__ int64_t* basicSyncAddr; + __gm__ int64_t* blockOuterSyncAddr; + TBuf tBuf; +}; + +#endif // LCCL_SYNC_H + + + + + +} + + + + -- Gitee From 961b14225aae40a27129d623f27fa6c69510954e Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 08:57:17 +0800 Subject: [PATCH 113/414] 2 --- .../src/ascendc_kernels/sync_collectives.h | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index b21b1047..22a6ad9c 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -39,8 +39,8 @@ public: this->blockNum = GetBlockNum() * LCAL_BLOCK_NUM_MULTI; segmentCount = GetBlockNum() * LCAL_BLOCK_NUM_MULTI * FLAG_UNIT_INT_NUM; localSyncAddr = (__gm__ int64_t*)(shareAddrs[rank]); - basicSyncAddr = (__gm__ int64_t*)(shareAddrs[rank] + GetBlockIdx() * FLAG_UNIT_INT_NUM); - blockOuterSyncAddr = (__gm__ int64_t*)(shareAddrs[rank] + segmentCount + GetBlockIdx() * FLAG_UNIT_INT_NUM); + basicSyncAddr = (__gm__ int64_t*)(shareAddrs[rank]) + GetBlockIdx() * FLAG_UNIT_INT_NUM; + blockOuterSyncAddr = (__gm__ int64_t*)(shareAddrs[rank]) + segmentCount + GetBlockIdx() * FLAG_UNIT_INT_NUM; TPipe pipe; pipe.InitBuffer(tBuf, GetBlockNum() * SYNC_UNIT_SIZE); } @@ -51,6 +51,17 @@ public: SetFlag(localSyncAddr + eventID * FLAG_UNIT_INT_NUM, v); } + __aicore__ inline void SetSyncFlag(int32_t magic, int32_t value, int32_t eventID, int32_t rank) + { + int64_t v = MergeMagicWithValue(magic, value); + SetFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, v); + } + + __aicore__ inline void CalEventIdByMulBlockNum(int32_t blockMultiplier, int32_t targetcoreID) + { + return (blockMultiplier * blockNum) + targetCoreId; + } + __aicore__ inline void WaitSyncFlag(int32_t magic, int32_t value, int32_t eventID, int32_t rank, int32_t breakCycle = 0) { @@ -60,12 +71,12 @@ public: __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID) { - int64_t v = MergeMagicWithValue(magic, eventID); + int64_t value = MergeMagicWithValue(magic, eventID); SetFlag(basicSyncAddr, value); } __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID, int64_t setRank, int64_t setBlock) { - int64_t v = MergeMagicWithValue(magic, eventID); + int64_t value = MergeMagicWithValue(magic, eventID); SetFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, value); } @@ -228,7 +239,7 @@ private: { GlobalTensor globalWait; globalWait.SetGlobalBuffer(waitAddr, flagNum * FLAG_UNIT_INT_NUM); - LocalTensor localWait = tBuf.GetWithOffset(flagNum * FLAGUNIT_INT_NUM, 0); + LocalTensor localWait = tBuf.GetWithOffset(flagNum * FLAG_UNIT_INT_NUM, 0); bool isSync = true; do { if (breakCycle > 0) { @@ -262,7 +273,7 @@ private: { GlobalTensor globalWait; globalWait.SetGlobalBuffer(waitAddr, flagNum * FLAG_UNIT_INT_NUM); - LocalTensor localWait = tBuf.GetWithOffset(flagNum * FLAGUNIT_INT_NUM, 0); + LocalTensor localWait = tBuf.GetWithOffset(flagNum * FLAG_UNIT_INT_NUM, 0); DataCopy(localWait, globalWait, flagNum * FLAG_UNIT_INT_NUM); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -295,14 +306,4 @@ private: TBuf tBuf; }; -#endif // LCCL_SYNC_H - - - - - -} - - - - +#endif // LCCL_SYNC _H \ No newline at end of file -- Gitee From 1ec10ba5da26ec9cd1d17b02c8e9ab5a81118d10 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 09:00:06 +0800 Subject: [PATCH 114/414] 4 --- comm/lcal/src/ascendc_kernels/sync_collectives.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index 22a6ad9c..5fdd7838 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -57,7 +57,7 @@ public: SetFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, v); } - __aicore__ inline void CalEventIdByMulBlockNum(int32_t blockMultiplier, int32_t targetcoreID) + __aicore__ inline int32_t CalEventIdByMulBlockNum(int32_t blockMultiplier, int32_t targetCoreId) { return (blockMultiplier * blockNum) + targetCoreId; } @@ -77,25 +77,25 @@ public: __aicore__ inline void SetInnerFlag(int32_t magic, int32_t eventID, int64_t setRank, int64_t setBlock) { int64_t value = MergeMagicWithValue(magic, eventID); - SetFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, value); + SetFlag((__gm__ int64_t*)(shareAddrs[setRank]) + setBlock * FLAG_UNIT_INT_NUM, value); } __aicore__ inline void WaitInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank, int64_t waitBlock) { int64_t value = MergeMagicWithValue(magic, eventID); - WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]) + eventID * FLAG_UNIT_INT_NUM, 1, value); + WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]) + waitBlock * FLAG_UNIT_INT_NUM, 1, value); } - __aicore__ inline void WaitInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank) + __aicore__ inline void WaitRankFlag(int32_t magic, int32_t eventID, int64_t waitRank) { int64_t value = MergeMagicWithValue(magic, eventID); - WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); + WaitOneRankAllFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); } - __aicore__ inline void CheckRankInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank) + __aicore__ inline bool CheckRankInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank) { int64_t value = MergeMagicWithValue(magic, eventID); - WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); + return CheckOneRankAllFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); } __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID) -- Gitee From 445a886b24a297e91936532eec060a1503b29d6a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 09:04:31 +0800 Subject: [PATCH 115/414] 8 --- comm/lcal/src/ascendc_kernels/sync_collectives.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index 5fdd7838..2b294415 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -86,7 +86,7 @@ public: WaitOneRankPartFlag((__gm__ int64_t*)(shareAddrs[waitRank]) + waitBlock * FLAG_UNIT_INT_NUM, 1, value); } - __aicore__ inline void WaitRankFlag(int32_t magic, int32_t eventID, int64_t waitRank) + __aicore__ inline void WaitRankInnerFlag(int32_t magic, int32_t eventID, int64_t waitRank) { int64_t value = MergeMagicWithValue(magic, eventID); WaitOneRankAllFlag((__gm__ int64_t*)(shareAddrs[waitRank]), value); @@ -104,7 +104,7 @@ public: SetFlag(blockOuterSyncAddr, value); } - __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID) + __aicore__ inline void SetOuterFlag(int32_t magic, int32_t eventID, int64_t setRank, int64_t setBlock) { __gm__ int64_t *flagAddr = GetOuterFlagAddr(setRank, setBlock); int64_t value = MergeMagicWithValue(magic, eventID); @@ -125,13 +125,14 @@ public: flagAddr = GetOuterFlagAddr(rank, 0); WaitOneRankPartFlag(flagAddr, blockNum, value); } - __aicore__ inline void WaitAllRankOuterFlag(int32_t magic, int32_t eventID, int64_t rank) + __aicore__ inline void WaitAllRankOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock, int64_t flagNum) { int64_t value = MergeMagicWithValue(magic, eventID); __gm__ int64_t *flagAddr; + int wairtRank; for (auto r = 0; r < rankSize; ++r) { waitRank = (rank + r) % rankSize; - flagAddr = GetOuterFlagAddr(waitRank, blockNum); + flagAddr = GetOuterFlagAddr(waitRank, startBlock); WaitOneRankPartFlag(flagAddr, flagNum, value); } } @@ -141,6 +142,7 @@ public: { int64_t value = MergeMagicWithValue(magic, eventID); __gm__ int64_t *flagAddr; + int waitRank; for (auto r = 0; r < rankSize; ++r) { waitRank = (rank + r) % rankSize; flagAddr = GetOuterFlagAddr(waitRank, startBlock); @@ -156,7 +158,7 @@ public: WaitAllRankPartOuterFlag(magic, eventID, 0, blockNum); } - __aicore__ inline void CheckAllRankOuterFlag(int32_t magic, int32_t eventID) + __aicore__ inline bool CheckAllRankOuterFlag(int32_t magic, int32_t eventID) { return CheckAllRankPartOuterFlag(magic, eventID, 0, blockNum); } @@ -221,7 +223,7 @@ public: private: __aicore__ inline int64_t MergeMagicWithValue(int32_t magic, int32_t value) { - return (static_cast(magic) << 32) | MAGIC_OFFSET | static_casat(value); + return (static_cast(magic) | MAGIC_OFFSET) | static_cast(value); } __aicore__ inline __gm__ int64_t* GetInnerFlagAddr(int64_t flagRank, int64_t flagBlock) @@ -231,7 +233,7 @@ private: __aicore__ inline __gm__ int64_t* GetOuterFlagAddr(int64_t flagRank, int64_t flagBlock) { - return (__gm__ int64_t*)(shareAddrs[flagRank]) + segment + flagBlock * FLAG_UNIT_INT_NUM; + return (__gm__ int64_t*)(shareAddrs[flagRank]) + segmentCount + flagBlock * FLAG_UNIT_INT_NUM; } __aicore__ inline void WaitOneRankPartFlag(__gm__ int64_t* waitAddr, int64_t flagNum, int64_t checkValue, -- Gitee From 238ee5385d8c3bcd4eadb0198696d945d7c9f23b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 09:05:25 +0800 Subject: [PATCH 116/414] 4 --- comm/lcal/src/ascendc_kernels/sync_collectives.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index 2b294415..ac8a1558 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -125,11 +125,11 @@ public: flagAddr = GetOuterFlagAddr(rank, 0); WaitOneRankPartFlag(flagAddr, blockNum, value); } - __aicore__ inline void WaitAllRankOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock, int64_t flagNum) + __aicore__ inline void WaitAllRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock, int64_t flagNum) { int64_t value = MergeMagicWithValue(magic, eventID); __gm__ int64_t *flagAddr; - int wairtRank; + int waitRank; for (auto r = 0; r < rankSize; ++r) { waitRank = (rank + r) % rankSize; flagAddr = GetOuterFlagAddr(waitRank, startBlock); @@ -223,7 +223,7 @@ public: private: __aicore__ inline int64_t MergeMagicWithValue(int32_t magic, int32_t value) { - return (static_cast(magic) | MAGIC_OFFSET) | static_cast(value); + return (static_cast(magic) << MAGIC_OFFSET) | static_cast(value); } __aicore__ inline __gm__ int64_t* GetInnerFlagAddr(int64_t flagRank, int64_t flagBlock) -- Gitee From 431c35c868e3bbe8a1cf6ef8992ff7b811f1e823 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 10:27:12 +0800 Subject: [PATCH 117/414] add class DequantPadder --- src/kernels/coc_preprocessor.cce | 273 +++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index d99969c1..141a50b1 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -247,3 +247,276 @@ protected: LcalWorkspaceInfo workspace_info; }; +template +class Padder : public BasePadder { +public: + __aicore__ explicit Padder() = default; + + inline __aicore__ void Run(int32_t expert_per_rank = 1) + { + if (this->aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_b ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitEvent(EVENT_ID1); + + if (this->aligned_b) { + int n_rows = this->trans_b ? this->n : this->k; + int n_cols = this->trans_b ? this->k : this->n; + int n_cols_aligned = this->trans_b ? this->k_align : this->n_align; + + this->PadMatrix(this->gm_b_align, this->gm_b, n_rows * expert_per_rank, n_cols, n_cols_aligned); + } + + this->Barrier(); + } +}; + +class FormatOffset { +public: + static constexpr int32_t max_len = 49152; + static inline __aicore__ void Loop(__gm__ int32_t *dst, int32_t offset, int32_t len) + { + static const ub_offset = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); + + int32_t repeat_num = Block256B::Count(len); + int32_t loop_num = DicCeil(repeat_num, repeat_num); + uint8_t repeat_this_loop = static_cast(repeat); + for (int32_t loop_idx = 0; loop_idx < loop_num; ++loop_idx) { + if (loop_idx == loop_num - 1) { + repeat_this_loop = repeat_num - loop_idx * repeat; + } + VectorDup(ub_offset + loop_idx * repeat * Block256B::size, offset, repeat_this_loop, 1, 8); + } + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + CopyUbufToGmAlign(dst, ub_offset, 1, len, 0); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + +private: + static constexpr uint8_t repeat = 255; +}; + + +template<> +class Padder : public BasePadder { +public: + __aicore__ explicit Padder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_offset = nullptr, + QuantGranularity dequant_granularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + + if (gm_dequant_offset != nullptr && dequant_granularity == QuantGranularity::PER_TENSOR) { + offset = *reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset); + gm_format_dequant_offset = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_dequant_offset); + need_format_dequant_offset = true; + } + } + + inline __aicore__ void Run(int32_t expert_per_rank = 1) + { + if (this->aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + + if (this->aligned_b) { + int n_rows = this->trans_b ? this->n : this->k; + int n_cols = this->trans_b ? this->k : this->n; + int n_cols_aligned = this->trans_b ? this->k_align : this->n_align; + + this->PadMatrix(this->gm_b_align, this->gm_b, n_rows * expert_per_rank, n_cols, n_cols_aligned); + } + + if (need_format_dequant_offset) { + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID1); + FormatOffset(); + } + + this->Barrier(); + } + +private: + inline __aicore__ void FormatOffset() + { + int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); + int32_t align_core_num = get_block_num() * get_subblockdim(); + + int32_t len = FormatOffset::max_len; + int32_t loop_num = DivCeil(n, len); + for (int32_t i = align_core_idx; i < loop_num; i += align_core_num) { + int32_t n_complete = i * len; + if (n_complete + len > n) { + len = n - n_complete; + } + FormatOffset::Loop(gm_format_dequant_offset + n_complete, offset, len); + } + } + + __gm__ int32_t *gm_format_dequant_offset; + int32_t offset; + bool need_format_dequant_offset{ false }; +}; + + +template +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_offset, __gm__ uint8_t *gm_dequant_offset) + {} + inline __aicore__ void Run() {} +}; + + +template <> +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + + scale = *reinterpret_cast<__gm__ half *>(gm_dequant_scale); + if (gm_dequant_offset) { + offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); + has_offset = true; + } + } + + inline __aicore__ void Run() + { + if (this->aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + + int n_rows = this->trans_b ? this->n : this->k; + int n_cols = this->trans_b ? this->k : this->n; + int n_cols_aligned = this->trans_b ? this->k_align : this->n_align; + + DequantAndPadMatrix(this->gm_b_align, this->gm_b, n_rows, n_cols, n_cols_aligned); + + this->Barrier(); + } + +private: + inline __aicore__ void DequantAndPadMatrix(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = Block256B::AlignDown(MAX_UB_BUFF / (sizeof(int8_t) + sizeof(half))); + int32_t n_cols_aligned = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_vconv = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_muls = reinterpret_cast<__ubuf__ half *>((uintptr_t)(MAX_LEN * sizeof(int8_t))); + + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitBatchLoop(max_cols_per_loop); !it.EndBatchLoop(); it.NextBatchLoop()) { + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + CopyGmToUbufAlign(ub_vconv, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + + int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * + (sizeof(half) / sizeof(int8_t)); + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; + int32_t repeat_times = DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + uint8_t repeat = REPEAT_PER_LOOP; + for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete += repeat) { + if (n_repeat_complete + repeat > repeat_times) { + repeat = repeat_times - n_repeat_complete; + } + Vconv(ub_muls + n_repeat_complete * Block256B::size, + ub_vconv + n_repeat_complete * Block256B::size, repeat, 1, 1, 8, 4); + } + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + if (has_offset) { + PipeBarrier(); + + repeat = REPEAT_PER_LOOP; + for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; + n_repeat_complete += repeat) { + if (n_repeat_complete + repeat > repeat_times) { + repeat = repeat_times - n_repeat_complete; + } + Vadds(ub_muls + n_repeat_complete * Block256B::size, + ub_muls + n_repeat_complete * Block256B::size, offset, 1, 1, 8, 8); + } + } + + PipeBarrier(); + + repeat = REPEAT_PER_LOOP; + for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete + repeat) { + if (n_repeat_complete + repeat > repeat_times) { + repeat = repeat_times - n_repeat_complete; + } + Vmuls(ub_muls + n_repeat_complete * Block256B::size, + ub_muls + n_repeat_complete * Block256B::size, scale, 1, 1, 8, 8); + } + + int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); + + SetFlag(EVENT_ID0); + WaitEvent(EVENT_ID0); + + CopyUbufToGmAlign(dst, ub_muls, it.n_rows_this_loop, dst_gap, ubuf_gap); + + SetFlag(EVENT_ID0); + WaitEvent(EVENT_ID0); + } + } + } + } + + half scale; + half offset; + bool has_offset{ false }; +}; + -- Gitee From e9ef398de3e474ee5014e178d417b394197c4413 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 10:37:52 +0800 Subject: [PATCH 118/414] fix some error --- src/kernels/coc_preprocessor.cce | 55 ++++++++++++++++---------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 141a50b1..1860d00d 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -37,7 +37,7 @@ public: inline __aicore__ void InitBatchLoop() { - batch idx = 0; + batch_idx = 0; src_batch_offset = 0; dst_batch_offset = 0; @@ -45,7 +45,7 @@ public: inline __aicore__ bool EndBatchLoop() const { - return batch_idx = batch_size; + return batch_idx == batch_size; } inline __aicore__ void NextBatchLoop() @@ -88,7 +88,8 @@ public: dst_row_loop_offset = n_rows_complete * n_cols_aligned; } - inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) { + inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) + { this->max_cols_per_loop = max_cols_per_loop; n_cols_complete = 0; col_loop_offset = 0; @@ -145,7 +146,7 @@ public: int32_t n_cols_this_loop; int64_t src_core_offset; - int64_t src_core_offset; + int64_t dst_core_offset; int64_t src_batch_offset; int64_t dst_batch_offset; int64_t src_row_loop_offset; @@ -155,7 +156,7 @@ public: __aicore__ explicit BasePadder() = default; - inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b) { this->gm_a = reinterpret_cast<__gm__ LhsDtype *>(gm_a); @@ -207,8 +208,8 @@ protected: CopyUbufToGmAlign(dst, ub_base, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap); - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); } } } @@ -219,7 +220,7 @@ protected: FFTSCrossCoreSync(0, AIV_FINISH_ALIGN_FLAG_ID); WaitEvent(AIV_FINISH_ALIGN_FLAG_ID); - FFTSCrossCoreSync(2, AIC_WAIT_AIV_FINISH_FLAG_ID); + FFTSCrossCoreSync(2, AIC_WAIT_AIV_FINISH_ALIGN_FLAG_ID); PipeBarrier(); } @@ -256,14 +257,14 @@ public: { if (this->aligned_a) { int n_rows = this->trans_a ? this->k : this->m; - int n_cols = this->trans_b ? this->m : this->k; + int n_cols = this->trans_a ? this->m : this->k; int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); } SetFlag(EVENT_ID1); - WaitEvent(EVENT_ID1); + WaitFLag(EVENT_ID1); if (this->aligned_b) { int n_rows = this->trans_b ? this->n : this->k; @@ -282,10 +283,10 @@ public: static constexpr int32_t max_len = 49152; static inline __aicore__ void Loop(__gm__ int32_t *dst, int32_t offset, int32_t len) { - static const ub_offset = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); + static const auto ub_offset = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); int32_t repeat_num = Block256B::Count(len); - int32_t loop_num = DicCeil(repeat_num, repeat_num); + int32_t loop_num = DivCeil(repeat_num, repeat_num); uint8_t repeat_this_loop = static_cast(repeat); for (int32_t loop_idx = 0; loop_idx < loop_num; ++loop_idx) { if (loop_idx == loop_num - 1) { @@ -321,7 +322,7 @@ public: if (gm_dequant_offset != nullptr && dequant_granularity == QuantGranularity::PER_TENSOR) { offset = *reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset); - gm_format_dequant_offset = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_dequant_offset); + gm_format_dequant_offset = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_dequant_param); need_format_dequant_offset = true; } } @@ -336,8 +337,8 @@ public: this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); } - SetFlag(EVENT_ID1); - WaitFlag(EVENT_ID1); + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); if (this->aligned_b) { int n_rows = this->trans_b ? this->n : this->k; @@ -349,7 +350,7 @@ public: if (need_format_dequant_offset) { SetFlag(EVENT_ID1); - SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); FormatOffset(); } @@ -385,7 +386,7 @@ public: __aicore__ explicit DequantPadder() = default; inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, - __gm__ uint8_t *gm_dequant_offset, __gm__ uint8_t *gm_dequant_offset) + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) {} inline __aicore__ void Run() {} }; @@ -439,7 +440,7 @@ private: LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); const int32_t MAX_LEN = Block256B::AlignDown(MAX_UB_BUFF / (sizeof(int8_t) + sizeof(half))); - int32_t n_cols_aligned = Block32B::AlignUp(n_cols); + int32_t n_cols_round = Block32B::AlignUp(n_cols); int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; @@ -447,10 +448,10 @@ private: auto ub_muls = reinterpret_cast<__ubuf__ half *>((uintptr_t)(MAX_LEN * sizeof(int8_t))); for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { - for (it.InitBatchLoop(max_cols_per_loop); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; - for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextColLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); @@ -470,7 +471,7 @@ private: repeat = repeat_times - n_repeat_complete; } Vconv(ub_muls + n_repeat_complete * Block256B::size, - ub_vconv + n_repeat_complete * Block256B::size, repeat, 1, 1, 8, 4); + ub_vconv + n_repeat_complete * Block256B::size, offset, 1, 1, 8, 4); } SetFlag(EVENT_ID0); @@ -493,23 +494,23 @@ private: PipeBarrier(); repeat = REPEAT_PER_LOOP; - for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete + repeat) { + for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete += repeat) { if (n_repeat_complete + repeat > repeat_times) { repeat = repeat_times - n_repeat_complete; } Vmuls(ub_muls + n_repeat_complete * Block256B::size, - ub_muls + n_repeat_complete * Block256B::size, scale, 1, 1, 8, 8); + ub_muls + n_repeat_complete * Block256B::size, scale, repeat, 1, 1, 8, 8); } int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); SetFlag(EVENT_ID0); - WaitEvent(EVENT_ID0); + WaitFLag(EVENT_ID0); - CopyUbufToGmAlign(dst, ub_muls, it.n_rows_this_loop, dst_gap, ubuf_gap); + CopyUbufToGmAlign(dst, ub_muls, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); - SetFlag(EVENT_ID0); - WaitEvent(EVENT_ID0); + SetFlag(EVENT_ID0); + WaitFLag(EVENT_ID0); } } } -- Gitee From 9a428998867d397c6e0510a5dfb23ded09741ca2 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 10:41:05 +0800 Subject: [PATCH 119/414] fix some error again --- src/kernels/coc_preprocessor.cce | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 1860d00d..efe040f1 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -286,7 +286,7 @@ public: static const auto ub_offset = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); int32_t repeat_num = Block256B::Count(len); - int32_t loop_num = DivCeil(repeat_num, repeat_num); + int32_t loop_num = DivCeil(repeat_num, repeat); uint8_t repeat_this_loop = static_cast(repeat); for (int32_t loop_idx = 0; loop_idx < loop_num; ++loop_idx) { if (loop_idx == loop_num - 1) { @@ -471,7 +471,7 @@ private: repeat = repeat_times - n_repeat_complete; } Vconv(ub_muls + n_repeat_complete * Block256B::size, - ub_vconv + n_repeat_complete * Block256B::size, offset, 1, 1, 8, 4); + ub_vconv + n_repeat_complete * Block256B::size, repeat, 1, 1, 8, 4); } SetFlag(EVENT_ID0); @@ -487,7 +487,7 @@ private: repeat = repeat_times - n_repeat_complete; } Vadds(ub_muls + n_repeat_complete * Block256B::size, - ub_muls + n_repeat_complete * Block256B::size, offset, 1, 1, 8, 8); + ub_muls + n_repeat_complete * Block256B::size, repeat, 1, 1, 8, 8); } } -- Gitee From 26109df906f479d6e2e051f30e7f4c6edcdb6d8f Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 10:42:33 +0800 Subject: [PATCH 120/414] fix some error again 2 --- src/kernels/coc_preprocessor.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index efe040f1..ffc01609 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -487,7 +487,7 @@ private: repeat = repeat_times - n_repeat_complete; } Vadds(ub_muls + n_repeat_complete * Block256B::size, - ub_muls + n_repeat_complete * Block256B::size, repeat, 1, 1, 8, 8); + ub_muls + n_repeat_complete * Block256B::size, offset, repeat, 1, 1, 8, 8); } } -- Gitee From 2ad015f527250c344688639d5c1d76798ec3eb7d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 11:42:53 +0800 Subject: [PATCH 121/414] 3 --- .../src/ascendc_kernels/allreduce_quant.h | 191 +++++++++++++++++- 1 file changed, 190 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index 9a893c3a..ed8a5a22 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -6,4 +6,193 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ +#ifndef LCCL_ALLREDUCE_QUANT_H +#define LCCL_ALLREDUCE_QUANT_H +#include "collectives.h" +using namespace AscendC; + +class AllReduceQuant : public Collectives { + constexpr static int32_t UB_HEAD_OFFSET = 96; + constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + ALIGN_SIZE; +public: + FORCE_INLINE_AICORE AllReuceQuant(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + + template + FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GLobalTensor& inputGT, + const uint32_t calCount, int op, T scale, T offset) + { + DataCopyGM2GM cpKernel; + cpKernel.Init(outputGT, inputGT, calCount, op); + cpKernel.Process(scale, offset); + } + + template + FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GLobalTensor& inputGT, + const uint32_t calCount, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) + { + DataCopyGM2GM cpKernel; + cpKernel.Init(outputGT, inputGT, calCount, op); + cpKernel.Process(scaleGT, scaleCount, offset); + } + + template + FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputGT, + const GlobalTensor& outputGT, int op, T scale, T offset) + { + constexpr int32_t ubBlockSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; + constexpr int32_t ubAlignNum = ubBlockSize / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; + constexpr int32_t inputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(U); + constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); + __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); + __gm__ T *output = const_cast<__gm__ U *>(outputGT.GetPhyAddr()); + __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET), (__ubuf__ U*)(UB_MID_OFFSET)}; + __ubuf__ U* outputUB[2] = {(__ubuf__ U*)(inputUB[0] + inputUbBlockSize / sizeof(U), + (__ubuf__ T*)(inputUB[1] + inputUbBlockSize / sizeof(U))}; + __ub__ T* targetOutputUB = nullptr; + int inputOffsetNum = 0; + int outputOffsetNum = 0; + + SetAtomic(op); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + uint32_t size = dataSizeRemain > outputUbBlockSize ? outputUbBlockSize : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + targetOutpuUB = (i & 1) ? outputUB[0] : outputUB[1]; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + inputOffsetNum, size / sizeof(T) * sizeof(U)); + SetWaitEvent(eventId); + CastImpl(targetOutputUB, (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, size / sizeof(T)); + PipeBarrier(); + AddsImpl(targetOutputUB, targetOutputUB, offset, size / sizeof(T)); + PipeBarrier(); + MulsImpl(targetOutputUB, targetOutputUB, scale, size / sizeof(T)); + SetWaitEvenet(eventId); + SetWaitEvenet(eventId); + CpUB2GM(output + outputOffsetNum, targetOutputUB, size); + AscendC::SetFlag(eventId); + + dataSizeRemain -= size; + inputOffsetNum += size / sizeof(T); + outputOffsetNum += size / sizeof(T); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + + SetWaitEvent(EVENT_ID3); + UnsetAtomic(op); + return; + } + + template + FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputGT, + const GlobalTensor& outputGT, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) + { + constexpr int32_t mulVal = 2; + cosntexpr int64_t ubSplitSize = (sizeof(T) + sizeof(U) + sizeof(T)) * mulVal; + constexpr int64_t ubAlignNum = UB_SINGLE_DMA_SIZE_MAX / ubSplitSize / ALIGN_SIZE * ALIGN_SIZE; + const int64_t batchDataNum = (scaleCount + ubAlignNUm - 1) / ubAlignNum; + + __ubuf__ T* scaleUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET), (__ubuf__ T*)(UB_MID_OFFSET)}; + __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET + ubAlignNum * sizeof(T)), + (__ubuf__ U*)(UB_MID_OFFSET + ubAlignNum * sizeof(T))}; + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), + (__ubuf__ T*)(UB_MID_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U)))}; + __ubuf__ T* targetOutputUB = nullptr; + int64_t i = 0; + int32_t curDataNum = 0; + int32_t processedNum = 0; + + SetAtomic(op); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + while (dataSizeRemain > 0) { + if (i % batchDataNum == batchDataNum - 1) { + curDataNum = scaleCount - i % batchDataNum * ubAlignNum; + } else { + curDataNum = ubAlignNum; + } + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + targetOutputUB = (i & 1) ? outputUB[0] : outputUB[1]; + + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + procxessedNum, curDataNum * sizeof(U)); + SetWaitEvent(eventId); + CpGM2UB((i & 1) ? scaleUB[0] : scaleUB[1], scale + i % batchDataNUm * ubAlignNum, curDataNum * sizeof(T)); + CastImpl(targetOutputUB, (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, curDataNum); + SetWaitEvent(eventId); + AddsImpl(targetOutputUB, targetOutputUB, offset, curDataNum); + PipeBarrier(); + MulImpl(targetOutputUB, targetOutputUB, (i & 1) ? scaleUB[0] : scaleUB[1], curDataNum); + SetWaitEvenet(eventId); + CpUB2GM(output + processedNum, targetOutputUB, curDataNum * sizeof(T)); + AscendC::SetFlag(eventId); + + dataSizeRemain -= curDataNum * sizeof(T); + processedNum += curDataNum; + ++i; + } + + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + SetWaitEvent(EVENT_ID3); + UnsetAtomic(op); + return; + } + + template + FORCE_INLINE_AICORE void CpGM2GMPingPongForSmallScale(int64_t dataSizeRemain, __gm__ U *input, + __gm__ T *output, int op, __gm__ T *scale, int64_t scaleCount, T offset) + { + constexpr int32_t ubSplitSize = sizeof(T) + sizeof(U) + sizeof(T) + sizeof(U) + sizeof(T); + constexpr int64_t ubAlignNum = UB_SINGLE_DMA_SIZE_MAX / ubSplitSize / ALIGN_SIZE * ALIGN_SIZE; + const int64_t batchDataNum = ubAlignNum / scaleCount * scaleCount; + const int64_t ubMidOffset = ubAlignNum * (sizeof(T) + sizeof(U) + sizeof(T)) + UB_HEAD_OFFSET + ALIGN_SIZE; + + __ubuf__ T* scaleUB = (__ubuf__ T*)(UB_HEAD_OFFSET); + __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET + ubAlignNum * sizeof(T)), (__ubuf__ U*)(ubMidOffset)}; + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), + (__ubuf__ T*)(ubMidOffset + ubAlignNum * sizeof(U))}; + __ubuf__ T* targetOutputUB = nullptr; + SetAtomic(op); + CpGM2UB(scaleUB, scale, scaleCount * sizeof(T)); + SetWaitEvent(EVENT_ID1); + int64_t repeatTimes = batchDataNum / scaleCount; + int64_t mulVal = 2; + for (int64_t i = 1; i < repeatTimes; i *= mulVal) { + PipeBarrier(); + CopyUB2UB(scaleUB + i * scaleCount, scaleUB, (repeatTimes > i * mulVal ? i : repeatTimes - i) * scaleCount); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemian > 0; i++) { + uint32_t size = dataSizeRemain > batchDataNum * sizeof(T) ? batchDataNum * sizeof(T) : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + targetOutputUB = (i & 1) ? outputUB[0] : outputUB[1]; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + i * batchDataNum, size / sizeof(T) * sizeof(U)); + SetWaitEvent(eventId); + CastImpl(targetOutputUB, (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, size / sizeof(T)); + SetWaitEvent(eventId); + AddsImpl(targetOutputUB, targetOutputUB, offset, size / sizeof(T)); + PipeBarrier(); + MulImpl(targetOutputUB, targetOutputUB, (i & 1) ? scaleUB[0] : scaleUB[1], size / sizeof(T)); + SetWaitEvent(eventId); + CpUB2GM(output + i * batchDataNum, targetOutputUB, size); + AscendC::SetFlag(eventId); + dataSizeRemain -= size; + processedNum += (size / sizeof(T)); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + SetWaitEvent(EVENT_ID3); + UnsetAtomic(op); + return; + } +}; + +#endif // LCCL_ALLREDUCE_QUANT_H \ No newline at end of file -- Gitee From f8d8baa1729ead25afd36c1d58e45255786e2d9a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 11:46:33 +0800 Subject: [PATCH 122/414] 4 --- .../src/ascendc_kernels/allreduce_quant.h | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index ed8a5a22..5dd43049 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -12,15 +12,15 @@ #include "collectives.h" using namespace AscendC; -class AllReduceQuant : public Collectives { +class AllReduceQuant : protected Collectives { constexpr static int32_t UB_HEAD_OFFSET = 96; constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + ALIGN_SIZE; public: - FORCE_INLINE_AICORE AllReuceQuant(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE AllReudeQuant(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} template - FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GLobalTensor& inputGT, + FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GlobalTensor& inputGT, const uint32_t calCount, int op, T scale, T offset) { DataCopyGM2GM cpKernel; @@ -29,7 +29,7 @@ public: } template - FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GLobalTensor& inputGT, + FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GlobalTensor& inputGT, const uint32_t calCount, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { DataCopyGM2GM cpKernel; @@ -46,11 +46,11 @@ public: constexpr int32_t inputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(U); constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); - __gm__ T *output = const_cast<__gm__ U *>(outputGT.GetPhyAddr()); + __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET), (__ubuf__ U*)(UB_MID_OFFSET)}; - __ubuf__ U* outputUB[2] = {(__ubuf__ U*)(inputUB[0] + inputUbBlockSize / sizeof(U), + __ubuf__ U* outputUB[2] = {(__ubuf__ T*)(inputUB[0] + inputUbBlockSize / sizeof(U)), (__ubuf__ T*)(inputUB[1] + inputUbBlockSize / sizeof(U))}; - __ub__ T* targetOutputUB = nullptr; + __ubuf__ T* targetOutputUB = nullptr; int inputOffsetNum = 0; int outputOffsetNum = 0; @@ -61,8 +61,8 @@ public: for (int64_t i = 0; dataSizeRemain > 0; i++) { uint32_t size = dataSizeRemain > outputUbBlockSize ? outputUbBlockSize : dataSizeRemain; event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; - targetOutpuUB = (i & 1) ? outputUB[0] : outputUB[1]; - AscendC::WaitFlag(eventId); + targetOutputUB = (i & 1) ? outputUB[0] : outputUB[1]; + AscendC::WaitFlag(eventId); CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + inputOffsetNum, size / sizeof(T) * sizeof(U)); SetWaitEvent(eventId); CastImpl(targetOutputUB, (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, size / sizeof(T)); @@ -70,8 +70,8 @@ public: AddsImpl(targetOutputUB, targetOutputUB, offset, size / sizeof(T)); PipeBarrier(); MulsImpl(targetOutputUB, targetOutputUB, scale, size / sizeof(T)); - SetWaitEvenet(eventId); - SetWaitEvenet(eventId); + SetWaitEvent(eventId); + SetWaitEvent(eventId); CpUB2GM(output + outputOffsetNum, targetOutputUB, size); AscendC::SetFlag(eventId); @@ -128,9 +128,9 @@ public: AddsImpl(targetOutputUB, targetOutputUB, offset, curDataNum); PipeBarrier(); MulImpl(targetOutputUB, targetOutputUB, (i & 1) ? scaleUB[0] : scaleUB[1], curDataNum); - SetWaitEvenet(eventId); + SetWaitEvent(eventId); CpUB2GM(output + processedNum, targetOutputUB, curDataNum * sizeof(T)); - AscendC::SetFlag(eventId); + AscendC::SetFlag(eventId); dataSizeRemain -= curDataNum * sizeof(T); processedNum += curDataNum; -- Gitee From dd821f022e9ee456af8a028c8fca74b6c903c266 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 13:25:02 +0800 Subject: [PATCH 123/414] 3 --- .../src/ascendc_kernels/allreduce_quant.h | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index 5dd43049..7c1767bb 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -16,7 +16,7 @@ class AllReduceQuant : protected Collectives { constexpr static int32_t UB_HEAD_OFFSET = 96; constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + ALIGN_SIZE; public: - FORCE_INLINE_AICORE AllReudeQuant(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE AllReducQuant(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} template @@ -48,7 +48,7 @@ public: __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET), (__ubuf__ U*)(UB_MID_OFFSET)}; - __ubuf__ U* outputUB[2] = {(__ubuf__ T*)(inputUB[0] + inputUbBlockSize / sizeof(U)), + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(inputUB[0] + inputUbBlockSize / sizeof(U)), (__ubuf__ T*)(inputUB[1] + inputUbBlockSize / sizeof(U))}; __ubuf__ T* targetOutputUB = nullptr; int inputOffsetNum = 0; @@ -76,8 +76,8 @@ public: AscendC::SetFlag(eventId); dataSizeRemain -= size; - inputOffsetNum += size / sizeof(T); - outputOffsetNum += size / sizeof(T); + inputOffsetNum += (size / sizeof(T)); + outputOffsetNum += (size / sizeof(T)); } AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); @@ -91,10 +91,27 @@ public: FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputGT, const GlobalTensor& outputGT, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { - constexpr int32_t mulVal = 2; - cosntexpr int64_t ubSplitSize = (sizeof(T) + sizeof(U) + sizeof(T)) * mulVal; + constexpr int32_t ubSplitSize = sizeof(T) + sizeof(U) + sizeof(T) + sizeof(U) + sizeof(T); + constexpr int64_t ubAlignNum = UB_SINGLE_PING_PONG_ADD_SIZE_MAX / ubSplitSize / ALIGN_SIZE * ALIGN_SIZE; + __gm__ T *scale = const_cast<__gm__ T *>(scaleGT.GetPhyAddr()); + __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); + __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); + if (scaleCount > ubAlignNum) { + CpGM2GMPingPongForBigScale(dataSizeRemain, inputGT, outputGT, op, scaleGT, scaleCount, offset); + } else { + CpGM2GMPingPongForSmallScale(dataSizeRemain, inputGT, outputGT, op, scaleGT, scaleCount, offset); + } + return; + } + + template + FORCE_INLINE_AICORE void CpGM2GMPingPongForBigScale(int64_t dataSizeRemain, const GlobalTensor& inputGT, + const GlobalTensor& outputGT, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) + { + constexpr int64_t mulVal = 2; + constexpr int64_t ubSplitSize = (sizeof(T) + sizeof(U) + sizeof(T)) * mulVal; constexpr int64_t ubAlignNum = UB_SINGLE_DMA_SIZE_MAX / ubSplitSize / ALIGN_SIZE * ALIGN_SIZE; - const int64_t batchDataNum = (scaleCount + ubAlignNUm - 1) / ubAlignNum; + const int64_t batchDataNum = (scaleCount + ubAlignNum - 1) / ubAlignNum; __ubuf__ T* scaleUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET), (__ubuf__ T*)(UB_MID_OFFSET)}; __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET + ubAlignNum * sizeof(T)), @@ -120,9 +137,9 @@ public: targetOutputUB = (i & 1) ? outputUB[0] : outputUB[1]; AscendC::WaitFlag(eventId); - CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + procxessedNum, curDataNum * sizeof(U)); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + processedNum, curDataNum * sizeof(U)); SetWaitEvent(eventId); - CpGM2UB((i & 1) ? scaleUB[0] : scaleUB[1], scale + i % batchDataNUm * ubAlignNum, curDataNum * sizeof(T)); + CpGM2UB((i & 1) ? scaleUB[0] : scaleUB[1], scale + i % batchDataNum * ubAlignNum, curDataNum * sizeof(T)); CastImpl(targetOutputUB, (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, curDataNum); SetWaitEvent(eventId); AddsImpl(targetOutputUB, targetOutputUB, offset, curDataNum); @@ -140,7 +157,7 @@ public: AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); SetWaitEvent(EVENT_ID3); - UnsetAtomic(op); + UnsetAtomic(op); return; } @@ -169,20 +186,20 @@ public: } AscendC::SetFlag(EVENT_ID0); AscendC::SetFlag(EVENT_ID1); - for (int64_t i = 0; dataSizeRemian > 0; i++) { + for (int64_t i = 0; dataSizeRemain > 0; i++) { uint32_t size = dataSizeRemain > batchDataNum * sizeof(T) ? batchDataNum * sizeof(T) : dataSizeRemain; event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; targetOutputUB = (i & 1) ? outputUB[0] : outputUB[1]; AscendC::WaitFlag(eventId); - CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + i * batchDataNum, size / sizeof(T) * sizeof(U)); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], input + processedNum, size / sizeof(T) * sizeof(U)); SetWaitEvent(eventId); CastImpl(targetOutputUB, (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, size / sizeof(T)); - SetWaitEvent(eventId); + PipeBarrier(); AddsImpl(targetOutputUB, targetOutputUB, offset, size / sizeof(T)); PipeBarrier(); - MulImpl(targetOutputUB, targetOutputUB, (i & 1) ? scaleUB[0] : scaleUB[1], size / sizeof(T)); + MulImpl(targetOutputUB, targetOutputUB, scaleUB, size / sizeof(T)); SetWaitEvent(eventId); - CpUB2GM(output + i * batchDataNum, targetOutputUB, size); + CpUB2GM(output + processedNum, targetOutputUB, size); AscendC::SetFlag(eventId); dataSizeRemain -= size; processedNum += (size / sizeof(T)); @@ -190,7 +207,7 @@ public: AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); SetWaitEvent(EVENT_ID3); - UnsetAtomic(op); + UnsetAtomic(op); return; } }; -- Gitee From b20e9c14986bca3f7949a90c9ac950ac32b0d335 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 13:28:09 +0800 Subject: [PATCH 124/414] 4 --- comm/lcal/src/ascendc_kernels/allreduce_quant.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index 7c1767bb..6c0b1526 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -16,7 +16,7 @@ class AllReduceQuant : protected Collectives { constexpr static int32_t UB_HEAD_OFFSET = 96; constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + ALIGN_SIZE; public: - FORCE_INLINE_AICORE AllReducQuant(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE AllReduceQuant(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} template @@ -92,21 +92,22 @@ public: const GlobalTensor& outputGT, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { constexpr int32_t ubSplitSize = sizeof(T) + sizeof(U) + sizeof(T) + sizeof(U) + sizeof(T); - constexpr int64_t ubAlignNum = UB_SINGLE_PING_PONG_ADD_SIZE_MAX / ubSplitSize / ALIGN_SIZE * ALIGN_SIZE; + constexpr int64_t ubAlignNum = UB_SINGLE_DMA_SIZE_MAX / ubSplitSize / ALIGN_SIZE * ALIGN_SIZE; __gm__ T *scale = const_cast<__gm__ T *>(scaleGT.GetPhyAddr()); __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); if (scaleCount > ubAlignNum) { - CpGM2GMPingPongForBigScale(dataSizeRemain, inputGT, outputGT, op, scaleGT, scaleCount, offset); + CpGM2GMPingPongForBigScale(dataSizeRemain, input, output, op, scale, scaleCount, offset); } else { - CpGM2GMPingPongForSmallScale(dataSizeRemain, inputGT, outputGT, op, scaleGT, scaleCount, offset); + CpGM2GMPingPongForSmallScale(dataSizeRemain, input, outputG, op, scaleG, scaleCount, offset); } return; } +protected: template - FORCE_INLINE_AICORE void CpGM2GMPingPongForBigScale(int64_t dataSizeRemain, const GlobalTensor& inputGT, - const GlobalTensor& outputGT, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) + FORCE_INLINE_AICORE void CpGM2GMPingPongForBigScale(int64_t dataSizeRemain, __gm__ U *input, + __gm__ T *output, int op, __gm__ T *scale, int64_t scaleCount, T offset) { constexpr int64_t mulVal = 2; constexpr int64_t ubSplitSize = (sizeof(T) + sizeof(U) + sizeof(T)) * mulVal; @@ -175,6 +176,7 @@ public: __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), (__ubuf__ T*)(ubMidOffset + ubAlignNum * sizeof(U))}; __ubuf__ T* targetOutputUB = nullptr; + int64_t processedNum = 0; SetAtomic(op); CpGM2UB(scaleUB, scale, scaleCount * sizeof(T)); SetWaitEvent(EVENT_ID1); -- Gitee From 8d22f14c58da03fb1e3409c2f99b6eb01e488c84 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 23 Aug 2025 13:28:28 +0800 Subject: [PATCH 125/414] 1 --- comm/lcal/src/ascendc_kernels/allreduce_quant.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index 6c0b1526..d4ef1986 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -99,7 +99,7 @@ public: if (scaleCount > ubAlignNum) { CpGM2GMPingPongForBigScale(dataSizeRemain, input, output, op, scale, scaleCount, offset); } else { - CpGM2GMPingPongForSmallScale(dataSizeRemain, input, outputG, op, scaleG, scaleCount, offset); + CpGM2GMPingPongForSmallScale(dataSizeRemain, input, output, op, scale, scaleCount, offset); } return; } -- Gitee From c5e1f1690ec4efdcc4bcfa3883d4d3dedf206389 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 15:56:10 +0800 Subject: [PATCH 126/414] add class DequantPadder --- src/kernels/coc_preprocessor.cce | 529 +++++++++++++++++++++++++++++++ 1 file changed, 529 insertions(+) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index ffc01609..01a39971 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -521,3 +521,532 @@ private: bool has_offset{ false }; }; +template<> +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, aligned_a, aligned_b, trans_a, trans_b); + + if (gm_dequant_offset) { + auto scale_dptr = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); + auto offset_dptr = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_offset); + + auto ub_args = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); + auto ub_args_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)256); + + int32_t args_gap = Block32B::size; + + CopyGmToUbufAlign(ub_args, scale_dptr, 1, 1, 0); + CopyGmToUbufAlign(ub_args + args_gap, offset_dptr, 1, 1, 0); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + Vconv(ub_args_f32, ub_args, 1, 1, 1, 8, 4); + + SetFlag(EVENT_ID0); + WaitFLag(EVENT_ID0); + + scale = ub_args_f32[0]; + offset = ub_args_f32[args_gap]; + + has_offset = true; + } else { + auto scale_dptr = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); + + auto ub_args = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); + auto ub_args_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)256); + + CopyGmToUbufAlign(ub_args, scale_dptr, 1, 1, 0); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + Vconv(ub_args_f32, ub_args, 1, 1, 1, 8, 4); + + SetFlag(EVENT_ID0); + WaitFLag(EVENT_ID0); + + scale = ub_args_f32[0]; + offset = 0; + } + } + + inline __aicore__ void Run() + { + if (this->aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitEvent(EVENT_ID1); + + int n_rows = this->trans_b ? this->n : this->k; + int n_cols = this->trans_b ? this->k : this->n; + int n_cols_aligned = this->trans_b ? this->k_align : this->n_align; + + DequantAndPadMatrix(this->gm_b_align, this->gm_b, n_rows, n_cols, n_cols_aligned); + + this->Barrier(); + } + +private: + inline __aicore__ void DequantAndPadMatrix(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 16320; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)32768); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)65536); + auto ub_adds = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)131072); + auto ub_muls = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)131072); + + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + int32_t n_blocks_per_row_b8 = Block32B::Count(it.n_cols_this_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(bfloat32_t) / sizeof(int8_t)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID0); + + PipeBarrier(); + Vadds(ub_adds, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vmuls(ub_muls, ub_adds, scale, repeat_b32, 1, 1, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID1); + Vconv(ub_output, ub_muls, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID1); + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + } + } + } + + float scale; + float offset; + bool has_offset{ false }; +}; + + +template<> +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, aligned_a, aligned_b, trans_a, trans_b); + + gm_scale = reinterpret_cast<__gm__ half *>(gm_dequant_scale); + if (gm_dequant_offset) { + gm_offset = reinterpret_cast<__gm__ half *>(gm_dequant_offset); + has_offset = true; + } + } + + inline __aicore__ void Run() + { + if (this->aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitEvent(EVENT_ID1); + + if (!this->trans_b && !has_offset) { + DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (!this->trans_b && has_offset) { + DequantAndPadMatrixHasOffset(this->gm_a_align, this->gm_b, this->k, this->n, this->n_align); + } else if (this->trans_b && !has_offset) { + DequantAndPadMatrixTransposeNoOffset(this->gm_a_align, this->gm_b, this->n, this->k, this->k_align); + } else { + DequantAndPadMatrixTransposeHasOffset(this->gm_a_align, this->gm_b, this->n, this->k, this->k_align); + } + + this->Barrier(); + } + +private: + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 28032; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + + int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * + (sizeof(half) / sizeof(int8_t)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + WaitFLag(EVENT_ID2); + Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + } + SetFlag(EVENT_ID2); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); + } + + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 17792; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)18688); + auto ub_quant_offset = reinterpret_cast<__ubuf__ half *>((uintptr_t)54272); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)89856); + auto ub_add = reinterpret_cast<__ubuf__ half *>((uintptr_t)125440); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID3); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + auto offset = gm_offset + it.n_cols_complete; + + int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * + (sizeof(half) / sizeof(int8_t)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_quant_offset, offset, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID0); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block256B::size, ub_quant_offset, + 0, 1, n_blocks_per_row, 0, 0); + } + + WaitFlag(EVENT_ID1); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_offset + row * n_blocks_per_row * Block32B::size, ub_quant_offset, + 0, 1, n_blocks_per_row, 0, 0); + } + + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID2); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + Vconv(ub_vconv_f16, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID2); + + PipeBarrier(); + Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID3); + Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID3); + + WaitFLag(EVENT_ID3); + CopyGmToUbufAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + WaitFLag(EVENT_ID3); + } + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID3); + } + + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 28032; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; + + int32_t n_blocks_per_row_b8 = Block32B::Count(it.n_cols_this_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { + CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, + 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + } + + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + int32_t src_gap = n_cols = it.n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + auto src = gm_src + it.src_offset(); + auto dst = gm_src + it.dst_offset(); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID1); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + WaitFLag(EVENT_ID2); + Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID2); + } + SetFlag(EVENT_ID0); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 17792; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)18688); + auto ub_quant_offset = reinterpret_cast<__ubuf__ half *>((uintptr_t)54272); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)89856); + auto ub_add = reinterpret_cast<__ubuf__ half *>((uintptr_t)125440); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); + + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID3); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; + auto offset = gm_offset + it.row_offset_this_core + it.n_rows_complete; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID0); + for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { + CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, + 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + } + + WaitFLag(EVENT_ID1); + for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { + CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, + 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + } + + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + int32_t src_gap = n_cols = it.n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + auto src = gm_src + it.src_offset(); + auto dst = gm_src + it.dst_offset(); + + WaitFLag(EVENT_ID2); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID2); + + PipeBarrier(); + Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID3); + Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID3); + + WaitFLag(EVENT_ID3); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID3); + } + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID3); + } + + __gm__ half *gm_scale{ nullptr }; + __gm__ half *gm_offset{ nullptr }; + bool has_offset{ false }; +}; + -- Gitee From 47715adddf246180d06632cd1c8e7262d72a0f19 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 16:38:30 +0800 Subject: [PATCH 127/414] fix some error --- src/kernels/coc_preprocessor.cce | 84 +++++++++++++++++--------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 01a39971..3313d26e 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -531,7 +531,7 @@ public: __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, - m_align, k_align, aligned_a, aligned_b, trans_a, trans_b); + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); if (gm_dequant_offset) { auto scale_dptr = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); @@ -589,7 +589,7 @@ public: } SetFlag(EVENT_ID1); - WaitEvent(EVENT_ID1); + WaitFlag(EVENT_ID1); int n_rows = this->trans_b ? this->n : this->k; int n_cols = this->trans_b ? this->k : this->n; @@ -601,7 +601,7 @@ public: } private: - inline __aicore__ void DequantAndPadMatrix(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrix(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -622,7 +622,7 @@ private: for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { int32_t n_blocks_per_row_b8 = Block32B::Count(it.n_cols_this_loop); int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); - int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(bfloat32_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(float32_t) / sizeof(int8_t)); int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -651,7 +651,10 @@ private: SetFlag(EVENT_ID0); PipeBarrier(); - Vadds(ub_adds, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + Vconv(ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vadds(ub_adds, ub_vconv_f32, repeat_b32, 1, 1, 8, 8); PipeBarrier(); Vmuls(ub_muls, ub_adds, scale, repeat_b32, 1, 1, 8, 8); @@ -662,7 +665,7 @@ private: SetFlag(EVENT_ID1); WaitFLag(EVENT_ID1); - CopyGmToUbufAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID1); } WaitFLag(EVENT_ID0); @@ -687,7 +690,7 @@ public: __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, - m_align, k_align, aligned_a, aligned_b, trans_a, trans_b); + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); gm_scale = reinterpret_cast<__gm__ half *>(gm_dequant_scale); if (gm_dequant_offset) { @@ -707,23 +710,23 @@ public: } SetFlag(EVENT_ID1); - WaitEvent(EVENT_ID1); + WaitFLag(EVENT_ID1); if (!this->trans_b && !has_offset) { DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); } else if (!this->trans_b && has_offset) { - DequantAndPadMatrixHasOffset(this->gm_a_align, this->gm_b, this->k, this->n, this->n_align); + DequantAndPadMatrixHasOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); } else if (this->trans_b && !has_offset) { - DequantAndPadMatrixTransposeNoOffset(this->gm_a_align, this->gm_b, this->n, this->k, this->k_align); + DequantAndPadMatrixTransposeNoOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); } else { - DequantAndPadMatrixTransposeHasOffset(this->gm_a_align, this->gm_b, this->n, this->k, this->k_align); + DequantAndPadMatrixTransposeHasOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); } this->Barrier(); } private: - inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -751,13 +754,17 @@ private: int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; - int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); WaitFLag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID0); WaitFLag(EVENT_ID0); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, + 0, 1, n_blocks_per_row, 0, 0); + } for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); @@ -770,7 +777,7 @@ private: SetFlag(EVENT_ID1); WaitFLag(EVENT_ID1); - Vconv(ub_vconv_f16, ub_input, repeat, 1, 1, 8, 4); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID1); PipeBarrier(); @@ -780,16 +787,17 @@ private: WaitFLag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID2); } - SetFlag(EVENT_ID2); + SetFlag(EVENT_ID0); } } WaitFLag(EVENT_ID0); WaitFLag(EVENT_ID1); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -832,7 +840,7 @@ private: WaitFLag(EVENT_ID0); for (int32_t row = 1; row < max_rows_per_loop; ++row) { - CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block256B::size, ub_quant_offset, + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, 0, 1, n_blocks_per_row, 0, 0); } @@ -854,7 +862,7 @@ private: SetFlag(EVENT_ID2); WaitFLag(EVENT_ID2); - Vconv(ub_vconv_f16, ub_input, repeat, 1, 1, 8, 4); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID2); PipeBarrier(); @@ -866,8 +874,8 @@ private: SetFlag(EVENT_ID3); WaitFLag(EVENT_ID3); - CopyGmToUbufAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); - WaitFLag(EVENT_ID3); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFLag(EVENT_ID3); } SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); @@ -879,7 +887,7 @@ private: WaitFlag(EVENT_ID3); } - inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -898,10 +906,10 @@ private: SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { - for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; - int32_t n_blocks_per_row_b8 = Block32B::Count(it.n_cols_this_loop); + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; @@ -917,15 +925,15 @@ private: 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } - for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { - int32_t src_gap = n_cols = it.n_cols - it.n_cols_this_loop; + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + int32_t src_gap = n_cols = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); auto src = gm_src + it.src_offset(); - auto dst = gm_src + it.dst_offset(); + auto dst = gm_dst + it.dst_offset(); WaitFLag(EVENT_ID1); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); @@ -942,7 +950,7 @@ private: WaitFLag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); - SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); } SetFlag(EVENT_ID0); } @@ -952,7 +960,7 @@ private: WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -975,7 +983,7 @@ private: SetFlag(EVENT_ID2); SetFlag(EVENT_ID3); for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { - for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + for (it.InitRowLoop(max_cols_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; auto offset = gm_offset + it.row_offset_this_core + it.n_rows_complete; @@ -986,7 +994,7 @@ private: uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + CopyGmToUbufAlign(ub_quant_offset, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID0); WaitFLag(EVENT_ID1); @@ -995,25 +1003,25 @@ private: WaitFLag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { - CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, + CopyUB2UB(ub_quant_offset + block_col * Block32B::size, ub_quant_offset, 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } WaitFLag(EVENT_ID1); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { - CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, + CopyUB2UB(ub_quant_offset + block_col * Block32B::size, ub_quant_offset, 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } - for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { - int32_t src_gap = n_cols = it.n_cols - it.n_cols_this_loop; + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); auto src = gm_src + it.src_offset(); - auto dst = gm_src + it.dst_offset(); + auto dst = gm_dst + it.dst_offset(); WaitFLag(EVENT_ID2); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); @@ -1028,12 +1036,12 @@ private: PipeBarrier(); WaitFLag(EVENT_ID3); - Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID3); WaitFLag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); - SetFlag(EVENT_ID3); + SetFlag(EVENT_ID3); } SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); -- Gitee From b3080245064ce108f7968abd58150f27082279bc Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 16:50:04 +0800 Subject: [PATCH 128/414] fix some error again --- src/kernels/coc_preprocessor.cce | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 3313d26e..9c83087e 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -651,10 +651,10 @@ private: SetFlag(EVENT_ID0); PipeBarrier(); - Vconv(ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, 1, 1, 8, 4); PipeBarrier(); - Vadds(ub_adds, ub_vconv_f32, repeat_b32, 1, 1, 8, 8); + Vadds(ub_adds, ub_vconv_f32, offset, repeat_b32, 1, 1, 8, 8); PipeBarrier(); Vmuls(ub_muls, ub_adds, scale, repeat_b32, 1, 1, 8, 8); @@ -926,7 +926,7 @@ private: } for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { - int32_t src_gap = n_cols = n_cols - it.n_cols_this_loop; + int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); @@ -983,7 +983,7 @@ private: SetFlag(EVENT_ID2); SetFlag(EVENT_ID3); for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { - for (it.InitRowLoop(max_cols_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; auto offset = gm_offset + it.row_offset_this_core + it.n_rows_complete; @@ -994,16 +994,16 @@ private: uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_offset, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID0); WaitFLag(EVENT_ID1); - CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + CopyGmToUbufAlign(ub_quant_offset, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID1); WaitFLag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { - CopyUB2UB(ub_quant_offset + block_col * Block32B::size, ub_quant_offset, + CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } -- Gitee From 968acf82356faa8e8ea62576702c4c25f115b757 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 16:51:29 +0800 Subject: [PATCH 129/414] fix some error again 2 --- src/kernels/coc_preprocessor.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 9c83087e..af879f4f 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -651,7 +651,7 @@ private: SetFlag(EVENT_ID0); PipeBarrier(); - Vconv(ub_vconv_f32, ub_vconv_f16, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); Vadds(ub_adds, ub_vconv_f32, offset, repeat_b32, 1, 1, 8, 8); -- Gitee From 56d9c6b93652cab89e3bd275e7d156b188794017 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 20:11:26 +0800 Subject: [PATCH 130/414] add class DequantPadder PERCHANNEL --- src/kernels/coc_preprocessor.cce | 463 +++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index af879f4f..5920007a 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -1058,3 +1058,466 @@ private: bool has_offset{ false }; }; + +template <> +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + gm_scale = *reinterpret_cast<__gm__ half *>(gm_dequant_scale); + if (gm_dequant_offset) { + gm_offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); + has_offset = true; + } + } + + inline __aicore__ void Run() + { + if (aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + + if (!trans_b && !has_offset) { + DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (!trans_b && has_offset) { + DequantAndPadMatrixHasOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (trans_b && !has_offset) { + DequantAndPadMatrixTransposeNoOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); + } else { + DequantAndPadMatrixTransposeHasOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); + } + + this->Barrier(); + } + +private: + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 10240; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)10496); + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)174592); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + + int32_t n_blocks_per_row_b16 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); + int32_t quant_repeat_b32 = static_cast( + DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, + ub_quant_scale, 0, 1, n_blocks_per_row, 0, 0); + } + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vconv(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID2); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID2); + } + SetFlag(EVENT_ID0); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 9344; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); + auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)18688); + auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)74752); + auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)112384); + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)149760); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID3); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + auto offset = gm_offset + it.n_cols_complete; + + int32_t n_blocks_per_row_b16 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); + uint8_t quant_repeat_b32 = static_cast( + DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_offset_origin, scale, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_quant_offset_origin, offset, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID0); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, + ub_quant_scale, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); + } + + WaitFlag(EVENT_ID1); + Vconv(ub_quant_scale, ub_quant_scale_origin, quant_repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row_b32 * Block32B::size, + ub_quant_offset, /* sid */ 0, 1, n_blocks_per_row, 0, 0); + } + + WaitFLag(EVENT_ID1); + Vconv(ub_quant_offset, ub_quant_offset_origin, quant_repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + for (int32_t row = 1; row < max_rows_per_loop; ++row) { + CopyUB2UB(ub_quant_offset + row * n_blocks_per_row_b32 * Block32B::size, + ub_quant_offset, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); + } + + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID2); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID2); + + PipeBarrier(); + Vadd(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + + PipeBarrier(); + Vadd(ub_add, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + Vadd(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID3); + Vmul(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID3); + + WaitFLag(EVENT_ID3); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFLag(EVENT_ID3); + } + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID3); + } + + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 10240; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)10496); + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); + auto ub_mul= reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)174592); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(float32_t) / sizeof(int8_t)); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { + CopyUB2UB(ub_quant_scale + block_col * Block32B::size, + ub_quant_scale, /* sid */ 0, it.n_rows_this_loop, 1, + n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + } + + PipeBarrier(); + Vconv(ub_quant_scale, ub_quant_scale_origin, repeat_b32, 1, 1, 8, 4); + + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFLag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID1); + + WaitFLag(EVENT_ID1); + Vconv(ub_vconv, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vmul(ub_mul, ub_vconv, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID2); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID2); + } + SetFlag(EVENT_ID0); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 9344; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); + auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)18688); + auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)74752); + auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)112384); + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)149760); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID3); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto scale = gm_scale + it.row_offset_this_core + it.n_rows_complete; + auto offset = gm_offset + it.row_offset_this_core + it.n_rows_complete; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(float32_t) / sizeof(int8_t)); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_offset, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID0); + + WaitFLag(EVENT_ID0); + for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { + CopyUB2UB(ub_quant_scale_origin + block_col * Block32B::size, + ub_quant_scale, /* sid */ 0, it.n_rows_this_loop, 1, + n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + } + + PipeBarrier(); + Vconv(ub_quant_scale, ub_quant_scale_origin, repeat_b32, 1, 1, 8, 4); + + WaitFLag(EVENT_ID1); + for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { + CopyUB2UB(ub_quant_offset_origin + block_col * Block32B::size, + ub_quant_offset, /* sid */ 0, it.n_rows_this_loop, 1, + n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + } + + PipeBarrier(); + Vconv(ub_quant_offset, ub_quant_offset_origin, repeat_b32, 1, 1, 8, 4); + + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFLag(EVENT_ID2); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID2); + + WaitFLag(EVENT_ID2); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID2); + + PipeBarrier(); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vadd(ub_add, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + Vmul(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFLag(EVENT_ID3); + Vmul(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8,RoundMode::CAST_RINT); + SetFlag(EVENT_ID3); + + WaitFLag(EVENT_ID3); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID3); + } + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + } + WaitFLag(EVENT_ID0); + WaitFLag(EVENT_ID1); + WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID3); + } + + __gm__ bfloat16_t *gm_scale{ nullptr }; + __gm__ bfloat16_t *gm_offset{ nullptr }; + bool has_offset{ false }; +}; -- Gitee From faa47d83f607ae28cd545fcb6e982c1be983d8d7 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 20:25:33 +0800 Subject: [PATCH 131/414] fix some error --- src/kernels/coc_preprocessor.cce | 246 +++++++++++++++---------------- 1 file changed, 122 insertions(+), 124 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 5920007a..a309b8bd 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -8,6 +8,7 @@ #include "kernel_operator.h" using namespace AscendC; + template class BasePadder { public: @@ -88,7 +89,7 @@ public: dst_row_loop_offset = n_rows_complete * n_cols_aligned; } - inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) + inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) { this->max_cols_per_loop = max_cols_per_loop; n_cols_complete = 0; @@ -172,7 +173,7 @@ public: this->m_align = m_align; this->k_align = k_align; this->n_align = n_align; - + this->aligned_a = aligned_a; this->aligned_b = aligned_b; @@ -264,7 +265,7 @@ public: } SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); if (this->aligned_b) { int n_rows = this->trans_b ? this->n : this->k; @@ -505,12 +506,12 @@ private: int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyUbufToGmAlign(dst, ub_muls, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); } } } @@ -551,7 +552,7 @@ public: Vconv(ub_args_f32, ub_args, 1, 1, 1, 8, 4); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); scale = ub_args_f32[0]; offset = ub_args_f32[args_gap]; @@ -571,7 +572,7 @@ public: Vconv(ub_args_f32, ub_args, 1, 1, 1, 8, 4); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); scale = ub_args_f32[0]; offset = 0; @@ -642,11 +643,11 @@ private: uint8_t repeat_b32 = static_cast( DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); SetFlag(EVENT_ID0); @@ -660,16 +661,16 @@ private: Vmuls(ub_muls, ub_adds, scale, repeat_b32, 1, 1, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); Vconv(ub_output, ub_muls, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID1); } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); } } } @@ -710,7 +711,7 @@ public: } SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); if (!this->trans_b && !has_offset) { DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); @@ -756,11 +757,11 @@ private: int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); for (int32_t row = 1; row < max_rows_per_loop; ++row) { CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, 0, 1, n_blocks_per_row, 0, 0); @@ -772,28 +773,28 @@ private: int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID1); PipeBarrier(); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID2); } SetFlag(EVENT_ID0); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); WaitFlag(EVENT_ID2); } @@ -830,15 +831,15 @@ private: int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_quant_offset, offset, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); for (int32_t row = 1; row < max_rows_per_loop; ++row) { CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, 0, 1, n_blocks_per_row, 0, 0); @@ -857,11 +858,11 @@ private: int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID2); @@ -869,11 +870,11 @@ private: Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID3); + WaitFlag(EVENT_ID3); Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID3); - WaitFLag(EVENT_ID3); + WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFLag(EVENT_ID3); } @@ -881,9 +882,9 @@ private: SetFlag(EVENT_ID1); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID3); } @@ -915,11 +916,11 @@ private: int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); @@ -935,28 +936,28 @@ private: auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID1); PipeBarrier(); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID2); } SetFlag(EVENT_ID0); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); WaitFlag(EVENT_ID2); } @@ -993,21 +994,21 @@ private: int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_quant_offset, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_offset + block_col * Block32B::size, ub_quant_offset, 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); @@ -1023,11 +1024,11 @@ private: auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID2); @@ -1035,11 +1036,11 @@ private: Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID3); + WaitFlag(EVENT_ID3); Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID3); - WaitFLag(EVENT_ID3); + WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID3); } @@ -1047,9 +1048,9 @@ private: SetFlag(EVENT_ID1); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID3); } @@ -1068,11 +1069,11 @@ public: int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset) { - this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - gm_scale = *reinterpret_cast<__gm__ half *>(gm_dequant_scale); + gm_scale = *reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); if (gm_dequant_offset) { - gm_offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); + gm_offset = *reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_offset); has_offset = true; } } @@ -1104,7 +1105,7 @@ public: } private: - inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1133,21 +1134,24 @@ private: Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); int32_t n_blocks_per_row_b32 = Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); - int32_t quant_repeat_b32 = static_cast( + uint8_t quant_repeat_b32 = static_cast( DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); - WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_scale, scale, 1, it.n_cols_this_loop, 0); + WaitFlag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale_origin, scale, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); + Vconv(ub_quant_scale, ub_quant_scale_origin, quant_repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); for (int32_t row = 1; row < max_rows_per_loop; ++row) { - CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, - ub_quant_scale, 0, 1, n_blocks_per_row, 0, 0); + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row_b32 * Block32B::size, + ub_quant_scale, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); } for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto src = gm_src + it.src_offset(); @@ -1160,11 +1164,11 @@ private: uint8_t repeat_b32 = static_cast( DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); SetFlag(EVENT_ID1); @@ -1172,26 +1176,26 @@ private: Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); - Vconv(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + Vmul(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID2); } SetFlag(EVENT_ID0); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1232,30 +1236,24 @@ private: int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); - WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_offset_origin, scale, 1, it.n_cols_this_loop, 0); + WaitFlag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale_origin, scale, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_quant_offset_origin, offset, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID0); - for (int32_t row = 1; row < max_rows_per_loop; ++row) { - CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, - ub_quant_scale, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); - } - - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); Vconv(ub_quant_scale, ub_quant_scale_origin, quant_repeat_b32, 1, 1, 8, 4); PipeBarrier(); for (int32_t row = 1; row < max_rows_per_loop; ++row) { CopyUB2UB(ub_quant_scale + row * n_blocks_per_row_b32 * Block32B::size, - ub_quant_offset, /* sid */ 0, 1, n_blocks_per_row, 0, 0); + ub_quant_scale, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); } - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); Vconv(ub_quant_offset, ub_quant_offset_origin, quant_repeat_b32, 1, 1, 8, 4); PipeBarrier(); @@ -1275,29 +1273,29 @@ private: uint8_t repeat_b32 = static_cast( DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); SetFlag(EVENT_ID2); PipeBarrier(); - Vadd(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); Vadd(ub_add, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); - Vadd(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + Vmul(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID3); - Vmul(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); - SetFlag(EVENT_ID3); + WaitFlag(EVENT_ID3); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID3); - WaitFLag(EVENT_ID3); + WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFLag(EVENT_ID3); } @@ -1305,13 +1303,13 @@ private: SetFlag(EVENT_ID1); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID3); } - inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1347,14 +1345,14 @@ private: uint8_t repeat_b32 = static_cast( DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + WaitFlag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale_origin, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { - CopyUB2UB(ub_quant_scale + block_col * Block32B::size, - ub_quant_scale, /* sid */ 0, it.n_rows_this_loop, 1, + CopyUB2UB(ub_quant_scale_origin + block_col * Block32B::size, + ub_quant_scale_origin, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } @@ -1371,38 +1369,38 @@ private: int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID1); - Vconv(ub_vconv, ub_input, repeat_b16, 1, 1, 8, 4); + WaitFlag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); SetFlag(EVENT_ID1); PipeBarrier(); Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); - Vmul(ub_mul, ub_vconv, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + Vmul(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID2); } SetFlag(EVENT_ID0); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1443,28 +1441,28 @@ private: uint8_t repeat_b32 = static_cast( DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); - WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_scale, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + WaitFlag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale_origin, scale, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); SetFlag(EVENT_ID0); - WaitFLag(EVENT_ID0); - CopyGmToUbufAlign(ub_quant_offset, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); - SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_quant_offset_origin, offset, it.n_rows_this_loop, 1, 0, n_blocks_per_row_b16 - 1); + SetFlag(EVENT_ID1); - WaitFLag(EVENT_ID0); + WaitFlag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_scale_origin + block_col * Block32B::size, - ub_quant_scale, /* sid */ 0, it.n_rows_this_loop, 1, + ub_quant_scale_origin, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } PipeBarrier(); Vconv(ub_quant_scale, ub_quant_scale_origin, repeat_b32, 1, 1, 8, 4); - WaitFLag(EVENT_ID1); + WaitFlag(EVENT_ID1); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_offset_origin + block_col * Block32B::size, - ub_quant_offset, /* sid */ 0, it.n_rows_this_loop, 1, + ub_quant_offset_origin, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } @@ -1481,11 +1479,11 @@ private: int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); SetFlag(EVENT_ID2); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); SetFlag(EVENT_ID2); @@ -1499,11 +1497,11 @@ private: Vmul(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); - WaitFLag(EVENT_ID3); - Vmul(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8,RoundMode::CAST_RINT); + WaitFlag(EVENT_ID3); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8,RoundMode::CAST_RINT); SetFlag(EVENT_ID3); - WaitFLag(EVENT_ID3); + WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID3); } @@ -1511,9 +1509,9 @@ private: SetFlag(EVENT_ID1); } } - WaitFLag(EVENT_ID0); - WaitFLag(EVENT_ID1); - WaitFLag(EVENT_ID2); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID3); } -- Gitee From 74bfb30bc124c09ce783e80bf80d7b16916e4b82 Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 20:30:57 +0800 Subject: [PATCH 132/414] fix some error again --- src/kernels/coc_preprocessor.cce | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index a309b8bd..74871e65 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -693,9 +693,9 @@ public: this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - gm_scale = reinterpret_cast<__gm__ half *>(gm_dequant_scale); + gm_scale = *reinterpret_cast<__gm__ half *>(gm_dequant_scale); if (gm_dequant_offset) { - gm_offset = reinterpret_cast<__gm__ half *>(gm_dequant_offset); + gm_offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); has_offset = true; } } @@ -751,7 +751,7 @@ private: auto scale = gm_scale + it.n_cols_complete; int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * - (sizeof(half) / sizeof(int8_t)); + (sizeof(half) / sizeof(int8_t)); int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -825,7 +825,7 @@ private: auto offset = gm_offset + it.n_cols_complete; int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * - (sizeof(half) / sizeof(int8_t)); + (sizeof(half) / sizeof(int8_t)); int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -876,7 +876,7 @@ private: WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); - SetFLag(EVENT_ID3); + SetFlag(EVENT_ID3); } SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); @@ -1297,7 +1297,7 @@ private: WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); - SetFLag(EVENT_ID3); + SetFlag(EVENT_ID3); } SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); -- Gitee From 83f9112d4166d9735cb7e2e6d89d88687d3eb5ee Mon Sep 17 00:00:00 2001 From: Denver Date: Sat, 23 Aug 2025 22:19:34 +0800 Subject: [PATCH 133/414] add class DequantPadder PER_GROUP --- src/kernels/coc_preprocessor.cce | 490 +++++++++++++++++++++++++++++++ 1 file changed, 490 insertions(+) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 74871e65..93b799ab 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -1519,3 +1519,493 @@ private: __gm__ bfloat16_t *gm_offset{ nullptr }; bool has_offset{ false }; }; + +template +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset, int32_t dequant_group_size) + {} + + inline __aicore__ void Run() + {} +}; + +template <> +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset, int32_t dequant_group_size) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + gm_scale = *reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); + if (gm_dequant_offset) { + gm_offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); + has_offset = true; + } + group_size = dequant_group_size; + group_num = (this->k + group_size - 1) / group_size; + } + + inline __aicore__ void Run() + { + if (this->aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + + if (!this->trans_b && !has_offset) { + DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (!this->trans_b && has_offset) { + DequantAndPadMatrixHasOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (this->trans_b && !has_offset) { + DequantAndPadMatrixTransposeNoOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); + } else { + DequantAndPadMatrixTransposeHasOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); + } + + this->Barrier(); + } + +private: + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 28032; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + int32_t n_blocks_per_row_b16 = Block32B::Count(it.n_cols_this_loop) * + (sizeof(half) / sizeof(int8_t)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + bool is_after_mte2 = false; + WaitFlag(EVENT_ID0); + + for (int32_t row = 0; row < max_rows_per_loop; ++row) { + int32_t row_idx = it.row_offset_this_core + it.n_rows_complete + row; + int32_t in_group_idx = row_idx % group_size; + if (in_group_idx == 0 || it.n_rows_complete + row == 0) { + int32_t ub_quant_args_offset = row * n_blocks * Block32B::size; + int32_t group_idx = row_idx / group_size; + + if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx * n_cols, + 1, it.n_cols_this_loop, 0); + is_after_mte2 = true; + ub_quant_args_root_offset = ub_quant_args_offset; + } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, 1, n_blocks_per_row, 0, 0); + is_after_mte2 = false; + } + } + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } else { + PipeBarrier(); + } + WaitFlag(EVENT_ID2); + Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + is_after_mte2 = false; + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 17792; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)18688); + auto ub_quant_offset = reinterpret_cast<__ubuf__ half *>((uintptr_t)54272); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)89856); + auto ub_add = reinterpret_cast<__ubuf__ half *>((uintptr_t)125440); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + auto offset = gm_offset + it.n_cols_complete; + + int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * + (sizeof(half) / sizeof(int8_t)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row - Block32B::Count(it.n_cols_this_loop); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + bool is_after_mte2 = false; + WaitFlag(EVENT_ID0); + for (int32_t row = 0; row < max_rows_per_loop; ++row) { + int32_t row_idx = row * n_blocks_per_row * Block32B::size; + int32_t group_idx = row_idx % group_size; + if (in_group_idx == 0 || it.n_rows_complete + row == 0) { + int32_t ub_quant_args_offset = row * n_blocks * Block32B::size; + int32_t group_idx = row_idx / group_size; + + if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx * n_cols, + 1, it.n_cols_this_loop, 0); + CopyGmToUbufAlign(ub_quant_offset + ub_quant_args_offset, offset + group_idx * n_cols, + 1, it.n_cols_this_loop, 0); + is_after_mte2 = true; + ub_quant_args_root_offset = ub_quant_args_offset; + } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + 1, n_blocks_per_row, 0, 0); + CopyUB2UB(ub_quant_offset + ub_quant_args_offset, + ub_quant_offset + ub_quant_args_root_offset, /* sid */ 0, + 1, n_blocks_per_row, 0, 0); + is_after_mte2 = false; + } + } + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } else { + PipeBarrier(); + } + Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); + is_after_mte2 = false; + + PipeBarrier(); + WaitFlag(EVENT_ID2); + Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 28032; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); + + int32_t group_block = Block32B::Count(group_size); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto scale = gm_scale + (it.row_offset_this_core + it.n_rows_complete) * group_num; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(half) / sizeof(int8_t)); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + bool is_after_mte2 = false; + WaitFlag(EVENT_ID0); + for (int32_t block_col = 0; block_col < n_blocks_per_row_b16; ++block_col) { + int32_t block_col_idx = Block32B::Count(it.n_cols_complete) + block_col; + int32_t in_group_idx = block_col_idx % group_size; + if (in_group_idx == 0 || block_col_idx == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + int32_t group_idx = block_col_idx / group_size; + + if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, + it.n_cols_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = true; + ub_quant_args_root_offset = ub_quant_args_offset; + } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = false; + } + } + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } else { + PipeBarrier(); + } + WaitFlag(EVENT_ID2); + Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + is_after_mte2 = false; + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 17792; + int32_t max_rows_per_loop = (it.n_rows_this_core * Block32B::size <= MAX_LEN) ? + it.n_rows_this_core : MAX_LEN / Block32B::size; + int32_t max_cols_per_loop = (it.n_rows_this_core * Block32B::size <= MAX_LEN) ? + Block32B::AlignDown(MAX_LEN / it.n_rows_this_core) : Block32B::size; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)18688); + auto ub_quant_offset = reinterpret_cast<__ubuf__ half *>((uintptr_t)54272); + auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)89856); + auto ub_add = reinterpret_cast<__ubuf__ half *>((uintptr_t)125440); + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); + + int32_t group_block = Block32B::Count(group_size); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto scale = gm_scale + (it.row_offset_this_core + it.n_rows_complete) * group_num; + auto offset = gm_offset + (it.row_offset_this_core + it.n_rows_complete) * group_num; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(half) / sizeof(int8_t)); + + int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row_b16; + uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + bool is_after_mte2 = false; + WaitFlag(EVENT_ID0); + for (int32_t block_col = 0; block_col < n_blocks_per_row_b16; ++block_col) { + int32_t block_col_idx = Block32B::Count(it.n_cols_complete) + block_col; + int32_t in_group_idx = block_col_idx % group_size; + if (in_group_idx == 0 || block_col_idx == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + int32_t group_idx = block_col_idx / group_size; + + if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, + it.n_cols_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + CopyGmToUbufAlign(ub_quant_offset + ub_quant_args_offset, offset + group_idx, + it.n_cols_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = true; + ub_quant_args_root_offset = ub_quant_args_offset; + } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + CopyUB2UB(ub_quant_offset + ub_quant_args_offset, + ub_quant_offset + ub_quant_args_root_offset, /* sid */ 0, + it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = false; + } + } + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } else { + PipeBarrier(); + } + Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); + is_after_mte2 = false; + + PipeBarrier(); + WaitFlag(EVENT_ID2); + Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + __gm__ half *gm_scale{ nullptr }; + __gm__ half *gm_offset{ nullptr }; + int32_t group_size; + int32_t group_num; + bool has_offset{ false }; +}; \ No newline at end of file -- Gitee From d8a6e293bb056a82260548ed8fb54d89897ec68d Mon Sep 17 00:00:00 2001 From: Denver Date: Sun, 24 Aug 2025 08:36:10 +0800 Subject: [PATCH 134/414] fix some error --- src/kernels/coc_preprocessor.cce | 56 +++++++++++++++++--------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 93b799ab..07a7795c 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -1543,11 +1543,11 @@ public: int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset, int32_t dequant_group_size) { - this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - gm_scale = *reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); + gm_scale = reinterpret_cast<__gm__ half *>(gm_dequant_scale); if (gm_dequant_offset) { - gm_offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); + gm_offset = reinterpret_cast<__gm__ half *>(gm_dequant_offset); has_offset = true; } group_size = dequant_group_size; @@ -1567,11 +1567,11 @@ public: SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); - if (!this->trans_b && !has_offset) { + if (!trans_b && !has_offset) { DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); - } else if (!this->trans_b && has_offset) { + } else if (!trans_b && has_offset) { DequantAndPadMatrixHasOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); - } else if (this->trans_b && !has_offset) { + } else if (trans_b && !has_offset) { DequantAndPadMatrixTransposeNoOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); } else { DequantAndPadMatrixTransposeHasOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); @@ -1581,7 +1581,7 @@ public: } private: - inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1602,7 +1602,8 @@ private: for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { auto scale = gm_scale + it.n_cols_complete; - int32_t n_blocks_per_row_b16 = Block32B::Count(it.n_cols_this_loop) * + + int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * (sizeof(half) / sizeof(int8_t)); int32_t src_gap = n_cols - it.n_cols_this_loop; @@ -1627,12 +1628,11 @@ private: bool is_after_mte2 = false; WaitFlag(EVENT_ID0); - for (int32_t row = 0; row < max_rows_per_loop; ++row) { int32_t row_idx = it.row_offset_this_core + it.n_rows_complete + row; int32_t in_group_idx = row_idx % group_size; if (in_group_idx == 0 || it.n_rows_complete + row == 0) { - int32_t ub_quant_args_offset = row * n_blocks * Block32B::size; + int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; int32_t group_idx = row_idx / group_size; if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { @@ -1720,16 +1720,20 @@ private: uint8_t repeat = static_cast(DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT)); WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID1); bool is_after_mte2 = false; WaitFlag(EVENT_ID0); for (int32_t row = 0; row < max_rows_per_loop; ++row) { - int32_t row_idx = row * n_blocks_per_row * Block32B::size; - int32_t group_idx = row_idx % group_size; + int32_t row_idx = it.row_offset_this_core + it.n_rows_complete + row; + int32_t in_group_idx = row_idx % group_size; if (in_group_idx == 0 || it.n_rows_complete + row == 0) { - int32_t ub_quant_args_offset = row * n_blocks * Block32B::size; + int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; int32_t group_idx = row_idx / group_size; if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { @@ -1785,7 +1789,7 @@ private: WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1831,27 +1835,27 @@ private: SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); - Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + Vconv(ub_vconv, ub_input, repeat, 1, 1, 8, 4); SetFlag(EVENT_ID1); bool is_after_mte2 = false; WaitFlag(EVENT_ID0); for (int32_t block_col = 0; block_col < n_blocks_per_row_b16; ++block_col) { int32_t block_col_idx = Block32B::Count(it.n_cols_complete) + block_col; - int32_t in_group_idx = block_col_idx % group_size; + int32_t in_group_idx = block_col_idx % group_block; if (in_group_idx == 0 || block_col_idx == 0) { int32_t ub_quant_args_offset = block_col * Block32B::size; - int32_t group_idx = block_col_idx / group_size; + int32_t group_idx = block_col_idx / group_block; if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, - it.n_cols_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); is_after_mte2 = true; ub_quant_args_root_offset = ub_quant_args_offset; - } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + } else if (in_group_idx < n_blocks_per_row_b16 || it.n_cols_complete == 0) { int32_t ub_quant_args_offset = block_col * Block32B::size; if (is_after_mte2) { @@ -1888,7 +1892,7 @@ private: WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -1945,22 +1949,22 @@ private: WaitFlag(EVENT_ID0); for (int32_t block_col = 0; block_col < n_blocks_per_row_b16; ++block_col) { int32_t block_col_idx = Block32B::Count(it.n_cols_complete) + block_col; - int32_t in_group_idx = block_col_idx % group_size; + int32_t in_group_idx = block_col_idx % group_block; if (in_group_idx == 0 || block_col_idx == 0) { int32_t ub_quant_args_offset = block_col * Block32B::size; - int32_t group_idx = block_col_idx / group_size; + int32_t group_idx = block_col_idx / group_block; if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, - it.n_cols_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); CopyGmToUbufAlign(ub_quant_offset + ub_quant_args_offset, offset + group_idx, - it.n_cols_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); is_after_mte2 = true; ub_quant_args_root_offset = ub_quant_args_offset; - } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + } else if (in_group_idx < n_blocks_per_row_b16 || it.n_cols_complete == 0) { int32_t ub_quant_args_offset = block_col * Block32B::size; if (is_after_mte2) { @@ -1987,7 +1991,7 @@ private: is_after_mte2 = false; PipeBarrier(); - WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID0); SetFlag(EVENT_ID2); -- Gitee From 598c099758a41767383bb4a0977b584c862cc4a1 Mon Sep 17 00:00:00 2001 From: Denver Date: Sun, 24 Aug 2025 08:50:35 +0800 Subject: [PATCH 135/414] add some comment --- src/kernels/coc_preprocessor.cce | 42 ++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 07a7795c..13c0bb0e 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -456,6 +456,7 @@ private: auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); + // 1. MTE2: ub_vconv <- gm_src CopyGmToUbufAlign(ub_vconv, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); int32_t n_blocks_per_row = Block32B::Count(it.n_cols_this_loop) * @@ -463,9 +464,11 @@ private: int32_t n_blocks = it.n_rows_this_loop * n_blocks_per_row; int32_t repeat_times = DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT); + // 1 -> 2 SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); + // 2. V: ub_muls <- vconv(ub_vconv) uint8_t repeat = REPEAT_PER_LOOP; for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete += repeat) { if (n_repeat_complete + repeat > repeat_times) { @@ -475,12 +478,15 @@ private: ub_vconv + n_repeat_complete * Block256B::size, repeat, 1, 1, 8, 4); } + // 2 -> 1 SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); if (has_offset) { + // 2 -> 3 PipeBarrier(); + // 3. V: ub_muls <- ub_muls + offset repeat = REPEAT_PER_LOOP; for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete += repeat) { @@ -492,8 +498,10 @@ private: } } + // 2/3 -> 4 PipeBarrier(); + // 4. V: ub_muls <- ub_muls + offset repeat = REPEAT_PER_LOOP; for (int32_t n_repeat_complete = 0; n_repeat_complete < repeat_times; n_repeat_complete += repeat) { if (n_repeat_complete + repeat > repeat_times) { @@ -508,8 +516,10 @@ private: SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); + // 5. MTE3: ub_muls -> dst CopyUbufToGmAlign(dst, ub_muls, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + // 5 -> 2 SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } @@ -693,9 +703,9 @@ public: this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - gm_scale = *reinterpret_cast<__gm__ half *>(gm_dequant_scale); + gm_scale = reinterpret_cast<__gm__ half *>(gm_dequant_scale); if (gm_dequant_offset) { - gm_offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); + gm_offset = reinterpret_cast<__gm__ half *>(gm_dequant_offset); has_offset = true; } } @@ -764,7 +774,7 @@ private: WaitFlag(EVENT_ID0); for (int32_t row = 1; row < max_rows_per_loop; ++row) { CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, - 0, 1, n_blocks_per_row, 0, 0); + 0, 1, n_blocks_per_row, 0, 0); /* sid */ } for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto src = gm_src + it.src_offset(); @@ -842,13 +852,13 @@ private: WaitFlag(EVENT_ID0); for (int32_t row = 1; row < max_rows_per_loop; ++row) { CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, - 0, 1, n_blocks_per_row, 0, 0); + 0, 1, n_blocks_per_row, 0, 0); /* sid */ } WaitFlag(EVENT_ID1); for (int32_t row = 1; row < max_rows_per_loop; ++row) { CopyUB2UB(ub_quant_offset + row * n_blocks_per_row * Block32B::size, ub_quant_offset, - 0, 1, n_blocks_per_row, 0, 0); + 0, 1, n_blocks_per_row, 0, 0); /* sid */ } for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { @@ -923,7 +933,7 @@ private: WaitFlag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, - 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); /* sid */ } for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { @@ -1005,13 +1015,13 @@ private: WaitFlag(EVENT_ID0); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_scale + block_col * Block32B::size, ub_quant_scale, - 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); /* sid */ } WaitFlag(EVENT_ID1); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { CopyUB2UB(ub_quant_offset + block_col * Block32B::size, ub_quant_offset, - 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); /* sid */ } for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { @@ -1071,9 +1081,9 @@ public: { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - gm_scale = *reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); + gm_scale = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); if (gm_dequant_offset) { - gm_offset = *reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_offset); + gm_offset = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_offset); has_offset = true; } } @@ -1206,15 +1216,15 @@ private: int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); // multiplex ub_quant_scale_origin auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)18688); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); - auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); // multiplex ub_quant_offset_origin auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)74752); auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)112384); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)149760); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); // multiplex ub_conv_f32 SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); @@ -1411,15 +1421,15 @@ private: int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); // multiplex ub_quant_scale_origin auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)18688); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); - auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); // multiplex ub_quant_offset_origin auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)74752); auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)112384); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)149760); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); // multiplex ub_conv_f32 SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); -- Gitee From 1f543f99e57c3876623c5241fc130de76bfd5e03 Mon Sep 17 00:00:00 2001 From: Denver Date: Sun, 24 Aug 2025 10:44:19 +0800 Subject: [PATCH 136/414] add class DequantPadder PER_GROUP --- src/kernels/coc_preprocessor.cce | 643 ++++++++++++++++++++++++++++++- 1 file changed, 642 insertions(+), 1 deletion(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 13c0bb0e..ea824834 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -2022,4 +2022,645 @@ private: int32_t group_size; int32_t group_num; bool has_offset{ false }; -}; \ No newline at end of file +}; + +template <> +class DequantPadder : public BasePadder { +public: + __aicore__ explicit DequantPadder() = default; + + inline __aicore__ void SetArgs(__gm__ uint8_t *gm_a, __gm__ uint8_t *gm_b, const LcalWorkspaceInfo &workspace_info, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, + __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset, int32_t dequant_group_size) + { + this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + gm_scale = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); + if (gm_dequant_offset) { + gm_offset = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_offset); + has_offset = true; + } + group_size = dequant_group_size; + group_num = (this->k + group_size - 1) / group_size; + } + + inline __aicore__ void Run() + { + if (aligned_a) { + int n_rows = this->trans_a ? this->k : this->m; + int n_cols = this->trans_a ? this->m : this->k; + int n_cols_aligned = this->trans_a ? this->m_align : this->k_align; + + this->PadMatrix(this->gm_a_align, this->gm_a, n_rows, n_cols, n_cols_aligned); + } + + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + + if (!trans_b && !has_offset) { + DequantAndPadMatrixNoOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (!trans_b && has_offset) { + DequantAndPadMatrixHasOffset(this->gm_b_align, this->gm_b, this->k, this->n, this->n_align); + } else if (trans_b && !has_offset) { + DequantAndPadMatrixTransposeNoOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); + } else { + DequantAndPadMatrixTransposeHasOffset(this->gm_b_align, this->gm_b, this->n, this->k, this->k_align); + } + + this->Barrier(); + } + +private: + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 10240; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)10496); + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)174592); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + + int32_t n_blocks_per_row_b16 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); + uint8_t quant_repeat_b32 = static_cast( + DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + for (int32_t row = 0; row < max_rows_per_loop; ++row) { + int32_t row_idx = it.row_offset_this_core + it.n_rows_complete + row; + int32_t in_group_idx = row_idx % group_size; + if (in_group_idx == 0 || it.n_rows_complete + row == 0) { + int32_t ub_quant_args_offset = row * n_blocks_per_row_b16 * Block32B::size; + int32_t group_idx = row_idx / group_size; + WaitEvent(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx * n_cols, + 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + ub_quant_args_root_offset = ub_quant_args_offset; + PipeBarrier(); + } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + 1, n_blocks_per_row_b32, 0, 0); + } + } + + WaitFlag(EVENT_ID1); + CopyUbufToGmAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vconv(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFlag(EVENT_ID2); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 8512; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)34048); + auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)68096); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); // multiplex ub_add + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)102144); + auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)119168); + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)136192); + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)153216); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID3); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto scale = gm_scale + it.n_cols_complete; + auto offset = gm_offset + it.n_cols_complete; + + int32_t n_blocks_per_row_b16 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = + Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); + uint8_t quant_repeat_b32 = static_cast( + DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + int32_t ubuf_gap = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + for (int32_t row = 0; row < max_rows_per_loop; ++row) { + int32_t row_idx = it.row_offset_this_core + it.n_rows_complete + row; + int32_t in_group_idx = row_idx % group_size; + if (in_group_idx == 0 || it.n_rows_complete + row == 0) { + int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; + int32_t group_idx = row_idx / group_size; + WaitFlag(EVENT_ID0); + CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx * n_cols, + 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + WaitFlag(EVENT_ID1); + Vconv(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale_origin + ub_quant_offset, quant_repeat_b32, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_quant_offset + ub_quant_args_offset, + ub_quant_offset_origin + ub_quant_args_offset, quant_repeat_b32, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + ub_quant_args_root_offset = ub_quant_args_offset; + PipeBarrier(); + } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { + int32_t ub_quant_args_offset = row * n_blocks_per_row_b32 * Block32B::size; + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + 1, n_blocks_per_row_b32, 0, 0); + CopyUB2UB(ub_quant_offset + ub_quant_args_offset, + ub_quant_offset + ub_quant_args_root_offset, /* sid */ 0, + 1, n_blocks_per_row_b32, 0, 0); + } + } + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID2); + + PipeBarrier(); + Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vconv(ub_mul, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFlag(EVENT_ID3); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID3); + + WaitFlag(EVENT_ID3); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); + SetFlag(EVENT_ID3); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID3); + } + + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 10240; + int32_t n_cols_round = Block32B::AlignUp(n_cols); + int32_t max_rows_per_loop = (n_cols_round <= MAX_LEN) ? (MAX_LEN / n_cols_round) : 1; + int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; + + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)10496); + auto ub_quant_scale = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)174592); + + int32_t group_block = Block32B::Count(group_size); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto scale = gm_scale + (it.row_offset_this_core + it.n_rows_complete) * group_num; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(float32_t) / sizeof(int8_t)); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + bool is_after_mte2 = false; + WaitFlag(EVENT_ID0); + for (int32_t block_col = 0; block_col < n_blocks_per_row_b16; ++block_col) { + int32_t block_col_idx = Block32B::Count(it.n_cols_complete) + block_col; + int32_t in_group_idx = block_col_idx % group_block; + if (in_group_idx == 0 || block_col_idx == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + int32_t group_idx = block_col_idx / group_block; + + if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, + it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = true; + ub_quant_args_root_offset = ub_quant_args_offset; + } else if (in_group_idx < n_blocks_per_row_b16 || it.n_cols_complete == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = false; + } + } + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } else { + PipeBarrier(); + } + Vconv(ub_quant_scale, ub_quant_scale_origin, repeat_b32, 1, 1, 8, 4); + is_after_mte2 = false; + SetFlag(EVENT_ID0); + + WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vconv(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFlag(EVENT_ID2); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) + { + LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); + + const int32_t MAX_LEN = 8512; + int32_t max_rows_per_loop = (it.n_rows_this_core * Block32B::size <= MAX_LEN) ? + it.n_rows_this_core : MAX_LEN / Block32B::size; + int32_t max_cols_per_loop = (it.n_rows_this_core * Block32B::size <= MAX_LEN) ? + Block32B::AlignDown(MAX_LEN / it.n_rows_this_core) : Block32B::size; + + auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)0); + auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)34048); + auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)68096); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)102144); + auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)119168); + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)136192); + auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)153216); + auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); + + int32_t group_block = Block32B::Count(group_size); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto scale = gm_scale + (it.row_offset_this_core + it.n_rows_complete) * group_num; + auto offset = gm_offset + (it.row_offset_this_core + it.n_rows_complete) * group_num; + + int32_t n_blocks_per_row_b8 = Block32B::Count(max_cols_per_loop); + int32_t n_blocks_per_row_b16 = n_blocks_per_row_b8 * (sizeof(bfloat16_t) / sizeof(int8_t)); + int32_t n_blocks_per_row_b32 = n_blocks_per_row_b8 * (sizeof(float32_t) / sizeof(int8_t)); + + int32_t n_blocks_b16 = it.n_rows_this_loop * n_blocks_per_row_b16; + int32_t n_blocks_b32 = it.n_rows_this_loop * n_blocks_per_row_b32; + uint8_t repeat_b16 = static_cast( + DivCeil(n_blocks_b16, VEC_BLOCK_PER_REPEAT)); + uint8_t repeat_b32 = static_cast( + DivCeil(n_blocks_b32, VEC_BLOCK_PER_REPEAT)); + + int32_t ub_quant_args_root_offset = 0; + for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { + auto src = gm_src + it.src_offset(); + auto dst = gm_dst + it.dst_offset(); + + int32_t src_gap = n_cols - it.n_cols_this_loop; + int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; + + int32_t ubuf_gap_b8 = n_blocks_per_row_b8 - Block32B::Count(it.n_cols_this_loop); + int32_t ubuf_gap_b16 = n_blocks_per_row_b16 - Block32B::Count(it.n_cols_this_loop); + + bool is_after_mte2 = false; + WaitFlag(EVENT_ID0); + for (int32_t block_col = 0; block_col < n_blocks_per_row_b16; ++block_col) { + int32_t block_col_idx = Block32B::Count(it.n_cols_complete) + block_col; + int32_t in_group_idx = block_col_idx % group_block; + if (in_group_idx == 0 || block_col_idx == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + int32_t group_idx = block_col_idx / group_block; + + if (ub_quant_args_offset == ub_quant_args_root_offset && !is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, + it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + CopyGmToUbufAlign(ub_quant_offset + ub_quant_args_offset, offset + group_idx, + it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = true; + ub_quant_args_root_offset = ub_quant_args_offset; + } else if (in_group_idx < n_blocks_per_row_b16 || it.n_cols_complete == 0) { + int32_t ub_quant_args_offset = block_col * Block32B::size; + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } + CopyUB2UB(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + CopyUB2UB(ub_quant_offset + ub_quant_args_offset, + ub_quant_offset + ub_quant_args_root_offset, /* sid */ 0, + it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); + is_after_mte2 = false; + } + } + + if (is_after_mte2) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + is_after_mte2 = false; + } else { + PipeBarrier(); + } + Vconv(ub_quant_scale, ub_quant_scale_origin, repeat_b32, 1, 1, 8, 4); + Vconv(ub_quant_offset, ub_quant_offset_origin, repeat_b32, 1, 1, 8, 4); + is_after_mte2 = false; + SetFlag(EVENT_ID0); + + WaitFlag(EVENT_ID1); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap, ubuf_gap_b8); + SetFlag(EVENT_ID1); + + WaitFlag(EVENT_ID1); + Vconv(ub_vconv_f16, ub_input, repeat_b16, 1, 1, 8, 4); + SetFlag(EVENT_ID1); + + PipeBarrier(); + Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + + PipeBarrier(); + Vadd(ub_add, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + Vmul(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + WaitFlag(EVENT_ID2); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); + SetFlag(EVENT_ID2); + } + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + } + + __gm__ half *gm_scale{ nullptr }; + __gm__ half *gm_offset{ nullptr }; + int32_t group_size; + int32_t group_num; + bool has_offset{ false }; +}; + +template +class Preprocessor { +public: + __aicore__ explicit Preprocessor() = default; + + FORCE_INLINE_AICORE void SetArgs(PP_MATMUL_AIV_PADDING_ARGS_FUN()) + { + this->is_int8 = is_int8; + this->dequant_granularity = dequant_granularity; + + int32_t m_align = is_int8 ? Block512B::AlignUp(m) : Block512B::AlignUp(m); + int32_t k_align = is_int8 ? Block512B::AlignUp(k) : Block512B::AlignUp(k); + int32_t n_align = is_int8 ? Block512B::AlignUp(n) : Block512B::AlignUp(n); + + int32_t aligned_a, aligned_b; + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + + bool has_a_align = IsQuant(quant_granularity) || aligned_a; + bool has_b_align = IsQuant(dequant_granularity) || !is_int8 || aligned_b; + bool has_accum = IsQuant(dequant_granularity) && is_int8 && std::is_same::value; + bool has_dequant_param = (dequant_granularity == QuantGranularity::PER_TOKEN || dequant_granularity == QuantGranularity::PER_TENSOR); + bool hasFormatDequantScale = (has_dequant_param || dequant_granularity == QuantGranularity::PER_CHANNEL); + + if (weight_nz) { + aligned_b = 0; + has_b_align = false; + } + LcalWorkspaceInfo workspace_info = GetLcalWorkSpaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, + trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, has_dequant_param, + hasFormatDequantScale,is_deterministic, is_moe, is_alltoallvc, EP, local_expert_nums, m * EP * TP); + + if (this->is_int8) { + switch (this->dequant_granularity) { + case QuantGranularity::PER_TENSOR: + padder_int8.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_offset, dequant_granularity); + return; + case QuantGranularity::PER_CHANNEL: + padder_int8.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + return; + case QuantGranularity::PER_TOKEN: + padder_int8.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + return; + case QuantGranularity::FLOAT32_SCALE_PER_CHANNEL: + padder_int8.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + return; + default: + return; + } + } + switch (this->dequant_granularity) { + case QuantGranularity::PER_TENSOR: + dequant_per_tensor_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_scale, gm_dequant_offset); + return; + case QuantGranularity::PER_CHANNEL: + dequant_per_tensor_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_scale, gm_dequant_offset); + return; + case QuantGranularity::PER_GROUP: + case QuantGranularity::PER_TENSOR: + dequant_per_tensor_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_scale, gm_dequant_offset, dequant_group_size); + return; + default: + padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + return; + } + } + + FORCE_INLINE_AICORE void Run(int32_t expert_per_rank = 1) + { + if (this->is_int8) { + padder_int8.Run(expert_per_rank); + return; + } + switch (this->dequant_granularity) { + case QuantGranularity:PER_TENSOR: + dequant_per_tensor_padder.Run(); + return; + case QuantGranularity::PER_CHANNEL: + dequant_per_channel_padder.Run(); + return; + case QuantGranularity::PER_GROUP: + dequant_per_group_padder.Run(); + return; + default: + padder.Run(expert_per_rank); + return; + } + } + +private: + Padder padder; + Padder padder_int8; + + DequantPadder dequant_per_tensor_padder; + DequantPadder dequant_per_channel_padder; + DequantPadder dequant_per_group_padder; + bool is_int8; + QuantGranularity dequant_granularity; +}; + +#endif + +#endif \ No newline at end of file -- Gitee From 433afb4ab46d2ec4e45bf709565bcd42968a0d5e Mon Sep 17 00:00:00 2001 From: Denver Date: Sun, 24 Aug 2025 11:18:49 +0800 Subject: [PATCH 137/414] fix some error --- src/kernels/coc_preprocessor.cce | 96 ++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index ea824834..4bae618f 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -2071,7 +2071,7 @@ public: } private: - inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -2125,15 +2125,20 @@ private: if (in_group_idx == 0 || it.n_rows_complete + row == 0) { int32_t ub_quant_args_offset = row * n_blocks_per_row_b16 * Block32B::size; int32_t group_idx = row_idx / group_size; - WaitEvent(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx * n_cols, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + Vconv(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale_origin + ub_quant_args_offset, quant_repeat_b32, 1, 1, 8, 4); + SetFlag(EVENT_ID0); + ub_quant_args_root_offset = ub_quant_args_offset; PipeBarrier(); } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { - int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; + int32_t ub_quant_args_offset = row * n_blocks_per_row_b32 * Block32B::size; CopyUB2UB(ub_quant_scale + ub_quant_args_offset, ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); @@ -2141,7 +2146,7 @@ private: } WaitFlag(EVENT_ID1); - CopyUbufToGmAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); @@ -2149,10 +2154,10 @@ private: SetFlag(EVENT_ID1); PipeBarrier(); - Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); - Vconv(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + Vmul(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); WaitFlag(EVENT_ID2); @@ -2170,7 +2175,7 @@ private: WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -2189,7 +2194,7 @@ private: auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)136192); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)153216); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); // multiplex ub_vconv_f32 SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); @@ -2227,16 +2232,21 @@ private: int32_t row_idx = it.row_offset_this_core + it.n_rows_complete + row; int32_t in_group_idx = row_idx % group_size; if (in_group_idx == 0 || it.n_rows_complete + row == 0) { - int32_t ub_quant_args_offset = row * n_blocks_per_row * Block32B::size; + int32_t ub_quant_args_offset = row * n_blocks_per_row_b16 * Block32B::size; int32_t group_idx = row_idx / group_size; WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx * n_cols, 1, it.n_cols_this_loop, 0); + SetFlag(EVENT_ID0); + + WaitFlag(EVENT_ID0); + Vconv(ub_quant_scale + ub_quant_args_offset, + ub_quant_scale_origin + ub_quant_args_offset, quant_repeat_b32, 1, 1, 8, 4); SetFlag(EVENT_ID0); WaitFlag(EVENT_ID1); - Vconv(ub_quant_scale + ub_quant_args_offset, - ub_quant_scale_origin + ub_quant_offset, quant_repeat_b32, 1, 1, 8, 4); + CopyGmToUbufAlign(ub_quant_offset_origin + ub_quant_args_offset, + offset + group_idx * n_cols, it.n_cols_this_loop, 0); SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); @@ -2247,7 +2257,7 @@ private: ub_quant_args_root_offset = ub_quant_args_offset; PipeBarrier(); } else if (in_group_idx < max_rows_per_loop || it.n_rows_complete == 0) { - int32_t ub_quant_args_offset = row * n_blocks_per_row_b32 * Block32B::size; + int32_t ub_quant_args_offset = row * n_blocks_per_row_b32 * Block32B::size; CopyUB2UB(ub_quant_scale + ub_quant_args_offset, ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); @@ -2258,7 +2268,7 @@ private: } WaitFlag(EVENT_ID2); - CopyUbufToGmAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); + CopyGmToUbufAlign(ub_input, src, it.n_rows_this_loop, it.n_cols_this_loop, src_gap); SetFlag(EVENT_ID2); WaitFlag(EVENT_ID2); @@ -2266,10 +2276,13 @@ private: SetFlag(EVENT_ID2); PipeBarrier(); - Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); - Vconv(ub_mul, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); + Vadd(ub_add, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); + + PipeBarrier(); + Vmul(ub_mul, ub_add, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); WaitFlag(EVENT_ID3); @@ -2288,7 +2301,7 @@ private: WaitFlag(EVENT_ID3); } - inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeNoOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -2300,7 +2313,7 @@ private: auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)10496); - auto ub_quant_scale = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); + auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); @@ -2350,7 +2363,7 @@ private: SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } - CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, + CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx, it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); is_after_mte2 = true; ub_quant_args_root_offset = ub_quant_args_offset; @@ -2361,8 +2374,8 @@ private: SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } - CopyUB2UB(ub_quant_scale + ub_quant_args_offset, - ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + CopyUB2UB(ub_quant_scale_origin + ub_quant_args_offset, + ub_quant_scale_origin + ub_quant_args_root_offset, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); is_after_mte2 = false; } @@ -2387,10 +2400,10 @@ private: SetFlag(EVENT_ID1); PipeBarrier(); - Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); - Vconv(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); + Vmul(ub_mul, ub_vconv_f32, ub_quant_scale, repeat_b32, 1, 1, 1, 8, 8, 8); PipeBarrier(); WaitFlag(EVENT_ID2); @@ -2408,7 +2421,7 @@ private: WaitFlag(EVENT_ID2); } - inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ half *gm_dst, __gm__ int8_t *gm_src, + inline __aicore__ void DequantAndPadMatrixTransposeHasOffset(__gm__ bfloat16_t *gm_dst, __gm__ int8_t *gm_src, int32_t n_rows, int32_t n_cols, int32_t n_cols_aligned) { LoopIter it(this->batch_size, n_rows, n_cols, n_cols_aligned); @@ -2422,13 +2435,13 @@ private: auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)0); auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)34048); auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)68096); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); // multiplex ub_add auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)102144); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)119168); auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)136192); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)153216); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); // multiplex ub_vconv_f32 int32_t group_block = Block32B::Count(group_size); @@ -2475,24 +2488,24 @@ private: SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } - CopyGmToUbufAlign(ub_quant_scale + ub_quant_args_offset, scale + group_idx, + CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx, it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); - CopyGmToUbufAlign(ub_quant_offset + ub_quant_args_offset, offset + group_idx, + CopyGmToUbufAlign(ub_quant_offset_origin + ub_quant_args_offset, offset + group_idx, it.n_rows_this_loop, 1, group_num - 1, n_blocks_per_row_b16 - 1); is_after_mte2 = true; ub_quant_args_root_offset = ub_quant_args_offset; } else if (in_group_idx < n_blocks_per_row_b16 || it.n_cols_complete == 0) { - int32_t ub_quant_args_offset = block_col * Block32B::size; + int32_t ub_quant_args_offset = block_col * Block32B::size; if (is_after_mte2) { SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); } - CopyUB2UB(ub_quant_scale + ub_quant_args_offset, - ub_quant_scale + ub_quant_args_root_offset, /* sid */ 0, + CopyUB2UB(ub_quant_scale_origin + ub_quant_args_offset, + ub_quant_scale_origin + ub_quant_args_root_offset, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); - CopyUB2UB(ub_quant_offset + ub_quant_args_offset, - ub_quant_offset + ub_quant_args_root_offset, /* sid */ 0, + CopyUB2UB(ub_quant_offset_origin + ub_quant_args_offset, + ub_quant_offset_origin + ub_quant_args_root_offset, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); is_after_mte2 = false; } @@ -2519,7 +2532,7 @@ private: SetFlag(EVENT_ID1); PipeBarrier(); - Vconv(ub_conv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); + Vconv(ub_vconv_f32, ub_vconv_f16, repeat_b32, 1, 1, 8, 4); PipeBarrier(); Vadd(ub_add, ub_vconv_f32, ub_quant_offset, repeat_b32, 1, 1, 1, 8, 8, 8); @@ -2534,7 +2547,7 @@ private: WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); - SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); } } } @@ -2543,8 +2556,8 @@ private: WaitFlag(EVENT_ID2); } - __gm__ half *gm_scale{ nullptr }; - __gm__ half *gm_offset{ nullptr }; + __gm__ bfloat16_t *gm_scale{ nullptr }; + __gm__ bfloat16_t *gm_offset{ nullptr }; int32_t group_size; int32_t group_num; bool has_offset{ false }; @@ -2568,7 +2581,7 @@ public: AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); bool has_a_align = IsQuant(quant_granularity) || aligned_a; - bool has_b_align = IsQuant(dequant_granularity) || !is_int8 || aligned_b; + bool has_b_align = IsQuant(dequant_granularity) && !is_int8 || aligned_b; bool has_accum = IsQuant(dequant_granularity) && is_int8 && std::is_same::value; bool has_dequant_param = (dequant_granularity == QuantGranularity::PER_TOKEN || dequant_granularity == QuantGranularity::PER_TENSOR); bool hasFormatDequantScale = (has_dequant_param || dequant_granularity == QuantGranularity::PER_CHANNEL); @@ -2578,7 +2591,7 @@ public: has_b_align = false; } LcalWorkspaceInfo workspace_info = GetLcalWorkSpaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, - trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, has_dequant_param, + trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, hasFormatDequantScale,is_deterministic, is_moe, is_alltoallvc, EP, local_expert_nums, m * EP * TP); if (this->is_int8) { @@ -2611,13 +2624,12 @@ public: gm_dequant_scale, gm_dequant_offset); return; case QuantGranularity::PER_CHANNEL: - dequant_per_tensor_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + dequant_per_channel_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, gm_dequant_scale, gm_dequant_offset); return; case QuantGranularity::PER_GROUP: - case QuantGranularity::PER_TENSOR: - dequant_per_tensor_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, + dequant_per_group_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, gm_dequant_scale, gm_dequant_offset, dequant_group_size); return; @@ -2635,7 +2647,7 @@ public: return; } switch (this->dequant_granularity) { - case QuantGranularity:PER_TENSOR: + case QuantGranularity::PER_TENSOR: dequant_per_tensor_padder.Run(); return; case QuantGranularity::PER_CHANNEL: -- Gitee From aa38dc4b138af306bf838302eaf334ba4ae2cf70 Mon Sep 17 00:00:00 2001 From: Denver Date: Sun, 24 Aug 2025 11:21:19 +0800 Subject: [PATCH 138/414] fix some error again --- src/kernels/coc_preprocessor.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernels/coc_preprocessor.cce b/src/kernels/coc_preprocessor.cce index 4bae618f..98a8333f 100644 --- a/src/kernels/coc_preprocessor.cce +++ b/src/kernels/coc_preprocessor.cce @@ -2234,19 +2234,19 @@ private: if (in_group_idx == 0 || it.n_rows_complete + row == 0) { int32_t ub_quant_args_offset = row * n_blocks_per_row_b16 * Block32B::size; int32_t group_idx = row_idx / group_size; - WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyGmToUbufAlign(ub_quant_scale_origin + ub_quant_args_offset, scale + group_idx * n_cols, 1, it.n_cols_this_loop, 0); - SetFlag(EVENT_ID0); + SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); Vconv(ub_quant_scale + ub_quant_args_offset, ub_quant_scale_origin + ub_quant_args_offset, quant_repeat_b32, 1, 1, 8, 4); - SetFlag(EVENT_ID0); + SetFlag(EVENT_ID0); WaitFlag(EVENT_ID1); CopyGmToUbufAlign(ub_quant_offset_origin + ub_quant_args_offset, - offset + group_idx * n_cols, it.n_cols_this_loop, 0); + offset + group_idx * n_cols, 1, it.n_cols_this_loop, 0); SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); -- Gitee From 4540bc03a646c9264ead8938e45672d6349abd10 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 16:01:29 +0800 Subject: [PATCH 139/414] Implement AllReduceHierarchyDoubleRing class with core functionality and IPC queue management --- .../91093/allreduce_hierarchy_double_ring.h | 413 +++++++++++++++++- 1 file changed, 412 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index 9a893c3a..f6ede6d0 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -6,4 +6,415 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ +#ifndef LCCL_ALLREDUCE_HIERARCHY_DOUBLE_RING_H +#define LCCL_ALLREDUCE_HIERARCHY_DOUBLE_RING_H + +#include "sync_collectives.h" +#include "collectives.h" +#include "ipc_queue.h" +using namespace AscendC; + +template +class AllReduceHierarchyDoubleRing : protected Collectives { + constexpr static int32_t RING_LAYER_NUM = 2; + constexpr static int32_t SIO_CORE_NUM = 12; + constexpr static int32_t RING_CORE_NUM = 12; + constexpr static int32_t OUTPUT_CORE_NUM = 6; + constexpr static int32_t IPC_QUE_DEPTH = 32; + constexpr static int32_t RING_GATHER_QUE_DEPTH = 2; + constexpr static int32_t SIO_GATHER_QUE_GATHER = 2; + constexpr static int32_t INPUT_FLAG = 0 * RING_CORE_NUM; + constexpr static int32_t SIO_REDUCE_FLAG = 1 * RING_CORE_NUM; + constexpr static int32_t RING_REDUCE_FLAG = 2 * RING_CORE_NUM; + constexpr static int32_t RING_REDUCE_PEER_FLAG = 3 * RING_CORE_NUM; + constexpr static int32_t RING_GATHER_FLAG = 4 * RING_CORE_NUM; + constexpr static int32_t RING_GATHER_PEER_FLAG = 5 * RING_CORE_NUM; + constexpr static int32_t SIO_GATHER_PEER_FLAG = 6 * RING_CORE_NUM; + constexpr static int32_t SIO_GATHER_FLAG = 7 * RING_CORE_NUM; + constexpr static int32_t SIO_GATHER_OUTPUT_FLAG = 8 * RING_CORE_NUM; + constexpr static int32_t OUTPUT_FLAG = 9 * RING_CORE_NUM; + + + constexpr static int32_t INPUT_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; + constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / SIO_CORE_NUM; + constexpr static int32_t OUTPUT_CORE_SCALE = RING_CORE_NUM / OUTPUT_CORE_NUM; + constexpr static int32_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); + +public: + FORCE_INLINE_AICORE AllReduceHierarchyDoubleRIng(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + atomOp = op; + DumpLcclLogInfo(LogId::INIT, static_cast(atomOp)); + blockNum = INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM + OUTPUT_CORE_NUM; + if (blockIdx >= blockNum) { + DumpLcclLogInfo(LogId::INIT, static_cast(atomOp)); + return; + } + sioLayerId = rank / RING_LAYER_NUM; + ringLayerId = rank % RING_LAYER_NUM; + ringRankSize = rankSize / RING_LAYER_NUM; + ringNextRankId = (sioLayerId + 1) % ringRankSize * RING_LAYER_NUM + ringLayerId; + ringPrevRankId = (sioLayerId + (ringRankSize - 1)) % ringRankSize * RING_LAYER_NUM + ringLayerId; + sioPeerRankId = sioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; + ipcBlockNum = IPC_BUFF_MAX_SIZE / (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH + SIO_GATHER_QUE_DEPTH) / sizeof(T); + dmaPerLoop = ipcBlockNum - rankSize; + loopCount = CeilDiv(len, rankSize * damPerLoop); + const int64_t sumDataLastLoop = len - (loopCount - 1) * rankSize * dmaPerLoop; + dmaLastLoop = sumDataLastLoop / rankSize; + dmaLastRankLoop = sumDataLastLoop - (rankSize - 1) * dmaLastLoop; + totalBlockDataNum = (loopCount - 1) * dmaPerLoop + dmaLastLoop; + + InitQue(); + inputTensor.SetGlobalBuffer((__gm__ T*) input); + outputTensor.SetGlobalBuffer((__gm__ T*) output); + DumpLcclLogInfo(LogId::INIT, static_cast(atomOp)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + if (blockIdx >= blockNum) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + return; + for (curLoopCnt = 0; curLoopCnt < loopCount; ++curLoopCnt) { + for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { + if (blockIdx < INPUT_CORE_NUM) { + Input2Ipc() + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { + SioReduce(); + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { + RingReduce(); + } else { + PrepareOutput(); + } + ++ipcQueIdx; + } + } + for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { + if (blockIdx < INPUT_CORE_NUM) { + ; + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { + SioGather(); + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { + RingGather(); + } else { + Ipc2Output(); + } + ++gatherQueIdx; + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } +private: + IpcQueue inputQueList[INPUT_CORE_SCALE]; + IpcQueue sioQueList[SIO_CORE_SCALE]; + IpcQueue sioGatherSrc1QueList[SIO_CORE_SCALE]; + IpcQueue sioGatherSrc2QueList[SIO_CORE_SCALE]; + IpcQueue sioGatherDstQueList[SIO_CORE_SCALE]; + IpcQueue ringSrcQue; + IpcQueue ringDstQue; + IpcQueue outputSrc1QueList[OUTPUT_CORE_SCALE]; + IpcQueue outputSrc2QueList[OUTPUT_CORE_SCALE]; + IpcQueue outputSrc3QueList[OUTPUT_CORE_SCALE]; + + IpcQueue *intputQue = nullptr; + IpcQueue *sioQue = nullptr; + IpcQueue *sioGatherSrc1Que = nullptr; + IpcQueue *sioGatherSrc2Que = nullptr; + IpcQueue *sioGatherDstQue = nullptr; + IpcQueue *outputSrc1Que = nullptr; + IpcQueue *outputSrc2Que = nullptr; + IpcQueue *outputSrc3Que = nullptr; + GlobalTensor srcIpcTensor; + GlobalTensor dstIpcTensor; + GlobalTensor inputTensor; + GlobalTensor outputTensor; + int atomOp = COPYONLY; + int32_t sioLayerId = 0; + int32_t ringLayerId = 0; + int32_t ringRankSize = 0; + int32_t ringNextRankId = 0; + int32_t ringPrevRankId = 0; + int32_t sioPeerRankId = 0; + int32_t localBlockIdx = 0; + int64_t ipcBlockNum = 0; + int64_t totalBlockDataNum = 0; + int64_t dmaPerLoop = 0; + int64_t dmaLastLoop = 0; + + int32_t ipcQueIdx = 0; + int32_t gatherQueIdx = 0; + int64_t loopCount = 0; + int64_t curLoopCnt = 0; + int64_t sioLayerLoop = 0; + int64_t coreDataNum = 0; + int64_t lastCoreDataNum = 0; + int64_t curCoreDataNum = 0; + + FORCE_INLINE_AICORE void InitQue() + { + const int64_t dmaSizePerCore = ipcBlockNum / RING_CORE_NUM * sizeof(T); + const int64_t ipcBlockSize = ipcBlockNum * sizeof(T); + if (blockIdx < INPUT_CORE_NUM) { + for (int32_t blockLoop = 0; blockLoop < INPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = blockIdx * INPUT_CORE_SCALE + blockLoop; + inputQueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + } + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; + sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + sioGatherSrc1QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + sioGatherSrc2QueList[blockLoop].Init(&sync, magic, shareAddrs[rankId] + IPC_DATA_OFFSET + + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + sioGatherDstQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + + (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH) * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + } + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { + localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); + ringSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + ringDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + ringGatherSrcQue.Init(&sync, magic, shareAddrs[rankPrevRankId] + IPC_DATA_OFFSET + + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + } else { + for (int32_t blockLoop = 0; blockLoop < OUTPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM)) * OUTPUT_CORE_SCALE + + blockLoop; + outputSrc1QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH) * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * SIO_GATHER_QUE_DEPTH, ipcBlockNum); + outputSrc2QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + outputSrc3QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + } + } + } + + FORCE_INLINE_AICORE void Input2Ipc() + { + for (int32_t blockLoop = 0; blockLoop < INPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = blockIdx * INPUT_CORE_SCALE + blockLoop; + inputQue = &(inputQueList[blockLoop]); + Input2IpcByCore(); + } + } + + FORCE_INLINE_AICORE void Input2IpcByCore() + { + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 - sioLayerLoop)) % ringRankSize; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; + + (*inputQue).DeQue(rank, RING_REDUCE_PEER_FLAG + localBlockIdx); + const int32_t consumedQueIdx = ipcQueIdx - (IPC_QUE_DEPTH + ringRankSize - 1); + if (consumedQueIdx >- 0 && consumerQueIdx % ringRankSize == 0) { + sync.WaitSyncFlag(magic, consumedQueIdx, OUTPUT_FLAG + localBlockIdx, rank); + sync.WaitSyncFlag(magic, consumedQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, rank); + } + + BuildCoreDataNum(curLoopCnt, targetRankOffset); + srcIpcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + + localBlockIdx * coreDataNum]; + dstIpcTensor = (*inputQue).EnQue(); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); + sync.SetSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId); + } + + FORCE_INLINE_AICORE void SioReduce() + { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + if (sioLayerLoop < ringRankSize - 1) { + sioGatherSrc1QueList[blockLoop].ReadFront(); + } + } + if (curLoopCnt > 0 && sioLayerLoop == 0) { + return; + } + const int32_t endIdx = (curLoopCnt < loopCount - 1) && (sioLayerLoop == ringRankSize - 1) ? 1 : 0; + for (int32_t i = 0; i <= endIdx; ++i) { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; + sioQue = &(sioQueList[blockLoop]); + SioReduceByCore(curLoopCnt + i, (sioLayerLoop + i) % ringRankSize, ipcQueIdx + i); + } + } + } + + FORCE_INLINE_AICORE void SioReduceByCore(int32_t newLoopCnt, int32_t newLayerLoop, int32_t newIpcQueIdx) + { + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 - newLayerLoop)) % ringRankSize; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; + + sync.WaitSyncFlag(magic, newQueIdx, INPUT_FLAG + localBlockIdx, rank); + BuildCoreDataNum(newLoopCnt, targetRankOffset); + srcIpcTensor = inputTensor[targetRankOffset * totalBlockDataNum + newLoopCnt * dmaPerLoop + + localBlockIdx * coreDataNum]; + dstIpcTensor = (*sioQue).EnQue(); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + sync.SetSyncFlag(magic, newQueIdx, SIO_REDUCE_FLAG + localBlockIdx, sioPeerRankId); + } + + FORCE_INLINE_AICORE void BuildCoreDataNum(int32_t processLoopIdx, int32_t targetRankOffset) + { + const int64_t damCurLoop = (processLoopIdx == loopCount - 1) ? + (targetRankOffset == rankSize - 1 ? dmaLastRankLoop : dmaLastLoop) : dmaPerLoop; + coreDataNum = ipcBlockNum / RING_CORE_NUM; + const int32_t maxIdx = damCurLoop / coreDataNum; + const int32_t lastIdx = maxIdx >= RING_CORE_NUM ? (RING_CORE_NUM - 1) : maxIdx; + + lastCoreDataNum = damCurLoop - lastIdx * coreDataNum; + curCoreDataNum = localBlockIdx < lastIdx ? coreDataNum : (localBlockIdx == lastIdx ? lastCoreDataNum : 0); + } + + FORCE_INLINE_AICORE coid SioGather() + { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; + sioGatherSrc1Que = &(sioGatherQueList[blockLoop]); + sioGatherSrc2Que = &(sioGatherQueList[blockLoop]); + sioGatherDstQue = &(sioGatherQueList[blockLoop]); + SioGatherByCore(); + } + } + + FORCE_INLINE_AICORE void SioGatherByCore() + { + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerLoop)) % ringRankSize; + const int32_t targetRankOffset = targetSioALayerId * RING_LAYER_NUM + ringLayerId; + + sync.SaitSyncFlag(maigc, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); + if (gatherQueIdx >= SIO_GATHER_QUE_DEPTH) { + sync.WaitSyncFlag(magic, gatherQueIdx - SIO_GATHER_QUE_DEPTH, SIO_GATHER_FLAG + localBlockIdx, rank); + } + BuildCoreDataNum(loopCount - 1, targetRankOffset); + srcIpcTensor = (*sioGatherSrc1Que).DeQue(); + dstIpcTensor = (*sioGatherDstQue).EnQue(); + CpGMPingPong2GM(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_QUE_DEPTH + localBlockIdx, sioPeerRankId); + sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, sioPeerRankId); + } + + FORCE_INLINE_AICORE void RingReduce() + { + if (sioLayerLoop == 0) { + ringDstQue.ReadFront(); + return; + } + + const int32_t consumeQueIdx = ipcQueIdx - 1; + sync.WaitSyncFlag(magic, consumeQueIdx + 1, SIO_REDUCE_FLAG + localBlockIdx, rank); + if (sioLayerLoop == 1) { + sync.WaitSyncFlag(maigc, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId); + } else { + sync.WaitSyncFlag(maigc, consumedQueIdx - 1, RING_REDUCE_FLAG + localBlockIdx, + ringPrevRankId); + } + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 -sioLayerLoop)) % ringRankSize; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; + BuildCoreDataNum(curLoopCnt, targetRankOffset); + srcIpcTensor = ringSrcQue.ReadFront(); + dstIpcTensor = ringDstQue.ReadFront(); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + sync.SetSyncFlag(magic, consumedQueIdx, RING_REDUCE_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, consumedQueIdx, RING_REDUCE_PEER_FLAG + localBlockIdx, ringPrevRankId); + } + + FORCE_INLINE_AICORE void RingGather() + { + if (sioLayerLoop == 0) { + sync.SetSyncFlag(maigc, gatehrQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, rank); + return; + } + + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerloop)) % ringRankSize; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; + sync.WaitSyncFlag(magic, gatherQueIdx - 1, SIO_GATHER_FLAG + localBlockIdx, ringPrevRankId); + if (gatherQueIdx > RING_GATHER_QUE_DEPTH) { + sync.WaitSyncFlag(magic, gatherQueIdx - RING_GATHER_QUE_DEPTH, OUTPUT_FLAG + localBlockIdx, rank); + if (targetRankOffset != ringPrevRankId) { + sync.WaitSyncFlag(magic, gatherQueIdx - RING_GATHER_QUE_DEPTH, RING_GATHER_PEER_FLAG + localBlockIdx, + rank); + } + } + + BuildCoreDataNum(curLoopCnt, targetRankOffset); + if (sioLayerLoop == 1) { + srcIpcTensor = ringSrcQue.ReadFront(); + } else { + srcIpcTensor = ringGatherSrcQue.ReadFront(); + } + dstipcTensor = ringGatherDstQue.ReadFront(); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); + sync.SetSyncFlag(magic, gatehrQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); + if (gatherQueIdx > 0) { + sync.SetSyncFlag(magic, gatherQueIdx - 1, RING_GATHER_PEER_FLAG + localBlockIdx, rank); + } + if (sioLayerLoop == ringRankSize - 1) { + ringGatherSrcQue.ReadFront(); + } + } + + FORCE_INLINE_AICORE coid PrepareOutput() + { + for (int32_t blockLoop = 0; blockLoop < OUTPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM)) * OUTPUT_CORE_SCALE + + blockLoop; + if (sioLayerLoop < ringRankSize - 1) { + outputSrc3QueList[blockLoop].ReadFront(); + } + } + } + + FORCE_INLINE_AICORE void Ipc2Output() + { + for (int32_t blockLoop = 0; blockLoop < OUTPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM)) * OUTPUT_CORE_SCALE + + blockLoop; + outputSrc1Que = &(outputSrc1QueList[blockLoop]); + outputSrc2Que = &(outputSrc1QueList[blockLoop]); + outputSrc3Que = &(outputSrc1QueList[blockLoop]); + Ipc2OutputByCore(); + } + } + + FORCE_INLINE_AICORE void Ipc2OutputByCore() + { + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerLoop)) % ringRankSize; + const int32_t targetSioRankOffset = targetSioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; + const int32_t targetRingRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; + BuildCoreDataNum(curLoopCnt, targetSioRankOffset); + sync.WaitSyncFlag(magic, gatherQueIdx, SIO_GATHER_PEER_FLAG + localBlockIdx, rank); + srcIpcTensor = (*outputSrc1Que).ReadFront(); + dstIpcTensor = outputTensor[targetSioRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + + localBlockIdx * coreDataNum]; + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_OUTPUT_FLAG + localBlockIdx, sioPeerRankId); + BuildCoreDataNum(curLoopCnt, targetRingRankOffset); + sync.WaitSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, rank); + srcIpcTensor = sioLayerLoop == 0 ? (*outputSrc3Que).ReadFront() : (*outputSrc2Que).ReadFront(); + dstIpcTensor = outputTensor[targetRingRankOffset * totalBlockDataNum + + curLoopCnt * dmaPerLoop + localBlockIdx * coreDataNum]; + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_PEER_FLAG + localBlockIdx, rank); + } +}; +#endif // LCCL_ALLREDUCE_HIERARCHY_DOUBLE_RING_H \ No newline at end of file -- Gitee From 6f35e16e15c46c7daf7a2481ebb8752720272840 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 16:08:15 +0800 Subject: [PATCH 140/414] 1 --- .../91093/allreduce_hierarchy_double_ring.h | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index f6ede6d0..0c5cc3f5 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -18,12 +18,13 @@ using namespace AscendC; template class AllReduceHierarchyDoubleRing : protected Collectives { constexpr static int32_t RING_LAYER_NUM = 2; + constexpr static int32_t INPUT_CORE_NUM = 4; constexpr static int32_t SIO_CORE_NUM = 12; constexpr static int32_t RING_CORE_NUM = 12; constexpr static int32_t OUTPUT_CORE_NUM = 6; constexpr static int32_t IPC_QUE_DEPTH = 32; - constexpr static int32_t RING_GATHER_QUE_DEPTH = 2; - constexpr static int32_t SIO_GATHER_QUE_GATHER = 2; + constexpr static int32_t RING_GATHER_QUE_DEPTH = 3; + constexpr static int32_t SIO_GATHER_QUE_DEPTH = 2; constexpr static int32_t INPUT_FLAG = 0 * RING_CORE_NUM; constexpr static int32_t SIO_REDUCE_FLAG = 1 * RING_CORE_NUM; constexpr static int32_t RING_REDUCE_FLAG = 2 * RING_CORE_NUM; @@ -39,10 +40,10 @@ class AllReduceHierarchyDoubleRing : protected Collectives { constexpr static int32_t INPUT_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / SIO_CORE_NUM; constexpr static int32_t OUTPUT_CORE_SCALE = RING_CORE_NUM / OUTPUT_CORE_NUM; - constexpr static int32_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); + constexpr static int64_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); public: - FORCE_INLINE_AICORE AllReduceHierarchyDoubleRIng(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE AllReduceHierarchyDoubleRing(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { @@ -62,7 +63,7 @@ public: sioPeerRankId = sioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; ipcBlockNum = IPC_BUFF_MAX_SIZE / (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH + SIO_GATHER_QUE_DEPTH) / sizeof(T); dmaPerLoop = ipcBlockNum - rankSize; - loopCount = CeilDiv(len, rankSize * damPerLoop); + loopCount = CeilDiv(len, rankSize * dmaPerLoop); const int64_t sumDataLastLoop = len - (loopCount - 1) * rankSize * dmaPerLoop; dmaLastLoop = sumDataLastLoop / rankSize; dmaLastRankLoop = sumDataLastLoop - (rankSize - 1) * dmaLastLoop; @@ -80,6 +81,7 @@ public: if (blockIdx >= blockNum) { DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); return; + } for (curLoopCnt = 0; curLoopCnt < loopCount; ++curLoopCnt) { for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { if (blockIdx < INPUT_CORE_NUM) { @@ -116,11 +118,13 @@ private: IpcQueue sioGatherDstQueList[SIO_CORE_SCALE]; IpcQueue ringSrcQue; IpcQueue ringDstQue; + IpcQueue ringGatherSrcQue; + IpcQueue ringGatherDstQue; IpcQueue outputSrc1QueList[OUTPUT_CORE_SCALE]; IpcQueue outputSrc2QueList[OUTPUT_CORE_SCALE]; IpcQueue outputSrc3QueList[OUTPUT_CORE_SCALE]; - IpcQueue *intputQue = nullptr; + IpcQueue *inputQue = nullptr; IpcQueue *sioQue = nullptr; IpcQueue *sioGatherSrc1Que = nullptr; IpcQueue *sioGatherSrc2Que = nullptr; @@ -144,12 +148,12 @@ private: int64_t totalBlockDataNum = 0; int64_t dmaPerLoop = 0; int64_t dmaLastLoop = 0; - + int64_t dmaLastRankLoop = 0; int32_t ipcQueIdx = 0; int32_t gatherQueIdx = 0; - int64_t loopCount = 0; - int64_t curLoopCnt = 0; - int64_t sioLayerLoop = 0; + int32_t loopCount = 0; + int32_t curLoopCnt = 0; + int32_t sioLayerLoop = 0; int64_t coreDataNum = 0; int64_t lastCoreDataNum = 0; int64_t curCoreDataNum = 0; @@ -171,12 +175,12 @@ private: dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); sioGatherSrc1QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); - sioGatherSrc2QueList[blockLoop].Init(&sync, magic, shareAddrs[rankId] + IPC_DATA_OFFSET + + sioGatherSrc2QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); sioGatherDstQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH) * ipcBlockSize + dmaSizePerCore * localBlockIdx, - ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + ipcBlockNum * SIO_GATHER_QUE_DEPTH, ipcBlockNum); } } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); @@ -184,9 +188,6 @@ private: dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); ringDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); - ringGatherSrcQue.Init(&sync, magic, shareAddrs[rankPrevRankId] + IPC_DATA_OFFSET + - IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, - ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); -- Gitee From 99220fc529a5202a19dbea36e1692cac76127e1d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 16:10:07 +0800 Subject: [PATCH 141/414] 3 --- .../91093/allreduce_hierarchy_double_ring.h | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index 0c5cc3f5..95b5f663 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -85,7 +85,7 @@ public: for (curLoopCnt = 0; curLoopCnt < loopCount; ++curLoopCnt) { for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { if (blockIdx < INPUT_CORE_NUM) { - Input2Ipc() + Input2Ipc(); } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { SioReduce(); } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { @@ -95,18 +95,19 @@ public: } ++ipcQueIdx; } - } - for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { - if (blockIdx < INPUT_CORE_NUM) { - ; - } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { - SioGather(); - } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { - RingGather(); - } else { - Ipc2Output(); + + for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { + if (blockIdx < INPUT_CORE_NUM) { + ; + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { + SioGather(); + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { + RingGather(); + } else { + Ipc2Output(); + } + ++gatherQueIdx; } - ++gatherQueIdx; } DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); } -- Gitee From bc138b9495618b611666a84871970869a2b6276e Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 16:15:57 +0800 Subject: [PATCH 142/414] 5 --- .../91093/allreduce_hierarchy_double_ring.h | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index 95b5f663..2b875fa4 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -189,7 +189,7 @@ private: dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); ringDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); - ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + ringGatherSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + @@ -206,7 +206,7 @@ private: IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); outputSrc3QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + - dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); } } } @@ -227,7 +227,7 @@ private: (*inputQue).DeQue(rank, RING_REDUCE_PEER_FLAG + localBlockIdx); const int32_t consumedQueIdx = ipcQueIdx - (IPC_QUE_DEPTH + ringRankSize - 1); - if (consumedQueIdx >- 0 && consumerQueIdx % ringRankSize == 0) { + if (consumedQueIdx >- 0 && consumedQueIdx % ringRankSize == 0) { sync.WaitSyncFlag(magic, consumedQueIdx, OUTPUT_FLAG + localBlockIdx, rank); sync.WaitSyncFlag(magic, consumedQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, rank); } @@ -260,7 +260,7 @@ private: } } - FORCE_INLINE_AICORE void SioReduceByCore(int32_t newLoopCnt, int32_t newLayerLoop, int32_t newIpcQueIdx) + FORCE_INLINE_AICORE void SioReduceByCore(int32_t newLoopCnt, int32_t newLayerLoop, int32_t newQueIdx) { const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 - newLayerLoop)) % ringRankSize; const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; @@ -286,13 +286,13 @@ private: curCoreDataNum = localBlockIdx < lastIdx ? coreDataNum : (localBlockIdx == lastIdx ? lastCoreDataNum : 0); } - FORCE_INLINE_AICORE coid SioGather() + FORCE_INLINE_AICORE void SioGather() { for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; - sioGatherSrc1Que = &(sioGatherQueList[blockLoop]); - sioGatherSrc2Que = &(sioGatherQueList[blockLoop]); - sioGatherDstQue = &(sioGatherQueList[blockLoop]); + sioGatherSrc1Que = &(sioGatherSrc1QueList[blockLoop]); + sioGatherSrc2Que = &(sioGatherSrc2QueList[blockLoop]); + sioGatherDstQue = &(sioGatherDstQueList[blockLoop]); SioGatherByCore(); } } @@ -300,16 +300,16 @@ private: FORCE_INLINE_AICORE void SioGatherByCore() { const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerLoop)) % ringRankSize; - const int32_t targetRankOffset = targetSioALayerId * RING_LAYER_NUM + ringLayerId; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; - sync.SaitSyncFlag(maigc, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); + sync.WaitSyncFlag(magic, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); if (gatherQueIdx >= SIO_GATHER_QUE_DEPTH) { - sync.WaitSyncFlag(magic, gatherQueIdx - SIO_GATHER_QUE_DEPTH, SIO_GATHER_FLAG + localBlockIdx, rank); + sync.WaitSyncFlag(magic, gatherQueIdx - SIO_GATHER_OUTPUT_QUE_DEPTH, SIO_GATHER_FLAG + localBlockIdx, rank); } - BuildCoreDataNum(loopCount - 1, targetRankOffset); + BuildCoreDataNum(curLoopCnt, targetRankOffset); srcIpcTensor = (*sioGatherSrc1Que).DeQue(); - dstIpcTensor = (*sioGatherDstQue).EnQue(); - CpGMPingPong2GM(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + dstIpcTensor = (*sioGatherDstQue).ReadFront(); + CpGMPingPong2GM(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_QUE_DEPTH + localBlockIdx, sioPeerRankId); sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, sioPeerRankId); } @@ -324,9 +324,9 @@ private: const int32_t consumeQueIdx = ipcQueIdx - 1; sync.WaitSyncFlag(magic, consumeQueIdx + 1, SIO_REDUCE_FLAG + localBlockIdx, rank); if (sioLayerLoop == 1) { - sync.WaitSyncFlag(maigc, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId); + sync.WaitSyncFlag(magic, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId); } else { - sync.WaitSyncFlag(maigc, consumedQueIdx - 1, RING_REDUCE_FLAG + localBlockIdx, + sync.WaitSyncFlag(magic, consumedQueIdx - 1, RING_REDUCE_FLAG + localBlockIdx, ringPrevRankId); } const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 -sioLayerLoop)) % ringRankSize; @@ -342,7 +342,7 @@ private: FORCE_INLINE_AICORE void RingGather() { if (sioLayerLoop == 0) { - sync.SetSyncFlag(maigc, gatehrQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, rank); return; } @@ -408,15 +408,15 @@ private: srcIpcTensor = (*outputSrc1Que).ReadFront(); dstIpcTensor = outputTensor[targetSioRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + localBlockIdx * coreDataNum]; - CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_OUTPUT_FLAG + localBlockIdx, sioPeerRankId); BuildCoreDataNum(curLoopCnt, targetRingRankOffset); sync.WaitSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, rank); srcIpcTensor = sioLayerLoop == 0 ? (*outputSrc3Que).ReadFront() : (*outputSrc2Que).ReadFront(); dstIpcTensor = outputTensor[targetRingRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + localBlockIdx * coreDataNum]; - CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); - sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_PEER_FLAG + localBlockIdx, rank); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); + sync.SetSyncFlag(magic, gatherQueIdx, OUTPUT_FLAG + localBlockIdx, rank); } }; #endif // LCCL_ALLREDUCE_HIERARCHY_DOUBLE_RING_H \ No newline at end of file -- Gitee From 2e81618520a10eb996a08fd9e1e9ac38f2d4c609 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 16:20:33 +0800 Subject: [PATCH 143/414] 9 --- .../91093/allreduce_hierarchy_double_ring.h | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index 2b875fa4..883e8f20 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -227,7 +227,7 @@ private: (*inputQue).DeQue(rank, RING_REDUCE_PEER_FLAG + localBlockIdx); const int32_t consumedQueIdx = ipcQueIdx - (IPC_QUE_DEPTH + ringRankSize - 1); - if (consumedQueIdx >- 0 && consumedQueIdx % ringRankSize == 0) { + if (consumedQueIdx >= 0 && consumedQueIdx % ringRankSize == 0) { sync.WaitSyncFlag(magic, consumedQueIdx, OUTPUT_FLAG + localBlockIdx, rank); sync.WaitSyncFlag(magic, consumedQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, rank); } @@ -304,14 +304,14 @@ private: sync.WaitSyncFlag(magic, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); if (gatherQueIdx >= SIO_GATHER_QUE_DEPTH) { - sync.WaitSyncFlag(magic, gatherQueIdx - SIO_GATHER_OUTPUT_QUE_DEPTH, SIO_GATHER_FLAG + localBlockIdx, rank); + sync.WaitSyncFlag(magic, gatherQueIdx - SIO_GATHER_QUE_DEPTH, SIO_GATHER_OUTPUT_FLAG + localBlockIdx, rank); } BuildCoreDataNum(curLoopCnt, targetRankOffset); - srcIpcTensor = (*sioGatherSrc1Que).DeQue(); + srcIpcTensor = (sioLayerLoop == 0 ? (*sioGatherSrc1Que).ReadFront() : (*sioGatherSrc2Que).ReadFront()); dstIpcTensor = (*sioGatherDstQue).ReadFront(); - CpGMPingPong2GM(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); - sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_QUE_DEPTH + localBlockIdx, sioPeerRankId); - sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, sioPeerRankId); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); + sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_PEER_FLAG + localBlockIdx, sioPeerRankId); + sync.SetSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, rank); } FORCE_INLINE_AICORE void RingReduce() @@ -321,8 +321,8 @@ private: return; } - const int32_t consumeQueIdx = ipcQueIdx - 1; - sync.WaitSyncFlag(magic, consumeQueIdx + 1, SIO_REDUCE_FLAG + localBlockIdx, rank); + const int32_t consumedQueIdx = ipcQueIdx - 1; + sync.WaitSyncFlag(magic, consumedQueIdx + 1, SIO_REDUCE_FLAG + localBlockIdx, rank); if (sioLayerLoop == 1) { sync.WaitSyncFlag(magic, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId); } else { @@ -343,13 +343,13 @@ private: { if (sioLayerLoop == 0) { sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); - sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_PEER_FLAG + localBlockIdx, ringPrevRankId); return; } - const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerloop)) % ringRankSize; + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerLoop)) % ringRankSize; const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; - sync.WaitSyncFlag(magic, gatherQueIdx - 1, SIO_GATHER_FLAG + localBlockIdx, ringPrevRankId); + sync.WaitSyncFlag(magic, gatherQueIdx - 1, RING_GATHER_FLAG + localBlockIdx, ringPrevRankId); if (gatherQueIdx > RING_GATHER_QUE_DEPTH) { sync.WaitSyncFlag(magic, gatherQueIdx - RING_GATHER_QUE_DEPTH, OUTPUT_FLAG + localBlockIdx, rank); if (targetRankOffset != ringPrevRankId) { @@ -364,18 +364,18 @@ private: } else { srcIpcTensor = ringGatherSrcQue.ReadFront(); } - dstipcTensor = ringGatherDstQue.ReadFront(); + dstIpcTensor = ringGatherDstQue.ReadFront(); CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); - sync.SetSyncFlag(magic, gatehrQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); if (gatherQueIdx > 0) { - sync.SetSyncFlag(magic, gatherQueIdx - 1, RING_GATHER_PEER_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, gatherQueIdx - 1, RING_GATHER_PEER_FLAG + localBlockIdx, ringPrevRankId); } if (sioLayerLoop == ringRankSize - 1) { ringGatherSrcQue.ReadFront(); } } - FORCE_INLINE_AICORE coid PrepareOutput() + FORCE_INLINE_AICORE void PrepareOutput() { for (int32_t blockLoop = 0; blockLoop < OUTPUT_CORE_SCALE; ++blockLoop) { localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM)) * OUTPUT_CORE_SCALE + @@ -392,8 +392,8 @@ private: localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM)) * OUTPUT_CORE_SCALE + blockLoop; outputSrc1Que = &(outputSrc1QueList[blockLoop]); - outputSrc2Que = &(outputSrc1QueList[blockLoop]); - outputSrc3Que = &(outputSrc1QueList[blockLoop]); + outputSrc2Que = &(outputSrc2QueList[blockLoop]); + outputSrc3Que = &(outputSrc3QueList[blockLoop]); Ipc2OutputByCore(); } } -- Gitee From 4d1d80d7328939cd96494a0e6f492e21229b8409 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 19:48:51 +0800 Subject: [PATCH 144/414] 7 --- .../reduce_scatter_big_data_91093_4step.h | 331 +++++++++++++++++- 1 file changed, 330 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 9a893c3a..c2e645c4 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -6,4 +6,333 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ + +#ifndef LCCL_REDUCE_SCATTER_BIG_DATA_91093_4STEP_H +#define LCCL_REDUCE_SCATTER_BIG_DATA_91093_4STEP_H + +#include "sync_collectives.h" +#include "collectives.h" +#include "ipc_queue.h" + +constexpr int PER_STEP_BLOCKNUM = 8; +constexpr int ARRAY_MAX_SIZE = 10; +constexpr int NUM_OF_TWO = 2; +constexpr int NUM_OF_THREE = 3; +constexpr int NUM_OF_FOUR = 4; + +template +class ReduceScatterBigData91093_4step : protected Collectives { +public: + __aicore__ inline ReduceScatterBigData91093(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + __aicore__ inline void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + DumpLcclInfo(LogId::INIT, static_cast(op)); + constexpr int IPC_QUEUE_DEPTH_91093 = NUM_OF_FOUR; + atomOp = op; + relaBlockIdx = blockIdx % PER_STEP_BLOCKNUM; + ipcSizeOfBlock = IPC_BUFF_MAX_SIZE / rankSize; + ipcNumOfBlock = ipcSizeOfBlock / sizeof(T); + ipcBlockNum = ipcNumOfBlock / IPC_QUEUE_DEPTH_91093; + totalBlockDataNum = len; + loopCount = CeilDiv(totalBlockDataNum, ipcBlockNum); + dstOutputGlobal.SetGlobalBuffer((__gm__ T*)output, totalBlockDataNum); + if ((rank % NUM_OF_TWO) == 0) { + adjPeerRank = rank + 1; + } else { + adjPeerRank = rank - 1; + } + StepRankPerCoreInit(); + IpcQueueInit(); + if ((blockIdx / PER_STEP_BLOCKNUM) == 0) { + for (int i = 0; i < stepOneRankPerCore; i++) { + srcInputGlobal[i].SetGlobalBuffer((__gm__ T*)input + (blockIdx * stepOneOriginRankPerCore + i) * + totalBlockDataNum, totalBlockDataNum); + } + } + DumpLcclLogInfo(LogId::INIT, static_cast(op)); + } + + __aicore__ inline void StepRankPerCoreInit() + { + int halfRankSize = rankSize / NUM_OF_TWO; + stepOneOriginRankPerCore = CeilDiv(rankSize, PER_STEP_BLOCKNUM); + stepTwoOriginRankPerCore = CeilDiv(halfRankSize, PER_STEP_BLOCKNUM); + stepThreeOriginRankPerCore = CeilDiv(halfRankSize, PER_STEP_BLOCKNUM); + stepOneInUseBlockNum = CeilDiv(rankSize, stepOneOriginRankPerCore); + stepTwoInUseBlockNum = CeilDiv(halfRankSize, stepTwoOriginRankPerCore); + stepThreeInUseBlockNum = CeilDiv(halfRankSize, stepThreeOriginRankPerCore); + if ((blockIdx / PER_STEP_BLOCKNUM) == 0) { + if ((blockIdx % PER_STEP_BLOCKNUM) == (stepOneInUseBlockNum - 1)) { + stepOneRankPerCore = rankSize - (blockIdx % PER_STEP_BLOCKNUM) * stepOneOriginRankPerCore; + } else { + stepOneRankPerCore = stepOneOriginRankPerCore; + } + } else if ((blockIdx / PER_STEP_BLOCKNUM) == 1) { + if ((blockIdx % PER_STEP_BLOCKNUM) == (stepTwoInUseBlockNum - 1)) { + stepTwoRankPerCore = halfRankSize - (blockIdx % PER_STEP_BLOCKNUM) * stepTwoOriginRankPerCore; + } else { + stepTwoRankPerCore = stepTwoOriginRankPerCore; + } + } else if ((blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_TWO || (blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_THREE) { + if (((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) == (stepThreeInUseBlockNum - 1)) { + stepThreeRankPerCore = halfRankSize - ((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / + NUM_OF_TWO) * stepThreeOriginRankPerCore; + } else { + stepThreeRankPerCore = stepThreeOriginRankPerCore; + } + } + } + + __aicore__ inline void IpcQueueInit() + { + int ipcRank; + if ((blockIdx / PER_STEP_BLOCKNUM) == 0) { + for (int i = 0; i < stepOneRankPerCore; i++) { + ipcRank = blockIdx * stepOneOriginRankPerCore + i; + writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ipcRank * + ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); + } + } else if ((blockIdx / PER_STEP_BLOCKNUM) == 1) { + for (int i = 0; i < stepTwoRankPerCore; i++) { + ipcRank = ((blockIdx % PER_STEP_BLOCKNUM) * stepTwoOriginRankPerCore + i) * + NUM_OF_TWO + (rank % NUM_OF_TWO); + readIpcQue[i].Init(&sync, magic, shareAddrs[adjPeerRank] + IPC_DATA_OFFSET + ipcRank * + ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); + writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ipcRank * + ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); + } + } else if ((blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_TWO || (blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_THREE) { + for (int i = 0; i < stepThreeRankPerCore; i++) { + stepThreeRank = (((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) * + stepThreeOriginRankPerCore + i) * NUM_OF_TWO + (rank % NUM_OF_TWO); + writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * + ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); + readIpcQue[i].Init(&sync, magic, shareAddrs[stepThreeRank] + IPC_DATA_OFFSET + rank * + ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); + } + } else if (blockIdx == (NUM_OF_FOUR * PER_STEP_BLOCKNUM)) { + readIpcQue[0].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * + ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); + } + } + + __aicore__ inline void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + int stepIndex = blockIdx / PER_STEP_BLOCKNUM; + if (stepIndex == 0 && ((relaBlockIdx * stepOneOriginRankPerCore) >= rankSize)) { + DumpLcclLogInfo(Log::PROCESS, static_cast(atomOp)); + return; + } + if (stepIndex == 1 && ((relaBlockIdx * stepTwoOriginRankPerCore) >= (rankSize / NUM_OF_TWO))) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + return; + } + if ((stepIndex == NUM_OF_TWO || stepIndex == NUM_OF_THREE) && ((blockIdx - PER_STEP_BLOCKNUM * + NUM_OF_TWO) / NUM_OF_TWO * stepThreeOriginRankPerCore) >= (rankSize / NUM_OF_TWO)) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + return; + } + if (stepIndex == 0) { + StepOneProcess(); + } else if (stepIndex == 1) { + StepTwoProcess(); + } else if ((stepIndex == NUM_OF_TWO || stepIndex == NUM_OF_THREE) && ((blockIdx % NUM_OF_TWO) == 0)) { + StepFourProcess(); + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } + + __aicore__ inline void StepOneProcess() + { + for (int i = 0; i < stepOneRankPerCore; i++) { + if ((blockIdx * stepOneOriginRankPerCore + i) % NUM_OF_TWO == rank % NUM_OF_TWO) { + waitWriteRankArr[i] = rank; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + } else { + waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / + stepThreeeOriginRankPerCore) * NUM_OF_TWO; + } + } else { + waitWriteRankArr[i] = adjPeerRank; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM + ((blockIdx * stepOneOriginRankPerCore + i) / + NUM_OF_TWO) / stepTwoOriginRankPerCore; + } + InputToIpcProcess(waitWriteRankArr, waitWriteBlockArr, stepOneRankPerCore); + } + + __aicore__ inline void StepTwoProcess() + { + int waitReadRank; + int processRank; + waitReadRank = adjPeerRank; + for (int i = 0; i < stepTwoRankPerCore; i++) { + processRank = (relaBlockIdx * stepTwoOriginRankPerCore + i) * NUM_OF_TWO + (rank % NUM_OF_TWO); + waitReadRankArr[i] = waitReadRank; + waitReadBlockArr[i] = processRank / stepOneOriginRankPerCore; + if (processRank == rank) { + waitWriteRankArr[i] = rank; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO; + } else { + waitWriteRankArr[i] = processRank; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / + stepThreeeOriginRankPerCore) * NUM_OF_TWO; + } + } + SioAtomicToIpcProcess(waitReadRankArr, waitReadBlockArr, waitWriteRankArr, + waitWriteBlockArr, stepTwoRankPerCore); + } + + __aicore__ inline void StepThreeProcess() + { + for (int i = 0; i < stepThreeRankPerCore; i++) { + waitReadRankArr[i] = (((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) * + stepThreeOriginRankPerCore + i) * NUM_OF_TWO + (rank % NUM_OF_TWO); + waitReadBlockArr[i] = PER_STEP_BLOCKNUM + (rank / NUM_OF_TWO) / stepTwoOriginRankPerCore; + waitWriteBlockArr[i] = rank; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_FOUR; + } + HccsAtomicToIpcProcess(waitReadRankArr, waitReadBlockArr, waitWriteRankArr, + waitWriteBlockArr, stepThreeRankPerCore); + } + + __aicore__ inline void StepFourProcess() + { + for (int i = 0; i < stepThreeInUseBlockNum; i++) { + waitReadRankArr[i] = rank; + waitReadBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + i * NUM_OF_TWO; + } + IpcToOutputProcess(waitReadRankArr, waitReadBlockArr, stepThreeInUseBlockNum); + } + + __aicore__ inline void InputToIpcProcess(int *waitWriteRank, int *waitWriteBlock, int waitCount) + { + int processBlockNum = ipcBlockNum; + for (int count = 0; count < loopCount; count++) { + if (count == (loopCount - 1)) { + processBlockNum = totalBlockDataNum - ipcBlockNum * count; + } + for (int i = 0; i < waitCount; i++) { + writeIpcQue[i].DeQue(waitWriteRank[i], waitWriteBlock[i]); + dstIpcGlobal = writeIpcQue[i].EnQue(); + CpInputToIpc(count, processBlockNum, srcInputGlobal[i]); + } + sync.SetInnerFlag(magic, count); + } + } + + __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadRank,int *waitWriteBlock, + int *waitWriteBlock, int waitCount) + { + int processBlockNum = ipcBlockNum; + for (int count = 0; count < loopCount; count++) { + if (count == (loopCount - 1)) { + processBlockNum = totalBlockDataNum - ipcBlockNum * count; + } + for (int i = 0; i < waitCount; i++) { + srcIpcGlobal = readIpcQue[i].ReadFront(); + sync.WaitInnerFlag(magic, count, waitReadRank[i], waitReadBlock[i]); + sync.WaitInnerFlag(magic, count, rank, waitReadBlock[i]); + writeIpcQue[i].DeQue(waitWriteRank[i], waitWriteBlock[i]); + dstIpcGlobal = writeIpcQue[i].EnQue(); + SioAtomicAddToIpc(count, processBlockNum, waitReadRank[i], i); + } + sync.SetInnerFlag(magic, count); + } + } + + __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadRank,int *waitWriteBlock, + int *waitWriteBlock, int waitCount) + { + int processBlockNum = ipcBlockNum; + for (int count = 0; count < loopCount; count++) { + if (count == (loopCount - 1)) { + processBlockNum = totalBlockDataNum - ipcBlockNum * count; + } + for (int i = 0; i < waitCount; i++) { + sync.WaitInnerFlag(magic, count, waitReadRank[i], waitReadBlock[i]); + sync.WaitInnerFlag(magic, count, rank, waitReadBlock[i]); + srcIpcGlobal = readIpcQue[i].ReadFront(); + writeIpcQue[i].DeQue(waitWriteRank[i], waitWriteBlock[i]); + dstIpcGlobal = writeIpcQue[i].EnQue(); + HccsAtomicAddToIpc(count, processBlockNum, waitReadRank[i], i); + } + sync.SetInnerFlag(magic, count); + } + } + + __aicore__ inline void IpcToOutputProcess(int *waitReadRank, int *waitReadRank, int waitCount) + { + int processBlockNum = ipcBlockNum; + for (int count = 0; count < waitCount; count++) { + if (count == (loopCount - 1)) { + processBlockNum = totalBlockDataNum - ipcBlockNum * count; + } + for (int i = 0; i < waitCount; i++) { + sync.WaitInnerFlag(magic, count, waitReadRank[i], waitReadBlock[i]); + } + srcIpcGlobal = readIpcQue[0].ReadFront(); + CpIpcToOutput(count, processBlockNum); + sync.SetInnerFlag(magic, count); + } + } + +protected: + GlobalTensor srcInputGlobal[ARRAY_MAX_SIZE]; + GlobalTensor srcIpcGlobal; + GlobalTensor dstIPCGlobal; + GlobalTensor dstOutputGlobal; + + int totalBlockDataNum; + int atomOp; + int relaBlockIdx; + int ipcBlockNum; + int loopCount; + int ipcNumOfBlock; + int ipcSizeofBlock; + IpcQueue writeIpcQue[ARRAY_MAX_SIZE]; + IpcQueue readIpcQue[ARRAY_MAX_SIZE]; + int adjPeerRank; + int stepThreeRank; + int stepOneRankPerCore; + int stepTwoRankPerCore; + int stepThreeRankPerCore; + int stepOneOriginRankPerCore; + int stepTwoOriginRankPerCore; + int stepThreeOriginRankPerCore; + int stepOneInUseBlockNum; + int stepTwoInUseBlockNum; + int stepThreeInUseBlockNum; + int waitWriteRankArr[ARRAY_MAX_SIZE]; + int waitWriteBlockArr[ARRAY_MAX_SIZE]; + int waitReadRankArr[ARRAY_MAX_SIZE]; + int waitReadBlockArr[ARRAY_MAX_SIZE]; + +private: + __aicore__ inline void HccsAtomicAddToIpc(int num, int processBlockNum, int waitRank, int i) + { + if (waitRank != rank) { + CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); + } + } + + __aicore__ inline void CpInputToIpc(int num, int processBlockNum, GLobalTensor inputTensor) + { + CpGM2GMPingPong(processBlockNum * sizeof(T), inputTensor[num * ipcBlockNum], dstIpcGlobal, -1); + } + + __aicore__ inline void SioAtomicAddToIpc(int num, int processBlockNum, int procedssRank, int i) + { + CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); + } + + __aicore__ inline void CpInputToIpc(int num, int processBlockNum) + { + CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstOutputGlobal[num * ipcBlockNum], -1); + } +}; +#endif // LCCL_REDUCE_SCATTER_BIG_DATA_91093_H -- Gitee From e95820a46e2d77edd13d7525713f6a4c50e52e5a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 19:52:56 +0800 Subject: [PATCH 145/414] 5 --- .../reduce_scatter_big_data_91093_4step.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index c2e645c4..8c55b2f8 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -8,8 +8,8 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef LCCL_REDUCE_SCATTER_BIG_DATA_91093_4STEP_H -#define LCCL_REDUCE_SCATTER_BIG_DATA_91093_4STEP_H +#ifndef LCCL_REDUCE_SCATTER_BIG_DATA_91093_H +#define LCCL_REDUCE_SCATTER_BIG_DATA_91093_H #include "sync_collectives.h" #include "collectives.h" @@ -22,14 +22,14 @@ constexpr int NUM_OF_THREE = 3; constexpr int NUM_OF_FOUR = 4; template -class ReduceScatterBigData91093_4step : protected Collectives { +class ReduceScatterBigData91093 : protected Collectives { public: __aicore__ inline ReduceScatterBigData91093(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} __aicore__ inline void Init(KERNELS_ARGS_FUN()) { Collectives::Init(KERNELS_ARGS_CALL()); - DumpLcclInfo(LogId::INIT, static_cast(op)); + DumpLcclLogInfo(LogId::INIT, static_cast(op)); constexpr int IPC_QUEUE_DEPTH_91093 = NUM_OF_FOUR; atomOp = op; relaBlockIdx = blockIdx % PER_STEP_BLOCKNUM; @@ -124,7 +124,7 @@ public: DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); int stepIndex = blockIdx / PER_STEP_BLOCKNUM; if (stepIndex == 0 && ((relaBlockIdx * stepOneOriginRankPerCore) >= rankSize)) { - DumpLcclLogInfo(Log::PROCESS, static_cast(atomOp)); + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); return; } if (stepIndex == 1 && ((relaBlockIdx * stepTwoOriginRankPerCore) >= (rankSize / NUM_OF_TWO))) { @@ -316,23 +316,23 @@ private: __aicore__ inline void HccsAtomicAddToIpc(int num, int processBlockNum, int waitRank, int i) { if (waitRank != rank) { - CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); + CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); } } - __aicore__ inline void CpInputToIpc(int num, int processBlockNum, GLobalTensor inputTensor) + __aicore__ inline void CpInputToIpc(int num, int processBlockNum, GlobalTensor inputTensor) { CpGM2GMPingPong(processBlockNum * sizeof(T), inputTensor[num * ipcBlockNum], dstIpcGlobal, -1); } __aicore__ inline void SioAtomicAddToIpc(int num, int processBlockNum, int procedssRank, int i) { - CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); + CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); } - __aicore__ inline void CpInputToIpc(int num, int processBlockNum) + __aicore__ inline void CpIpcToOutput(int num, int processBlockNum) { - CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstOutputGlobal[num * ipcBlockNum], -1); + CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstOutputGlobal[num * ipcBlockNum], -1); } }; #endif // LCCL_REDUCE_SCATTER_BIG_DATA_91093_H -- Gitee From 415d0704a8c471da1d7c12af3c8cbf886fad0929 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 19:55:39 +0800 Subject: [PATCH 146/414] 3 --- .../91093/reduce_scatter_big_data_91093_4step.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 8c55b2f8..199b9830 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -141,6 +141,8 @@ public: } else if (stepIndex == 1) { StepTwoProcess(); } else if ((stepIndex == NUM_OF_TWO || stepIndex == NUM_OF_THREE) && ((blockIdx % NUM_OF_TWO) == 0)) { + StepThreeProcess(); + } else if (blockIdx == (NUM_OF_FOUR * PER_STEP_BLOCKNUM)) { StepFourProcess(); } DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); @@ -150,8 +152,10 @@ public: { for (int i = 0; i < stepOneRankPerCore; i++) { if ((blockIdx * stepOneOriginRankPerCore + i) % NUM_OF_TWO == rank % NUM_OF_TWO) { - waitWriteRankArr[i] = rank; + if ((blockIdx * stepOneOriginRankPerCore + i) == rank) { + waitWriteRankArr[i] = rank; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + } } else { waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / -- Gitee From d342b53466b8cf0a604c5043365d3bcfcfa1ba85 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 19:59:53 +0800 Subject: [PATCH 147/414] 5 --- .../reduce_scatter_big_data_91093_4step.h | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 199b9830..33b1f2ee 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -154,21 +154,20 @@ public: if ((blockIdx * stepOneOriginRankPerCore + i) % NUM_OF_TWO == rank % NUM_OF_TWO) { if ((blockIdx * stepOneOriginRankPerCore + i) == rank) { waitWriteRankArr[i] = rank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + } else { + waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / + stepThreeeOriginRankPerCore) * NUM_OF_TWO; } } else { - waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / - stepThreeeOriginRankPerCore) * NUM_OF_TWO; + waitWriteRankArr[i] = adjPeerRank; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM + ((blockIdx * stepOneOriginRankPerCore + i) / + NUM_OF_TWO) / stepTwoOriginRankPerCore; } - } else { - waitWriteRankArr[i] = adjPeerRank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM + ((blockIdx * stepOneOriginRankPerCore + i) / - NUM_OF_TWO) / stepTwoOriginRankPerCore; } InputToIpcProcess(waitWriteRankArr, waitWriteBlockArr, stepOneRankPerCore); } - __aicore__ inline void StepTwoProcess() { int waitReadRank; @@ -184,7 +183,7 @@ public: } else { waitWriteRankArr[i] = processRank; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / - stepThreeeOriginRankPerCore) * NUM_OF_TWO; + stepThreeOriginRankPerCore) * NUM_OF_TWO; } } SioAtomicToIpcProcess(waitReadRankArr, waitReadBlockArr, waitWriteRankArr, @@ -229,7 +228,7 @@ public: } } - __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadRank,int *waitWriteBlock, + __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, int *waitWriteBlock, int waitCount) { int processBlockNum = ipcBlockNum; @@ -249,7 +248,7 @@ public: } } - __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadRank,int *waitWriteBlock, + __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, int *waitWriteBlock, int waitCount) { int processBlockNum = ipcBlockNum; @@ -269,7 +268,7 @@ public: } } - __aicore__ inline void IpcToOutputProcess(int *waitReadRank, int *waitReadRank, int waitCount) + __aicore__ inline void IpcToOutputProcess(int *waitReadRank, int *waitReadBlock, int waitCount) { int processBlockNum = ipcBlockNum; for (int count = 0; count < waitCount; count++) { -- Gitee From 412a55fbb5e6d9dcdaa478cf3fcb12f6972e5e18 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 20:01:58 +0800 Subject: [PATCH 148/414] 1 --- .../91093/reduce_scatter_big_data_91093_4step.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 33b1f2ee..51ea7774 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -158,7 +158,7 @@ public: } else { waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / - stepThreeeOriginRankPerCore) * NUM_OF_TWO; + stepThreeOriginRankPerCore) * NUM_OF_TWO; } } else { waitWriteRankArr[i] = adjPeerRank; @@ -179,7 +179,7 @@ public: waitReadBlockArr[i] = processRank / stepOneOriginRankPerCore; if (processRank == rank) { waitWriteRankArr[i] = rank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_FOUR; } else { waitWriteRankArr[i] = processRank; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / @@ -196,7 +196,7 @@ public: waitReadRankArr[i] = (((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) * stepThreeOriginRankPerCore + i) * NUM_OF_TWO + (rank % NUM_OF_TWO); waitReadBlockArr[i] = PER_STEP_BLOCKNUM + (rank / NUM_OF_TWO) / stepTwoOriginRankPerCore; - waitWriteBlockArr[i] = rank; + waitWriteRankArr[i] = rank; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_FOUR; } HccsAtomicToIpcProcess(waitReadRankArr, waitReadBlockArr, waitWriteRankArr, @@ -242,7 +242,7 @@ public: sync.WaitInnerFlag(magic, count, rank, waitReadBlock[i]); writeIpcQue[i].DeQue(waitWriteRank[i], waitWriteBlock[i]); dstIpcGlobal = writeIpcQue[i].EnQue(); - SioAtomicAddToIpc(count, processBlockNum, waitReadRank[i], i); + SioAtomicAddToIpc(count, processBlockNum, waitWriteRank[i], i); } sync.SetInnerFlag(magic, count); } @@ -271,7 +271,7 @@ public: __aicore__ inline void IpcToOutputProcess(int *waitReadRank, int *waitReadBlock, int waitCount) { int processBlockNum = ipcBlockNum; - for (int count = 0; count < waitCount; count++) { + for (int count = 0; count < loopCount; count++) { if (count == (loopCount - 1)) { processBlockNum = totalBlockDataNum - ipcBlockNum * count; } @@ -287,7 +287,7 @@ public: protected: GlobalTensor srcInputGlobal[ARRAY_MAX_SIZE]; GlobalTensor srcIpcGlobal; - GlobalTensor dstIPCGlobal; + GlobalTensor dstIpcGlobal; GlobalTensor dstOutputGlobal; int totalBlockDataNum; @@ -296,7 +296,7 @@ protected: int ipcBlockNum; int loopCount; int ipcNumOfBlock; - int ipcSizeofBlock; + int ipcSizeOfBlock; IpcQueue writeIpcQue[ARRAY_MAX_SIZE]; IpcQueue readIpcQue[ARRAY_MAX_SIZE]; int adjPeerRank; @@ -328,7 +328,7 @@ private: CpGM2GMPingPong(processBlockNum * sizeof(T), inputTensor[num * ipcBlockNum], dstIpcGlobal, -1); } - __aicore__ inline void SioAtomicAddToIpc(int num, int processBlockNum, int procedssRank, int i) + __aicore__ inline void SioAtomicAddToIpc(int num, int processBlockNum, int processRank, int i) { CpGM2GMPingPong(processBlockNum * sizeof(T), srcIpcGlobal, dstIpcGlobal, atomOp); } -- Gitee From 3db208981da86a47ed87afe852a65e1080d750a5 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 20:02:26 +0800 Subject: [PATCH 149/414] 7 --- .../ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 51ea7774..9799f3b2 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -242,7 +242,7 @@ public: sync.WaitInnerFlag(magic, count, rank, waitReadBlock[i]); writeIpcQue[i].DeQue(waitWriteRank[i], waitWriteBlock[i]); dstIpcGlobal = writeIpcQue[i].EnQue(); - SioAtomicAddToIpc(count, processBlockNum, waitWriteRank[i], i); + SioAtomicAddToIpc(count, processBlockNum, waitWriteArr[i], i); } sync.SetInnerFlag(magic, count); } -- Gitee From b14d42be8e9292df9641e750a39d49994fa848e5 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Mon, 25 Aug 2025 20:02:47 +0800 Subject: [PATCH 150/414] 4 --- .../ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 9799f3b2..a82ed1ac 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -242,7 +242,7 @@ public: sync.WaitInnerFlag(magic, count, rank, waitReadBlock[i]); writeIpcQue[i].DeQue(waitWriteRank[i], waitWriteBlock[i]); dstIpcGlobal = writeIpcQue[i].EnQue(); - SioAtomicAddToIpc(count, processBlockNum, waitWriteArr[i], i); + SioAtomicAddToIpc(count, processBlockNum, waitWriteRankArr[i], i); } sync.SetInnerFlag(magic, count); } -- Gitee From 6a5a46f1f7a3003c9de2ce0cd5652ac9fc70d087 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 08:49:42 +0800 Subject: [PATCH 151/414] 4 --- .../lcal/src/kernels/coc_matmul_allreduce.cce | 9 +++++ .../src/kernels/coc_matmul_reduce_scatter.cce | 39 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 comm/lcal/src/kernels/coc_matmul_allreduce.cce create mode 100644 comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce diff --git a/comm/lcal/src/kernels/coc_matmul_allreduce.cce b/comm/lcal/src/kernels/coc_matmul_allreduce.cce new file mode 100644 index 00000000..9a893c3a --- /dev/null +++ b/comm/lcal/src/kernels/coc_matmul_allreduce.cce @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce b/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce new file mode 100644 index 00000000..c74e9e0e --- /dev/null +++ b/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __CCE_KT_TEST_ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + +#include "coc_ppmatmul_switch.cce" +#include "coc_reduce_scatter.cce" +#include "coc_internal.cce" + +#ifdef __DAV_C220_CUBE__ +// Matmul in LcalMatmulReduceScatter +#define COC_MATMUL_REDUCE_SCATTER_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalMatmulReduceScatter_##type##_mix_aic(COC_ARGS_FUN(type)) { \ + CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + +#elif __DAV_C220_VEC__ +// ReduceScatter in LcalMatmulReduceScatter +#define COC_MATMUL_REDUCE_SCATTER_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalMatmulReduceScatter_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ + CocMatmulReduceScatterAiv(COC_ARGS_CALL()); +} +#endif + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // 910B support bf16 +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_MATMUL_REDUCE_SCATTER_FUNC_AUTO_DEF); +#endif -- Gitee From 270dcdae8cf6eae24f7122d487719b0c34ae5dd2 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 08:55:13 +0800 Subject: [PATCH 152/414] 3 --- .../lcal/src/kernels/coc_matmul_allreduce.cce | 32 ++++++++++++++++++- .../src/kernels/coc_matmul_reduce_scatter.cce | 4 +-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/coc_matmul_allreduce.cce b/comm/lcal/src/kernels/coc_matmul_allreduce.cce index 9a893c3a..d53c0b50 100644 --- a/comm/lcal/src/kernels/coc_matmul_allreduce.cce +++ b/comm/lcal/src/kernels/coc_matmul_allreduce.cce @@ -6,4 +6,34 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ +#ifdef __CCE_KT_TEST__ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + +#include "coc_ppmatmul_switch.cce" +#include "coc_allreduce.cce" +#include "coc_internal.cce" + +#ifdef __DAV_C220_CUBE__ +// Matmul in LcalMatmulAllReduce +#define COC_MATMUL_ALLREDUCE_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalMatmulAllReduce_##type##_mix_aic(COC_ARGS_FUN(type)) { \ + CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + +#elif __DAV_C220_VEC__ +// ReduceScatter in LcalMatmulReduceScatter +#define COC_MATMUL_ALLREDUCE_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalMatmulAllReduce_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ + CocMatmulAllReduceAiv(COC_ARGS_CALL()); \ +} +#endif + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // 910B support bf16 +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_MATMUL_ALLREDUCE_FUNC_AUTO_DEF); +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce b/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce index c74e9e0e..798ef54b 100644 --- a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce +++ b/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce @@ -7,7 +7,7 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#ifdef __CCE_KT_TEST_ +#ifdef __CCE_KT_TEST__ #define __aicore__ #else #define __aicore__ [aicore] @@ -28,7 +28,7 @@ extern "C" __global__ __aicore__ void LcalMatmulReduceScatter_##type##_mix_aic(C // ReduceScatter in LcalMatmulReduceScatter #define COC_MATMUL_REDUCE_SCATTER_FUNC_AUTO_DEF(type) \ extern "C" __global__ __aicore__ void LcalMatmulReduceScatter_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ - CocMatmulReduceScatterAiv(COC_ARGS_CALL()); + CocMatmulReduceScatterAiv(COC_ARGS_CALL()); \ } #endif -- Gitee From 84d2ea3a4cf00f50cf5bd8fbdec8d294dd9b404f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 08:55:52 +0800 Subject: [PATCH 153/414] 5 --- comm/lcal/src/kernels/coc_matmul_allreduce.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_matmul_allreduce.cce b/comm/lcal/src/kernels/coc_matmul_allreduce.cce index d53c0b50..75755361 100644 --- a/comm/lcal/src/kernels/coc_matmul_allreduce.cce +++ b/comm/lcal/src/kernels/coc_matmul_allreduce.cce @@ -25,7 +25,7 @@ extern "C" __global__ __aicore__ void LcalMatmulAllReduce_##type##_mix_aic(COC_A } #elif __DAV_C220_VEC__ -// ReduceScatter in LcalMatmulReduceScatter +// AllReduce in LcalMatmulAllReduce #define COC_MATMUL_ALLREDUCE_FUNC_AUTO_DEF(type) \ extern "C" __global__ __aicore__ void LcalMatmulAllReduce_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ CocMatmulAllReduceAiv(COC_ARGS_CALL()); \ -- Gitee From 9688d19f3c80cde20a7308e197696200bbd96e1e Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 11:52:05 +0800 Subject: [PATCH 154/414] 5 --- .../reduce_scatter_hierarchy_double_ring.h | 226 +++++++++++++++++- 1 file changed, 225 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h index 9a893c3a..b39cf2e7 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h @@ -6,4 +6,228 @@ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. - */ \ No newline at end of file + */ +#ifndef LCCL_REDUCE_SCATTER_HIERARCHY_DOUBLE_RING_H +#define LCCL_REDUCE_SCATTER_HIERARCHY_DOUBLE_RING_H + +#include "sync_collectives.h" +#include "collectives.h" +#include "ipc_queue.h" +using namespace AscendC; + +template +class ReduceScatterHierarchyDoubleRing : protected Collectives { + constexpr static int32_t RING_LAYER_NUM = 2; + constexpr static int32_t INPUT_CORE_NUM = 12; + constexpr static int32_t SIO_CORE_NUM = 12; + constexpr static int32_t RING_CORE_NUM = 12; + constexpr static int32_t IPC_QUE_DEPTH = 12; + constexpr static int32_t INPUT_SIO_PEER_FLAG = 0 * RING_CORE_NUM; + constexpr static int32_t SIO_REDUCE_FLAG = 1 * RING_CORE_NUM; + constexpr static int32_t RING_REDUCE_FLAG = 2 * RING_CORE_NUM; + constexpr static int32_t RING_REDUCE_PEER_FLAG = 3 * RING_CORE_NUM; + constexpr static int32_t OUTPUT_FLAG = 4 * RING_CORE_NUM; + constexpr static int32_t INPUT_FLAG = 5 * RING_CORE_NUM; + + constexpr static int32_t INPUT_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; + constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; + constexpr static int32_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); + constexpr static int32_t BREAK_CYCLE = 10; + +public: + FORCE_INLINE_AICORE ReduceScatterHierarchyDoubleRing(int rank, int rankSize, uint32_t extraFlag) + : Collectives(rank, rankSize, extraFlag) {} + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + { + Collectives::Init(KERNELS_ARGS_CALL()); + atomOp = op; + DumpLcclLogInfo(LogId::INIT, static_cast(atomOp)); + blockNum = INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM; + if (blockIdx >= blockNum) { + DumpLcclLogInfo(LogId::INIT, static_cast(atomOp)); + return; + } + sioLayerId = rank / RING_LAYER_NUM; + ringLayerId = rank % RING_LAYER_NUM; + ringRankSize = rankSize / RING_LAYER_NUM; + ringNextRankId = (sioLayerId + 1) % ringRankSize * RING_LAYER_NUM + ringLayerId; + ringPrevRankId = (sioLayerId + (ringRankSize - 1)) % ringRankSize * RING_LAYER_NUM + ringLayerId; + sioPeerRankId = sioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; + ipcBlockNum = IPC_BUFF_MAX_SIZE / IPC_QUE_DEPTH / sizeof(T); + totalBlockDataNum = len; + loopCount = CeilDiv(totalBlockDataNum, ipcBlockNum); + dmaPerLoop = ipcBlockNum; + dmaLastLoop = totalBlockDataNum - (loopCount - 1) * ipcBlockNum; + const int64_t dmaSizePerCore = ipcBlockNum / RING_CORE_NUM * sizeof(T); + if (blockIdx < INPUT_CORE_NUM) { + for (int32_t blockLoop = 0; blockLoop < INPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = blockIdx * INPUT_CORE_SCALE + blockLoop; + inputQueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + } + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; + sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + } + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); + sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + } + } else { + localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); + ringSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + ringDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + } + inputTensor.SetGlobalBuffer((__gm__ T*) input); + outputTensor.SetGlobalBuffer((__gm__ T*) output); + DumpLcclLogInfo(LogId::INIT, static_cast(atomOp)); + } + + FORCE_INLINE_AICORE void Process() + { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + if (blockIdx >= blockNum) { + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + return; + } + + for (curLoopCnt = 0; curLoopCnt < loopCount; ++curLoopCnt) { + const int64_t damCurLoop = (curLoopCnt == loopCount - 1) ? dmaLastLoop : dmaPerLoop; + coreDataNum = damCurLoop / RING_CORE_NUM; + lastCoreDataNum = damCurLoop - (RING_CORE_NUM - 1) * coreDataNum; + for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { + if (blockIdx < INPUT_CORE_NUM) { + Input2Ipc(); + } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { + SioReduce(); + } else { + RingReduce(); + } + ++ipcQueIdx; + } + } + DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); + } + +private: + IpcQueue inputQueList[INPUT_CORE_SCALE]; + IpcQueue sioQueList[SIO_CORE_SCALE]; + IpcQueue ringSrcQue; + IpcQueue ringDstQue; + IpcQueue *inputQue = nullptr; + IpcQueue *sioQue = nullptr; + GlobalTensor inputTensor; + GlobalTensor OutputTensor; + GlobalTensor SrcTensor; + GlobalTensor DstTensor; + int atomOp = COPYONLY; + int32_t sioLayerId = 0; + int32_t ringLayerId = 0; + int32_t ringRankSize = 0; + int32_t ringNextRankId = 0; + int32_t ringPrevRankId = 0; + int32_t sioPeerRankId = 0; + int32_t localBlockIdx = 0; + int64_t ipcBlockNum = 0; + int64_t totalBlockDataNum = 0; + int64_t dmaPerLoop = 0; + int64_t dmaLastLoop = 0; + int32_t ipcQueIdx = 0; + int32_t loopCount = 0; + int32_t curLoopCnt = 0; + int32_t sioLayerLoop = 0; + int64_t coreDataNum = 0; + int64_t lastCoreDataNum = 0; + int64_t curcoreDataNum = 0; + + FORCE_INLINE_AICORE void Input2Ipc() + { + for (int32_t blockLoop = 0; blockLoop < INPUT_CORE_SCALE; ++blockLoop) { + localBlockIdx = blockIdx * INPUT_CORE_SCALE + blockLoop; + inputQue = &(inputQueList[blockLoop]); + Input2IpcByCore(); + } + } + + FORCE_INLINE_AICORE void Input2IpcByCore() + { + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 - sioLayerLoop)) % ringRankSize; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; + curCoreDataNum = (localBlockIdx == RING_CORE_NUM - 1) ? lastCoreDataNum : coreDataNum; + srcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * ipcBlockNum + + localBlockIdx * coreDataNum]; + dstTensor = (*inputQue).EnQue(); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, COPYONLY); + sync.SetSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId); + } + + FORCE_INLINE_AICORE void SioReduce() + { + for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { + localBlockIdx = blockIdx * SIO_CORE_SCALE + blockLoop; + sioQue = &(sioQueList[blockLoop]); + SioReduceByCore(); + } + } + + FORCE_INLINE_AICORE void SioReduceByCore() + { + const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 - sioLayerLoop)) % ringRankSize; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; + + curCoreDataNum = (localBlockIdx == RING_CORE_NUM - 1) ? lastCoreDataNum : coreDataNum; + srcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * ipcBlockNum + + localBlockIdx * coreDataNum]; + dstTensor = (*inputQue).EnQue(); + if (ipcQueIdx == 0) { + sync.WaitSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId, BREAK_CYCLE); + } else { + sync.WaitSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId); + } + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, COPYONLY); + sync.SetSyncFlag(magic, ipcQueIdx, SIO_REDUCE_FLAG + localBlockIdx, sioPeerRankId); + } + + FORCE_INLINE_AICORE void RingReduceOutput() + { + if (sioLayerLoop == 0) { + ringDstQue.ReadFront(); + return; + } + curCoreDataNum = (localBlockIdx == RING_CORE_NUM - 1) ? lastCoreDataNum : coreDataNum; + srcTensor = ringSrcQue.ReadFront(); + dstTensor = ringDstQue.ReadFront(); + GlobalTensor srcOutTenser; + GlobalTensor dstOutTenser; + if (sioLayerLoop == ringRankSize - 1) [ + ringSrcQue.ReadFront(); + srcOutTensor = dstTensor; + dstOutTensor = outputTensor[curLoopCnt * ipcBlockNum + localBlockIdx * coreDataNum]; + ] + const int32_t consumedQueIdx = ipcQueIdx - 1; + if (consumedQueIdx == 0) { + sync.WaitSyncFlag(magic, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId, BREAK_CYCLE); + } else { + sync.WaitSyncFlag(magic, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId); + } + if (sioLayerLoop > 1) { + sync.WaitSyncFlag(magic, consumedQueIdx - 1, RING_REDUCE_FLAG + localBlockIdx, ringPrevRankId); + } + sync.WaitSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, rank); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, atomOp); + if (sioLayerLoop != ringRankSize - 1) { + sync.SetSyncFlag(magic, consumeQueIdx, RING_REDUCE_FLAG + localBlockIdx, rank); + } else { + sync.WaitSyncFlag(magic, ipcQueIdx, SIO_REDUCE_FLAG + localBlockIdx, rank); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcOutTensor, dstOutTensor, COPYONLY); + } + } +}; + +#endif // LCCL_REDUCE_SCATTER_HIERARCHY_DOUBLE_RING_H \ No newline at end of file -- Gitee From a75b77d6f75432ee1a01f607f89124eddcc987ef Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 11:54:34 +0800 Subject: [PATCH 155/414] 3 --- .../reduce_scatter_hierarchy_double_ring.h | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h index b39cf2e7..2c88b1ea 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h @@ -21,7 +21,7 @@ class ReduceScatterHierarchyDoubleRing : protected Collectives { constexpr static int32_t INPUT_CORE_NUM = 12; constexpr static int32_t SIO_CORE_NUM = 12; constexpr static int32_t RING_CORE_NUM = 12; - constexpr static int32_t IPC_QUE_DEPTH = 12; + constexpr static int32_t IPC_QUE_DEPTH = 32; constexpr static int32_t INPUT_SIO_PEER_FLAG = 0 * RING_CORE_NUM; constexpr static int32_t SIO_REDUCE_FLAG = 1 * RING_CORE_NUM; constexpr static int32_t RING_REDUCE_FLAG = 2 * RING_CORE_NUM; @@ -30,8 +30,8 @@ class ReduceScatterHierarchyDoubleRing : protected Collectives { constexpr static int32_t INPUT_FLAG = 5 * RING_CORE_NUM; constexpr static int32_t INPUT_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; - constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; - constexpr static int32_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); + constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / SIO_CORE_NUM; + constexpr static int64_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); constexpr static int32_t BREAK_CYCLE = 10; public: @@ -68,12 +68,6 @@ public: } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; - sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + - dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); - } - } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { - for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { - localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); } @@ -107,7 +101,7 @@ public: } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { SioReduce(); } else { - RingReduce(); + RingReduceOutput(); } ++ipcQueIdx; } @@ -123,9 +117,9 @@ private: IpcQueue *inputQue = nullptr; IpcQueue *sioQue = nullptr; GlobalTensor inputTensor; - GlobalTensor OutputTensor; - GlobalTensor SrcTensor; - GlobalTensor DstTensor; + GlobalTensor outputTensor; + GlobalTensor srcTensor; + GlobalTensor dstTensor; int atomOp = COPYONLY; int32_t sioLayerId = 0; int32_t ringLayerId = 0; @@ -144,7 +138,7 @@ private: int32_t sioLayerLoop = 0; int64_t coreDataNum = 0; int64_t lastCoreDataNum = 0; - int64_t curcoreDataNum = 0; + int64_t curCoreDataNum = 0; FORCE_INLINE_AICORE void Input2Ipc() { @@ -203,8 +197,8 @@ private: curCoreDataNum = (localBlockIdx == RING_CORE_NUM - 1) ? lastCoreDataNum : coreDataNum; srcTensor = ringSrcQue.ReadFront(); dstTensor = ringDstQue.ReadFront(); - GlobalTensor srcOutTenser; - GlobalTensor dstOutTenser; + GlobalTensor srcOutTensor; + GlobalTensor dstOutTensor; if (sioLayerLoop == ringRankSize - 1) [ ringSrcQue.ReadFront(); srcOutTensor = dstTensor; -- Gitee From 6cf1e5bc690e5eab41b61a3e25f8ef5a641ac2cd Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 11:56:43 +0800 Subject: [PATCH 156/414] 4 --- .../91093/reduce_scatter_hierarchy_double_ring.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h index 2c88b1ea..f5b19285 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h @@ -158,13 +158,13 @@ private: localBlockIdx * coreDataNum]; dstTensor = (*inputQue).EnQue(); CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, COPYONLY); - sync.SetSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId); + sync.SetSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, rank); } FORCE_INLINE_AICORE void SioReduce() { for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { - localBlockIdx = blockIdx * SIO_CORE_SCALE + blockLoop; + localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; sioQue = &(sioQueList[blockLoop]); SioReduceByCore(); } @@ -178,13 +178,13 @@ private: curCoreDataNum = (localBlockIdx == RING_CORE_NUM - 1) ? lastCoreDataNum : coreDataNum; srcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * ipcBlockNum + localBlockIdx * coreDataNum]; - dstTensor = (*inputQue).EnQue(); + dstTensor = (*sioQue).EnQue(); if (ipcQueIdx == 0) { sync.WaitSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId, BREAK_CYCLE); } else { sync.WaitSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, sioPeerRankId); } - CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, COPYONLY); + CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, atomOp); sync.SetSyncFlag(magic, ipcQueIdx, SIO_REDUCE_FLAG + localBlockIdx, sioPeerRankId); } @@ -199,11 +199,11 @@ private: dstTensor = ringDstQue.ReadFront(); GlobalTensor srcOutTensor; GlobalTensor dstOutTensor; - if (sioLayerLoop == ringRankSize - 1) [ + if (sioLayerLoop == ringRankSize - 1) { ringSrcQue.ReadFront(); srcOutTensor = dstTensor; dstOutTensor = outputTensor[curLoopCnt * ipcBlockNum + localBlockIdx * coreDataNum]; - ] + } const int32_t consumedQueIdx = ipcQueIdx - 1; if (consumedQueIdx == 0) { sync.WaitSyncFlag(magic, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId, BREAK_CYCLE); @@ -216,7 +216,7 @@ private: sync.WaitSyncFlag(magic, ipcQueIdx, INPUT_FLAG + localBlockIdx, rank); CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcTensor, dstTensor, atomOp); if (sioLayerLoop != ringRankSize - 1) { - sync.SetSyncFlag(magic, consumeQueIdx, RING_REDUCE_FLAG + localBlockIdx, rank); + sync.SetSyncFlag(magic, consumedQueIdx, RING_REDUCE_FLAG + localBlockIdx, rank); } else { sync.WaitSyncFlag(magic, ipcQueIdx, SIO_REDUCE_FLAG + localBlockIdx, rank); CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcOutTensor, dstOutTensor, COPYONLY); -- Gitee From f7cf027d936709fb9fb9da6fc1111699af3d2928 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 11:52:15 +0800 Subject: [PATCH 157/414] draft --- comm/lcal/src/lcal_comm.cpp | 214 ++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 comm/lcal/src/lcal_comm.cpp diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp new file mode 100644 index 00000000..936c1c2d --- /dev/null +++ b/comm/lcal/src/lcal_comm.cpp @@ -0,0 +1,214 @@ +#include +#include +#include "lcal_internal.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "mki/utils/log/log.h" +#include "mki/utils/env/env.h" +#include "tools/socket/lcal_sock_excahnge.h" + +#include "runtime/kernel.h" +#include "runtime/mem.h" +#include "runtime/dev.h" +#include "runtime/rt_ffts.h" +#include "profiling/report_timing.h" + +constexpr int AI_CORE_NUM_24 = 24; +constexpr int AI_CORE_NUM_20 = 20; +constexpr int AI_CORE_NUM_2 = 2; + +enum TopologyType : int { + TOPOLOGY_HCCS = 0, + TOPOLOGY_PIX, + TOPOLOGY_PIB, + TOPOLOGY_PHB, + TOPOLOGY_SYS, + TOPOLOGY_SIO, + TOPOLOGY_HCCS_SW +}; + +using namespace std; +using namespace chrono; +using namespace Mki; + +namespace Lcal { +constexpr int HCCL_IPC_PID_ARRAY_SIZE = 1; +constexpr int LCAL_INIT_TIMEOUT = 600; + +static map g_localPeerMemMap; +static map g_devList; +static std::mutex g_mtx; + +static const std::unordered_map CHIP_MAP = { + {"Ascend310P", ChipName::CHIP_3010P3}, + {"Ascend910B1", ChipName::CHIP_910B1}, + {"Ascend910B2", ChipName::CHIP_910B2}, + {"Ascend910B2C", ChipName::CHIP_910B2C}, + {"Ascend910B3", ChipName::CHIP_910B3}, + {"Ascend910B4", ChipName::CHIP_910B4}, + {"Ascend910B4-1", ChipName::CHIP_910B41}, + {"Ascend910_9391", ChipName::CHIP_910_9391}, + {"Ascend910_9381", ChipName::CHIP_910_9381}, + {"Ascend910_9392", ChipName::CHIP_910_9392}, + {"Ascend910_9382", ChipName::CHIP_910_9382}, + {"Ascend910_9372", ChipName::CHIP_910_9372}, + {"Ascend910_9361", ChipName::CHIP_910_9361}, + {"Ascend910_9362", ChipName::CHIP_910_9362} +} + +ChipName GetChipName() +{ + static curChipName = ChipName::RESERVED; + if (curChipName != ChipName::RESERVED) { + return curChipName; + } + constexpr int socVerLength = 100; + char ver[socVerLength]; + auto ret - rtGetSocVersion(ver, socVerLength); + if (ret != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "rtGetSocVersion failed, not sure whether the function is normal, please use it with caution"; + return ChipName::RESERVED; + } + string chipName(ver); + MKI_LOG(DEBUG) << "rtGetSocVersion: -- The result after converting ver to string is :" << chipName; + + auto it = CHIP_MAP.find(chipName); + if (it != CHIP_MAP.end()) { + curChipName = it->second; + } else { + MKI_LOG(WARN) << "There is no commitment to the supported chip types yet," << + " and it is not certain whether the functions will work properly." + } + return curChipName; +} + +uint32_t GetCoreNum(ChipName chipName) +{ + switch (chipName) { + case ChipName::CHIP_910B1: + case ChipName::CHIP_910B2: + case ChipName::CHIP_910_9391: + case ChipName::CHIP_910_9381: + case ChipName::CHIP_910_9392: + case ChipName::CHIP_910_9382: + case ChipName::CHIP_910B2C: + return AI_CORE_NUM_24; + case ChipName::CHIP_910B3: + case ChipName::CHIP_910B4: + case ChipName::CHIP_910B41: + case ChipName::CHIP_910_9372: + case ChipName::CHIP_910_9361: + case ChipName::CHIP_910_9362: + case ChipName::CHIP_910A5: + return AI_CORE_NUM_20; + case ChipName::CHIP_3010P3: + return AI_CORE_NUM_2; + default: + MKI_LOG(ERROR) << "Unknown chip name"; + return 0; + } +} + +bool SkipUnusedChannel910B2C(int curRank, int peerRank, ChipName chipName) +{ + if (chipName == ChipName::CHIP_910B2C) { + constexpr int rankSizePerNode = 8; + if ((curRank / rankSizePerNode) != (peerRank / rankSizePerNode)) + && (std::abs(curRank - peerRank) != rankSizePerNode)) { + return true; + } + } + return false; +} + +int LcalComm::InitDumpAddr() +{ + constexpr uint32_t dumpCoreCnt = 75; + constexpr uint32_t dumpSizePerCore = 1 * 1024 * 1024; + constexpr uint32_t dumpWorkspaceSize = dumpCoreCnt * dumpSizePerCore; + GM_ADDR dumpAddr = nullptr; + int ret = 0; + ret = aclrtMalloc(reinterpret_cast(&dumpAddr), dumpWorkspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtMalloc err " << __LINE__; + return LCAL_ERROR_INTERNAL; + } + aclrtMemset(dumpAddr, dumpWorkspaceSize, 0, dumpWorkspaceSize); + + GM_ADDR memory = static_cast(std::malloc(dumpWorkspaceSize)); + if (!memory) { + MKI_LOG(ERROR) << "std::malloc err " << __LINE__; + return LCAL_ERROR_INTERNAL; + } + errno_t result = memcpy_s(memory, dumpWorkspaceSize, 0, dumpWorkspaceSize); + if (result != 0) { + MKI_LOG(ERROR) << "memcpy_s err " << __LINE__; + } + for (uint32_t i = 0; i < dumpCoreCnt; i++) { + GM_ADDR block_start = memory + i * dumpSizePerCore; + GM_ADDR deviceBlockStart = dumpAddr + i * dumpSizePerCore; + + LcclDumpBlockInfo* block_info = reinterpret_cast(block_start); + block_info->len = dumpSizePerCore; + block_info->core = i; + block_info->blockNum = 0; + block_info->dumpOffset = dumpSizePerCore - sizeof(LcclDumpBlockInfo); + block_info->magic = 0; + block_info->dumpAddr = reinterpret_cast(deviceBlockStart + sizeof(LcclDumpBlockInfo)); + } + + ret = aclrtMemcpy(dumpAddr, dumpWorkspaceSize, memory, dumpWorkspaceSize, ACL_MEMCPY_HOST_TO_DEVICE); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtMemcpy err " << __LINE__ << " " << ret; + return LCAL_ERROR_INTERNAL; + } + std::free(memory); + + commArgs_.dumpAddr = dumpAddr; + return LCAL_SUCCESS; +} + +int LcalComm::SyncCommArgs() +{ + commArgs_.rank = rank_; + commArgs_.localRank = localRank_; + commArgs_.rankSize = rankSize_; + commArgs_.localRankSize = localRankSize_; + for (int i = 0; i < rankSize_; i++) { + commArgs_.peerMems[i] = peerMems_[i]; + } + + if (isEnableMsprofOp_ && InitDumpAddr() != LCAL_SUCCESS) { + return LCAL_ERROR_INTERNAL; + } + + if (isEnableMix_) { + uint64_t fftsVal = 0; + uint32_t fftsLen = 0; + int error = rtGetC2cCtrlAddr(&fftsVal, &fftsLen); + if (error != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "rtGetC2cCtrlAddr err:" << error; + return LCAL_ERROR_MKIRT; + } + commArgs_.fftsVal = fftsVal; + } + + int ret = 0; + ret = aclrtMalloc(reinterpret_cast(&commArgsPtr_), sizeof(commArgs_), ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtMalloc err " << __LINE__ << " " << ret; + return LCAL_ERROR_INTERNAL; + } + +} +} + -- Gitee From 823f6631d83686c70342b1022b6366be214897cd Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 16:28:51 +0800 Subject: [PATCH 158/414] draft 2 --- comm/lcal/src/lcal_comm.cpp | 429 +++++++++++++++++++++++++++++++++++- 1 file changed, 427 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 936c1c2d..0252e7ee 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -49,7 +49,7 @@ static map g_devList; static std::mutex g_mtx; static const std::unordered_map CHIP_MAP = { - {"Ascend310P", ChipName::CHIP_3010P3}, + {"Ascend310P", ChipName::CHIP_310P3}, {"Ascend910B1", ChipName::CHIP_910B1}, {"Ascend910B2", ChipName::CHIP_910B2}, {"Ascend910B2C", ChipName::CHIP_910B2C}, @@ -110,7 +110,7 @@ uint32_t GetCoreNum(ChipName chipName) case ChipName::CHIP_910_9362: case ChipName::CHIP_910A5: return AI_CORE_NUM_20; - case ChipName::CHIP_3010P3: + case ChipName::CHIP_310P3: return AI_CORE_NUM_2; default: MKI_LOG(ERROR) << "Unknown chip name"; @@ -208,7 +208,432 @@ int LcalComm::SyncCommArgs() MKI_LOG(ERROR) << "aclrtMalloc err " << __LINE__ << " " << ret; return LCAL_ERROR_INTERNAL; } + ret = aclrtMemcpy(commArgsPtr_, sizeof(commArgs_), &commArgs_, sizeof(commArgs_), ACL_MEMCPY_HOST_TO_DEVICE); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtMemcpy err " << __LINE__ << " " << ret; + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalComm::InitCommon() +{ + if (EnablePeerAccess() != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "EnablePeerAccess failed!"; + return LCAL_ERROR_INTERNAL; + } + const char *lcclDeterministic = Mki::GetEnv("LCCL_DETERMINISTIC"); + if (lcclDeterministic && (string(lcclDeterministic) == "1" || string(lcclDeterministic) == "true")) { + deterministic_ = true; + commArgs.extraFlag |= ExtraFlag::DETERMINiSTIC; + } + if (GetChipName() == ChipName::CHIP_910B2C) { + commArgs.extraFlag |= ExtraFlag::TOPO_910B2C; + } + if (GetChipName() >= ChipName::CHIP_910_9391) { + commArgs.extraFlag |= ExtraFlag::TOPO_910_93; + } + if (GetChipName() >= ChipName::CHIP_910_9362) { + commArgs.extraFlag |= ExtraFlag::TOPO_910A5; + } + if (GetCoreNum(GetChipName()) > AI_CORE_NUM_20) { + commArgs.extraFlag |= ExtraFlag::IS_GREATER_THAN_40_AIV; + } + + ReportTiming report("LcclReporting", rank_, false, nullptr, nullptr); + MKI_LOG(INFO) << "LcalComm::InitCommon ReportTiming " << std::hex << ReportTiming::ProfilingStatus() << std::dec; + if (ReportTiming::ProfilingStatus() == ReportTiming::PROF_TASK_TIME_DUMP) { + isEnableMsprofOp_ = true; + isEnableMix_ = true; + } + + int32_t opGroup = 0; + if (isEnableMsprofOp_) { + opGroup = 0; + } else if (isEnableMix_) { + opGroup = 1; + } else { + constexpr int32_t normalOpGroup = 2; + opGroup = normalOpGroup; + } + MKI_LOG(INFO) << "LcalComm::InitCommon RegistKernel opGroup " << opGroup; + RegistKernel(opGroup); + + lcalRank_ = rank % localRankSize_; + return LCAL_SUCCESS; +} + +void LcalComm::CloseIpcMem() +{ + for (int i = 0; i < rankSize_; i++) { + if (i == rank_ || peerMem_[i] == nullptr) { + continue; + } + + int ret = rtIpcCloseMemory(static_cast(peerMem_[i])); + if (ret != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "Close ipc[" << i << "] memory failed! ret = " << ret; + } + peerMem_[i] = nullptr; + + } +} + +void LcalComm::FreePeerMem(GM_ADDR &mem) const +{ + if (mem != nullptr) { + aclError aclRet = aclrtFree(mem); + if (aclRet != ACL_SUCCESS) { + MKI_LOG(ERROR) << "Free share memory failed! ret " << aclRet; + } + } + mem = nullptr; +} + +int LcalComm::Init() +{ + if (inited_) { + return LCAL_SUCCESS; + } + if (rank_ < 0 || rank >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + MKI_LOG(ERROR) << "The rank is invalid! rank: " << rank_ << ", rankSize: " << rankSize_; + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (LcalSockExchange::CheckValid(commId_)) { + socketExchange_ = new (nothrow) LcalSockExchange(rank_, rankSize_, commId_); + } else { + socketExchange_ = new (nothrow) LcalSockExchange(rank_, rankSize_, rankList_, commDomain_); + } + if (socketExchange_ == nullptr) { + MKI_LOG(ERROR) << "LcalSockExchange create failed. rank : " << rank_ << " rankSize:" << rankSize_; + return LCAL_ERROR_INTERNAL; + } + int ret = GetDev(); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "init context failed! ret: " << ret; + return ret; + } + + MKI_LOG(INFO) << "rank " << rank_ << "/" << rankSize_ << " running devId:" << devId_; + + if (InitCommon() != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "Init common failed!"; + return LCAL_ERROR_INTERNAL; + } + + MKI_LOG(DEBUG) << "Prepare to InitCommMem localRankSize_ -> " << localRankSize_ << ", localRank_ -> " << localRank_; + if (InitCommMem() != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "InitCommMem failed!"; + return LCAL_ERROR_INTERNAL; + } + MKI_LOG(DEBUG) << "InitCommMem " << rank_ << "/" << rankSize_ << ", localRank_ : " << localRank_ << + ", localRankSize_ : " << localRankSize_ << " success"; + SyncCommArgs(); + MKI_LOG(INFO) << "LcalCommInit " << rank_ << "/" << rankSize_ << " success and extraFlag: " << commArgs.extraFlag << + " commArgs_.localRank : " << commArgs_.localRank << " commArgs_.localRankSize : " << commArgs_.localRankSize; + inited_ = true; + delete socketExchange_; + socketExchange_ = nullptr; + return LCAL_SUCCESS; +} + +int LcalComm::InitThread(const std::string &uid) +{ + if (inited_) { + return LCAL_SUCCESS; + } + if (rank_ < 0 || rank >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + MKI_LOG(ERROR) << "The rank is invalid! rank: " << rank_ << ", rankSize: " << rankSize_; + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (GetDevThread(uid) != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "get devs failed."; + return LCAL_ERROR_INTERNAL; + } + MKI_LOG(INFO) << "rank " << rank_ << "/" << rankSize_ << " running devId:" << devId_ << "uid: " << uid; + + if (InitCommon() != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "Init common failed!"; + return LCAL_ERROR_INTERNAL; + } + { + lock_guard lock(g_mtx); + if (g_localPeerMemMap.find(uid) == g_localPeerMemMap.end()) { + for (int i = 0; i < rankSize_; i++) { + g_localPeerMemMap[uid][i] = nullptr; + } + } + uid_ = uid; + } + InitMem(); + g_localPeerMemMap[uid][rank_] = peerMem_[rank_]; + + auto start = high_resolution_clock::now(); + for (int i = 0; i < rankSize_; ++i) { + while (g_localPeerMemMap[uid][i] == nullptr) { + this_thread::sleep_for(1ms); + auto elapsed = duration_cast(high_resolution_clock::now() - start); + if (elapsed.count() > LCAL_INIT_TIMEOUT) { + MKI_LOG(ERROR) << "Lccl Init timeout!"; + FreePeerMem(g_localPeerMemMap[uid][rank_]); + return LCAL_ERROR_INTERNAL; + } + } + peerMem_[i] = g_localPeerMemMap[uid][i]; + } + localRank_ = rank_; + localRankSize_ = rankSize_; + SyncCommArgs(); + MKI_LOG(INFO) << "Lccl init multi thread " << rank_ << "/" << rankSize_ << " success, uid:" << uid; + inited_ = true; + return LCAL_SUCCESS; +} + +int LcalComm::EnablePeerAccess() +{ + physicalInfo_.chipName = GetChipName(); + for (auto &dev : devList_) { + if (devId_ == dev) { + continue; + } + if (SkipUnusedChannel910B2C(dev, devId_, GetChipName())) { + continue; + } + + int64_t value = 0; + if (rtGetPairDevicesInfo(devId_, dev, 0, &value) != RT_ERROR_NONE) { + MKI_LOG(WARN) << devId_ << " & " << dev << " pair devices info failed to get"; + } else { + MKI_LOG(DEBUG) << devId_ << " <-----> " << dev << ", halGetPairDevicesInfo: *value = " << value; + } + + if (value == TOPOLOGY_HCCS || value == TOPOLOGY_SIO || value == TOPOLOGY_HCCS_SW || + GetChipName() == ChipName::CHIP_910B2C) { + physicalInfo_.physicalLink = PhysicalLink::HCCS; + commArgs.extraFlag &= ~(ExtraFlag::TOPO_PCIE); + } else if (physicalInfo_.physicalLink = PhysicalLink::RESERVED) { + physicalInfo_.physicalLink = PhysicalLink::PCIE; + commArgs.extraFlag |= ExtraFlag::TOPO_PCIE; + if (rankSize_ > PING_PONG_SIZE) { + MKI_LOG(ERROR) << "do not support pcie > 2 rank! rankSize_ = " << rankSize_; + return LCAL_ERROR_INTERNAL; + } + } + + physicalInfo_.coreNum = GetCoreNum(physicalInfo_.chipName); + + if (physicalInfo_.chipName == ChipName::CHIP_310P3 && value == 0) { + MKI_LOG(WARN) << "warn aclrtDeviceEnablePeerAccess is skipped! peerDeviceId = " << dev; + continue; + } + + aclError ret = aclrtDeviceEnablePeerAccess(dev, 0); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "err aclrtDeviceEnablePeerAccess failed peerDeviceId = " << dev << ",rank = " << rank_ + << ", value = " << value << ", flags = " << 0 << "," << __LINE__ << ": " << ret; + return LCAL_ERROR_INTERNAL; + } + } + MKI_LOG(DEBUG) << "EnablePeerAccess succeed" << rank_; + return LCAL_SUCCESS; +} + +int LcalComm::GetDevThread(const std::string &uid) +{ + devList_.resize(rankSize_); + aclError aclRet = aclrtGetDevice(&devId_); + if (aclRet != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtGetDevice error! ret: " << aclRet; + return LCAL_ERROR_INTERNAL; + } + { + std::lock_guard lock(g_mtx); + if (g_devList.find(uid) == g_devList.end()) { + for (int i = 0; i < rankSize_; i++) { + g_devList[uid][i] = 0; + } + } + } + g_devList[uid][rank_] = devid_ + 1; + auto start = high_resolution_clock::now(); + for (int i = 0; i < rankSize_; ++i) { + while (g_devList[uid][i] == 0) { + this_thread::sleep_for(1ms); + auto elapsed = duration_cast(high_resolution_clock::now() - start); + if (elapsed.count() > LCAL_INIT_TIMEOUT) { + MKI_LOG(ERROR) << "Lccl Init timeout!"; + return LCAL_ERROR_TIMEOUT; + } + } + devList_.at(i) = g_devList[uid][i] - 1; + } + return LCAL_SUCCESS; +} + +int LcalComm::InitMem() +{ + constexpr int32_t bufferSizeUnit = 1024 * 1024; + int lcalBuffSize = bufferSize_ * bufferSizeUnit + LCAL_FLAG_BUFF_BYTES; + + MKI_LOG(DEBUG) << "lcal buffer size " << lcalBuffSize; + aclError ret = aclrtMalloc( + reinterpret_cast(&peerMem_[rank_]), lcalBuffSize, + (GetChipName() == ChipName::CHIP_310P3) ? ACL_MEM_MALLOC_HUGE_FIRST_P2P : ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "allocate device mem error " << __FILE__ << ":" << __LINE__ << " " << ret; + return LCAL_ERROR_INTERNAL; + } + MKI_LOG(DEBUG) << "peerMem[rank" << rank_ << "], allocate finished."; + aclrtMemset(peerMem_[rank_], lcalBuffSize, 0, lcalBuffSize); + return LCAL_SUCCESS; +} + +int LcalComm::GetPid(uint32_t *pids) +{ + if (rtDeviceGetBaraTgid(&pids[rank_]) != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "DeviceGetBaraTgid err " << __LINE__; + return LCAL_ERROR_INTERNAL; + } + int ret = socketExchange_->AllGather(&pids[rank_], 1, pids); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "LcalSockExchange AllGather error! ret: " << ret; + return ret; + } + for (int i = 0; i < rankSize_; ++i) { + MKI_LOG(DEBUG) << "rank: " << rank_ << ", otherRank : " << i << " pid[" << i << "]: " << pids[i]; + } + MKI_LOG(DEBUG) << "AllGather: Get other rank pid"; + return LCAL_SUCCESS; +} + +int LcalComm::GetSidId(int64_t sdids[LCAL_MAX_RANK_SIZE], int rankSize) +{ + if (rank_ > rankSize) { + MKI_LOG(ERROR) << "LcalComm::GetSidId err rank_ >= rankSize " << rank_ << ">=" << rankSize; + return LCAL_ERROR_INTERNAL; + } + if ((physicalInfo_.chipName >= ChipName::CHIP_910_9391) && (physicalInfo_.chipName < ChipName::RESERVED)) { + const int rtModuleTypeSystem = 0; + const int infoTypeSdid = 26; + if (rtGetDeviceInfo(devList_[rank_], rtModuleTypeSystem, infoTypeSdid, &sdids[rank_]) != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "DeviceGetDeviceInfo err " << __LINE__; + return LCAL_ERROR_INTERNAL; + } + MKI_LOG(DEBUG) << "rank" << rank_ << " dev id: " << devList_[rank_] + << " rtGetDeviceInfo sdid: " << sdids[rank_]; + + int ret = socketExchange_->AllGather(&sdids[rank_], 1, sdids); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "LcalSockExchange AllGather error! ret: " << ret; + return ret; + } + for (int i = 0; i < rankSize_; ++i) { + MKI_LOG(DEBUG) << "rank: " << i << " sdid: " << sdids[i]; + } + MKI_LOG(DEBUG) << "AllGather: Get other rank sdid"; + } + return LCAL_SUCCESS; } + +int LcalComm::GetName(string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const +{ + int ret = socketExchange_->AllGather(name.c_str(), IPC_NAME_SIZE, names[0]); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "LcalSockExchange AllGather error! ret: " << ret; + return ret; + } + for (int i = 0; i < rankSize_; ++i) { + names[i][IPC_NAME_SIZE - 1] = '\0'; + MKI_LOG(DEBUG) << "rank: " << i << " mem name: " << names[i]; + } + MKI_LOG(DEBUG) << "AllGather: Get other rank mem name"; + return LCAL_SUCCESS; +} + +int LcalComm::InitCommMem() +{ + int ret = InitMem(); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "InitMem error! ret: " << ret; + return ret; + } + + uint32_t pids[LCAL_MAX_RANK_SIZE] = {0}; + ret = GetPid(pids); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "GetPid error! ret: " << ret; + return ret; + } + + int64_t sdids[LCAL_MAX_RANK_SIZE] = {0}; + ret = GetSidId(sdids, rankSize_); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "GetSidId error! ret: " << ret; + return ret; + } + + string name; + if (SetMemoryName(name) != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "SetMemoryName error!"; + return LCAL_ERROR_INTERNAL; + } + + if (SetIpcPidSdid(name, pids, sdids) != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "SetIpcPidSdid error!"; + return LCAL_ERROR_INTERNAL; + } + + MKI_LOG(DEBUG) << "rank " << rank_ << " mem name: " << name; + char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]; + ret = GetName(name, names); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "GetName error! ret: " << ret; + return ret; + } + + if (OpenIpcMem(names) != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "rank: " << rank_ << " OpenIpcMem failed!"; + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalComm::OpenIpcMem(const char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) +{ + static mutex mut; + lock_guard lock(mut); + for (int i = 0; i < rankSize_; ++i) { + if (i == rank_) { + continue; + } + if (SkipUnusedChannel910B2C(rank_, i, GetChipName())) { + continue; + } + int ret = rtIpcOpenMemory(reinterpret_cast(&peerMem_[i]), names[i]); + if (ret != RT_ERROR_NONE) { + CloseIpcMem(); + MKI_LOG(ERROR) << "rank : " << rank_ << " localRank : " << localRank_ << " peerMem: " << i << + " IpcOpenMemory err " << ret; + return LCAL_ERROR_INTERNAL; + } + } + ipcMemInited_ = true; + return LCAL_SUCCESS; +} + +int LcalComm::SetMemoryName(string &name) +{ + char nameModified[IPC_NAME_SIZE] = {}; + int memRank = rank_; + constexpr int32_t bufferSizeUnit = 1024 * 1024; + int lcalBuffSize = bufferSize_ * bufferSizeUnit + LCAL_FLAG_BUFF_BYTES; + if (rtIpcSetMemoryName(peerMem_[memRank, lcalBuffSize, nameModified, IPC_NAME_SIZE]) != RT_ERROR_NONE) { + return LCAL_ERROR_INTERNAL; + } + name = nameModified; + return LCAL_SUCCESS; +} + } -- Gitee From aeabfb892ccd913f3f74b0c2b556013c0ac32285 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 17:48:35 +0800 Subject: [PATCH 159/414] dfaft --- comm/lcal/src/lcal_comm.cpp | 154 ++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 0252e7ee..f6bedaa4 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -635,5 +635,159 @@ int LcalComm::SetMemoryName(string &name) return LCAL_SUCCESS; } +int LcalComm::SetIpcPidSdid(string &name, const uint32_t *pids, const int64_t *sdids) const +{ + for (int i =0; i < rankSize_; ++i) { + if (i == rank_) { + continue; + } + + if (physicalInfo_.chipName < ChipName::RESERVED) { + int32_t pidInt32 = pids[i]; + int rtRet = rtSetIpcMemPid(name.c_str(), &pidInt32, HCCL_IPC_PID_ARRAY_SIZE); + if (rtRet != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "err " << rtRet; + return LCAL_ERROR_INTERNAL; + } + } else { + int32_t pidInt32 = pids[i]; + int rtRet = rtSetIpcMemorySuperPodPid(name.c_str(), sdids[i], &pidInt32, HCCL_IPC_PID_ARRAY_SIZE); + if (rtRet != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "err " << rtRet; + return LCAL_ERROR_INTERNAL; + } + } + } + return LCAL_SUCCESS; +} + +LcalComm::~LcalComm() +{ + { + lock_guard lock(g_mtx); + if (g_localPeerMemMap.find(uid_) != g_localPeerMemMap.end()) { + g_localPeerMemMap.erase(uid_); + } + } + + if (ipcMemInited_) { +#ifndef USE_MSSANITIZER + CloseIpcMem(); +#endif + ipcMemInited_ = false; + } + if (socketExchange_) { + delete socketExchange_; + socketExchange_ = nullptr; + } + FreePeerMem(commArgs_.dumpAddr); + FreePeerMem(peerMem_[rank_]); + FreePeerMem(commArgsPtr_); +} + +LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize) +{ +} + +LcalComm::LcalComm(int rank, int rankSize, int bufferSize) : rank_(rank), rankSize_(rankSize), bufferSize_(bufferSize) +{ +} + +LcalComm::LcalComm(int rank, int rankSize, int bufferSize, int isEnableMagic) + : rank_(rank), rankSize_(rankSize), bufferSize_(bufferSize), isEnableMagic_(isEnableMagic) +{ +} + +LcalComm::LcalComm(int rank, int rankSize, LcalUniqueId commId) + : rank_(rank), rankSize_(rankSize), commId_(commId) +{ +} + +int LcalComm::GetRank() const +{ + return rank_; +} + +int LcalComm::GetRankSize() const +{ + return rankSize_; +} + +int LcalComm::GetCommSize() const +{ + return commSize_; +} + +int LcalComm::GetBufferSize() const +{ + return bufferSize_; +} + +const PhysicalInfo &LcalComm::GetPhysicalInfo() const +{ + return physicalInfo_; +} + +GM_ADDR LcalComm::GetCommArgsPtr() const +{ + return commArgsPtr_; +} + +CommArgs* LcalComm::GetCommArgs() const +{ + return &commArgs_; +} + + +std::string LcalComm::PrintDFX() +{ + if (commArgsPtr_ == nullptr) { + return "no comm args"; + } + int ret = aclrtMemcpy(&commArgs, sizeof(commArgs_), commArgsPtr_, sizeof(commArgs_), + ACL_MEMCPY_DEVICE_TO_HOST); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtMemCpy err " << __LINE__ << " " << ret; + return "aclrtMemcpy failed"; + } + stringstream ss; + ss << "CommArgs {" + << "\n rank: " << commArgs_.rank + << "\n localRank: " << commArgs_.localRank + << "\n rankSize: " << commArgs_.rankSize + << "\n localRankSize: " << commArgs_.localRankSize + << "\n extraFlag: 0x" << std::hex << std::setfill('0') << commArgs_.extraFlag << std::dec; + + ss << "\n peerMems: ["; + for (int i = 0; i < LCAL_MAX_RANK_SIZE; ++i) { + if (commArgs_.peerMems[i] == nullptr) { + continue; + } + if (i > 0) { + ss << ", "; + } + ss << "{id: " << static_cast(commArgs_.peerMems[i]) << "}"; + } + ss << "]"; + + ss << "\n magics: ["; + for (int i = 0; i < rankSize_; ++i) { + ss << std::dec << commArgs_.magics[i] << ","; + } + ss << "] \n"; + + ss << "\n dfx: ["; + const int dfxGroupCount = 5; + for (int i = 0; i < DFX_COUNT; ++i) { + if (i % dfxGroupCount == 0) { + ss << "\n " << std::dec << setw(dfxGroupCount) << i << ": "; + } + ss << "0x"<< std::hex << commArgs_.dfx[i] << std::dec << ", "; + } + ss << "\n ]"; + + ss << "\n }"; + return ss.str(); } +} \ No newline at end of file -- Gitee From 6a28b30deb8fd69e495e82f4c52de648138c8dec Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 19:21:10 +0800 Subject: [PATCH 160/414] fix --- comm/lcal/src/lcal_comm.cpp | 142 ++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index f6bedaa4..7c9ee7da 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -14,7 +14,7 @@ #include #include "mki/utils/log/log.h" #include "mki/utils/env/env.h" -#include "tools/socket/lcal_sock_excahnge.h" +#include "tools/socket/lcal_sock_exchange.h" #include "runtime/kernel.h" #include "runtime/mem.h" @@ -67,26 +67,26 @@ static const std::unordered_map CHIP_MAP = { ChipName GetChipName() { - static curChipName = ChipName::RESERVED; + static ChipName curChipName = ChipName::RESERVED; if (curChipName != ChipName::RESERVED) { return curChipName; } constexpr int socVerLength = 100; char ver[socVerLength]; - auto ret - rtGetSocVersion(ver, socVerLength); + auto ret = rtGetSocVersion(ver, socVerLength); if (ret != RT_ERROR_NONE) { MKI_LOG(ERROR) << "rtGetSocVersion failed, not sure whether the function is normal, please use it with caution"; return ChipName::RESERVED; } string chipName(ver); - MKI_LOG(DEBUG) << "rtGetSocVersion: -- The result after converting ver to string is :" << chipName; + MKI_LOG(DEBUG) << "rtGetSocVersion: -- The result after converting ver to string is : " << chipName; auto it = CHIP_MAP.find(chipName); if (it != CHIP_MAP.end()) { curChipName = it->second; } else { MKI_LOG(WARN) << "There is no commitment to the supported chip types yet," << - " and it is not certain whether the functions will work properly." + " and it is not certain whether the functions will work properly."; } return curChipName; } @@ -122,7 +122,7 @@ bool SkipUnusedChannel910B2C(int curRank, int peerRank, ChipName chipName) { if (chipName == ChipName::CHIP_910B2C) { constexpr int rankSizePerNode = 8; - if ((curRank / rankSizePerNode) != (peerRank / rankSizePerNode)) + if ((curRank / rankSizePerNode != peerRank / rankSizePerNode) && (std::abs(curRank - peerRank) != rankSizePerNode)) { return true; } @@ -139,7 +139,7 @@ int LcalComm::InitDumpAddr() int ret = 0; ret = aclrtMalloc(reinterpret_cast(&dumpAddr), dumpWorkspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { - MKI_LOG(ERROR) << "aclrtMalloc err " << __LINE__; + MKI_LOG(ERROR) << "aclrtMalloc err " << __LINE__ << " " << ret; return LCAL_ERROR_INTERNAL; } aclrtMemset(dumpAddr, dumpWorkspaceSize, 0, dumpWorkspaceSize); @@ -149,11 +149,11 @@ int LcalComm::InitDumpAddr() MKI_LOG(ERROR) << "std::malloc err " << __LINE__; return LCAL_ERROR_INTERNAL; } - errno_t result = memcpy_s(memory, dumpWorkspaceSize, 0, dumpWorkspaceSize); + errno_t result = memset_s(memory, dumpWorkspaceSize, 0, dumpWorkspaceSize); if (result != 0) { - MKI_LOG(ERROR) << "memcpy_s err " << __LINE__; + MKI_LOG(ERROR) << "memset_s err " << result; } - for (uint32_t i = 0; i < dumpCoreCnt; i++) { + for (uint32_t i = 0; i < dumpCoreCnt; ++i) { GM_ADDR block_start = memory + i * dumpSizePerCore; GM_ADDR deviceBlockStart = dumpAddr + i * dumpSizePerCore; @@ -183,8 +183,8 @@ int LcalComm::SyncCommArgs() commArgs_.localRank = localRank_; commArgs_.rankSize = rankSize_; commArgs_.localRankSize = localRankSize_; - for (int i = 0; i < rankSize_; i++) { - commArgs_.peerMems[i] = peerMems_[i]; + for (int i = 0; i < rankSize_; ++i) { + commArgs_.peerMems[i] = peerMem_[i]; } if (isEnableMsprofOp_ && InitDumpAddr() != LCAL_SUCCESS) { @@ -225,19 +225,19 @@ int LcalComm::InitCommon() const char *lcclDeterministic = Mki::GetEnv("LCCL_DETERMINISTIC"); if (lcclDeterministic && (string(lcclDeterministic) == "1" || string(lcclDeterministic) == "true")) { deterministic_ = true; - commArgs.extraFlag |= ExtraFlag::DETERMINiSTIC; + commArgs_.extraFlag |= ExtraFlag::DETERMINISTIC; } if (GetChipName() == ChipName::CHIP_910B2C) { - commArgs.extraFlag |= ExtraFlag::TOPO_910B2C; + commArgs_.extraFlag |= ExtraFlag::TOPO_910B2C; } if (GetChipName() >= ChipName::CHIP_910_9391) { - commArgs.extraFlag |= ExtraFlag::TOPO_910_93; + commArgs_.extraFlag |= ExtraFlag::TOPO_910_93; } - if (GetChipName() >= ChipName::CHIP_910_9362) { - commArgs.extraFlag |= ExtraFlag::TOPO_910A5; + if (GetChipName() > ChipName::CHIP_910_9362) { + commArgs_.extraFlag |= ExtraFlag::TOPO_910A5; } if (GetCoreNum(GetChipName()) > AI_CORE_NUM_20) { - commArgs.extraFlag |= ExtraFlag::IS_GREATER_THAN_40_AIV; + commArgs_.extraFlag |= ExtraFlag::IS_GREATER_THAN_40_AIV; } ReportTiming report("LcclReporting", rank_, false, nullptr, nullptr); @@ -265,14 +265,14 @@ int LcalComm::InitCommon() void LcalComm::CloseIpcMem() { - for (int i = 0; i < rankSize_; i++) { + for (int i = 0; i < rankSize_; ++i) { if (i == rank_ || peerMem_[i] == nullptr) { continue; } int ret = rtIpcCloseMemory(static_cast(peerMem_[i])); if (ret != RT_ERROR_NONE) { - MKI_LOG(ERROR) << "Close ipc[" << i << "] memory failed! ret = " << ret; + MKI_LOG(WARN) << "Close ipc[" << i << "] memory failed! ret: " << ret; } peerMem_[i] = nullptr; @@ -284,7 +284,7 @@ void LcalComm::FreePeerMem(GM_ADDR &mem) const if (mem != nullptr) { aclError aclRet = aclrtFree(mem); if (aclRet != ACL_SUCCESS) { - MKI_LOG(ERROR) << "Free share memory failed! ret " << aclRet; + MKI_LOG(ERROR) << "Free share memory failed! ret: " << aclRet; } } mem = nullptr; @@ -295,8 +295,8 @@ int LcalComm::Init() if (inited_) { return LCAL_SUCCESS; } - if (rank_ < 0 || rank >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { - MKI_LOG(ERROR) << "The rank is invalid! rank: " << rank_ << ", rankSize: " << rankSize_; + if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + MKI_LOG(ERROR) << "The rank is invalid! rank: " << rank_ << " rankSize: " << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } if (LcalSockExchange::CheckValid(commId_)) { @@ -317,7 +317,7 @@ int LcalComm::Init() MKI_LOG(INFO) << "rank " << rank_ << "/" << rankSize_ << " running devId:" << devId_; if (InitCommon() != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "Init common failed!"; + MKI_LOG(ERROR) << "init common failed!"; return LCAL_ERROR_INTERNAL; } @@ -328,9 +328,9 @@ int LcalComm::Init() } MKI_LOG(DEBUG) << "InitCommMem " << rank_ << "/" << rankSize_ << ", localRank_ : " << localRank_ << ", localRankSize_ : " << localRankSize_ << " success"; - + SyncCommArgs(); - MKI_LOG(INFO) << "LcalCommInit " << rank_ << "/" << rankSize_ << " success and extraFlag: " << commArgs.extraFlag << + MKI_LOG(INFO) << "LcalCommInit " << rank_ << "/" << rankSize_ << " success and extraFlag: " << commArgs_.extraFlag << " commArgs_.localRank : " << commArgs_.localRank << " commArgs_.localRankSize : " << commArgs_.localRankSize; inited_ = true; delete socketExchange_; @@ -343,8 +343,8 @@ int LcalComm::InitThread(const std::string &uid) if (inited_) { return LCAL_SUCCESS; } - if (rank_ < 0 || rank >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { - MKI_LOG(ERROR) << "The rank is invalid! rank: " << rank_ << ", rankSize: " << rankSize_; + if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize: " << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } if (GetDevThread(uid) != LCAL_SUCCESS) { @@ -354,13 +354,13 @@ int LcalComm::InitThread(const std::string &uid) MKI_LOG(INFO) << "rank " << rank_ << "/" << rankSize_ << " running devId:" << devId_ << "uid: " << uid; if (InitCommon() != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "Init common failed!"; + MKI_LOG(ERROR) << "init common failed!"; return LCAL_ERROR_INTERNAL; } { lock_guard lock(g_mtx); if (g_localPeerMemMap.find(uid) == g_localPeerMemMap.end()) { - for (int i = 0; i < rankSize_; i++) { + for (int i = 0; i < rankSize_; ++i) { g_localPeerMemMap[uid][i] = nullptr; } } @@ -377,7 +377,7 @@ int LcalComm::InitThread(const std::string &uid) if (elapsed.count() > LCAL_INIT_TIMEOUT) { MKI_LOG(ERROR) << "Lccl Init timeout!"; FreePeerMem(g_localPeerMemMap[uid][rank_]); - return LCAL_ERROR_INTERNAL; + return LCAL_ERROR_TIMEOUT; } } peerMem_[i] = g_localPeerMemMap[uid][i]; @@ -411,10 +411,10 @@ int LcalComm::EnablePeerAccess() if (value == TOPOLOGY_HCCS || value == TOPOLOGY_SIO || value == TOPOLOGY_HCCS_SW || GetChipName() == ChipName::CHIP_910B2C) { physicalInfo_.physicalLink = PhysicalLink::HCCS; - commArgs.extraFlag &= ~(ExtraFlag::TOPO_PCIE); - } else if (physicalInfo_.physicalLink = PhysicalLink::RESERVED) { + commArgs_.extraFlag &= ~(ExtraFlag::TOPO_PCIE); + } else if (physicalInfo_.physicalLink == PhysicalLink::RESERVED) { physicalInfo_.physicalLink = PhysicalLink::PCIE; - commArgs.extraFlag |= ExtraFlag::TOPO_PCIE; + commArgs_.extraFlag |= ExtraFlag::TOPO_PCIE; if (rankSize_ > PING_PONG_SIZE) { MKI_LOG(ERROR) << "do not support pcie > 2 rank! rankSize_ = " << rankSize_; return LCAL_ERROR_INTERNAL; @@ -430,7 +430,7 @@ int LcalComm::EnablePeerAccess() aclError ret = aclrtDeviceEnablePeerAccess(dev, 0); if (ret != ACL_SUCCESS) { - MKI_LOG(ERROR) << "err aclrtDeviceEnablePeerAccess failed peerDeviceId = " << dev << ",rank = " << rank_ + MKI_LOG(ERROR) << "err aclrtDeviceEnablePeerAccess failed peerDeviceId = " << dev << " ,rank = " << rank_ << ", value = " << value << ", flags = " << 0 << "," << __LINE__ << ": " << ret; return LCAL_ERROR_INTERNAL; } @@ -450,12 +450,12 @@ int LcalComm::GetDevThread(const std::string &uid) { std::lock_guard lock(g_mtx); if (g_devList.find(uid) == g_devList.end()) { - for (int i = 0; i < rankSize_; i++) { + for (int i = 0; i < rankSize_; ++i) { g_devList[uid][i] = 0; } } } - g_devList[uid][rank_] = devid_ + 1; + g_devList[uid][rank_] = devId_ + 1; auto start = high_resolution_clock::now(); for (int i = 0; i < rankSize_; ++i) { while (g_devList[uid][i] == 0) { @@ -473,8 +473,8 @@ int LcalComm::GetDevThread(const std::string &uid) int LcalComm::InitMem() { - constexpr int32_t bufferSizeUnit = 1024 * 1024; - int lcalBuffSize = bufferSize_ * bufferSizeUnit + LCAL_FLAG_BUFF_BYTES; + constexpr int32_t bufferSizeUint = 1024 * 1024; + int lcalBuffSize = bufferSize_ * bufferSizeUint + LCAL_FLAG_BUFF_BYTES; MKI_LOG(DEBUG) << "lcal buffer size " << lcalBuffSize; aclError ret = aclrtMalloc( @@ -491,8 +491,8 @@ int LcalComm::InitMem() int LcalComm::GetPid(uint32_t *pids) { - if (rtDeviceGetBaraTgid(&pids[rank_]) != RT_ERROR_NONE) { - MKI_LOG(ERROR) << "DeviceGetBaraTgid err " << __LINE__; + if (rtDeviceGetBareTgid(&pids[rank_]) != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "DeviceGetBareTgid err " << __LINE__; return LCAL_ERROR_INTERNAL; } int ret = socketExchange_->AllGather(&pids[rank_], 1, pids); @@ -501,7 +501,7 @@ int LcalComm::GetPid(uint32_t *pids) return ret; } for (int i = 0; i < rankSize_; ++i) { - MKI_LOG(DEBUG) << "rank: " << rank_ << ", otherRank : " << i << " pid[" << i << "]: " << pids[i]; + MKI_LOG(DEBUG) << "rank : " << rank_ << ", otherRank : " << i << " pid[" << i << "]: " << pids[i]; } MKI_LOG(DEBUG) << "AllGather: Get other rank pid"; return LCAL_SUCCESS; @@ -509,7 +509,7 @@ int LcalComm::GetPid(uint32_t *pids) int LcalComm::GetSidId(int64_t sdids[LCAL_MAX_RANK_SIZE], int rankSize) { - if (rank_ > rankSize) { + if (rank_ >= rankSize) { MKI_LOG(ERROR) << "LcalComm::GetSidId err rank_ >= rankSize " << rank_ << ">=" << rankSize; return LCAL_ERROR_INTERNAL; } @@ -520,7 +520,7 @@ int LcalComm::GetSidId(int64_t sdids[LCAL_MAX_RANK_SIZE], int rankSize) MKI_LOG(ERROR) << "DeviceGetDeviceInfo err " << __LINE__; return LCAL_ERROR_INTERNAL; } - MKI_LOG(DEBUG) << "rank" << rank_ << " dev id: " << devList_[rank_] + MKI_LOG(DEBUG) << "rank " << rank_ << " dev id: " << devList_[rank_] << " rtGetDeviceInfo sdid: " << sdids[rank_]; int ret = socketExchange_->AllGather(&sdids[rank_], 1, sdids); @@ -529,7 +529,7 @@ int LcalComm::GetSidId(int64_t sdids[LCAL_MAX_RANK_SIZE], int rankSize) return ret; } for (int i = 0; i < rankSize_; ++i) { - MKI_LOG(DEBUG) << "rank: " << i << " sdid: " << sdids[i]; + MKI_LOG(DEBUG) << "rank " << i << " sdid: " << sdids[i]; } MKI_LOG(DEBUG) << "AllGather: Get other rank sdid"; } @@ -541,11 +541,11 @@ int LcalComm::GetName(string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE int ret = socketExchange_->AllGather(name.c_str(), IPC_NAME_SIZE, names[0]); if (ret != LCAL_SUCCESS) { MKI_LOG(ERROR) << "LcalSockExchange AllGather error! ret: " << ret; - return ret; + return LCAL_ERROR_INTERNAL } for (int i = 0; i < rankSize_; ++i) { names[i][IPC_NAME_SIZE - 1] = '\0'; - MKI_LOG(DEBUG) << "rank: " << i << " mem name: " << names[i]; + MKI_LOG(DEBUG) << "rank " << i << " mem name: " << names[i]; } MKI_LOG(DEBUG) << "AllGather: Get other rank mem name"; return LCAL_SUCCESS; @@ -575,12 +575,12 @@ int LcalComm::InitCommMem() string name; if (SetMemoryName(name) != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "SetMemoryName error!"; + MKI_LOG(ERROR) << "SetMemoryName err "; return LCAL_ERROR_INTERNAL; } if (SetIpcPidSdid(name, pids, sdids) != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "SetIpcPidSdid error!"; + MKI_LOG(ERROR) << "SetIpcPidSdid failed!"; return LCAL_ERROR_INTERNAL; } @@ -626,9 +626,9 @@ int LcalComm::SetMemoryName(string &name) { char nameModified[IPC_NAME_SIZE] = {}; int memRank = rank_; - constexpr int32_t bufferSizeUnit = 1024 * 1024; - int lcalBuffSize = bufferSize_ * bufferSizeUnit + LCAL_FLAG_BUFF_BYTES; - if (rtIpcSetMemoryName(peerMem_[memRank, lcalBuffSize, nameModified, IPC_NAME_SIZE]) != RT_ERROR_NONE) { + constexpr int32_t bufferSizeUint = 1024 * 1024; + int lcalBuffSize = bufferSize_ * bufferSizeUint + LCAL_FLAG_BUFF_BYTES; + if (rtIpcSetMemoryName(peerMem_[memRank], lcalBuffSize, nameModified, IPC_NAME_SIZE) != RT_ERROR_NONE) { return LCAL_ERROR_INTERNAL; } name = nameModified; @@ -637,12 +637,12 @@ int LcalComm::SetMemoryName(string &name) int LcalComm::SetIpcPidSdid(string &name, const uint32_t *pids, const int64_t *sdids) const { - for (int i =0; i < rankSize_; ++i) { + for (int i = 0; i < rankSize_; ++i) { if (i == rank_) { continue; } - if (physicalInfo_.chipName < ChipName::RESERVED) { + if (physicalInfo_.chipName < ChipName::CHIP_910_9391) { int32_t pidInt32 = pids[i]; int rtRet = rtSetIpcMemPid(name.c_str(), &pidInt32, HCCL_IPC_PID_ARRAY_SIZE); if (rtRet != RT_ERROR_NONE) { @@ -693,8 +693,8 @@ LcalComm::LcalComm(int rank, int rankSize, int bufferSize) : rank_(rank), rankSi { } -LcalComm::LcalComm(int rank, int rankSize, int bufferSize, int isEnableMagic) - : rank_(rank), rankSize_(rankSize), bufferSize_(bufferSize), isEnableMagic_(isEnableMagic) +LcalComm::LcalComm(int rank, int rankSize, int commDomain, int bufferSize, int isEnableMagic) + : rank_(rank), rankSize_(rankSize), commDomain_(commDomain), bufferSize_(bufferSize), isEnableMix_(isEnableMagic) { } @@ -713,7 +713,7 @@ int LcalComm::GetRankSize() const return rankSize_; } -int LcalComm::GetCommSize() const +int LcalComm::GetCommSize() { return commSize_; } @@ -744,21 +744,21 @@ std::string LcalComm::PrintDFX() if (commArgsPtr_ == nullptr) { return "no comm args"; } - int ret = aclrtMemcpy(&commArgs, sizeof(commArgs_), commArgsPtr_, sizeof(commArgs_), + int ret = aclrtMemcpy(&commArgs_, sizeof(commArgs_), commArgsPtr_, sizeof(commArgs_), ACL_MEMCPY_DEVICE_TO_HOST); if (ret != ACL_SUCCESS) { - MKI_LOG(ERROR) << "aclrtMemCpy err " << __LINE__ << " " << ret; - return "aclrtMemcpy failed"; + MKI_LOG(ERROR) << "aclrtMemcpy err " << __LINE__ << " " << ret; + return "acl mem copy error"; } stringstream ss; ss << "CommArgs {" - << "\n rank: " << commArgs_.rank - << "\n localRank: " << commArgs_.localRank - << "\n rankSize: " << commArgs_.rankSize - << "\n localRankSize: " << commArgs_.localRankSize - << "\n extraFlag: 0x" << std::hex << std::setfill('0') << commArgs_.extraFlag << std::dec; + << "\n rank: " << commArgs_.rank + << "\n localRank: " << commArgs_.localRank + << "\n rankSize: " << commArgs_.rankSize + << "\n localRankSize: " << commArgs_.localRankSize + << "\n extraFlag: 0x" << std::hex << std::setfill('0') << commArgs_.extraFlag << std::dec; - ss << "\n peerMems: ["; + ss << "\n peerMems: ["; for (int i = 0; i < LCAL_MAX_RANK_SIZE; ++i) { if (commArgs_.peerMems[i] == nullptr) { continue; @@ -770,23 +770,23 @@ std::string LcalComm::PrintDFX() } ss << "]"; - ss << "\n magics: ["; + ss << "\n magics: ["; for (int i = 0; i < rankSize_; ++i) { ss << std::dec << commArgs_.magics[i] << ","; } ss << "] \n"; - ss << "\n dfx: ["; + ss << "\n dfx: ["; const int dfxGroupCount = 5; for (int i = 0; i < DFX_COUNT; ++i) { if (i % dfxGroupCount == 0) { ss << "\n " << std::dec << setw(dfxGroupCount) << i << ": "; } - ss << "0x"<< std::hex << commArgs_.dfx[i] << std::dec << ", "; + ss << "0x"<< std::hex << commArgs_.dfx[i] << std::dec << ", "; } - ss << "\n ]"; + ss << "\n ]"; - ss << "\n }"; + ss << "\n}"; return ss.str(); } -- Gitee From 16d69bc153a27c072d58fd8b06e768d4b6438f34 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 19:25:43 +0800 Subject: [PATCH 161/414] fix --- comm/lcal/src/lcal_comm.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 7c9ee7da..16f41917 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -63,7 +63,7 @@ static const std::unordered_map CHIP_MAP = { {"Ascend910_9372", ChipName::CHIP_910_9372}, {"Ascend910_9361", ChipName::CHIP_910_9361}, {"Ascend910_9362", ChipName::CHIP_910_9362} -} +}; ChipName GetChipName() { @@ -79,7 +79,7 @@ ChipName GetChipName() return ChipName::RESERVED; } string chipName(ver); - MKI_LOG(DEBUG) << "rtGetSocVersion: -- The result after converting ver to string is : " << chipName; + MKI_LOG(DEBUG) << "rtGetSocVersion -- The result after converting ver to string is:" << chipName; auto it = CHIP_MAP.find(chipName); if (it != CHIP_MAP.end()) { @@ -259,7 +259,7 @@ int LcalComm::InitCommon() MKI_LOG(INFO) << "LcalComm::InitCommon RegistKernel opGroup " << opGroup; RegistKernel(opGroup); - lcalRank_ = rank % localRankSize_; + localRank_ = rank_ % localRankSize_; return LCAL_SUCCESS; } @@ -296,7 +296,7 @@ int LcalComm::Init() return LCAL_SUCCESS; } if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { - MKI_LOG(ERROR) << "The rank is invalid! rank: " << rank_ << " rankSize: " << rankSize_; + MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } if (LcalSockExchange::CheckValid(commId_)) { @@ -327,10 +327,10 @@ int LcalComm::Init() return LCAL_ERROR_INTERNAL; } MKI_LOG(DEBUG) << "InitCommMem " << rank_ << "/" << rankSize_ << ", localRank_ : " << localRank_ << - ", localRankSize_ : " << localRankSize_ << " success"; + ", localRankSize_ : " << localRankSize_ << " success"; SyncCommArgs(); - MKI_LOG(INFO) << "LcalCommInit " << rank_ << "/" << rankSize_ << " success and extraFlag: " << commArgs_.extraFlag << + MKI_LOG(INFO) << "LcalCommInit " << rank_ << "/" << rankSize_ << " success and extraFlag:" << commArgs_.extraFlag << " commArgs_.localRank : " << commArgs_.localRank << " commArgs_.localRankSize : " << commArgs_.localRankSize; inited_ = true; delete socketExchange_; @@ -344,7 +344,7 @@ int LcalComm::InitThread(const std::string &uid) return LCAL_SUCCESS; } if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { - MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize: " << rankSize_; + MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } if (GetDevThread(uid) != LCAL_SUCCESS) { @@ -541,7 +541,7 @@ int LcalComm::GetName(string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE int ret = socketExchange_->AllGather(name.c_str(), IPC_NAME_SIZE, names[0]); if (ret != LCAL_SUCCESS) { MKI_LOG(ERROR) << "LcalSockExchange AllGather error! ret: " << ret; - return LCAL_ERROR_INTERNAL + return LCAL_ERROR_INTERNAL; } for (int i = 0; i < rankSize_; ++i) { names[i][IPC_NAME_SIZE - 1] = '\0'; @@ -713,7 +713,7 @@ int LcalComm::GetRankSize() const return rankSize_; } -int LcalComm::GetCommSize() +int LcalComm::GetCommSize() const { return commSize_; } @@ -733,7 +733,7 @@ GM_ADDR LcalComm::GetCommArgsPtr() const return commArgsPtr_; } -CommArgs* LcalComm::GetCommArgs() const +CommArgs* LcalComm::GetCommArgs() { return &commArgs_; } @@ -756,7 +756,7 @@ std::string LcalComm::PrintDFX() << "\n localRank: " << commArgs_.localRank << "\n rankSize: " << commArgs_.rankSize << "\n localRankSize: " << commArgs_.localRankSize - << "\n extraFlag: 0x" << std::hex << std::setfill('0') << commArgs_.extraFlag << std::dec; + << "\n extraFlag: 0x" << std::hex << std::setfill('0') << commArgs_.extraFlag << std::dec; ss << "\n peerMems: ["; for (int i = 0; i < LCAL_MAX_RANK_SIZE; ++i) { -- Gitee From 3d6be118383ca787903780480440b4ca9fce43db Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 19:34:05 +0800 Subject: [PATCH 162/414] fix --- comm/lcal/src/lcal_comm.cpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 16f41917..5c4f3c1f 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -439,6 +439,38 @@ int LcalComm::EnablePeerAccess() return LCAL_SUCCESS; } +int LcalComm::GetDev() +{ + int nodeNum = socketExchange_->GetNodeNum(); + if (nodeNum <= 0 || nodeNum > rankSize_) { + MKI_LOG(ERROR) << "error! node num : " << nodeNum << " rank size: " << rankSize_; + return LCAL_ERROR_INTERNAL; + } + localRankSize_ = rankSize_ / nodeNum; + localRank_ = rank_ % localRankSize_; + MKI_LOG(DEBUG) << "GetDev : localRankSize_ : " << localRankSize_ << " localRank_: " << localRank_ + << " rank :" << rank_ << " rankSize :" << rankSize_; + devList_.resize(rankSize_); + aclError aclRet = aclrtGetDevice(&devId_); + if (aclRet != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtGetDevice error! ret: " << aclRet; + return LCAL_ERROR_INTERNAL; + } + int ret = socketExchange_->AllGather(&devId_, 1, devList_.data()); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "LcalSockExchange AllGather error! ret: " << ret; + return LCAL_ERROR_INTERNAL; + } + std::string devIdStr = ""; + for (int i = 0; i < rankSize_; ++i) { + devIdStr += (i == 0 ? "" : ", "); + devIdStr += to_string(devList_[i]); + } + MKI_LOG(DEBUG) << "rank " << rank_ << " devId: " << devId_ << ", otherDevList : " << devIdStr; + MKI_LOG(INFO) << "AllGather: Get other rank dev id success"; + return LCAL_SUCCESS; +} + int LcalComm::GetDevThread(const std::string &uid) { devList_.resize(rankSize_); -- Gitee From 082aab38f0b632a96ec831c8df72761c9ad812e2 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 19:37:40 +0800 Subject: [PATCH 163/414] fix --- comm/lcal/src/lcal_comm.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 5c4f3c1f..ae964964 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -1,3 +1,12 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ #include #include #include "lcal_internal.h" -- Gitee From 1590beb4fdbc4092449fcfc81c5e08bdd7ad0e8b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 19:58:22 +0800 Subject: [PATCH 164/414] draft --- comm/lcal/src/lcal_internal.cpp | 50 +++++++++++++++++++++++++++++++++ comm/lcal/src/lcal_internal.h | 32 +++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 comm/lcal/src/lcal_internal.cpp create mode 100644 comm/lcal/src/lcal_internal.h diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp new file mode 100644 index 00000000..2027a988 --- /dev/null +++ b/comm/lcal/src/lcal_internal.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "lcal_internal.h" +#include +#include +#include +#include +#include +#include +#include "ccl_kernel_args.h" +#include "coc_kernel_args.h" +#include "lcoc.h" + +using namespace std; +using namespace Mki; + +extern const int LCAL_CCE_BIN_STR[]; +asm(R"(.section .rodata, "a", @progbits +LCAL_CCE_BIN_STR:.incbin "/tmp/lcal_cce.o" +.byte 0 +.previous)"); + +constexpr int LCCL_RT_DEV_BINARY_MAGIC_ELF_AIVEC = 0x41415246; +constexpr int COC_RT_DEV_BINARY_MAGIC_ELF = 0x43554245; + +namespcae Lcal { +const std::map DATATYPE2NAME = { + {HCCL_DATA_TYPE_INT32, "int"}, + {HCCL_DATA_TYPE_INT16, "int16_t"}, + {HCCL_DATA_TYPE_INT8, "int8_t"}, + {HCCL_DATA_TYPE_INT64, "int64_t"}, + {HCCL_DATA_TYPE_FP32, "float"}, + {HCCL_DATA_TYPE_FP16, "float16_t"}, + {HCCL_DATA_TYPE_BFP16, "bfloat16_t"}, +}; + + +template +int RegisterBinaryKernel() + + + +} \ No newline at end of file diff --git a/comm/lcal/src/lcal_internal.h b/comm/lcal/src/lcal_internal.h new file mode 100644 index 00000000..6106a2a9 --- /dev/null +++ b/comm/lcal/src/lcal_internal.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_INTERNAL_H +#define LCAL_INTERNAL_H + +#include +#include +#include "lcal_types.h" +#include "coc_kernel_args.h" +#include "ccl_kernel_args.h" + +namespace Lcal { + +int RegisterKernel(const int32_t opGroup = 0); + +int64_t Count2Size(int64_t count, const HcclDataType &dataType); + +int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream); + +int LoadMTE(LocalType cclType, CCLGatherArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream); + +int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dataType, aclrtStream stream); +} + +#endif \ No newline at end of file -- Gitee From 6482a63d302cb71061520e886c40abd3c53e2428 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:00:04 +0800 Subject: [PATCH 165/414] draft --- comm/lcal/src/lcal_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/lcal_internal.h b/comm/lcal/src/lcal_internal.h index 6106a2a9..2a0d3897 100644 --- a/comm/lcal/src/lcal_internal.h +++ b/comm/lcal/src/lcal_internal.h @@ -18,13 +18,13 @@ namespace Lcal { -int RegisterKernel(const int32_t opGroup = 0); +int RegistKernel(const int32_t opGroup = 0); int64_t Count2Size(int64_t count, const HcclDataType &dataType); int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream); -int LoadMTE(LocalType cclType, CCLGatherArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream); +int LoadMTE(LcalType cclType, CCLGatherArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream); int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dataType, aclrtStream stream); } -- Gitee From 4b7f7a8fa0f4ec5af915301038787566265f37e6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:18:19 +0800 Subject: [PATCH 166/414] draft --- comm/lcal/src/lcal_internal.cpp | 90 +++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 2027a988..32557e8d 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -30,7 +30,7 @@ LCAL_CCE_BIN_STR:.incbin "/tmp/lcal_cce.o" constexpr int LCCL_RT_DEV_BINARY_MAGIC_ELF_AIVEC = 0x41415246; constexpr int COC_RT_DEV_BINARY_MAGIC_ELF = 0x43554245; -namespcae Lcal { +namespace Lcal { const std::map DATATYPE2NAME = { {HCCL_DATA_TYPE_INT32, "int"}, {HCCL_DATA_TYPE_INT16, "int16_t"}, @@ -38,13 +38,97 @@ const std::map DATATYPE2NAME = { {HCCL_DATA_TYPE_INT64, "int64_t"}, {HCCL_DATA_TYPE_FP32, "float"}, {HCCL_DATA_TYPE_FP16, "float16_t"}, - {HCCL_DATA_TYPE_BFP16, "bfloat16_t"}, + {HCCL_DATA_TYPE_BFP16, "bfloat16_t"} }; template -int RegisterBinaryKernel() +int RegisterBinaryKernel(const string &funcName, int8_t *funSig, const T *binStrPtr, int magic, int len = 0) +{ + rtDevBinary_t binary; + void *binHandle = nullptr; + binary.data = binStrPtr; + binary.length = (len == 0 ? LCAL_1OP_BIN_SIZE : len); + binary.magic = magic; + binary.version = 0; + rtError_t rtRet = rtDevBinaryRegister(&binary, &binHandle); + if (rtRet != RT_ERROR_NONE) { + MKI_LOG(WARN) << "rtDevBinaryRegister failed! " << to_string(rtRet) << ", funcName = " << funcName; + return LCAL_ERROR_INTERNAL; + } + rtRet = rtFunctionRegister(binHandle, funSig, funcName.c_str(), funcName.c_str(), 0); + if (rtRet != RT_ERROR_NONE) { + MKI_LOG(WARN) << "rtFunctionRegister failed! " << to_string(rtRet) << ", funcName = " << funcName; + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int8_t *GetFunSig(LcalType type, HcclDataType dataType, uint64_t devType = 0) +{ + constexpr int sigOffset = 16; + constexpr int sigSkew = 0x1000; + return reinterpret_cast((static_cast(type) << sigOffset << sigOffset) + + (static_cast(dataType)<< sigOffset) + devType + sigSkew); +} + + +const int* FindNextOpStart(const int opStartMagic, const int* cclBinEndPtr, const int* cclBinPtr) +{ + if (cclBinPtr == nullptr) { + MKI_LOG(ERROR) << "FindNextOpStart failed! cclBinPtr is nullptr"; + return nullptr; + } + while (*cclBinPtr != opStartMagic and cclBinPtr < cclBinEndPtr) { + cclBinPtr++; + } + if (*cclBinPtr == opStartMagic) { + cclBinPtr++; + } + return cclBinPtr; +} + +int RegistCCLOp2Kernel(const int* cclBinPtr, const int* nextPtr) +{ + vector registerTypes = { HCCL_DATA_TYPE_INT32, HCCL_DATA_TYPE_INT16, HCCL_DATA_TYPE_INT8, + HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, + HCCL_DATA_TYPE_INT64 }; + std::vector registerCCLTypesOp2 = { + LcalType::ALL_GATHER, LcalType::REDUCE_SCATTER, LcalType::ALL2ALL, + }; + int res = LCAL_SUCCESS; + for (auto ccl : registerCCLTypesOp2) { + for (auto t : registerTypes) { + res = RegisterBinaryKernel(LCAL_TYPE2NAME.at(ccl) + "_" + DATATYPE2NAME.at(t), GetFunSig(ccl, t), + cclBinPtr, LCCL_RT_DEV_BINARY_MAGIC_ELF_AIVEC, (nextPtr - cclBinPtr) * sizeof(int)); + } + } + if (res != LCAL_SUCCESS) { + return res; + } + res = RegisterBinaryKernel(LCAL_TYPE2NAME.at(LcalType::BROADCAST), + GetFunSig(LcalType::BROADCAST, HCCL_DATA_TYPE_RESERVED), cclBinPtr, LCCL_RT_DEV_BINARY_MAGIC_ELF_AIVEC); + return res; +} + +int RegistCCLOp1Kernel(const int* cclBinPtr, const int* nextPtr) +{ + vector registerTypes = { HCCL_DATA_TYPE_INT32, HCCL_DATA_TYPE_INT16, HCCL_DATA_TYPE_INT8, + HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, + HCCL_DATA_TYPE_INT64 }; + std::vector registerCCLTypesOp1 = { + LcalType::ALL_REDUCE, + }; + int res = LCAL_SUCCESS; + for (auto ccl : registerCCLTypesOp1) { + for (auto t : registerTypes) { + res = RegisterBinaryKernel(LCAL_TYPE2NAME.at(ccl) + "_" + DATATYPE2NAME.at(t), GetFunSig(ccl, t), + cclBinPtr, LCCL_RT_DEV_BINARY_MAGIC_ELF_AIVEC, (nextPtr - cclBinPtr) * sizeof(int)); + } + } + return res; +} } \ No newline at end of file -- Gitee From fd9281019771e5812e387bc0288744d97e6f0238 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:19:26 +0800 Subject: [PATCH 167/414] draft --- comm/lcal/src/lcal_internal.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 32557e8d..ce59ebb2 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -74,16 +74,16 @@ int8_t *GetFunSig(LcalType type, HcclDataType dataType, uint64_t devType = 0) } -const int* FindNextOpStart(const int opStartMagic, const int* cclBinEndPtr, const int* cclBinPtr) +const int* FindNextOpStart(const int opStartMaigc, const int* cclBinEndPtr, const int* cclBinPtr) { if (cclBinPtr == nullptr) { MKI_LOG(ERROR) << "FindNextOpStart failed! cclBinPtr is nullptr"; return nullptr; } - while (*cclBinPtr != opStartMagic and cclBinPtr < cclBinEndPtr) { + while (*cclBinPtr != opStartMaigc and cclBinPtr < cclBinEndPtr) { cclBinPtr++; } - if (*cclBinPtr == opStartMagic) { + if (*cclBinPtr == opStartMaigc) { cclBinPtr++; } return cclBinPtr; -- Gitee From c7cad14e7212d2ad2e70a6065146b6dc02eef276 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:20:34 +0800 Subject: [PATCH 168/414] draft --- comm/lcal/src/lcal_internal.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index ce59ebb2..16b973f2 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -32,13 +32,13 @@ constexpr int COC_RT_DEV_BINARY_MAGIC_ELF = 0x43554245; namespace Lcal { const std::map DATATYPE2NAME = { - {HCCL_DATA_TYPE_INT32, "int"}, - {HCCL_DATA_TYPE_INT16, "int16_t"}, - {HCCL_DATA_TYPE_INT8, "int8_t"}, - {HCCL_DATA_TYPE_INT64, "int64_t"}, - {HCCL_DATA_TYPE_FP32, "float"}, - {HCCL_DATA_TYPE_FP16, "float16_t"}, - {HCCL_DATA_TYPE_BFP16, "bfloat16_t"} + { HCCL_DATA_TYPE_INT32, "int" }, + { HCCL_DATA_TYPE_INT16, "int16_t" }, + { HCCL_DATA_TYPE_INT8, "int8_t" }, + { HCCL_DATA_TYPE_INT64, "int64_t" }, + { HCCL_DATA_TYPE_FP32, "float" }, + { HCCL_DATA_TYPE_FP16, "float16_t" }, + { HCCL_DATA_TYPE_BFP16, "bfloat16_t "} }; -- Gitee From d8b700e903777c8904dade9eb845d26ec014a7fe Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:20:52 +0800 Subject: [PATCH 169/414] draft --- comm/lcal/src/lcal_internal.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 16b973f2..5cd35545 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -41,7 +41,6 @@ const std::map DATATYPE2NAME = { { HCCL_DATA_TYPE_BFP16, "bfloat16_t "} }; - template int RegisterBinaryKernel(const string &funcName, int8_t *funSig, const T *binStrPtr, int magic, int len = 0) { @@ -73,7 +72,6 @@ int8_t *GetFunSig(LcalType type, HcclDataType dataType, uint64_t devType = 0) (static_cast(dataType)<< sigOffset) + devType + sigSkew); } - const int* FindNextOpStart(const int opStartMaigc, const int* cclBinEndPtr, const int* cclBinPtr) { if (cclBinPtr == nullptr) { -- Gitee From b7d215e2062f816e6a480fad5267f8930c01c10a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:21:22 +0800 Subject: [PATCH 170/414] draft --- comm/lcal/src/lcal_internal.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 5cd35545..94860b79 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -38,7 +38,7 @@ const std::map DATATYPE2NAME = { { HCCL_DATA_TYPE_INT64, "int64_t" }, { HCCL_DATA_TYPE_FP32, "float" }, { HCCL_DATA_TYPE_FP16, "float16_t" }, - { HCCL_DATA_TYPE_BFP16, "bfloat16_t "} + { HCCL_DATA_TYPE_BFP16, "bfloat16_t" } }; template -- Gitee From d3f5d4b3c425abe78b9de850e4ecc10d97bbcc2a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:21:49 +0800 Subject: [PATCH 171/414] draft --- comm/lcal/src/lcal_internal.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 94860b79..248726bd 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -90,7 +90,7 @@ const int* FindNextOpStart(const int opStartMaigc, const int* cclBinEndPtr, cons int RegistCCLOp2Kernel(const int* cclBinPtr, const int* nextPtr) { vector registerTypes = { HCCL_DATA_TYPE_INT32, HCCL_DATA_TYPE_INT16, HCCL_DATA_TYPE_INT8, - HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, + HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, HCCL_DATA_TYPE_INT64 }; std::vector registerCCLTypesOp2 = { LcalType::ALL_GATHER, LcalType::REDUCE_SCATTER, LcalType::ALL2ALL, @@ -113,7 +113,7 @@ int RegistCCLOp2Kernel(const int* cclBinPtr, const int* nextPtr) int RegistCCLOp1Kernel(const int* cclBinPtr, const int* nextPtr) { vector registerTypes = { HCCL_DATA_TYPE_INT32, HCCL_DATA_TYPE_INT16, HCCL_DATA_TYPE_INT8, - HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, + HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, HCCL_DATA_TYPE_INT64 }; std::vector registerCCLTypesOp1 = { LcalType::ALL_REDUCE, -- Gitee From e16ddfec7bdda101f1f184537f33671beb3ffcd3 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:39:24 +0800 Subject: [PATCH 172/414] draft --- comm/lcal/src/lcal_internal.cpp | 95 +++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 248726bd..7a58757e 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -128,5 +128,100 @@ int RegistCCLOp1Kernel(const int* cclBinPtr, const int* nextPtr) return res; } +int RegistCCLKernel(const int32_t opGroup) +{ + const int* cclBinStr = LCAL_CCE_BIN_STR; + auto cclBinEndPtr = cclBinStr + LCAL_1OP_BIN_SIZE / sizeof(int); + const int* cclBinPtr = cclBinStr + 1; + constexpr int opStartMaigc = 0x44444444; + const int* nextPtr = FindNextOpStart(opStartMaigc, cclBinEndPtr, cclBinPtr); + if (nextPtr == nullptr) { + return LCAL_ERROR_INTERNAL; + } + + constexpr int32_t smallGroupNum = 2; + + for (int32_t opGroupIdx = 0; opGroupIdx < opGroup; ++opGroupIdx) { + for (int32_t opIdx = 0; opIdx < smallGroupNum; ++opIdx) { + cclBinPtr = nextPtr; + nextPtr = FindNextOpStart(opStartMaigc, cclBinEndPtr, nextPtr); + if (cclBinPtr == nullptr || cclBinPtr == cclBinEndPtr || nextPtr == nullptr) { + return LCAL_ERROR_INTERNAL; + } + } + } + + int ret = 0; + ret = RegistCCLOp1Kernel(cclBinPtr, nextPtr); + if (ret != LCAL_SUCCESS) { + return LCAL_ERROR_INTERNAL; + } + + cclBinPtr = nextPtr; + nextPtr = FindNextOpStart(opStartMaigc, cclBinEndPtr, nextPtr); + if (cclBinPtr == nullptr || cclBinPtr == cclBinEndPtr || nextPtr == nullptr) { + return LCAL_ERROR_INTERNAL; + } + + ret = RegistCCLOp2Kernel(cclBinPtr, nextPtr); + if (ret != LCAL_SUCCESS) { + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +void RegistCoCKernel() +{ + vector registerTypes = { HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16 }; + vector registerCOCTypes = { + { LcalType::MATMUL_ALL_REDUCE }, + { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER}, + }; + + auto cocCceBinStr = LCAL_CCE_BIN_STR + LCAL_CCE_BIN_STR / sizeof(int); + for (auto lcalTypeGroup : registerCOCTypes) { + for (auto lcalType : lcalTypeGroup) { + for (auto t : registerTypes) { + RegisterBinaryKernel(LCAL_TYPE2NAME.at(lcalType) + "_" + DATATYPE2NAME.at(t), GetFunSig(lcalType, t), + cocCceBinStr, COC_RT_DEV_BINARY_MAGIC_ELF); + } + } + cocCceBinStr += LCAL_CCE_BIN_STR / sizeof(int); + } +} + +int RegistKernel(const int32_t opGroup) +{ + static bool init = false; + static mutex mut; + lock_guard guard(mut); + if (init) { + return 0; + } + RegistCoCKernel(); + RegistCCLKernel(opGroup); + init = true; + return LCAL_SUCCESS; +} + +int64_t Count2Size(int64_t count, const HcclDataType &dataType) +{ + int64_t dataSize = LCAL_INVALID_VALUE; + if (dataType == HCCL_DATA_TYPE_INT8 || dataType == HCCL_DATA_TYPE_UINT8) { + dataSize = count; + } else if (dataType == HCCL_DATA_TYPE_INT16 || dataType == HCCL_DATA_TYPE_FP16 || + dataType == HCCL_DATA_TYPE_BFP16 || dataType == HCCL_DATA_TYPE_UINT16) { + dataSize = count * sizeof(int16_t); + } else if (dataType == HCCL_DATA_TYPE_FP32 || dataType == HCCL_DATA_TYPE_INT32 || + dataType == HCCL_DATA_TYPE_UINT32) { + dataSize = count * sizeof(int32_t); + } else if (dataType == HCCL_DATA_TYPE_INT64 || dataType == HCCL_DATA_TYPE_UINT64) { + dataSize = count * sizeof(int64_t); + } else { + MKI_LOG(ERROR) << "unknown datatype"; + } + return dataSize; +} + } \ No newline at end of file -- Gitee From 5fb7ccebd52d33f8878870aec7fdc9f34ee777b7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 18 Aug 2025 20:40:55 +0800 Subject: [PATCH 173/414] draft --- comm/lcal/src/lcal_internal.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 7a58757e..50318ca8 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -173,12 +173,12 @@ int RegistCCLKernel(const int32_t opGroup) void RegistCoCKernel() { vector registerTypes = { HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16 }; - vector registerCOCTypes = { + vector> registerCOCTypes = { { LcalType::MATMUL_ALL_REDUCE }, { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER}, }; - auto cocCceBinStr = LCAL_CCE_BIN_STR + LCAL_CCE_BIN_STR / sizeof(int); + auto cocCceBinStr = LCAL_CCE_BIN_STR + LCAL_1OP_BIN_SIZE / sizeof(int); for (auto lcalTypeGroup : registerCOCTypes) { for (auto lcalType : lcalTypeGroup) { for (auto t : registerTypes) { @@ -186,7 +186,7 @@ void RegistCoCKernel() cocCceBinStr, COC_RT_DEV_BINARY_MAGIC_ELF); } } - cocCceBinStr += LCAL_CCE_BIN_STR / sizeof(int); + cocCceBinStr += LCAL_1OP_BIN_SIZE / sizeof(int); } } -- Gitee From d1625cd4483cc5270f6aed0755089682fbbb50c7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 10:23:02 +0800 Subject: [PATCH 174/414] draft --- comm/lcal/src/lcal_internal.cpp | 77 +++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 50318ca8..8f43e840 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -223,5 +223,82 @@ int64_t Count2Size(int64_t count, const HcclDataType &dataType) return dataSize; } +int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream) +{ + int error = 0; + MKI_LOG(DEBUG) << "LoadMTE" << LCAL_TYPE2NAME.at(cclType) << " count:" << args.count << " dataType:" << dataType + << " op:" << args.op << " blockDim:" << blockDim << " rootRank:" << args.root + << " magic: " << args.magic; + int64_t dataSize = Count2Size(args.count, dataType); + if (dataSize == LCAL_INVALID_VALUE || blockDim == 0) { + MKI_LOG(ERROR) << ("LoadMTE args are invalid"); + return LCAL_ERROR_PARA_CHECK_FAIL; + } + + static const char *ENV = Mki::GetEnv("LCCL_PARALLEL"); + if (ENV && (string(ENV) == "1" || string(ENV) == "true") && dataSize >= IPC_BUFF_MAX_SIZE) { + MKI_LOG(ERROR) << ("LoadMTE args are invalid. because LCCL_PARALLEL is open, and dataSize is too big."); + return LCAL_ERROR_PARA_CHECK_FAIL; + } + + rtTaskCfgInfo_t cfgInfo{}; + cfgInfo.schemMode = 1; + + rtArgsEx_t argsInfo{}; + argsInfo.args = &args; + argsInfo.argsSize = sizeof(args); + + if (cclType == LcalType::BROADCAST || cclType == LcalType::BANDWIDTH) { + args.count = dataSize; + error = rtKernelLaunchWithFlagV2(GetFunSig(cclType, HCCL_DATA_TYPE_RESERVED), + blockDim, &argsInfo, nullptr, stream, 0, &cfgInfo); + } else { + error = rtKernelLaunchWithFlagV2(GetFunSig(cclType, dataType), + blockDim, &argsInfo, nullptr, stream, 0, &cfgInfo); + } + if (error != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "rtKernelLaunch -:" << LCAL_TYPE2NAME.at(cclType) << to_string(error); + return LCAL_ERROR_INTERNAL; + } + return error; +} + +int LoadMTE(LcalType cclType, CCLGatherArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream) +{ + int error = 0; + MKI_LOG(DEBUG) << "LoadMTE" << LCAL_TYPE2NAME.at(cclType) << " embTableLen:" << args.embTableLen + << " embTableDim:" << args.embTableDim + << " loopupLen:" << args.lookupLen; + + rtTaskCfgInfo_t cfgInfo{}; + cfgInfo.schemMode = 1; + + rtArgsEx_t argsInfo{}; + argsInfo.args = &args; + argsInfo.argsSize = sizeof(args); + + if (cclType == LcalType::GATHER) { + error = rtKernelLaunchWithFlagV2(GetFunSig(cclType, dataType), + blockDim, &argsInfo, nullptr, stream, 0, &cfgInfo); + } + if (error != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "rtKernelLaunch -:" << to_string(error); + return LCAL_ERROR_INTERNAL; + } + return error; +} + +template +size_t OffsetOf(M T::*member, T obj) +{ + return reinterpret_cast(&(obj.*member)) - reinterpret_cast(&obj); +} + +int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dataType, aclrtStream stream) +{ + int error = LCAL_SUCCESS; + + // size_t tilingAddrOffset = OffsetOf(&CoCKernelArgs::pCoc) +} } \ No newline at end of file -- Gitee From c00957b225b9494d9d341ef32c0ef20c728fe901 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 10:32:05 +0800 Subject: [PATCH 175/414] draft --- comm/lcal/src/lcal_internal.cpp | 34 +++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 8f43e840..76e70346 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -298,7 +298,37 @@ int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dat { int error = LCAL_SUCCESS; - // size_t tilingAddrOffset = OffsetOf(&CoCKernelArgs::pCoc) -} + size_t tilingAddrOffset = OffsetOf(&CoCKernelArgs::pCocTiling, kernelArgs); + size_t tilingDataOffset = OffsetOf(&CoCKernelArgs::cocKernelParam, kernelArgs + + OffsetOf(&CoCKernelArgs::cocTilingData, kernelArgs.cocKernelParam)); + + auto &cocTilingData = kernelArgs.cocKernelParam.cocTilingData; + if (cocTilingData.withSerialMode != 0) { + static std::vector serialTags(LCAL_MAX_RANK_SIZE, 1); + cocTilingData.tag = serialTags[cocTilingData.rank]; + serialTags[cocTilingData.rank] = serialTags[cocTilingData.rank] % TAG_MOD + 1; + } + + rtTaskCfgInfo_t cfgInfo{}; + cfgInfo.schemMode = 1; + rtArgsEx_t argsInfo{}; + argsInfo.args = static_cast(&kernelArgs); + argsInfo.hostInputInfoPtr = nullptr; + argsInfo.argsSize = sizeof(kernelArgs); + argsInfo.tilingAddrOffset = tilingAddrOffset; + argsInfo.tilingDataOffset = tilingDataOffset; + argsInfo.hostInputInfoNum = 0; + argsInfo.hasTiling = 1; + argsInfo.isNoNeedH2DCopy = 0; + + error = rtKernelLaunchWithFlagV2(GetFunSig(cocType, dataType), + kernelArgs.cocKernelParam.cocTilingData.blockDim, + &argsInfo, nullptr, stream, 0, &cfgInfo); + if (error != RT_ERROR_NONE) { + MKI_LOG(ERROR) << "rtKernelLaunch -:" << to_string(error); + return LCAL_ERROR_INTERNAL; + } + return error; +} } \ No newline at end of file -- Gitee From 495d4082990d6b69f28608f96277c7d7af7e266b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 10:40:47 +0800 Subject: [PATCH 176/414] fix --- comm/lcal/src/lcal_internal.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 76e70346..c0ad71dd 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -226,9 +226,9 @@ int64_t Count2Size(int64_t count, const HcclDataType &dataType) int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream) { int error = 0; - MKI_LOG(DEBUG) << "LoadMTE" << LCAL_TYPE2NAME.at(cclType) << " count:" << args.count << " dataType:" << dataType + MKI_LOG(DEBUG) << "LoadMTE " << LCAL_TYPE2NAME.at(cclType) << " count:" << args.count << " dataType:" << dataType << " op:" << args.op << " blockDim:" << blockDim << " rootRank:" << args.root - << " magic: " << args.magic; + << ", magic: " << args.magic; int64_t dataSize = Count2Size(args.count, dataType); if (dataSize == LCAL_INVALID_VALUE || blockDim == 0) { MKI_LOG(ERROR) << ("LoadMTE args are invalid"); @@ -237,7 +237,7 @@ int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, Hccl static const char *ENV = Mki::GetEnv("LCCL_PARALLEL"); if (ENV && (string(ENV) == "1" || string(ENV) == "true") && dataSize >= IPC_BUFF_MAX_SIZE) { - MKI_LOG(ERROR) << ("LoadMTE args are invalid. because LCCL_PARALLEL is open, and dataSize is too big."); + MKI_LOG(ERROR) << ("LoadMTE args are invalid, because LCCL_PARALLEL is open, and dataSize is too big."); return LCAL_ERROR_PARA_CHECK_FAIL; } @@ -257,8 +257,8 @@ int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, Hccl blockDim, &argsInfo, nullptr, stream, 0, &cfgInfo); } if (error != RT_ERROR_NONE) { - MKI_LOG(ERROR) << "rtKernelLaunch -:" << LCAL_TYPE2NAME.at(cclType) << to_string(error); - return LCAL_ERROR_INTERNAL; + MKI_LOG(ERROR) << "AsdRtFunctionLaunch -:" << LCAL_TYPE2NAME.at(cclType) << to_string(error); + return LCAL_ERROR_MKIRT; } return error; } @@ -266,9 +266,9 @@ int LoadMTE(LcalType cclType, AscendCCLKernelArgs &args, uint32_t blockDim, Hccl int LoadMTE(LcalType cclType, CCLGatherArgs &args, uint32_t blockDim, HcclDataType dataType, aclrtStream stream) { int error = 0; - MKI_LOG(DEBUG) << "LoadMTE" << LCAL_TYPE2NAME.at(cclType) << " embTableLen:" << args.embTableLen + MKI_LOG(DEBUG) << "LoadMTE " << LCAL_TYPE2NAME.at(cclType) << " embTableLen:" << args.embTableLen << " embTableDim:" << args.embTableDim - << " loopupLen:" << args.lookupLen; + << " lookupLen:" << args.lookupLen; rtTaskCfgInfo_t cfgInfo{}; cfgInfo.schemMode = 1; @@ -282,8 +282,8 @@ int LoadMTE(LcalType cclType, CCLGatherArgs &args, uint32_t blockDim, HcclDataTy blockDim, &argsInfo, nullptr, stream, 0, &cfgInfo); } if (error != RT_ERROR_NONE) { - MKI_LOG(ERROR) << "rtKernelLaunch -:" << to_string(error); - return LCAL_ERROR_INTERNAL; + MKI_LOG(ERROR) << "AsdRtFunctionLaunch -:" << to_string(error); + return LCAL_ERROR_MKIRT; } return error; } @@ -299,8 +299,8 @@ int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dat int error = LCAL_SUCCESS; size_t tilingAddrOffset = OffsetOf(&CoCKernelArgs::pCocTiling, kernelArgs); - size_t tilingDataOffset = OffsetOf(&CoCKernelArgs::cocKernelParam, kernelArgs + - OffsetOf(&CoCKernelArgs::cocTilingData, kernelArgs.cocKernelParam)); + size_t tilingDataOffset = OffsetOf(&CoCKernelArgs::cocKernelParam, kernelArgs) + + OffsetOf(&CoCKernelParam::cocTilingData, kernelArgs.cocKernelParam); auto &cocTilingData = kernelArgs.cocKernelParam.cocTilingData; if (cocTilingData.withSerialMode != 0) { @@ -326,7 +326,7 @@ int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dat kernelArgs.cocKernelParam.cocTilingData.blockDim, &argsInfo, nullptr, stream, 0, &cfgInfo); if (error != RT_ERROR_NONE) { - MKI_LOG(ERROR) << "rtKernelLaunch -:" << to_string(error); + MKI_LOG(ERROR) << "AsdRtFunctionLaunch -:" << to_string(error); return LCAL_ERROR_INTERNAL; } return error; -- Gitee From 04a7364f3837e1a0ec7121d1368b17a5b35b3f68 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 10:41:42 +0800 Subject: [PATCH 177/414] fix --- comm/lcal/src/lcal_internal.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index c0ad71dd..fd608c5f 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -327,7 +327,7 @@ int ComputeOverComm(LcalType cocType, CoCKernelArgs kernelArgs, HcclDataType dat &argsInfo, nullptr, stream, 0, &cfgInfo); if (error != RT_ERROR_NONE) { MKI_LOG(ERROR) << "AsdRtFunctionLaunch -:" << to_string(error); - return LCAL_ERROR_INTERNAL; + return LCAL_ERROR_MKIRT; } return error; } -- Gitee From 452ed556e4e6f65aa2d037535a465a5cd3cfe170 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 11:20:44 +0800 Subject: [PATCH 178/414] draft --- comm/lcal/src/lcal_wrap.cpp | 278 ++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 comm/lcal/src/lcal_wrap.cpp diff --git a/comm/lcal/src/lcal_wrap.cpp b/comm/lcal/src/lcal_wrap.cpp new file mode 100644 index 00000000..84dba12c --- /dev/null +++ b/comm/lcal/src/lcal_wrap.cpp @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include +#include +#include +#include "mki/utils/log/log.h" +#include "lcal.h" +#include "tools/socket/lcal_socket_exchange.h" + +using namespace std; +using namespace Lcal; + +int LcalCommInitRankLocal(int rankSize, int rank, LcalCommPtr *comm) +{ + MKI_LOG(INFO) << "using lcal c++ api! rank" << rank; + if (comm == nullptr) { + MKI_LOG(ERROR) << "lcal comm ptr is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + auto *c = new (std::nothrow) LcalCommLocal(rank, rankSize); + if (c == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize; + return LCAL_ERROR_INTERNAL; + } + *comm = c; + int ret = c->Init(); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "lccl init failed!" + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalGetUniqueId(LcalUniqueId *uniqueId, int commDomain) +{ + if (uniqueId == nullptr) { + MKI_LOG(ERROR) << "uniqueId is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + int res = BootstrapGetUniqueId(reinterpret_cast(uniqueId), commDomain); + if (res != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "lcal BootstrapGetUniqueId failed!"; + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalCommInitRank(LcalUniqueId commId, int rankSize, int rank, LcalCommPtr *comm) +{ + MKI_LOG(INFO) << "using lcal c++ api! rank" << rank; + if (comm == nullptr) { + MKI_LOG(ERROR) << "lcal comm ptr is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + auto *c = new (std::nothrow) LcalComm(rank, rankSize, commId); + if (c == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize; + return LCAL_ERROR_INTERNAL; + } + *comm = c; + int ret = c->Init(); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "lccl init failed!" + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalCommInitRankWithCustDomainSize(int commDomain, int bufferSize, int rankSize, int rank, LcalCommPtr *comm, + const bool isEnableAutoMagicNum) +{ + MKI_LOG(INFO) << "using lcal c++ api! rank" << rank << ", rankSize : " << rankSize << ", commDomain : " << + commDomain << ", bufferSize : " << bufferSize << ", isEnableAutoMagicNum : " << isEnableAutoMagicNum; + if (comm == nullptr) { + MKI_LOG(ERROR) << "lcal comm ptr is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + + constexpr int minBufferSize = LCAL_COMM_BUFFER_SIZE; + if (bufferSize < minBufferSize) { + MKI_LOG(ERROR) << "lcal comm buffer size " << bufferSize << " MBytes should not be less than " << + minBufferSize << " MBytes."; + return LCAL_ERROR_INTERNAL; + } + + auto *c = new (std::nothrow) LcalComm(rank, rankSize, commDomain, bufferSize, isEnableAutoMagicNum); + if (c == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize << ", commDomain : " << + commDomain << ", bufferSize : " << bufferSize << ", isEnableAutoMagicNum : " << isEnableAutoMagicNum; + return LCAL_ERROR_INTERNAL; + } + *comm = c; + int ret = c->Init(); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "lccl init failed!" + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalCommInitRankWithDomain(int commDomain, int rankSize, int rank, LcalCommPtr *comm) +{ + constexpr int minBufferSize = LCAL_COMM_BUFFER_SIZE; + return LcalCommInitRankWithCustDomainSize(commDomain, minBufferSize, rankSize, rank, comm); +} + +int LcalGetCommArgsDev(LcalCommPtr comm, GM_ADDR &commArgsPtr) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "lcal comm is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + auto *lccl = static_cast(comm); + commArgsPtr = lccl->GetCommArgsPtr(); + return LCAL_SUCCESS; +} + +int LcalGetCommArgsDev(LcalCommPtr comm, GM_ADDR &commArgsPtr) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "lcal comm is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + auto *c = static_cast(comm); + commArgsPtr = c->GetCommArgs(); + return LCAL_SUCCESS; +} + +void LcalPrintDFX2Log(LcalCommPtr comm) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "lcal comm is nullptr!"; + return; + } + auto *lcal = static_cast(comm); + MKI_LOG(INFO) << lcal->PrintDFX(); +} + +int LcalCommInit(int rank, int rankSize, LcalCommPtr *comms) +{ + if (comms == nullptr) { + MKI_LOG(ERROR) << "lcal comms ptr is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + *comms = new (std::nothrow) LcalComm(rank, rankSize); + if (*comms == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize; + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +int LcalCommInitAll(uint32_t ndev, int32_t* devices, LcalCommPtr *comms) +{ + if (comms == nullptr) { + MKI_LOG(ERROR) << "lcal comms is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + if (devices == nullptr) { + MKI_LOG(ERROR) << "lcal devices is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + static int commDomain = 0; + commDomain++; + for (uint32_t i = 0; i < ndev; ++i) { + comms[i] = new (std::nothrow) LcalComm(i, ndev, commDomain, LCAL_COMM_BUFFER_SIZE, false); + if (comms[i] == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed. dev : " << i << ", rankSize : " << ndev; + return LCAL_ERROR_INTERNAL; + } + } + static atomic uid; + uid++; + vector> threads; + int error = LCAL_SUCCESS; + for (uint32_t r = 0; r < ndev; r++) { + threads.emplace_back(make_unique( + [&](int rank) { + aclrtSetDevice(devices[rank]); + auto *c = static_cast(comms[rank]); + int ret = c->InitThread("uid" + to_string(uid)); + if (ret != LCAL_SUCCESS) { + error = ret; + } + }, + r)); + } + for (auto &t : threads) { + t->join(); + } + threads.clear(); + return error; +} + +int LcalCommInitThread(int rank, int rankSize, const char *uid, LcalCommPtr *comms) +{ + if (uid == nullptr) { + MKI_LOG(ERROR) << "lcal uid is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + if (comms == nullptr) { + MKI_LOG(ERROR) << "lcal comms is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + if (rank > rankSize) { + MKI_LOG(ERROR) << "lcal rank : " << rank << " rankSize : " << rankSize; + return LCAL_ERROR_INTERNAL; + } + *comms = new (std::nothrow) LcalComm(rank, rankSize); + if (*comms == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize; + return LCAL_ERROR_INTERNAL; + } + auto *c = static_cast(*comms); + return c->InitThread(string(uid)); +} + +int LcclAllReduce(void *sendBuf, void *recvBuf, int64_t count, HcclDataType dataType, HcclReduceOp op, + LcalCommPtr comm, aclrtStream stream) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "LcclAllReduce comm is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + Lccl lccl(static_cast(comm)); + return lccl.AllReduce(sendBuf, recvBuf, count, dataType, op, stream); +} + +int LcclAllGather(void *sendBuf, void *recvBuf, int64_t sendCount, HcclDataType dataType, LcalCommPtr comm, + aclrtStream stream) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "LcclAllGather comm is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + Lccl lccl(static_cast(comm)); + return lccl.AllGather(sendBuf, recvBuf, sendCount, dataType, stream); +} + +int LcclReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount, HcclDataType dataType, HcclReduceOp op, + LcalCommPtr comm, aclrtStream stream) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "LcclReduceScatter comm is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + Lccl lccl(static_cast(comm)); + return lccl.ReduceScatter(sendBuf, recvBuf, recvCount, dataType, op, stream); +} + +int LcclBroadcast(void *buf, int64_t count, HcclDataType dataType, int root, LcalCommPtr comm, + aclrtStream stream) +{ + if (comm == nullptr) { + MKI_LOG(ERROR) << "LcclBroadcast comm is nullptr!"; + return LCAL_ERROR_INTERNAL; + } + Lccl lccl(static_cast(comm)); + return lccl.Broadcast(buf, count, dataType, root, stream); +} + +int LcclCommDestroy(LcalCommPtr comm) +{ + if (comm == nullptr) { + return LCAL_INVALID_VALUE; + } + auto *c = static_cast(comm); + delete c; + return LCAL_SUCCESS; +} \ No newline at end of file -- Gitee From 554688b3114625ecdd4b7b295da1a4163bef51e4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 11:24:26 +0800 Subject: [PATCH 179/414] draft --- comm/lcal/src/lcal_wrap.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/comm/lcal/src/lcal_wrap.cpp b/comm/lcal/src/lcal_wrap.cpp index 84dba12c..0bf96fec 100644 --- a/comm/lcal/src/lcal_wrap.cpp +++ b/comm/lcal/src/lcal_wrap.cpp @@ -13,7 +13,7 @@ #include #include "mki/utils/log/log.h" #include "lcal.h" -#include "tools/socket/lcal_socket_exchange.h" +#include "tools/socket/lcal_sock_exchange.h" using namespace std; using namespace Lcal; @@ -25,7 +25,7 @@ int LcalCommInitRankLocal(int rankSize, int rank, LcalCommPtr *comm) MKI_LOG(ERROR) << "lcal comm ptr is nullptr!"; return LCAL_ERROR_INTERNAL; } - auto *c = new (std::nothrow) LcalCommLocal(rank, rankSize); + auto *c = new (std::nothrow) LcalComm(rank, rankSize); if (c == nullptr) { MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize; return LCAL_ERROR_INTERNAL; @@ -33,7 +33,7 @@ int LcalCommInitRankLocal(int rankSize, int rank, LcalCommPtr *comm) *comm = c; int ret = c->Init(); if (ret != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "lccl init failed!" + MKI_LOG(ERROR) << "lccl init failed!"; return LCAL_ERROR_INTERNAL; } return LCAL_SUCCESS; @@ -68,7 +68,7 @@ int LcalCommInitRank(LcalUniqueId commId, int rankSize, int rank, LcalCommPtr *c *comm = c; int ret = c->Init(); if (ret != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "lccl init failed!" + MKI_LOG(ERROR) << "lccl init failed!"; return LCAL_ERROR_INTERNAL; } return LCAL_SUCCESS; @@ -77,8 +77,8 @@ int LcalCommInitRank(LcalUniqueId commId, int rankSize, int rank, LcalCommPtr *c int LcalCommInitRankWithCustDomainSize(int commDomain, int bufferSize, int rankSize, int rank, LcalCommPtr *comm, const bool isEnableAutoMagicNum) { - MKI_LOG(INFO) << "using lcal c++ api! rank" << rank << ", rankSize : " << rankSize << ", commDomain : " << - commDomain << ", bufferSize : " << bufferSize << ", isEnableAutoMagicNum : " << isEnableAutoMagicNum; + MKI_LOG(INFO) << "using lcal c++ api! rank : " << rank << ", rankSize : " << rankSize << ", commDomain:" << + commDomain << ", bufferSize:" << bufferSize << ", isEnableAutoMagicNum:" << isEnableAutoMagicNum; if (comm == nullptr) { MKI_LOG(ERROR) << "lcal comm ptr is nullptr!"; return LCAL_ERROR_INTERNAL; @@ -87,20 +87,20 @@ int LcalCommInitRankWithCustDomainSize(int commDomain, int bufferSize, int rankS constexpr int minBufferSize = LCAL_COMM_BUFFER_SIZE; if (bufferSize < minBufferSize) { MKI_LOG(ERROR) << "lcal comm buffer size " << bufferSize << " MBytes should not be less than " << - minBufferSize << " MBytes."; + minBufferSize << " MBytes!"; return LCAL_ERROR_INTERNAL; } auto *c = new (std::nothrow) LcalComm(rank, rankSize, commDomain, bufferSize, isEnableAutoMagicNum); if (c == nullptr) { - MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize << ", commDomain : " << - commDomain << ", bufferSize : " << bufferSize << ", isEnableAutoMagicNum : " << isEnableAutoMagicNum; + MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize << ", commDomain:" << + commDomain << ", bufferSize:" << bufferSize << ", isEnableAutoMagicNum:" << isEnableAutoMagicNum; return LCAL_ERROR_INTERNAL; } *comm = c; int ret = c->Init(); if (ret != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "lccl init failed!" + MKI_LOG(ERROR) << "lccl init failed!"; return LCAL_ERROR_INTERNAL; } return LCAL_SUCCESS; @@ -123,7 +123,7 @@ int LcalGetCommArgsDev(LcalCommPtr comm, GM_ADDR &commArgsPtr) return LCAL_SUCCESS; } -int LcalGetCommArgsDev(LcalCommPtr comm, GM_ADDR &commArgsPtr) +int LcalGetCommArgsHost(LcalCommPtr comm, Lcal::CommArgs *&commArgsPtr) { if (comm == nullptr) { MKI_LOG(ERROR) << "lcal comm is nullptr!"; @@ -147,7 +147,7 @@ void LcalPrintDFX2Log(LcalCommPtr comm) int LcalCommInit(int rank, int rankSize, LcalCommPtr *comms) { if (comms == nullptr) { - MKI_LOG(ERROR) << "lcal comms ptr is nullptr!"; + MKI_LOG(ERROR) << "lcal comms is nullptr!"; return LCAL_ERROR_INTERNAL; } *comms = new (std::nothrow) LcalComm(rank, rankSize); @@ -158,7 +158,7 @@ int LcalCommInit(int rank, int rankSize, LcalCommPtr *comms) return LCAL_SUCCESS; } -int LcalCommInitAll(uint32_t ndev, int32_t* devices, LcalCommPtr *comms) +int LcalCommInitAll(uint32_t ndev, int32_t *devices, LcalCommPtr *comms) { if (comms == nullptr) { MKI_LOG(ERROR) << "lcal comms is nullptr!"; @@ -173,7 +173,7 @@ int LcalCommInitAll(uint32_t ndev, int32_t* devices, LcalCommPtr *comms) for (uint32_t i = 0; i < ndev; ++i) { comms[i] = new (std::nothrow) LcalComm(i, ndev, commDomain, LCAL_COMM_BUFFER_SIZE, false); if (comms[i] == nullptr) { - MKI_LOG(ERROR) << "LcalComm create failed. dev : " << i << ", rankSize : " << ndev; + MKI_LOG(ERROR) << "LcalComm create failed. dev : " << i << ", ndev : " << ndev; return LCAL_ERROR_INTERNAL; } } @@ -210,12 +210,12 @@ int LcalCommInitThread(int rank, int rankSize, const char *uid, LcalCommPtr *com MKI_LOG(ERROR) << "lcal comms is nullptr!"; return LCAL_ERROR_INTERNAL; } - if (rank > rankSize) { + if (rank >= rankSize) { MKI_LOG(ERROR) << "lcal rank : " << rank << " rankSize : " << rankSize; return LCAL_ERROR_INTERNAL; } *comms = new (std::nothrow) LcalComm(rank, rankSize); - if (*comms == nullptr) { + if (*comms == nullptr) { MKI_LOG(ERROR) << "LcalComm create failed. rank : " << rank << ", rankSize : " << rankSize; return LCAL_ERROR_INTERNAL; } -- Gitee From 157610e116fe7b1f535d72bf89537bb92f6943e7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 11:56:37 +0800 Subject: [PATCH 180/414] draft --- comm/lcal/src/lccl.cpp | 25 +++++ comm/lcal/src/profiling/report_timing.h | 131 ++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 comm/lcal/src/lccl.cpp create mode 100644 comm/lcal/src/profiling/report_timing.h diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp new file mode 100644 index 00000000..81c15e17 --- /dev/null +++ b/comm/lcal/src/lccl.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "lccl.h" +#include "lcal_internal.h" + +#include +#include +#include +#include + +#include +#include + +#include "profiling/report_timing.h" + +using namespace std; +using namespace chrono; +using namespace Mki; diff --git a/comm/lcal/src/profiling/report_timing.h b/comm/lcal/src/profiling/report_timing.h new file mode 100644 index 00000000..94e99e53 --- /dev/null +++ b/comm/lcal/src/profiling/report_timing.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef REPORT_TIMING_H +#define REPORT_TIMING_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Lcal { +class ReportTiming { +public: + static constexpr uint64_t PROF_TASK_TIME_DUMP = 0x000100000000ULL; + ReportTiming() = delete; + explicit ReportTiming(const char *opName, int commDomain, int64_t count =0, + HcclDataType dataType = HCCL_DATA_TYPE_RESERVED) + : opName_(opName), typeMix_(false), count_(count), dataType_(dataType) + { + InitProfiling(commDomain); + } + + explicit ReportTiming(const char *opName, uint32_t blockDim) + : opName_(opName), blockDim_(blockDim), typeMix_(true) + { + InitProfiling(0); + } + + explicit ReportTiming(const char *opName, const int32_t rankId, const bool isReporting, uint8_t *dumpAddr, + const aclrtStream stream) : opName_(opName), rankId_(rankId), isReporting_(isReporting), + dumpAddr_(dumpAddr), stream_(stream) + { + moduleId_ = DUMP_MODULE_ID; + InitProfiling(0); + } + + ~ReportTiming() + { + MKI_LOG(DEBUG) << "ReportTiming " << __LINE__ << " ~ReportTiming() " << + " isReporting_:" << isReporting_ << " profEnable_:" << profEnable_; + if (profEnable_ && isReporting_) { + ReportMsprofData(); + } + + if (!isReporting_) { + ProfilingStatus(RESET_STATUS); + } + } + + void InitProfiling(int commDomain) + { + if (ProfilingStatus() == -1) { + ProfilingStatus(0); + MKI_LOG(INFO) << "MsprofRegisterCallback start!"; + if (MsprofRegisterCallback(moduleId_, ProfHandle) != 0) { + MKI_LOG(ERROR) << "MsprofRegisterCallback fail!"; + } + } + + MKI_LOG(DEBUG) << "InitProfiling " << __LINE__ << "ProfilingStatus()" << ProfilingStatus() << + " isReporting_:" << isReporting_; + if (ProfilingStatus() > 0) { + ParamsInit(commDomain); + } + MKI_LOG(DEBUG) << "InitProfiling " << __LINE__ << "ProfilingStatus()" << ProfilingStatus() << + " isReporting_:" << isReporting_ << " profEnable_:" << profEnable_; + } + + + + +private: + static constexpr uint64_t PROF_TASK_TIME_L0 = 0x00000800ULL; + static constexpr uint64_t PROF_TASK_TIME_L1 = 0x00000002ULL; + static constexpr int32_t DUMP_MODULE_ID = 61; + static constexpr int32_t RESET_STATUS = -2; + uint64_t beginTime_ = 0; + uint16_t endTime_ = 0; + const char *opName_ = nullptr; + uint32_t blockDim_ = 0; + uint64_t nameHash_ = 0; + uint64_t groupHash_ = 0; + uint64_t naHash_ = 0; + bool typeMix_ = false; + long tid_ = 0; + bool profEnable_ = false; + int64_t count_ = 0; + uint8_t dataType = HCCL_DATA_TYPE_RESERVED; + int32_t rankId_ = 0; + bool isReporting_ = true; + uint8_t *dumpAddr_ = nullptr; + aclrtStream stream_ = nullptr; + int32_t moduleId_ = INVALID_MODULE_ID; +}; + + + +} + + + + + + + + + + + + + + + + + +#endif \ No newline at end of file -- Gitee From 5fd9eb66719d92ec6f16c72afa2063419eb7a383 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 16:07:14 +0800 Subject: [PATCH 181/414] draft --- comm/lcal/src/profiling/report_timing.h | 262 ++++++++++++++++++++++++ 1 file changed, 262 insertions(+) diff --git a/comm/lcal/src/profiling/report_timing.h b/comm/lcal/src/profiling/report_timing.h index 94e99e53..acaa8799 100644 --- a/comm/lcal/src/profiling/report_timing.h +++ b/comm/lcal/src/profiling/report_timing.h @@ -81,8 +81,270 @@ public: " isReporting_:" << isReporting_ << " profEnable_:" << profEnable_; } + static int64_t ProfilingStatus(int64_t setValue = -1) + { + static int64_t profilingStatus = -1; + if (setValue == RESET_STATUS) { + profilingStatus = -1; + } else if (setValue != -1) { + profilingStatus = setValue; + } + return profilingStatus; + } + + void ParamsInit(int commDomain) + { + profEnable_ = true; + std::string groupName = std::to_string(commDomain); + groupHash_ = MsprofGethashId(groupName.c_str(), strlen(groupName.c_str())); + + std::string naStr = "NA"; + naHash_ = MsprofGethashId(naStr.c_str(), strlen(naStr.c_str())); + + nameHash_ = MsprofGethashId(opName_, strlen(opName_)); + beginTime_ = MsprofSysCycleTime(); + } + + void ReportMsprofData() + { + tid_ = GetCurrentThreadId(); + if (tid_ == -1) { + MKI_LOG(ERROR) << "GetCurrentThreadId error!" << " name: " << opName_; + return; + } + endTime_ = MsprofSysCycleTime(); + + MKI_LOG(DEBUG) << "ReportMsprofData " << ProfilingStatus() << " dumpAddr_ is " << + (dumpAddr_ == nullptr ? "" : "not") << " nullptr "; + + if (ProfilingStatus() != PROF_TASK_TIME_DUMP || dumpAddr_ == nullptr) { + CallMsprofReportHostNodeApi(); + CallMsprofReportHostLcclOpApi(); + CallMsprofReportHostLcclOpInfo(); + CallMsprofReportHostNodeBasicInfo(); + CallMsprofReportContextIdInfo(); + } else { + CallMsprofReportDumpApi(); + } + } + + void CallMsprofReportDumpApi() const + { + constexpr uint32_t dumpCoreCnt = 75; + constexpr uint32_t dumpSizePerCore = 1 * 1024 * 1024; + constexpr uint32_t dumpWorkspaceSize = dumpCoreCnt * dumpSizePerCore; + + MKI_LOG(DEBUG) << "LcclReporting dump rankId " << rankId_; + uint8_t *devProfData = dumpAddr_; + size_t profLen = dumpWorkspaceSize; + + std::vector buffer(profLen, 0); + int ret = 0; + ret = aclrtMemcpyAsync(&buffer[0], profLen, devProfData, profLen, ACL_MEMCPY_DEVICE_TO_HOST, stream_); + if (ret != 0) { + MKI_LOG(ERROR) << "aclrtMemcpyAsync dump data failed"; + } + ret = aclrtSynchronizeStream(stream_); + if (ret != 0) { + MKI_LOG(ERROR) << "aclrtSynchronizeStream dump data failed"; + } + + constexpr int32_t logLimit = 2; + constexpr int32_t logFirstLimit = 10; + constexpr int32_t profLevel = 3000; + MsprofAdditionalInfo t; + t.level = profLevel; + t.type = 0; + t.threadId = 0; + t.dataLen = sizeof(LcclDumpLogInfo); + t.timeStamp = 0; + for (uint32_t coreId = 0; coreId < dumpCoreCnt; ++coreId) { + LcclDumpUnion *u = reinterpret_cast(&buffer[coreId * dumpSizePerCore]); + LcclDumpBlockInfo *b = &(u->blockInfo); + LcclDumpLogInfo *l = &((u + 1)->logInfo); + + int32_t logLen = (dumpSizePerCore - b->dumpOffset) / sizeof(LcclDumpUnion) - 1; + for (int32_t logInfoIdx = 0; logInfoIdx < logLen; ++logInfoIdx) { + LcclDumpLogInfo *logInfo = l + logInfoIdx; + auto ret = memcpy_s(t.data, sizeof(LcclDumpLogInfo), logInfo, sizeof(LcclDumpLogInfo)); + if (ret != 0) { + MKI_LOG(ERROR) << "LcclReporting report memcpy_s err " << ret; + } + if ((logInfoIdx < logLimit) || (logInfoIdx < logFirstLimit && rankId_ == 0 && coreId == 0)) { + MKI_LOG(DEBUG) << "LcclReporting report: rankId=" << rankId_ << ", coreId=" << coreId << + ", curLog=" << logInfoIdx << "/" << logLen << + "; LcclDumpLogInfo: logId=" << logInfo->logId << ", blockId=" << logInfo->blockId << + ", syscyc=" << logInfo->syscyc << ", curPc=" << logInfo->curPc << + ", operationType=" << logInfo->operationType; + } + MsprofReportAdditionalInfo(0, &t, sizeof(MsprofAdditionalInfo)); + } + } + } + + void CallMsprofReportHostNodeApi() const + { + MsprofApi reporterData{}; + reporterData.level = MSPROF_REPORT_NODE_LEVEL; + reporterData.type = MSPROF_REPORT_NODE_LAUNCH_TYPE; + reporterData.threadId = static_cast(tid_); + reporterData.beginTime = beginTime_; + reporterData.endTime = endTime_; + reporterData.itemId = namehash_; + + auto ret = MsprofReportApi(true, &reporterData); + if (ret != 0) { + MKI_LOG(ERROR) << "CallMsprofReportHostNodeApi error! code: " << ret << " name: " << opName_; + } + } + + void CallMsprofReportHostLcclOpApi() const + { + if (typeMix_) { + return; + } + MsprofApi reporterData{}; + reporterData.level = MSPROF_REPORT_HCCL_NODE_LEVEL; + reporterData.type = MSPROF_REPORT_HCCL_MASTER_TYPE; + reporterData.threadId = static_cast(tid_); + reporterData.beginTime = beginTime_; + reporterData.endTime = endTime_; + reporterData.itemId = namehash_; + + auto ret = MsprofReportApi(true, &reporterData); + if (ret != 0) { + MKI_LOG(ERROR) << "CallMsprofReportHostLcclOpApi error! code: " << ret << " name: " << opName_; + } + } + void CallMsprofReportHostLcclOpInfo() const + { + if (typeMix_) { + return; + } + MsprofCompactInfo reporterData{}; + reporterData.level = MSPROF_REPORT_NODE_LEVEL; + reporterData.type = MSPROF_REPORT_NODE_HCCL_OP_INFO_TYPE; + reporterData.threadId = static_cast(tid_); + reporterData.dataLen = sizeof(MsprofHCCLOPInfo); + reporterData.timeStamp = beginTime_ + 1; + + reporterData.data.hcclopInfo.relay = 0; + reporterData.data.hcclopInfo.retry = 0; + reporterData.data.hcclopInfo.dataType = dataType_; + reporterData.data.hcclopInfo.algType = naHash_; + reporterData.data.hcclopInfo.count = count_; + reporterData.data.hcclopInfo.groupName = groupHash_; + + auto ret = MsprofReportCompactInfo(static_cast(true), + static_cast(&reporterData), static_cast(sizeof(MsprofCompactInfo))); + if (ret != 0) { + MKI_LOG(ERROR) << "CallMsprofReportHostLcclOpInfo error! code: " << ret << " name: " << opName_; + } + } + + void CallMsprofReportHostNodeBasicInfo() const + { + if (ProfilingStatus() == PROF_TASK_TIME_L0) { + return; + } + MsprofCompactInfo reporterData{}; + + reporterData.level = MSPROF_REPORT_NODE_LEVEL; + reporterData.type = MSPROF_REPORT_NODE_BASIC_INFO_TYPE; + reporterData.threadId = static_cast(tid_); + reporterData.dataLen = sizeof(MsprofNodeBasicInfo); + reporterData.timeStamp = endTime_; + + reporterData.data.nodeBasicInfo.opName = nameHash_; + reporterData.data.nodeBasicInfo.opType = nameHash_; + reporterData.data.nodeBasicInfo.blockDim = ((blockDim_ & 0x0000FFFU) | 0x20000U); + + auto ret = MsprofReportCompactInfo(static_cast(true), + static_cast(&reporterData), + static_cast(sizeof(MsprofCompactInfo))); + if (ret != 0) { + MKI_LOG(ERROR) << "CallMsprofReportHostNodeBasicInfo error! code: " << ret << " name: " << opName_; + } + } + + void CallMsprofReportContextIdInfo() const + { + if (!typeMix_) { + return; + } + + MsprofAdditionalInfo additionalInfo = {}; + additionalInfo.magicNumber = MSPROF_REPORT_DATA_MAGIC_NUM; + additionalInfo.level = MSPROF_REPORT_NODE_LEVEL; + additionalInfo.type = MSPROF_REPORT_NODE_CONTEXT_ID_INFO_TYPE; + additionalInfo.timeStamp = beginTime_ + 1; + additionalInfo.threadId = static_cast(tid_); + additionalInfo.dataLen = sizeof(MsprofContextIdInfo); + + MsprofContextIdInfo info = {}; + info.opName = nameHash_; + info.ctxIdNum = 1; + info.ctxIds[0] = 0; + + int ret = memcpy_s(additionalInfo.data, MSPROF_ADDITIONAL_INFO_DATA_LENGTH, &info, sizeof(MsprofContextIdInfo)); + MKI_LOG_IF(ret != EOK, ERROR) << "memcpy_s Error! Error Code: " << ret; + auto retReport = MsprofReportAdditionalInfo(static_cast(true), + static_cast(&additionalInfo), + static_cast(sizeof(MsprofAdditionalInfo))); + if (retReport != 0) { + MKI_LOG(ERROR) << "ProfReportAdditionalInfo error!" << " name: " << opName_; + } + } + + static int32_t GetCurrentThreadId() + { + int32_t tid = static_cast(syscall(SYS_gettid)); + if (tid == -1) { + MKI_LOG(ERROR) << "get tid failed, errorno: " << errno; + } + return tid; + } + + static int32_t ProfHandle(uint32_t type, void *data, uint32_t len) + { + if (data == nullptr) { + MKI_LOG(ERROR) << "ProfHandle failed! data is nullptr!"; + return -1; + } + if (type != PROF_CTRL_SWITCH) { + MKI_LOG(ERROR) << "ProfHandle failed! ProfCtrlType is not correct!"; + return -1; + } + if (len < sizeof(MsprofCommandHandle)) { + MKI_LOG(ERROR) << "ProfHandle failed! dataSize is not correct!"; + return -1; + } + MsprofCommandHandle *profilerConfig = static_cast(data); + const uint32_t profType = profilerConfig->type; + const uint64_t profSwitch = profilerConfig->profSwitch; + if (profSwitch == PROF_COMMANDHANDLE_TYPE_START) { + MKI_LOG(INFO) << "Open Profiling Switch " << std::hex << profSwitch << std::dec; + if ((profSwitch & PROF_TASK_TIME_L0) != PROF_CTRL_INVALID) { + ProfilingStatus(PROF_TASK_TIME_L0); + MKI_LOG(DEBUG) << "Profiling Level0 Enable"; + } + if ((profSwitch & PROF_TASK_TIME_L1) != PROF_CTRL_INVALID) { + ProfilingStatus(PROF_TASK_TIME_L1); + MKI_LOG(DEBUG) << "Profiling Level1 Enable"; + } + if ((profSwitch & PROF_TASK_TIME_DUMP) != PROF_CTRL_INVALID) { + ProfilingStatus(PROF_TASK_TIME_DUMP); + MKI_LOG(DEBUG) << "Profiling dump Enable"; + } + } + if (profType == PROF_COMMANDHANDLE_TYPE_STOP) { + MKI_LOG(INFO) << "Close Profiling Switch"; + ProfilingStatus(0); + } + return 0; + } private: static constexpr uint64_t PROF_TASK_TIME_L0 = 0x00000800ULL; -- Gitee From aa8f852a27daaef522c425e98e869c93cb9ad4a6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 16:13:05 +0800 Subject: [PATCH 182/414] draft --- comm/lcal/src/profiling/report_timing.h | 42 ++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/comm/lcal/src/profiling/report_timing.h b/comm/lcal/src/profiling/report_timing.h index acaa8799..c6540fba 100644 --- a/comm/lcal/src/profiling/report_timing.h +++ b/comm/lcal/src/profiling/report_timing.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,9 +26,9 @@ namespace Lcal { class ReportTiming { public: - static constexpr uint64_t PROF_TASK_TIME_DUMP = 0x000100000000ULL; + static constexpr uint64_t PROF_TASK_TIME_DUMP = 0x0000100000000ULL; ReportTiming() = delete; - explicit ReportTiming(const char *opName, int commDomain, int64_t count =0, + explicit ReportTiming(const char *opName, int commDomain, int64_t count = 0, HcclDataType dataType = HCCL_DATA_TYPE_RESERVED) : opName_(opName), typeMix_(false), count_(count), dataType_(dataType) { @@ -72,12 +72,12 @@ public: } } - MKI_LOG(DEBUG) << "InitProfiling " << __LINE__ << "ProfilingStatus()" << ProfilingStatus() << + MKI_LOG(DEBUG) << "InitProfiling " << __LINE__ << " ProfilingStatus():" << ProfilingStatus() << " isReporting_:" << isReporting_; if (ProfilingStatus() > 0) { ParamsInit(commDomain); } - MKI_LOG(DEBUG) << "InitProfiling " << __LINE__ << "ProfilingStatus()" << ProfilingStatus() << + MKI_LOG(DEBUG) << "InitProfiling " << __LINE__ << " ProfilingStatus():" << ProfilingStatus() << " isReporting_:" << isReporting_ << " profEnable_:" << profEnable_; } @@ -96,12 +96,12 @@ public: { profEnable_ = true; std::string groupName = std::to_string(commDomain); - groupHash_ = MsprofGethashId(groupName.c_str(), strlen(groupName.c_str())); + groupHash_ = MsprofGetHashId(groupName.c_str(), strlen(groupName.c_str())); std::string naStr = "NA"; - naHash_ = MsprofGethashId(naStr.c_str(), strlen(naStr.c_str())); + naHash_ = MsprofGetHashId(naStr.c_str(), strlen(naStr.c_str())); - nameHash_ = MsprofGethashId(opName_, strlen(opName_)); + nameHash_ = MsprofGetHashId(opName_, strlen(opName_)); beginTime_ = MsprofSysCycleTime(); } @@ -171,10 +171,10 @@ public: MKI_LOG(ERROR) << "LcclReporting report memcpy_s err " << ret; } if ((logInfoIdx < logLimit) || (logInfoIdx < logFirstLimit && rankId_ == 0 && coreId == 0)) { - MKI_LOG(DEBUG) << "LcclReporting report: rankId=" << rankId_ << ", coreId=" << coreId << + MKI_LOG(DEBUG) << "LcclReporting report: rankId=" << rankId_ << ", coreId=" << coreId << ", curLog=" << logInfoIdx << "/" << logLen << "; LcclDumpLogInfo: logId=" << logInfo->logId << ", blockId=" << logInfo->blockId << - ", syscyc=" << logInfo->syscyc << ", curPc=" << logInfo->curPc << + ", syscyc=" << logInfo->syscyc << ", curPc=" << logInfo->curPc << ", operationType=" << logInfo->operationType; } MsprofReportAdditionalInfo(0, &t, sizeof(MsprofAdditionalInfo)); @@ -190,7 +190,7 @@ public: reporterData.threadId = static_cast(tid_); reporterData.beginTime = beginTime_; reporterData.endTime = endTime_; - reporterData.itemId = namehash_; + reporterData.itemId = nameHash_; auto ret = MsprofReportApi(true, &reporterData); if (ret != 0) { @@ -209,7 +209,7 @@ public: reporterData.threadId = static_cast(tid_); reporterData.beginTime = beginTime_; reporterData.endTime = endTime_; - reporterData.itemId = namehash_; + reporterData.itemId = nameHash_; auto ret = MsprofReportApi(true, &reporterData); if (ret != 0) { @@ -222,7 +222,7 @@ public: if (typeMix_) { return; } - MsprofCompactInfo reporterData{}; + MsprofCompactInfo reporterData = {}; reporterData.level = MSPROF_REPORT_NODE_LEVEL; reporterData.type = MSPROF_REPORT_NODE_HCCL_OP_INFO_TYPE; reporterData.threadId = static_cast(tid_); @@ -265,7 +265,7 @@ public: static_cast(sizeof(MsprofCompactInfo))); if (ret != 0) { MKI_LOG(ERROR) << "CallMsprofReportHostNodeBasicInfo error! code: " << ret << " name: " << opName_; - } + } } void CallMsprofReportContextIdInfo() const @@ -287,7 +287,7 @@ public: info.ctxIdNum = 1; info.ctxIds[0] = 0; - int ret = memcpy_s(additionalInfo.data, MSPROF_ADDITIONAL_INFO_DATA_LENGTH, &info, sizeof(MsprofContextIdInfo)); + int ret = memcpy_s(additionalInfo.data, MSPROF_ADDTIONAL_INFO_DATA_LENGTH, &info, sizeof(MsprofContextIdInfo)); MKI_LOG_IF(ret != EOK, ERROR) << "memcpy_s Error! Error Code: " << ret; auto retReport = MsprofReportAdditionalInfo(static_cast(true), @@ -302,7 +302,7 @@ public: { int32_t tid = static_cast(syscall(SYS_gettid)); if (tid == -1) { - MKI_LOG(ERROR) << "get tid failed, errorno: " << errno; + MKI_LOG(ERROR) << "get tid failed, errno: " << errno; } return tid; } @@ -321,10 +321,10 @@ public: MKI_LOG(ERROR) << "ProfHandle failed! dataSize is not correct!"; return -1; } - MsprofCommandHandle *profilerConfig = static_cast(data); + MsprofCommandHandle *profilerConfig = static_cast(data); const uint32_t profType = profilerConfig->type; const uint64_t profSwitch = profilerConfig->profSwitch; - if (profSwitch == PROF_COMMANDHANDLE_TYPE_START) { + if (profType == PROF_COMMANDHANDLE_TYPE_START) { MKI_LOG(INFO) << "Open Profiling Switch " << std::hex << profSwitch << std::dec; if ((profSwitch & PROF_TASK_TIME_L0) != PROF_CTRL_INVALID) { ProfilingStatus(PROF_TASK_TIME_L0); @@ -352,7 +352,7 @@ private: static constexpr int32_t DUMP_MODULE_ID = 61; static constexpr int32_t RESET_STATUS = -2; uint64_t beginTime_ = 0; - uint16_t endTime_ = 0; + uint64_tendTime_ = 0; const char *opName_ = nullptr; uint32_t blockDim_ = 0; uint64_t nameHash_ = 0; @@ -362,12 +362,12 @@ private: long tid_ = 0; bool profEnable_ = false; int64_t count_ = 0; - uint8_t dataType = HCCL_DATA_TYPE_RESERVED; + uint8_t dataType_ = HCCL_DATA_TYPE_RESERVED; int32_t rankId_ = 0; bool isReporting_ = true; uint8_t *dumpAddr_ = nullptr; aclrtStream stream_ = nullptr; - int32_t moduleId_ = INVALID_MODULE_ID; + int32_t moduleId_ = INVLID_MOUDLE_ID; }; -- Gitee From 855caea66cf11ad51856ea6fdfddbda9babf0d8e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 16:13:44 +0800 Subject: [PATCH 183/414] draft --- comm/lcal/src/profiling/report_timing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/profiling/report_timing.h b/comm/lcal/src/profiling/report_timing.h index c6540fba..a8f813f4 100644 --- a/comm/lcal/src/profiling/report_timing.h +++ b/comm/lcal/src/profiling/report_timing.h @@ -352,7 +352,7 @@ private: static constexpr int32_t DUMP_MODULE_ID = 61; static constexpr int32_t RESET_STATUS = -2; uint64_t beginTime_ = 0; - uint64_tendTime_ = 0; + uint64_t endTime_ = 0; const char *opName_ = nullptr; uint32_t blockDim_ = 0; uint64_t nameHash_ = 0; -- Gitee From cc53a5e8d4fea9054267a31901ea27246cd5ee36 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 16:14:50 +0800 Subject: [PATCH 184/414] draft --- comm/lcal/src/profiling/report_timing.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/comm/lcal/src/profiling/report_timing.h b/comm/lcal/src/profiling/report_timing.h index a8f813f4..bbcc3e36 100644 --- a/comm/lcal/src/profiling/report_timing.h +++ b/comm/lcal/src/profiling/report_timing.h @@ -369,25 +369,5 @@ private: aclrtStream stream_ = nullptr; int32_t moduleId_ = INVLID_MOUDLE_ID; }; - - - } - - - - - - - - - - - - - - - - - #endif \ No newline at end of file -- Gitee From 02e77587c157ced2960ed026ee2cc5096bea4165 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 19 Aug 2025 16:31:17 +0800 Subject: [PATCH 185/414] draft --- comm/lcal/src/lccl.cpp | 43 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp index 81c15e17..83eddac4 100644 --- a/comm/lcal/src/lccl.cpp +++ b/comm/lcal/src/lccl.cpp @@ -23,3 +23,46 @@ using namespace std; using namespace chrono; using namespace Mki; + +namespace Lcal { + +uint32_t GetLocalReduceBlockDum(int64_t dataSize) +{ + constexpr int oneDataSize = 190 * 1024; + constexpr int maxBlockDim = 8; + int blockDim = dataSize / oneDataSize + 1; + return blockDim <= maxBlockDim ? blockDim : maxBlockDim; +} + +bool GetParallel() +{ + static int parallel = -1; + if (parallel == -1) { + static const char *ENV = Mki::GetEnv("LCCL_PARALLEL"); + parallel = (ENV && (string(ENV) == "1" || string(ENV) == "true")) ? 1 : 0; + MKI_LOG(INFO) << "LCCL_PARALLEL is " << parallel; + } + return static_cast(parallel); +} + +uint32_t GetAllReduceDetermBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extraFlag) +{ + constexpr uint32_t quickOneshotRankSize = 2; + constexpr uint32_t twoBlockNum = 2; + constexpr uint32_t treeBlockNum = 3; + constexpr uint32_t rankSize910a3 = 16; + constexpr uint32_t dbRingBlockNum = 34; + constexpr int64_t smallDataSize = 1 * 1024 * 1024; + constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { + + } + + uint32_t blockDim = GetLocalReduceBlockDum(dataSize); + return (rankSize + blockDim - 1) / blockDim; +} + + + + +} \ No newline at end of file -- Gitee From ced6746afd1cd964b52ab4c222792ca609142d0c Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 16:03:25 +0800 Subject: [PATCH 186/414] draft --- comm/lcal/src/lccl.cpp | 439 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 435 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp index 83eddac4..d19f509a 100644 --- a/comm/lcal/src/lccl.cpp +++ b/comm/lcal/src/lccl.cpp @@ -49,20 +49,451 @@ uint32_t GetAllReduceDetermBlockNum(uint32_t rankSize, int64_t dataSize, uint32_ { constexpr uint32_t quickOneshotRankSize = 2; constexpr uint32_t twoBlockNum = 2; - constexpr uint32_t treeBlockNum = 3; + constexpr uint32_t threeStepNum = 3; constexpr uint32_t rankSize910a3 = 16; constexpr uint32_t dbRingBlockNum = 34; constexpr int64_t smallDataSize = 1 * 1024 * 1024; constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { - + constexpr uint32_t maxAivNum = 40; + const bool isAivNumSupport = ((extraFlag & ExtraFlag::IS_GREATER_THAN_40_AIV) != 0 || + rankSize * threeStepNum <= maxAivNum); + if (rankSize % quickOneshotRankSize == 1 || rankSize == quickOneshotRankSize || + (rankSize << rankSize910a3 && dataSize <= smallDataSize910a3 && isAivNumSupport)) { + return rankSize * threeStepNum; + } else { + return dbRingBlockNum; + } } + if (dataSize < smallDataSize) { + return rankSize * twoBlockNum; + } + return rankSize * threeStepNum; +} + +uint32_t GetAllReduceBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extraFlag) +{ + constexpr uint32_t twoBlockNum = 2; + constexpr uint32_t threeStepNum = 3; + constexpr uint32_t dbRingBlockNum = 34; + constexpr int64_t smallDataSize = 1 * 1024 * 1024; + constexpr uint32_t smallRankSize = 8; + constexpr uint32_t cceSmallDataSize = 2 * 1024 * 1024; + constexpr uint32_t quickOneshotRankSize = 2; + const int64_t quantSmallDataSize = ((extraFlag & ExtraFlag::QUANT_FP16) != 0) ? (smallDataSize / 2) : smallDataSize; + constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; + + if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { + return rankSize * twoBlockNum; + } else if ((extraFlag & ExtraFlag::QUANT_FP16) != 0) { + return dataSize <= quantSmallDataSize ? rankSize : rankSize * twoBlockNum; + } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { + return dataSize < cceSmallDataSize ? rankSize : (rankSize / twoBlockNum * threeStepNum + twoBlockNum); + } else if ((extraFlag & ExtraFlag::DETERMINISTIC) != 0) { + return GetAllReduceDetermBlockNum(rankSize, dataSize, extraFlag); + } + + if (GetParallel()) { + return rankSize; + } + + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && + (rankSize != quickOneshotRankSize)) { + return rankSize % quickOneshotRankSize == 0 ? dbRingBlockNum : rankSize * threeStepNum; + } + return (rankSize == quickOneshotRankSize || dataSize >= cceSmallDataSize) ? rankSize * twoBlockNum : rankSize; +} + +uint32_t GetReduceScatterBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extraFlag) +{ + constexpr uint32_t twoBlockNum = 2; + constexpr int64_t smallDataSize = 1 * 1024 * 1024; + constexpr uint32_t quickOneshotRankSize = 2; + constexpr int64_t cceSmallDataSize = 2 * 1024 * 1024; + constexpr int64_t a3BigDataSize = 32 * 1024 * 1024; + constexpr uint32_t fourStepBlockNum = 34; + constexpr uint32_t a3SupportRankSize = 4; + constexpr uint32_t smallRankSize = 8; + constexpr uint32_t dbRingBlockNum = 36; + + const bool isDbRing = (rankSize == a3SupportRankSize || rankSize == smallRankSize) && + (dataSize * smallRankSize > cceSmallDataSize && dataSize * smallRankSize <= a3BigDataSize); - uint32_t blockDim = GetLocalReduceBlockDum(dataSize); - return (rankSize + blockDim - 1) / blockDim; + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && (rankSize > smallRankSize || isDbRing)) { + if (isDbRing) { + return dbRingBlockNum; + } else { + return dataSize < smallDataSize ? rankSize : fourStepBlockNum; + } + } else { + return (rankSize == quickOneshotRankSize || dataSize >= cceSmallDataSize) ? rankSize * twoBlockNum : rankSize; + } } +uint32_t GetAll2AllBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extraFlag) +{ + constexpr uint32_t twoStepBlockNum = 16; + constexpr uint32_t twoBlockNum = 2; + constexpr int64_t smallDataSize = 1 * 1024 * 1024; + constexpr uint32_t smallRankSize = 8; + + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { + if (rankSize <= smallRankSize && dataSize > smallDataSize && + dataSize % (smallRankSize * smallRankSize * rankSize) == 0) { + return twoStepBlockNum * twoBlockNum; + } else { + return rankSize <= twoStepBlockNum ? rankSize * twoBlockNum : twoStepBlockNum * twoBlockNum; + } + } + return rankSize * twoBlockNum; +} +uint32_t GetAllGatherBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extraFlag) +{ + constexpr uint32_t axRankSize = 16; + constexpr uint32_t twoBlockNum = 2; + constexpr uint32_t quickOneshotRankSize = 2; + constexpr uint32_t allGatherHDBRingBlockNum = 32; + constexpr uint32_t cceSmallDataSize = 2 * 1024 * 1024; + constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; + constexpr uint32_t smallRankSize = 8; + + if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && (rankSize == axRankSize)) { + constexpr uint32_t axBlockNum = 10; + return axBlockNum; + } else if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { + return rankSize * twoBlockNum; + } + + if (GetParallel()) { + return rankSize; + } + + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && + (dataSize > smallDataSize910a3 || rankSize > smallRankSize) && + rankSize > quickOneshotRankSize && rankSize % quickOneshotRankSize == 0) { + return allGatherHDBRingBlockNum; + } + return (rankSize == quickOneshotRankSize || dataSize >= cceSmallDataSize) ? rankSize * twoBlockNum : rankSize; +} + +uint32_t GetKernelBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize, int localRankSize, uint32_t extraFlag) +{ + constexpr uint32_t twoStepBlockNum = 16; + constexpr uint32_t twoBlockNum = 2; + constexpr int64_t smallDataSize = 1 * 1024 * 1024; + constexpr uint32_t gatherDefaultBlockNum = 4; + const uint32_t rankSizeLocal = static_cast(localRankSize); + + if (cclType == LcalType::LOCAL_REDUCE) { + return GetLocalReduceBlockDum(dataSize); + } + + if (cclType == LcalType::BROADCAST) { + return rankSize; + } + + if (cclType == LcalType::ALL2ALL_V_C) { + return twoStepBlockNum * twoBlockNum; + } + if (cclType == LcalType::ALL2ALL) { + return GetAll2AllBlockNum(rankSize, dataSize, extraFlag); + } + if (cclType == LcalType::BANDWIDTH) { + return twoStepBlockNum * twoBlockNum; + } + if (cclType == LcalType::ALL_REDUCE) { + return GetAllReduceBlockNum(rankSize, dataSize, extraFlag); + } + if (cclType == LcalType::REDUCE_SCATTER) { + return GetReduceScatterBlockNum(rankSize, dataSize, extraFlag); + } + if (cclType == LcalType::ALL_GATHER) { + return GetAllGatherBlockNum(rankSize, dataSize, extraFlag); + } + if (cclType == LcalType::GATHER) { + return gatherDefaultBlockNum; + } + bool sendOrRecv = cclType == LcalType == LcalType::RECV || cclType == LcalType::SEND; + if (sendOrRecv) { + return dataSize <= smallDataSize ? rankSizeLocal : rankSizeLocal * twoBlockNum; + } + return twoBlockNum; +} + +uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize, + int localRankSize, uint32_t extraFlag) const +{ + uint32_t blockNum = GetKernelBlockNum(cclType, rankSize, dataSize, localRankSize, extraFlag); + if (comm_->isEnableMix_) { + constexpr uint32_t aivNumPerAic = 2; + if (blockNum % aivNumPerAic == 1) { + MKI_LOG(ERROR) << "Lccl not support odd block number at msprof op enabled!"; + return 0; + } + return blockNum / aivNumPerAic; + } else { + return blockNum; + } +} + +int Lccl::LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const +{ + if (sendBuff != recvBuff) { + auto ret = aclrtMemcpyAsync(recvBuff, Count2Size(count, dataType), sendBuff, Count2Size(count, dataType), + ACL_MEMCPY_DEVICE_TO_DEVICE, stream); + if (ret != 0) { + MKI_LOG(ERROR) << "LoopBack failed!"; + return LCAL_ERROR_INTERNAL; + } + } + return LCAL_SUCCESS; +} +int AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, HcclReduceOp op, + aclrtStream stream, HcclDataType outputDataType, const void *scale, int64_t scaleCount, const void *offset = nullptr) const +{ + if (!CheckBuff(sendBuff, recvBuff)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (!CheckDataType(dataType) || op == HCCL_REDUCE_PROD || + (outputDataType != HCCL_DATA_TYPE_RESERVED && !CheckDataType(outputDataType))) { + MKI_LOG(ERROR) << "Lccl not support."; + return LCAL_ERROR_NOT_INITIALIZED; + } + std::unique_ptr report; + if (comm_->isEnableMsprofOp_) { + report = std::make_unique("LcclAllReduce", comm_->rank_, true, + comm_->commArgs_.dumpAddr, stream); + } else { + report = std::make_unique("LcclAllReduce", comm_->commDomain_, count, dataType); + } + if ((dataType == HCCL_DATA_TYPE_INT8 && outputDataType == HCCL_DATA_TYPE_FP16) != + static_cast(comm->commArgs_.extraFlag & ExtraFlag::QUANT_FP16)) { + if (dataType == HCCL_DATA_TYPE_INT8 && outputDataType == HCCL_DATA_TYPE_FP16) { + comm_->commArgs_.extraFlag != ExtraFlag::QUANT_FP16; + } else { + comm_->commArgs_.extraFlag &= ~ExtraFlag::QUANT_FP16; + } + + auto ret = aclrtMemcpyAsync(comm->commArgsPtr_, sizeof(CommArgs), &(comm_->commArgs_), sizeof(CommArgs), + ACL_MEMCPY_HOST_TO_DEVICE, stream); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtMemcpy err " << __LINE__ << " " << ret; + return LCAL_ERROR_INTERNAL; + } + } + + if ((comm_->commArgs_.extraFlag & ExtraFlag::QUANT_FP16) != 0 && + (comm_->commArgs_.extraFlag & (ExtraFlag::QUANT_DELAY | ExtraFlag::QUANT_CURRENT)) == 0) { + uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, + scale, scaleCount, offset}; + comm_->magic_++; + return LoadMTE(LcalType::ALL_REDUCE, ascendArgs, blockDim, dataType, stream); + } + + if (rankSize_ <= 1) { + return LoopBack(sendBuff, recvBuff, count, dataType, stream); + } + + if ((comm_->commArgs_.extraFlag & (ExtraFlag::QUANT_DELAY | ExtraFlag::QUANT_CURRENT)) != 0) { + uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, scale, + scaleCount}; + comm_->magic_++; + return LoadMTE(LcalType::ALL_REDUCE, ascendArgs, blockDim, dataType, stream); + } + + uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0}; + comm_->magic_++; + return LoadMTE(LcalType::ALL_REDUCE, ascendArgs, blockDim, dataType, stream); +} + +bool Lccl::CheckDataType(const HcclDataType &dataType) const +{ + return (dataType == HCCL_DATA_TYPE_INT32 or dataType == HCCL_DATA_TYPE_FP16 or dataType == HCCL_DATA_TYPE_FP32 or + dataType == HCCL_DATA_TYPE_INT8 or dataType == HCCL_DATA_TYPE_INT16 or dataType == HCCL_DATA_TYPE_BFP16 or + dataType == HCCL_DATA_TYPE_INT64); +} + +bool Lccl::CheckBuff(const void *sendBuff, const void *recvBuff) const +{ + bool res = true; + if (sendBuff == nullptr) { + MKI_LOG(ERROR) << "Lccl sendBuff is nullptr"; + res = false; + } else if (recvBuff == nullptr) { + MKI_LOG(ERROR) << "Lccl recvBuff is nullptr"; + res = false; + } + return res; +} + +int Lccl::ReduceScatter(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, HcclReduceOp op, + aclrtStream stream) const +{ + if (!CheckBuff(sendBuff, recvBuff)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (rankSize_ <= 1) { + return LoopBack(sendBuff, recvBuff, count, dataType, stream); + } + std::unique_ptr report; + if (comm_->isEnableMsprofOp_) { + report = std::make_unique("LcclReduceScatter", comm_->rank_, true, + comm_->commArgs_.dumpAddr, stream); + } else { + report = std::make_unique("LcclReduceScatter", comm_->commDomain_, count, dataType); + } + if (CheckDataType(dataType) and op != HCCL_REDUCE_PROD) { + uint32_t blockDim = GetBlockNum(LcalType::REDUCE_SCATTER, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op}; + comm_->magic_++; + return LoadMTE(LcalType::REDUCE_SCATTER, ascendArgs, blockDim, dataType, stream); + } + MKI_LOG(ERROR) << "Lccl not support."; + return LCAL_ERROR_NOT_INITIALIZED; +} + +int Lccl::AllGather(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const +{ + if (!CheckBuff(sendBuff, recvBuff)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (rankSize_ <= 1) { + return LoopBack(sendBuff, recvBuff, count, dataType, stream); + } + std::unique_ptr report; + if (comm_->isEnableMsprofOp_) { + report = std::make_unique("LcclAllGahter", comm_->rank_, true, + comm_->commArgs_.dumpAddr, stream); + } else { + report = std::make_unique("LcclAllGahter", comm_->commDomain_, count, dataType); + } + AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0}; + comm_->magic_++; + uint32_t blockDim = GetBlockNum(LcalType::ALL_GATHER, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + return LoadMTE(LcalType::ALL_GATHER, ascendArgs, blockDim, dataType, stream); +} + +int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const +{ + constexpr int32_t supportRankNum = 2; + if (!CheckBuff(sendBuff, recvBuff) || (rankSize_ > 1 && rankSize_ % supportRankNum != 0)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (rankSize_ <= 1) { + return LoopBack(sendBuff, recvBuff, count, dataType, stream); + } + ReportTiming report("LcclAll2All", comm_->commDomain_, count, dataType); + AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0, 0}; + comm_->magic_++; + uint32_t blockDim = GetBlockNum(LcalType::ALL2ALL, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + return LoadMTE(LcalType::ALL2ALL, ascendArgs, blockDim, dataType, stream); +} + +int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, int burstLen, + int stride, HcclDataType dataType, aclrtStream stream) const +{ + if (!CheckBuff(sendBuff, recvBuff)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (rankSize_ <= 1) { + return LoopBack(sendBuff, recvBuff, count, dataType, stream); + } + ReportTiming report("LcclAll2AllTranspose", comm_->commDomain_, count, dataType); + + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, burstLen, stride}; + comm_->magic_++; + uint32_t blockDim = GetBlockNum(LcalType::ALL2ALL, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + return LoadMTE(LcalType::ALL2ALL, ascendArgs, blockDim, dataType, stream); +} + +int64_t GetSizeByHcclDataType(const HcclDataType &dataType) +{ + int64_t dataSize = sizeof(int); + switch (dataType) { + case HCCL_DATA_TYPE_INT8: + case HCCL_DATA_TYPE_UINT8: + dataSize = sizeof(int8_t); + break; + case HCCL_DATA_TYPE_INT16: + case HCCL_DATA_TYPE_FP16: + case HCCL_DATA_TYPE_BFP16: + case HCCL_DATA_TYPE_UINT16: + dataSize = sizeof(int16_t); + break; + case HCCL_DATA_TYPE_FP32: + case HCCL_DATA_TYPE_INT32: + case HCCL_DATA_TYPE_UINT32: + dataSize = sizeof(int32_t); + break; + case HCCL_DATA_TYPE_INT64: + case HCCL_DATA_TYPE_UINT64: + dataSize = sizeof(int64_t); + break; + default: + MKI_LOG(ERROR) << "unknown datatype"; + } + return dataSize; +} + +int Lccl::Broadcast(void *buff, int64_t count, HcclDataType dataType, int32_t root, aclrtStream stream) const +{ + constexpr int supportRankSize = 8; + if (rankSize_ <= 1) { + return LCAL_SUCCESS; + } + if (rankSize_ > supportRankSize) { + MKI_LOG(ERROR) << "Broadcast does not support ranksize over 8"; + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (!CheckBuff(sendBuff, recvBuff)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcclBroadcast", comm_->commDomain_, count, dataType); + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_}; + comm_->magic_++; + uint32_t blockDim = GetBlockNum(LcalType::BROADCAST, rankSize_, Count2Size(count, dataType), + comm_->localRankSize_, comm_->commArgs_.extraFlag); + return LoadMTE(LcalType::BROADCAST, args, blockDim, dataType, stream); +} + +Lccl::~Lccl() +{ + if (rankSize_ == -1 and comm_ != nullptr) { + delete comm_; + } +} + +Lccl::Lccl(LcalComm *comm) : comm_(comm) +{ + if (comm != nullptr) { + rank_ = comm->rank_; + rankSize_ = comm->rankSize_; + } else { + MKI_LOG(ERROR) << "com is nullptr."; + comm_ = new (std::nothrow) LcalComm(0, 0); + if (comm_ == nullptr) { + MKI_LOG(ERROR) << "LcalComm create failed " << __LINE__; + } + rankSize_ = -1; + } +} + +Lccl::Lccl(LcalComm &comm) : comm_(&comm) +{ + rank_ = comm.rank_; + rankSize_ = comm.rankSize_; +} } \ No newline at end of file -- Gitee From 3e0ead2ecd7989d90dc2d455bbdbad9333e0af98 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 16:10:41 +0800 Subject: [PATCH 187/414] draft --- comm/lcal/src/lccl.cpp | 64 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp index d19f509a..bed56fb6 100644 --- a/comm/lcal/src/lccl.cpp +++ b/comm/lcal/src/lccl.cpp @@ -59,7 +59,7 @@ uint32_t GetAllReduceDetermBlockNum(uint32_t rankSize, int64_t dataSize, uint32_ const bool isAivNumSupport = ((extraFlag & ExtraFlag::IS_GREATER_THAN_40_AIV) != 0 || rankSize * threeStepNum <= maxAivNum); if (rankSize % quickOneshotRankSize == 1 || rankSize == quickOneshotRankSize || - (rankSize << rankSize910a3 && dataSize <= smallDataSize910a3 && isAivNumSupport)) { + (rankSize <= rankSize910a3 && dataSize <= smallDataSize910a3 && isAivNumSupport)) { return rankSize * threeStepNum; } else { return dbRingBlockNum; @@ -97,7 +97,7 @@ uint32_t GetAllReduceBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extr return rankSize; } - if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && + if ((extraFlag & ExtraFlag::TOPO_910_93) != 0 && dataSize > smallDataSize910a3 && (rankSize != quickOneshotRankSize)) { return rankSize % quickOneshotRankSize == 0 ? dbRingBlockNum : rankSize * threeStepNum; } @@ -123,7 +123,7 @@ uint32_t GetReduceScatterBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t if (isDbRing) { return dbRingBlockNum; } else { - return dataSize < smallDataSize ? rankSize : fourStepBlockNum; + return dataSize <= smallDataSize ? rankSize : fourStepBlockNum; } } else { return (rankSize == quickOneshotRankSize || dataSize >= cceSmallDataSize) ? rankSize * twoBlockNum : rankSize; @@ -156,7 +156,7 @@ uint32_t GetAllGatherBlockNum(uint32_t rankSize, int64_t dataSize, uint32_t extr constexpr uint32_t quickOneshotRankSize = 2; constexpr uint32_t allGatherHDBRingBlockNum = 32; constexpr uint32_t cceSmallDataSize = 2 * 1024 * 1024; - constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; + constexpr int64_t smallDataSize910a3 = 32 * 1024 * 1024; constexpr uint32_t smallRankSize = 8; if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && (rankSize == axRankSize)) { @@ -215,11 +215,11 @@ uint32_t GetKernelBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize if (cclType == LcalType::GATHER) { return gatherDefaultBlockNum; } - bool sendOrRecv = cclType == LcalType == LcalType::RECV || cclType == LcalType::SEND; + bool sendOrRecv = cclType == LcalType::RECV || cclType == LcalType::SEND; if (sendOrRecv) { return dataSize <= smallDataSize ? rankSizeLocal : rankSizeLocal * twoBlockNum; } - return twoBlockNum; + return twoStepBlockNum; } uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize, @@ -251,8 +251,8 @@ int Lccl::LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclData return LCAL_SUCCESS; } -int AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, HcclReduceOp op, - aclrtStream stream, HcclDataType outputDataType, const void *scale, int64_t scaleCount, const void *offset = nullptr) const +int Lccl::AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, HcclReduceOp op, + aclrtStream stream, HcclDataType outputDataType, const void *scale, int64_t scaleCount, const void *offset) const { if (!CheckBuff(sendBuff, recvBuff)) { return LCAL_ERROR_PARA_CHECK_FAIL; @@ -270,14 +270,14 @@ int AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataTy report = std::make_unique("LcclAllReduce", comm_->commDomain_, count, dataType); } if ((dataType == HCCL_DATA_TYPE_INT8 && outputDataType == HCCL_DATA_TYPE_FP16) != - static_cast(comm->commArgs_.extraFlag & ExtraFlag::QUANT_FP16)) { + static_cast(comm_->commArgs_.extraFlag & ExtraFlag::QUANT_FP16)) { if (dataType == HCCL_DATA_TYPE_INT8 && outputDataType == HCCL_DATA_TYPE_FP16) { - comm_->commArgs_.extraFlag != ExtraFlag::QUANT_FP16; + comm_->commArgs_.extraFlag |= ExtraFlag::QUANT_FP16; } else { comm_->commArgs_.extraFlag &= ~ExtraFlag::QUANT_FP16; } - auto ret = aclrtMemcpyAsync(comm->commArgsPtr_, sizeof(CommArgs), &(comm_->commArgs_), sizeof(CommArgs), + auto ret = aclrtMemcpyAsync(comm_->commArgsPtr_, sizeof(CommArgs), &(comm_->commArgs_), sizeof(CommArgs), ACL_MEMCPY_HOST_TO_DEVICE, stream); if (ret != ACL_SUCCESS) { MKI_LOG(ERROR) << "aclrtMemcpy err " << __LINE__ << " " << ret; @@ -302,17 +302,17 @@ int AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataTy if ((comm_->commArgs_.extraFlag & (ExtraFlag::QUANT_DELAY | ExtraFlag::QUANT_CURRENT)) != 0) { uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, scale, - scaleCount}; + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, scale, + scaleCount}; comm_->magic_++; - return LoadMTE(LcalType::ALL_REDUCE, ascendArgs, blockDim, dataType, stream); + return LoadMTE(LcalType::ALL_REDUCE, args, blockDim, dataType, stream); } uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0}; + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0}; comm_->magic_++; - return LoadMTE(LcalType::ALL_REDUCE, ascendArgs, blockDim, dataType, stream); + return LoadMTE(LcalType::ALL_REDUCE, args, blockDim, dataType, stream); } bool Lccl::CheckDataType(const HcclDataType &dataType) const @@ -354,9 +354,9 @@ int Lccl::ReduceScatter(void *sendBuff, void *recvBuff, int64_t count, HcclDataT if (CheckDataType(dataType) and op != HCCL_REDUCE_PROD) { uint32_t blockDim = GetBlockNum(LcalType::REDUCE_SCATTER, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op}; + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op}; comm_->magic_++; - return LoadMTE(LcalType::REDUCE_SCATTER, ascendArgs, blockDim, dataType, stream); + return LoadMTE(LcalType::REDUCE_SCATTER, args, blockDim, dataType, stream); } MKI_LOG(ERROR) << "Lccl not support."; return LCAL_ERROR_NOT_INITIALIZED; @@ -372,16 +372,16 @@ int Lccl::AllGather(void *sendBuff, void *recvBuff, int64_t count, HcclDataType } std::unique_ptr report; if (comm_->isEnableMsprofOp_) { - report = std::make_unique("LcclAllGahter", comm_->rank_, true, + report = std::make_unique("LcclAllGather", comm_->rank_, true, comm_->commArgs_.dumpAddr, stream); } else { - report = std::make_unique("LcclAllGahter", comm_->commDomain_, count, dataType); + report = std::make_unique("LcclAllGather", comm_->commDomain_, count, dataType); } - AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0}; + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0}; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::ALL_GATHER, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - return LoadMTE(LcalType::ALL_GATHER, ascendArgs, blockDim, dataType, stream); + return LoadMTE(LcalType::ALL_GATHER, args, blockDim, dataType, stream); } int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const @@ -394,15 +394,15 @@ int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, HcclDataType da return LoopBack(sendBuff, recvBuff, count, dataType, stream); } ReportTiming report("LcclAll2All", comm_->commDomain_, count, dataType); - AscendCCLKernelArgs ascendArgs = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0, 0}; + AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0, 0}; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::ALL2ALL, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - return LoadMTE(LcalType::ALL2ALL, ascendArgs, blockDim, dataType, stream); + return LoadMTE(LcalType::ALL2ALL, args, blockDim, dataType, stream); } -int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, int burstLen, - int stride, HcclDataType dataType, aclrtStream stream) const +int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, int32_t burstLen, + int32_t stride, HcclDataType dataType, aclrtStream stream) const { if (!CheckBuff(sendBuff, recvBuff)) { return LCAL_ERROR_PARA_CHECK_FAIL; @@ -412,11 +412,11 @@ int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, int burstLen, } ReportTiming report("LcclAll2AllTranspose", comm_->commDomain_, count, dataType); - AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, burstLen, stride}; + AscendCCLKernelArgs args = { sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, burstLen, stride}; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::ALL2ALL, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - return LoadMTE(LcalType::ALL2ALL, ascendArgs, blockDim, dataType, stream); + return LoadMTE(LcalType::ALL2ALL, args, blockDim, dataType, stream); } int64_t GetSizeByHcclDataType(const HcclDataType &dataType) @@ -458,15 +458,15 @@ int Lccl::Broadcast(void *buff, int64_t count, HcclDataType dataType, int32_t ro MKI_LOG(ERROR) << "Broadcast does not support ranksize over 8"; return LCAL_ERROR_PARA_CHECK_FAIL; } - if (!CheckBuff(sendBuff, recvBuff)) { + if (!CheckBuff(buff, buff)) { return LCAL_ERROR_PARA_CHECK_FAIL; } ReportTiming report("LcclBroadcast", comm_->commDomain_, count, dataType); - AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_}; + AscendCCLKernelArgs args = {buff, buff, comm_->commArgsPtr_, count, comm_->magic_, 0, root}; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::BROADCAST, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - return LoadMTE(LcalType::BROADCAST, args, blockDim, dataType, stream); + return LoadMTE(LcalType::BROADCAST, args, blockDim, dataType, stream); } Lccl::~Lccl() @@ -482,7 +482,7 @@ Lccl::Lccl(LcalComm *comm) : comm_(comm) rank_ = comm->rank_; rankSize_ = comm->rankSize_; } else { - MKI_LOG(ERROR) << "com is nullptr."; + MKI_LOG(ERROR) << "comm is nullptr."; comm_ = new (std::nothrow) LcalComm(0, 0); if (comm_ == nullptr) { MKI_LOG(ERROR) << "LcalComm create failed " << __LINE__; -- Gitee From c0ae039cdad22dc3a9131e6694e580ed506179fc Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 16:13:14 +0800 Subject: [PATCH 188/414] draft --- comm/lcal/src/lccl.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp index bed56fb6..a92d54e6 100644 --- a/comm/lcal/src/lccl.cpp +++ b/comm/lcal/src/lccl.cpp @@ -302,8 +302,8 @@ int Lccl::AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType if ((comm_->commArgs_.extraFlag & (ExtraFlag::QUANT_DELAY | ExtraFlag::QUANT_CURRENT)) != 0) { uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, scale, - scaleCount}; + AscendCCLKernelArgs args = { sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, scale, + scaleCount }; comm_->magic_++; return LoadMTE(LcalType::ALL_REDUCE, args, blockDim, dataType, stream); } @@ -354,9 +354,9 @@ int Lccl::ReduceScatter(void *sendBuff, void *recvBuff, int64_t count, HcclDataT if (CheckDataType(dataType) and op != HCCL_REDUCE_PROD) { uint32_t blockDim = GetBlockNum(LcalType::REDUCE_SCATTER, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op}; + AscendCCLKernelArgs args = { sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0 }; comm_->magic_++; - return LoadMTE(LcalType::REDUCE_SCATTER, args, blockDim, dataType, stream); + return LoadMTE(LcalType::REDUCE_SCATTER, args, blockDim, dataType, stream); } MKI_LOG(ERROR) << "Lccl not support."; return LCAL_ERROR_NOT_INITIALIZED; @@ -377,7 +377,7 @@ int Lccl::AllGather(void *sendBuff, void *recvBuff, int64_t count, HcclDataType } else { report = std::make_unique("LcclAllGather", comm_->commDomain_, count, dataType); } - AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0}; + AscendCCLKernelArgs args = { sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0 }; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::ALL_GATHER, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); @@ -394,11 +394,11 @@ int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, HcclDataType da return LoopBack(sendBuff, recvBuff, count, dataType, stream); } ReportTiming report("LcclAll2All", comm_->commDomain_, count, dataType); - AscendCCLKernelArgs args = {sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0, 0}; + AscendCCLKernelArgs args = { sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, 0, 0, 0 }; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::ALL2ALL, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); - return LoadMTE(LcalType::ALL2ALL, args, blockDim, dataType, stream); + return LoadMTE(LcalType::ALL2ALL, args, blockDim, dataType, stream); } int Lccl::All2All(void *sendBuff, void *recvBuff, int64_t count, int32_t burstLen, @@ -462,7 +462,7 @@ int Lccl::Broadcast(void *buff, int64_t count, HcclDataType dataType, int32_t ro return LCAL_ERROR_PARA_CHECK_FAIL; } ReportTiming report("LcclBroadcast", comm_->commDomain_, count, dataType); - AscendCCLKernelArgs args = {buff, buff, comm_->commArgsPtr_, count, comm_->magic_, 0, root}; + AscendCCLKernelArgs args = { buff, buff, comm_->commArgsPtr_, count, comm_->magic_, 0, root }; comm_->magic_++; uint32_t blockDim = GetBlockNum(LcalType::BROADCAST, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); -- Gitee From 26e4cf39995a6f999357970e21daa45e0a5487d4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 17:26:56 +0800 Subject: [PATCH 189/414] draft --- comm/lcal/src/kernels/collectives.cce | 268 ++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index e69de29b..3fb680f4 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_COLLECTIVES_H +#define LCAL_COLLECTIVES_H + +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M220_VEC__) && !defined(__DAV_C220_CUBE__) +#define __aicore__ +#define __ubuf__ +#define __gm__ +#endif + +#include +#include +#include "kernel_operator.h" +#include "comm_args.h" +#include "../ascendc_kernels/datacopy_gm2gm.h" +#include "coc_internal.cce" +using namespace AscendC; +using namespace Lcal; +constexpr int64_t UB_MAX_SIZE = 196608; + +constexpr int64_t MEM_DMA_UNIT_BYTE = 32; + +constexpr int64_t DMA_SIZE_PER_FLAG = UB_SINGLE_DMA_SIZE_MAX; + +constexpr int64_t EXCEPTION_VALUE = -11327; + +constexpr int64_t SIZE_OF_2M = 2 * 1024 * 1024; + +constexpr int64_t SIZE_OF_8M = 8 * 1024 * 1024; + +constexpr int64_t SIZE_OF_1M = 1 * 1024 * 1024; + +constexpr int64_t MAX_RANK_NUM_OF_ONE_910B2C = 16; + +constexpr int64_t MAX_SEND_COUNT_MATRIX_SIZ_OF_ONE_910B2C = MAX_RANK_NUM_OF_ONE_910B2C * MAX_RANK_NUM_OF_ONE_910B2C; + +constexpr int64_t ALL2ALL_V_C_BUFF_SIZE_PER_PARAGRAPH_910B2C = IPC_BUFF_MAX_SIZE / MAX_RANK_NUM_OF_ONE_910B2C / 2 * 2; + +constexpr int64_t DETERMINISTIC_BUFF_SIZE = (IPC_BUFF_MAX_SIZE >> 1) - 4 * 1024; + +#define ALLREDUCE_ARGS_FUN(T) \ +__gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ +int localrankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ +__gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *buff4,\ +__gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7 + +#define ALLREDUCE_ARGS_CALL(type) \ +(__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ +magic, op, root, localrankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ +shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7] + +#define ALLREDUCE_ARGS_FUN_16P(T) \ +__gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ +int localrankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ +__gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *buff4,\ +__gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7, __gm__ T *buff8, __gm__ T *buff9, \ +__gm__ T *buff10, __gm__ T *buff11, __gm__ T *buff12, __gm__ T *buff13, __gm__ T *buff14, __gm__ T *buff15 + +#define ALLREDUCE_ARGS_CALL_16P(type) \ +(__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ +magic, op, root, localrankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ +shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7], shareAddrs[8], shareAddrs[9], \ +shareAddrs[10], shareAddrs[11], shareAddrs[12], shareAddrs[13], shareAddrs[14], shareAddrs[15] + +#define ALLREDUCE_ARGS_FUN_16P_Origin(T) \ +__gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ +int localrankSize, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, __gm__ T* buff[MAX_RANK_NUM_OF_ONE_910B2C] + +#define ALLREDUCE_ARGS_CALL_16P_Origin(T) \ +input, output, rank, rankSize, len, magic, op, root, localrankSize, sendCountMatrix, dumpAddr, buff + +#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_AGRS_CALL_16P_Origin(processdNum, remainNum, magic) \ +(input + (processedNum)), (output + (processedNum)), rank, rankSize, (remainNum), (maigc), op, root, \ +localRankSize, sendCountMatrix, dumpAddr, buff + +#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_AGRS_CALL_16P(magic) \ +input, output, rank, rankSize, len, (magic), op, root, localrankSize, sendCountMatrix, dumpAddr, \ +buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7, buff8, buff9, buff10, buff11, \ +buff12, buff13, buff14, buff15 + +__attribute__((always_inline)) inline __aicore__ int64_t CeilDiv(int64_t source, int64_t cardinality) +{ + return (((source) + (cardinality) - 1) / (cardinality)); +} + +constexpr int64_t UB_SINGLE_ADD_SIZE_MAX = UB_SINGLE_ADD_SIZE_MAX; + +__attribute__((always_inline)) inline __aicore__ void CpUB2GMAlignB16(__gm__ void* gmAddr, __ubuf__ void* ubAddr, uint32_t size) +{ + CopyUbfugToGmAlignB16(gmAddr, ubAddr, 1, size, 0, 0); +} + +__attribute__((always_inline)) inline __aicore__ void CpGM2UBAlignB16(__ubuf__ void* ubAddr, __gm__ void* gmAddr, uint32_t size) +{ + CopyGmToUbfugAlignB16(ubAddr, gmAddr, 1, size, 0, 0); +} + +__attribute__((always_inline)) inline __aicore__ void DumpLcclLogInfo(GM_ADDR workspaceDumpAddr, LogId logId, Op operationType) +{ +#ifdef ENABLE_LCCL_DUMP + constexpr int32_t UB_HEAD_OFFSET = 96; + + AscendC::PipeBarrier(); + GM_ADDR blockGm = (GM_ADDR)(workspaceDumpAddr + LCCL_DUMP_UINT_SIZE * GetBlockIdx()); + __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); + __ubuf__ LcclDumpLogInfo *logUb = (__ubuf__ LcclDumpLogInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); + + CpGm2UB((__ubuf__ uint8_t *)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); + AscendC::PipeBarrier(); + + if (blockUb->dumpOffset < sizeof(LcclDumpLogInfo)) { + return; + } + + logUb->logId = logId; + logUb->blockId = GetBlockIdx(); + logUb->syscyc = static_cast(GetSystemCycle()); + logUb->curPc = static_cast(get_pc()); + logUb->operationType = operationType; + logUb->rsv = 0; + CpUB2GM((GM_ADDR) blockUb->dumpAddr, (GM_ADDR) logUb, sizeof(LcclDumpLogInfo)); + + blockUb->dumpAddr += sizeof(LcclDumpBlockInfo); + blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); + CpUB2GM(blockGm, (__ubuf__ uint8_t*) blockUb, sizeof(LcclDumpBlockInfo)); + AscendC::PipeBarrier(); +#endif +} + +__attribute__((always_inline)) inline __aicore__ void SetFlag(__ubuf__ int64_t *ctrlFlagsUB, __gm__ int64_t *ctrlFlagGM, + int64_t checkValue) +{ + AscendC::PipeBarrier(); + *ctrlFlagsUB = checkValue; + AscendC::SetFlag(ENENT_ID1); + AscendC::WaitFlag(ENENT_ID1); + CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); + AscendC::PipeBarrier(); +} + +__attribute__((always_inline)) inline __aicore__ void SetFlagNonPipeBarrier(__ubuf__ int64_t *ctrlFlagsUB, __gm__ int64_t *ctrlFlagGM, + int64_t checkValue) +{ + *ctrlFlagsUB = checkValue; + AscendC::SetFlag(ENENT_ID0); + AscendC::WaitFlag(ENENT_ID0); + CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); +} + +__attribute__((always_inline)) inline __aicore__ void SetFlag(__ubuf__ int64_t *ctrlFlagsUB, + __gm__ int64_t *ctrlFlagGM1, __gm__ int64_t *ctrlFlagGM2, int64_t checkValue) +{ + AscendC::PipeBarrier(); + *ctrlFlagsUB = checkValue; + AscendC::SetFlag(ENENT_ID0); + AscendC::WaitFlag(ENENT_ID0); + CpUB2GM(ctrlFlagGM1, ctrlFlagsUB, sizeof(int64_t)); + CpUB2GM(ctrlFlagGM2, ctrlFlagsUB, sizeof(int64_t)); + AscendC::PipeBarrier(); +} + +__attribute__((always_inline)) inline __aicore__ void SetFlagNonPipeBarrier(__ubuf__ int64_t *ctrlFlagsUB, + __gm__ int64_t *ctrlFlagGM1, __gm__ int64_t *ctrlFlagGM2, int64_t checkValue) +{ + *ctrlFlagsUB = checkValue; + AscendC::SetFlag(ENENT_ID0); + AscendC::WaitFlag(ENENT_ID0); + CpUB2GM(ctrlFlagGM1, ctrlFlagsUB, sizeof(int64_t)); + CpUB2GM(ctrlFlagGM2, ctrlFlagsUB, sizeof(int64_t)); +} + +__attribute__((always_inline)) inline __aicore__ void CheckFlag(__ubuf__ int64_t *ctrlFlagsUB, + __gm__ int64_t *ctrlFlagGM, int64_t checkValue) +{ + while (true) { + AscendC::PipeBarrier(); + CpGM2UB(ctrlFlagsUB, ctrlFlagGM, sizeof(int64_t)); + AscendC::SetFlag(ENENT_ID0); + AscendC::WaitFlag(ENENT_ID0); + if (*ctrlFlagsUB == checkValue) { + break; + } + } +} + +__attribute__((always_inline)) inline __aicore__ void CheckFlagNew(__ubuf__ int64_t *ctrlFlagsUB, + __gm__ int64_t *ctrlFlagGM, int64_t checkValue) +{ + while (true) { + AscendC::PipeBarrier(); + CpGM2UB(ctrlFlagsUB, ctrlFlagGM, sizeof(int64_t)); + AscendC::PipeBarrier(); + if (*ctrlFlagsUB == checkValue || (*ctrlFlagsUB) == (checkValue + 1)) { + break; + } + } +} + +__attribute__((always_inline)) inline __aicore__ int64_t GetLcalBlockNum() { + #ifdef ENABLE_LCCL_MIX + constexpr int32_t aivNumPerAic = 2; + return GetBlockNum() * aivNumPerAic; + #else + return GetBlockNum(); + #endif +} + +__attribute__((always_inline)) inline __aicore__ void SyncWithNPU(__ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t *buffRank, __gm__ int64_t magic) { + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buffRank + (GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); + for (int64_t i = 0; i < GetLcalBlockNum(); i++) { + if (i == GetBlockIdx()) { + continue; + } + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buffRank + i * MEM_DMA_UNIT_INT_NUM, magic); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void GM2GM( + int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *receiveBuff, + int64_t revBuffOffsetNum, __gm__ T *sendBuff, int64_t sendBuffOffsetNum) +{ + int64_t times = 0; + while (dataSizeRemain > UB_SINGLE_DMA_SIZE_MAX) { + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(ENENT_ID0); + AscendC::WaitFlag(ENENT_ID0); + CpUB2GM( + (__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(ENENT_ID1); + AscendC::WaitFlag(ENENT_ID1); + times += 1; + dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; + } + if (dataSizeRemain <= 0) { + return; + } + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + dataSizeRemain); + AscendC::SetFlag(ENENT_ID0); + AscendC::WaitFlag(ENENT_ID0); + CpUB2GM( + (__gm__ T*)receiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + inputUB, dataSizeRemain); + AscendC::PipeBarrier(); +} + +template +__attribute__((always_inline)) inline __aicore__ void GM2GMPingPong( + int64_t dataSizeRemain, __ubuf__ T *inputUB[2], __gm__ T *receiveBuff, + int64_t revBuffOffsetNum, __gm__ T *sendBuff, int64_t sendBuffOffsetNum) +{ + if (dataSizeRemain <= 0) { + return; + } +} + +#endif \ No newline at end of file -- Gitee From 5a7efce92179cf0ad6a8891100bfee3856944349 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 17:29:46 +0800 Subject: [PATCH 190/414] draft --- comm/lcal/src/kernels/collectives.cce | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 3fb680f4..9bfd4043 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -10,7 +10,7 @@ #ifndef LCAL_COLLECTIVES_H #define LCAL_COLLECTIVES_H -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M220_VEC__) && !defined(__DAV_C220_CUBE__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) #define __aicore__ #define __ubuf__ #define __gm__ @@ -48,41 +48,41 @@ constexpr int64_t DETERMINISTIC_BUFF_SIZE = (IPC_BUFF_MAX_SIZE >> 1) - 4 * 1024; #define ALLREDUCE_ARGS_FUN(T) \ __gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ -int localrankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ +int localRankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ __gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *buff4,\ __gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7 #define ALLREDUCE_ARGS_CALL(type) \ (__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ -magic, op, root, localrankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ +magic, op, root, localRankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7] #define ALLREDUCE_ARGS_FUN_16P(T) \ __gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ -int localrankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ +int localRankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ __gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *buff4,\ __gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7, __gm__ T *buff8, __gm__ T *buff9, \ __gm__ T *buff10, __gm__ T *buff11, __gm__ T *buff12, __gm__ T *buff13, __gm__ T *buff14, __gm__ T *buff15 #define ALLREDUCE_ARGS_CALL_16P(type) \ (__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ -magic, op, root, localrankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ +magic, op, root, localRankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7], shareAddrs[8], shareAddrs[9], \ shareAddrs[10], shareAddrs[11], shareAddrs[12], shareAddrs[13], shareAddrs[14], shareAddrs[15] #define ALLREDUCE_ARGS_FUN_16P_Origin(T) \ __gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ -int localrankSize, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, __gm__ T* buff[MAX_RANK_NUM_OF_ONE_910B2C] +int localRankSize, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, __gm__ T* buff[MAX_RANK_NUM_OF_ONE_910B2C] #define ALLREDUCE_ARGS_CALL_16P_Origin(T) \ -input, output, rank, rankSize, len, magic, op, root, localrankSize, sendCountMatrix, dumpAddr, buff +input, output, rank, rankSize, len, magic, op, root, localRankSize, sendCountMatrix, dumpAddr, buff #define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_AGRS_CALL_16P_Origin(processdNum, remainNum, magic) \ (input + (processedNum)), (output + (processedNum)), rank, rankSize, (remainNum), (maigc), op, root, \ localRankSize, sendCountMatrix, dumpAddr, buff #define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_AGRS_CALL_16P(magic) \ -input, output, rank, rankSize, len, (magic), op, root, localrankSize, sendCountMatrix, dumpAddr, \ +input, output, rank, rankSize, len, (magic), op, root, localRankSize, sendCountMatrix, dumpAddr, \ buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7, buff8, buff9, buff10, buff11, \ buff12, buff13, buff14, buff15 -- Gitee From e9c7d283ff4a84e7eabd59e19bffe7bf2d1e171b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 17:33:37 +0800 Subject: [PATCH 191/414] draft --- comm/lcal/src/kernels/collectives.cce | 54 +++++++++++++-------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 9bfd4043..77e7a9ba 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -68,20 +68,20 @@ __gm__ T *buff10, __gm__ T *buff11, __gm__ T *buff12, __gm__ T *buff13, __gm__ T (__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ magic, op, root, localRankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7], shareAddrs[8], shareAddrs[9], \ -shareAddrs[10], shareAddrs[11], shareAddrs[12], shareAddrs[13], shareAddrs[14], shareAddrs[15] +shareAddrs[10], shareAddrs[11], shareAddrs[12], shareAddrs[13], shareAddrs[14], shareAddrs[15] \ #define ALLREDUCE_ARGS_FUN_16P_Origin(T) \ __gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ int localRankSize, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, __gm__ T* buff[MAX_RANK_NUM_OF_ONE_910B2C] -#define ALLREDUCE_ARGS_CALL_16P_Origin(T) \ +#define ALLREDUCE_ARGS_CALL_16P_Origin() \ input, output, rank, rankSize, len, magic, op, root, localRankSize, sendCountMatrix, dumpAddr, buff -#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_AGRS_CALL_16P_Origin(processdNum, remainNum, magic) \ -(input + (processedNum)), (output + (processedNum)), rank, rankSize, (remainNum), (maigc), op, root, \ +#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_ARGS_CALL_16P_Origin(processdNum, remainNum, magic) \ +(input + (processedNum)), (output + (processedNum)), rank, rankSize, (remainNum), (magic), op, root, \ localRankSize, sendCountMatrix, dumpAddr, buff -#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_AGRS_CALL_16P(magic) \ +#define MODIFIABLE_MAGIC_ALLREDUCE_ARGS_CALL_16P(magic) \ input, output, rank, rankSize, len, (magic), op, root, localRankSize, sendCountMatrix, dumpAddr, \ buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7, buff8, buff9, buff10, buff11, \ buff12, buff13, buff14, buff15 @@ -91,16 +91,16 @@ __attribute__((always_inline)) inline __aicore__ int64_t CeilDiv(int64_t source, return (((source) + (cardinality) - 1) / (cardinality)); } -constexpr int64_t UB_SINGLE_ADD_SIZE_MAX = UB_SINGLE_ADD_SIZE_MAX; +constexpr int64_t UB_SINGLE_ADD_SIZE_MAX = UB_SINGLE_DMA_SIZE_MAX; __attribute__((always_inline)) inline __aicore__ void CpUB2GMAlignB16(__gm__ void* gmAddr, __ubuf__ void* ubAddr, uint32_t size) { - CopyUbfugToGmAlignB16(gmAddr, ubAddr, 1, size, 0, 0); + CopyUbufToGmAlignB16(gmAddr, ubAddr, 1, size, 0, 0); } __attribute__((always_inline)) inline __aicore__ void CpGM2UBAlignB16(__ubuf__ void* ubAddr, __gm__ void* gmAddr, uint32_t size) { - CopyGmToUbfugAlignB16(ubAddr, gmAddr, 1, size, 0, 0); + CopyGmToUbufAlignB16(ubAddr, gmAddr, 1, size, 0, 0); } __attribute__((always_inline)) inline __aicore__ void DumpLcclLogInfo(GM_ADDR workspaceDumpAddr, LogId logId, Op operationType) @@ -113,7 +113,7 @@ __attribute__((always_inline)) inline __aicore__ void DumpLcclLogInfo(GM_ADDR wo __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); __ubuf__ LcclDumpLogInfo *logUb = (__ubuf__ LcclDumpLogInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); - CpGm2UB((__ubuf__ uint8_t *)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); + CpGM2UB((__ubuf__ uint8_t *)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); AscendC::PipeBarrier(); if (blockUb->dumpOffset < sizeof(LcclDumpLogInfo)) { @@ -126,11 +126,11 @@ __attribute__((always_inline)) inline __aicore__ void DumpLcclLogInfo(GM_ADDR wo logUb->curPc = static_cast(get_pc()); logUb->operationType = operationType; logUb->rsv = 0; - CpUB2GM((GM_ADDR) blockUb->dumpAddr, (GM_ADDR) logUb, sizeof(LcclDumpLogInfo)); + CpUB2GM((GM_ADDR) blockUb->dumpAddr, (__ubuf__ uint8_t*) logUb, sizeof(LcclDumpLogInfo)); blockUb->dumpAddr += sizeof(LcclDumpBlockInfo); blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); - CpUB2GM(blockGm, (__ubuf__ uint8_t*) blockUb, sizeof(LcclDumpBlockInfo)); + CpUB2GM(blockGm, (__ubuf__ uint8_t*)blockUb, sizeof(LcclDumpBlockInfo)); AscendC::PipeBarrier(); #endif } @@ -140,8 +140,8 @@ __attribute__((always_inline)) inline __aicore__ void SetFlag(__ubuf__ int64_t * { AscendC::PipeBarrier(); *ctrlFlagsUB = checkValue; - AscendC::SetFlag(ENENT_ID1); - AscendC::WaitFlag(ENENT_ID1); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); AscendC::PipeBarrier(); } @@ -150,8 +150,8 @@ __attribute__((always_inline)) inline __aicore__ void SetFlagNonPipeBarrier(__ub int64_t checkValue) { *ctrlFlagsUB = checkValue; - AscendC::SetFlag(ENENT_ID0); - AscendC::WaitFlag(ENENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); } @@ -160,8 +160,8 @@ __attribute__((always_inline)) inline __aicore__ void SetFlag(__ubuf__ int64_t * { AscendC::PipeBarrier(); *ctrlFlagsUB = checkValue; - AscendC::SetFlag(ENENT_ID0); - AscendC::WaitFlag(ENENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); CpUB2GM(ctrlFlagGM1, ctrlFlagsUB, sizeof(int64_t)); CpUB2GM(ctrlFlagGM2, ctrlFlagsUB, sizeof(int64_t)); AscendC::PipeBarrier(); @@ -171,8 +171,8 @@ __attribute__((always_inline)) inline __aicore__ void SetFlagNonPipeBarrier(__ub __gm__ int64_t *ctrlFlagGM1, __gm__ int64_t *ctrlFlagGM2, int64_t checkValue) { *ctrlFlagsUB = checkValue; - AscendC::SetFlag(ENENT_ID0); - AscendC::WaitFlag(ENENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); CpUB2GM(ctrlFlagGM1, ctrlFlagsUB, sizeof(int64_t)); CpUB2GM(ctrlFlagGM2, ctrlFlagsUB, sizeof(int64_t)); } @@ -183,8 +183,8 @@ __attribute__((always_inline)) inline __aicore__ void CheckFlag(__ubuf__ int64_t while (true) { AscendC::PipeBarrier(); CpGM2UB(ctrlFlagsUB, ctrlFlagGM, sizeof(int64_t)); - AscendC::SetFlag(ENENT_ID0); - AscendC::WaitFlag(ENENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); if (*ctrlFlagsUB == checkValue) { break; } @@ -232,13 +232,13 @@ __attribute__((always_inline)) inline __aicore__ void GM2GM( while (dataSizeRemain > UB_SINGLE_DMA_SIZE_MAX) { CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, UB_SINGLE_DMA_SIZE_MAX); - AscendC::SetFlag(ENENT_ID0); - AscendC::WaitFlag(ENENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); CpUB2GM( (__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, inputUB, UB_SINGLE_DMA_SIZE_MAX); - AscendC::SetFlag(ENENT_ID1); - AscendC::WaitFlag(ENENT_ID1); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); times += 1; dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; } @@ -247,8 +247,8 @@ __attribute__((always_inline)) inline __aicore__ void GM2GM( } CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), dataSizeRemain); - AscendC::SetFlag(ENENT_ID0); - AscendC::WaitFlag(ENENT_ID0); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); CpUB2GM( (__gm__ T*)receiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), inputUB, dataSizeRemain); -- Gitee From ab85f6fe64bc638fe8eecf7ce05a6c56c42c1398 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 17:35:30 +0800 Subject: [PATCH 192/414] draft --- comm/lcal/src/kernels/collectives.cce | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 77e7a9ba..d5960408 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -77,7 +77,7 @@ int localRankSize, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, __gm__ T* #define ALLREDUCE_ARGS_CALL_16P_Origin() \ input, output, rank, rankSize, len, magic, op, root, localRankSize, sendCountMatrix, dumpAddr, buff -#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_ARGS_CALL_16P_Origin(processdNum, remainNum, magic) \ +#define MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_ARGS_CALL_16P_Origin(processedNum, remainNum, magic) \ (input + (processedNum)), (output + (processedNum)), rank, rankSize, (remainNum), (magic), op, root, \ localRankSize, sendCountMatrix, dumpAddr, buff @@ -213,7 +213,7 @@ __attribute__((always_inline)) inline __aicore__ int64_t GetLcalBlockNum() { #endif } -__attribute__((always_inline)) inline __aicore__ void SyncWithNPU(__ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t *buffRank, __gm__ int64_t magic) { +__attribute__((always_inline)) inline __aicore__ void SyncWithINNPU(__ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t* buffRank, int64_t magic) { SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buffRank + (GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); for (int64_t i = 0; i < GetLcalBlockNum(); i++) { if (i == GetBlockIdx()) { -- Gitee From dc27e532035fd42c578734acb736b62461a341d4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 17:36:38 +0800 Subject: [PATCH 193/414] draft --- comm/lcal/src/kernels/collectives.cce | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index d5960408..44627061 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -213,7 +213,7 @@ __attribute__((always_inline)) inline __aicore__ int64_t GetLcalBlockNum() { #endif } -__attribute__((always_inline)) inline __aicore__ void SyncWithINNPU(__ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t* buffRank, int64_t magic) { +__attribute__((always_inline)) inline __aicore__ void SyncWithinNPU(__ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t* buffRank, int64_t magic) { SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buffRank + (GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); for (int64_t i = 0; i < GetLcalBlockNum(); i++) { if (i == GetBlockIdx()) { @@ -223,6 +223,16 @@ __attribute__((always_inline)) inline __aicore__ void SyncWithINNPU(__ubuf__ int } } +__attribute__((always_inline)) inline __aicore__ void SyncWithinNPUNew(__ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t* buffRank, int64_t magic) { + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buffRank + (GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); + for (int64_t i = 0; i < GetLcalBlockNum(); i++) { + if (i == GetBlockIdx()) { + continue; + } + CheckFlagNew((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buffRank + i * MEM_DMA_UNIT_INT_NUM, magic); + } +} + template __attribute__((always_inline)) inline __aicore__ void GM2GM( int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *receiveBuff, -- Gitee From 671b9a9f1f804624717b1e9cc7d41ea19777a7b5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 20 Aug 2025 17:40:39 +0800 Subject: [PATCH 194/414] draft --- comm/lcal/src/kernels/collectives.cce | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 44627061..19045e40 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -53,19 +53,19 @@ __gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *bu __gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7 #define ALLREDUCE_ARGS_CALL(type) \ -(__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ +(__gm__ type *)input, (__gm__ type *) output, rank, rankSize, len, \ magic, op, root, localRankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7] #define ALLREDUCE_ARGS_FUN_16P(T) \ __gm__ T *input, __gm__ T *output, int rank, int rankSize, int64_t len, int64_t magic, int op, int root, \ -int localRankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ -__gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *buff4,\ -__gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7, __gm__ T *buff8, __gm__ T *buff9, \ +int localRankSize, int64_t loopTime, __gm__ int64_t *sendCountMatrix, GM_ADDR dumpAddr, \ +__gm__ T *buff0, __gm__ T *buff1, __gm__ T *buff2, __gm__ T *buff3, __gm__ T *buff4, \ +__gm__ T *buff5, __gm__ T *buff6, __gm__ T *buff7, __gm__ T *buff8, __gm__ T *buff9, \ __gm__ T *buff10, __gm__ T *buff11, __gm__ T *buff12, __gm__ T *buff13, __gm__ T *buff14, __gm__ T *buff15 #define ALLREDUCE_ARGS_CALL_16P(type) \ -(__gm__ type *) input, (__gm__ type *) output, rank, rankSize, len, \ +(__gm__ type *)input, (__gm__ type *) output, rank, rankSize, len, \ magic, op, root, localRankSize, 0, nullptr, dumpAddr, shareAddrs[0], shareAddrs[1], shareAddrs[2], \ shareAddrs[3], shareAddrs[4], shareAddrs[5], shareAddrs[6], shareAddrs[7], shareAddrs[8], shareAddrs[9], \ shareAddrs[10], shareAddrs[11], shareAddrs[12], shareAddrs[13], shareAddrs[14], shareAddrs[15] \ @@ -113,7 +113,7 @@ __attribute__((always_inline)) inline __aicore__ void DumpLcclLogInfo(GM_ADDR wo __ubuf__ LcclDumpBlockInfo *blockUb = (__ubuf__ LcclDumpBlockInfo*)(UB_HEAD_OFFSET); __ubuf__ LcclDumpLogInfo *logUb = (__ubuf__ LcclDumpLogInfo*)(UB_HEAD_OFFSET + sizeof(LcclDumpBlockInfo)); - CpGM2UB((__ubuf__ uint8_t *)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); + CpGM2UB((__ubuf__ uint8_t*)blockUb, blockGm, sizeof(LcclDumpBlockInfo)); AscendC::PipeBarrier(); if (blockUb->dumpOffset < sizeof(LcclDumpLogInfo)) { @@ -126,7 +126,7 @@ __attribute__((always_inline)) inline __aicore__ void DumpLcclLogInfo(GM_ADDR wo logUb->curPc = static_cast(get_pc()); logUb->operationType = operationType; logUb->rsv = 0; - CpUB2GM((GM_ADDR) blockUb->dumpAddr, (__ubuf__ uint8_t*) logUb, sizeof(LcclDumpLogInfo)); + CpUB2GM((GM_ADDR) blockUb->dumpAddr, (__ubuf__ uint8_t*)logUb, sizeof(LcclDumpLogInfo)); blockUb->dumpAddr += sizeof(LcclDumpBlockInfo); blockUb->dumpOffset -= sizeof(LcclDumpLogInfo); @@ -140,7 +140,7 @@ __attribute__((always_inline)) inline __aicore__ void SetFlag(__ubuf__ int64_t * { AscendC::PipeBarrier(); *ctrlFlagsUB = checkValue; - AscendC::SetFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); AscendC::PipeBarrier(); @@ -239,7 +239,7 @@ __attribute__((always_inline)) inline __aicore__ void GM2GM( int64_t revBuffOffsetNum, __gm__ T *sendBuff, int64_t sendBuffOffsetNum) { int64_t times = 0; - while (dataSizeRemain > UB_SINGLE_DMA_SIZE_MAX) { + while (dataSizeRemain >= UB_SINGLE_DMA_SIZE_MAX) { CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, UB_SINGLE_DMA_SIZE_MAX); AscendC::SetFlag(EVENT_ID0); @@ -247,8 +247,8 @@ __attribute__((always_inline)) inline __aicore__ void GM2GM( CpUB2GM( (__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, inputUB, UB_SINGLE_DMA_SIZE_MAX); - AscendC::SetFlag(EVENT_ID1); - AscendC::WaitFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); times += 1; dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; } @@ -256,7 +256,7 @@ __attribute__((always_inline)) inline __aicore__ void GM2GM( return; } CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), - dataSizeRemain); + dataSizeRemain); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); CpUB2GM( -- Gitee From ea5469356615b926550b880c2982831ea980558f Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 12:00:20 +0800 Subject: [PATCH 195/414] draft --- comm/lcal/src/kernels/collectives.cce | 75 +++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 19045e40..537d30ce 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -273,6 +273,81 @@ __attribute__((always_inline)) inline __aicore__ void GM2GMPingPong( if (dataSizeRemain <= 0) { return; } + AscendC::PipeBarrier(); + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 ? EVENT_ID1; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)sendBuff + sendBuffOffsetNum, size); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + CpUB2GM((__gm__ T*)receiveBuff + revBuffOffsetNum, (i & 1) ? inputUB[1] : inputUB[0], size); + AscendC::SetFlag(eventId); + dataSizeRemain -= size; + sendBuffOffsetNum += (size / sizeof(T)); + revBuffOffsetNum += (size / sizeof(T)); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + if (dataSizeRemain <= 0) { + return; + } +} + +template +__attribute__((always_inline)) inline __aicore__ void GM2GMPingPongNonPipeBarrier( + int64_t dataSizeRemain, __ubuf__ T *inputUB[2], __gm__ T *receiveBuff, + int64_t revBuffOffsetNum, __gm__ T *sendBuff, int64_t sendBuffOffsetNum) +{ + if (dataSizeRemain <= 0) { + return; + } + const int64_t offsetNumPerLoop = UB_SINGLE_PING_PONG_ADD_SIZE_MAX / sizeof(T); + uint32_t size = 0; + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 ? EVENT_ID1; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)sendBuff + sendBuffOffsetNum, size); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + CpUB2GM((__gm__ T*)receiveBuff + revBuffOffsetNum, (i & 1) ? inputUB[1] : inputUB[0], size); + AscendC::SetFlag(eventId); + dataSizeRemain -= size; + sendBuffOffsetNum += offsetNumPerLoop; + revBuffOffsetNum += offsetNumPerLoop; + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + if (dataSizeRemain <= 0) { + return; + } } +template +__attribute__((always_inline)) inline __aicore__ void input2BuffRankMagic( + int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *ipcReceiveBuff, int64_t revBuffOffsetNum, + __gm__ T *sendBuff, int64_t sendBuffOffsetNum, __ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t* ctrlFlagGM, + int64_t magic) +{ + int64_t times = 0; + int64_t flag = 0; + + while (dataSizeRemain >= UB_SINGLE_DMA_SIZE_MAX) { + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM( + (__gm__ T*)ipcReceiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, UB_SINGLE_DMA_SIZE_MAX); + times += 1; + flag = times; +) + + #endif \ No newline at end of file -- Gitee From 997e9700ca0aa6dfcf588c2133e9627a75f771d9 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 12:03:00 +0800 Subject: [PATCH 196/414] draft --- comm/lcal/src/kernels/collectives.cce | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 537d30ce..8ad47d6c 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -278,12 +278,12 @@ __attribute__((always_inline)) inline __aicore__ void GM2GMPingPong( AscendC::SetFlag(EVENT_ID1); for (int64_t i = 0; dataSizeRemain > 0; i++) { uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; - event_t eventId = (i & 1) ? EVENT_ID0 ? EVENT_ID1; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; AscendC::WaitFlag(eventId); CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)sendBuff + sendBuffOffsetNum, size); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - CpUB2GM((__gm__ T*)receiveBuff + revBuffOffsetNum, (i & 1) ? inputUB[1] : inputUB[0], size); + CpUB2GM((__gm__ T*)receiveBuff + revBuffOffsetNum, (i & 1) ? inputUB[0] : inputUB[1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; sendBuffOffsetNum += (size / sizeof(T)); @@ -310,12 +310,12 @@ __attribute__((always_inline)) inline __aicore__ void GM2GMPingPongNonPipeBarrie AscendC::SetFlag(EVENT_ID1); for (int64_t i = 0; dataSizeRemain > 0; i++) { size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; - event_t eventId = (i & 1) ? EVENT_ID0 ? EVENT_ID1; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; AscendC::WaitFlag(eventId); CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)sendBuff + sendBuffOffsetNum, size); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - CpUB2GM((__gm__ T*)receiveBuff + revBuffOffsetNum, (i & 1) ? inputUB[1] : inputUB[0], size); + CpUB2GM((__gm__ T*)receiveBuff + revBuffOffsetNum, (i & 1) ? inputUB[0] : inputUB[1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; sendBuffOffsetNum += offsetNumPerLoop; @@ -347,6 +347,7 @@ __attribute__((always_inline)) inline __aicore__ void input2BuffRankMagic( inputUB, UB_SINGLE_DMA_SIZE_MAX); times += 1; flag = times; + } ) -- Gitee From 9a3392e74a945f5a818fad8556672525ac10289a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 12:12:46 +0800 Subject: [PATCH 197/414] draft --- comm/lcal/src/kernels/collectives.cce | 86 ++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 8ad47d6c..e228e759 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -291,6 +291,7 @@ __attribute__((always_inline)) inline __aicore__ void GM2GMPingPong( } AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); + AscendC::PipeBarrier(); if (dataSizeRemain <= 0) { return; } @@ -346,9 +347,90 @@ __attribute__((always_inline)) inline __aicore__ void input2BuffRankMagic( (__gm__ T*)ipcReceiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, inputUB, UB_SINGLE_DMA_SIZE_MAX); times += 1; - flag = times; + flag = times * UB_SINGLE_DMA_SIZE_MAX / DMA_SIZE_PER_FLAG + magic; + if (flag != *ctrlFlagsUB && flag > 0) { + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + *ctrlFlagsUB = flag; + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; } -) + if (dataSizeRemain <= 0) { + return; + } + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + dataSizeRemain); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM( + (__gm__ T*)ipcReceiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + inputUB, dataSizeRemain); + flag = CeilDiv(times * UB_SINGLE_DMA_SIZE_MAX + dataSizeRemain, DMA_SIZE_PER_FLAG) + magic; + AscendC::PipeBarrier(); + *ctrlFlagsUB = flag; + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); + AscendC::PipeBarrier(); +} + +template +__attribute__((always_inline)) inline __aicore__ void input2BuffRank( + int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *ipcReceiveBuff, int64_t revBuffOffsetNum, + __gm__ T *sendBuff, int64_t sendBuffOffsetNum, __ubuf__ int64_t* ctrlFlagsUB, __gm__ int64_t* ctrlFlagGM) +{ + int64_t times = 0; + int64_t flag = 0; + + while (dataSizeRemain >= UB_SINGLE_DMA_SIZE_MAX) { + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM( + (__gm__ T*)ipcReceiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, UB_SINGLE_DMA_SIZE_MAX); + times += 1; + flag = times * UB_SINGLE_DMA_SIZE_MAX / DMA_SIZE_PER_FLAG; + if (flag != *ctrlFlagsUB && flag > 0) { + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + *ctrlFlagsUB = flag; + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; + } + if (dataSizeRemain <= 0) { + return; + } + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + dataSizeRemain); + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + dataSizeRemain); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM( + (__gm__ T*)ipcReceiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + inputUB, dataSizeRemain); + flag = CeilDiv(times * UB_SINGLE_DMA_SIZE_MAX + dataSizeRemain, DMA_SIZE_PER_FLAG); + AscendC::PipeBarrier(); + *ctrlFlagsUB = flag; + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); + AscendC::PipeBarrier(); +} + + #endif \ No newline at end of file -- Gitee From 0eee08e00ae63a8d7c7503c4c14188bc8cfb438d Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 15:12:55 +0800 Subject: [PATCH 198/414] draft --- comm/lcal/src/kernels/collectives.cce | 303 +++++++++++++++++++++++++- 1 file changed, 301 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index e228e759..a371d86c 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -412,8 +412,6 @@ __attribute__((always_inline)) inline __aicore__ void input2BuffRank( if (dataSizeRemain <= 0) { return; } - CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), - dataSizeRemain); CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), dataSizeRemain); AscendC::SetFlag(EVENT_ID0); @@ -430,7 +428,308 @@ __attribute__((always_inline)) inline __aicore__ void input2BuffRank( AscendC::PipeBarrier(); } +template +__attribute__((always_inline)) inline __aicore__ void PostSyncBigData( + __ubuf__ int64_t *ctrlFlagsUB, __gm__ T* buff[8], uint32_t rank, uint32_t rankSize, + int64_t dataOffsetNum, int64_t ipcBuffMaxNum, int64_t magic, int64_t i) +{ + if (i <= 0) { + return; + } + + const int64_t postSyncFlagIdx = MEM_DMA_UNIT_INT_NUM + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + + SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); + + for (int64_t targetNPU = 0; targetNPU < rankSize; targetNPU++) { + if (targetNPU == rank) { + continue; + } + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ T *)buff[targetNPU] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + CheckFlagNew(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void PostSyncBigData910B2C( + __ubuf__ int64_t *ctrlFlagsUB, __gm__ T* buff[8], uint32_t rank, uint32_t rankSize, + int64_t dataOffsetNum, int64_t ipcBuffMaxNum, int64_t magic, int64_t i, const int64_t peerRankId, + const int64_t singleNodeRankSize) +{ + if (i <= 0) { + return; + } + + const int64_t postSyncFlagIdx = MEM_DMA_UNIT_INT_NUM + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + + SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); + + int64_t targetNPUBegin = rank < singleNodeRankSize ? 0 : singleNodeRankSize; + int64_t targetNPUEnd = rank < singleNodeRankSize ? singleNodeRankSize : rankSize; + for (int64_t targetNPU = targetNPUBegin; targetNPU < targetNPUEnd; targetNPU++) { + if (targetNPU == rank) { + continue; + } + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ T *)buff[targetNPU] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + CheckFlagNew(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); + } + const int64_t postSyncPeerFlagIdx = MEM_DMA_UNIT_INT_NUM + dataOffsetNum + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGMPeer = + (__gm__ int64_t *)((__gm__ T *)buff[peerRankId] + ipcBuffMaxNum) + postSyncPeerFlagIdx; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMPeer, (int64_t)magic + i); + CheckFlagNew(ctrlFlagsUB, + (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + postSyncPeerFlagIdx, + (int64_t)magic + i); +} + +template +__attribute__((always_inline)) inline __aicore__ void PostSyncBigDataWriteAcrossCard( + __ubuf__ int64_t *ctrlFlagsUB, __gm__ T* buff[8], uint32_t rank, uint32_t rankSize, + int64_t dataOffsetNum, int64_t ipcBuffMaxNum, int64_t magic, int64_t i) +{ + const int64_t postSyncFlagIdx = MEM_DMA_UNIT_INT_NUM + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + int64_t x = (rank == 0) ? 1 : 0; + if (i > 0) { + SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ T *)buff[x] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); + + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + CheckFlagNew(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void SetAtomicOp(int op) +{ + switch (op) { + case 0: + AscendC::SetAtomicAdd(); + break; + case 1: + break; + case 2: + AscendC::SetAtomicMax(); + break; + case 3: + AscendC::SetAtomicMin(); + break; + default: + ; + } +} +template +__attribute__((always_inline)) inline __aicore__ void PostSync(__ubuf__ int64_t *ctrlFlagsUB, __gm__ T **buff, + int32_t rank, int32_t rankSize, int64_t magic) +{ + if (GetBlockIdx() == 0) { + AscendC::PipeBarrier(); + *ctrlFlagsUB = rank + magic; + AscendC::PipeBarrier(); + CpUB2GM(buff[rank] + 1, ctrlFlagsUB, sizeof(int64_t)); + + AscendC::PipeBarrier(); + + for (int64_t x = 0; x < rankSize; ++x) { + if (x == rank) { + continue; + } + CheckFlag(ctrlFlagsUB, buff[x] + 1, x + magic); + } + } +} + +template +__attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSizeRemain, __ubuf__ T *inputUB, + __gm__ T *buff, int64_t dataOffsetNum, int64_t buffOffsetNum, __gm__ T *output, int64_t outputOffsetNum, int op) +{ + if (dataSizeRemain <= 0) { + return; + } + AscendC::PipeBarrier(); + #ifdef __DAV_C220_VEC__ + SetAtomicOpType(op); + #endif + AscendC::PipeBarrier(); + + while (dataSizeRemain >= UB_SINGLE_ADD_SIZE_MAX) { + CpGM2UB(inputUB, (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, UB_SINGLE_ADD_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM((__gm__ T*)output + outputOffsetNum, inputUB, UB_SINGLE_ADD_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + dataSizeRemain -= UB_SINGLE_ADD_SIZE_MAX; + buffOffsetNum += (UB_SINGLE_ADD_SIZE_MAX / sizeof(T)); + outputOffsetNum += (UB_SINGLE_ADD_SIZE_MAX / sizeof(T)); + } + if (dataSizeRemain <= 0) { + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::SetAtomicNone(); + AscendC::PipeBarrier(); + return; + } + + CpGM2UB(inputUB, (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, dataSizeRemain); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM((__gm__ T*)output + outputOffsetNum, inputUB, dataSizeRemain); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::SetAtomicNone(); + AscendC::PipeBarrier(); +} + +template +__attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dataSizeRemain, __ubuf__ T *inputUB[2], + __gm__ T *buff, int64_t dataOffsetNum, int64_t buffOffsetNum, __gm__ T *output, int64_t outputOffsetNum, int op) +{ + if (dataSizeRemain <= 0) { + return; + } + + AscendC::PipeBarrier(); + #ifdef __DAV_C220_VEC__ + SetAtomicOpType(op); + #endif + AscendC::PipeBarrier(); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, size); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + CpUB2GM((__gm__ T*)output + outputOffsetNum, (i & 1) ? inputUB[0] : inputUB[1], size); + AscendC::SetFlag(eventId); + + dataSizeRemain -= size; + buffOffsetNum += (size / size(T)); + outputOffsetNum += (size / size(T)); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); + AscendC::SetAtomicNone(); + AscendC::PipeBarrier(); +} + + +template +__attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(int64_t dataSizeRemain, __ubuf__ T *inputUB[2], + __gm__ T *buff, int64_t dataOffsetNum, int64_t buffOffsetNum, __gm__ T *output, int64_t outputOffsetNum, int op) +{ + if (dataSizeRemain <= 0) { + return; + } + + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + #ifdef __DAV_C220_VEC__ + SetAtomicOpType(op); + #endif + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int64_t i = 0; dataSizeRemain > 0; i++) { + uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; + event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; + AscendC::WaitFlag(eventId); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, size); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + CpUB2GM((__gm__ T*)output + outputOffsetNum, (i & 1) ? inputUB[0] : inputUB[1], size); + AscendC::SetFlag(eventId); + + dataSizeRemain -= size; + buffOffsetNum += (size / size(T)); + outputOffsetNum += (size / size(T)); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); + AscendC::SetAtomicNone(); + AscendC::PipeBarrier(); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); +} + +template +__attribute__((always_inline)) inline __aicore__ void CheckFlagGE(__ubuf__ int64_t *ctrlFlagsUB, + __gm__ int64_t *CtrlFlagGM, int64_t checkValue) +{ + while (true) { + AscendC::PipeBarrier(); + CpGM2UBAlignB16(ctrlFlagsUB, CtrlFlagGM, sizeof(int64_t)); + AscendC::PipeBarrier(); + if ((*ctrlFlagsUB >> 10) == (checkValue >> 10) && (*checkFlagsUB & 0x3FF) >= (checkValue & 0x3FF)) { + break; + } + } +} + + +template +__attribute__((always_inline)) inline __aicore__ void NewCheckFlagGE(__ubuf__ int64_t *ctrlFlagsUB, + __gm__ int64_t *CtrlFlagGM, int64_t checkValue, event_t eventId) +{ + AscendC::SetFlag(eventId); + while (true) { + AscendC::WaitFlag(eventId); + CpGM2UBAlignB16(ctrlFlagsUB, CtrlFlagGM, sizeof(int64_t)); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + if ((*ctrlFlagsUB >> 20) == (checkValue >> 20) && (*checkFlagsUB & 0xFFFFF) >= (checkValue & 0xFFFFF)) { + break; + } + AscendC::SetFlag(eventId); + } +} + +template +__attribute__((always_inline)) inline __aicore__ int64_t GetDeterministicRankOffset(int64_t x) +{ + int64_t count = 1; + while (!(x & 1)) { + x >> 1; + count <<= 1; + } + return count; +} + +template +__attribute__((always_inline)) inline __aicore__ void CopyInput2BuffBroadCast(__ubuf__ char* inputUB, __gm__ char* buff + __gm__ char* input, int64_t singleCoreDataNum, + int64_t blockDataOffset) +{ + if (singleCoreDataNum <= 0) { + return; + } + CpGM2UBAlignB16(inputUB, input + blockDataOffset, singleCoreDataNum * sizeof(char)); + AscendC::PipeBarrier(); + + CpUB2GMAlignB16((__gm__ char*)((__gm__ int64_t * )buff + GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM) + blockDataOffset, + inputUB, singleCoreDataNum * sizeof(char)); + AscendC::PipeBarrier(); +} #endif \ No newline at end of file -- Gitee From 1b4bef35a0673d3d8090f067e80f9aae91dba669 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 15:27:40 +0800 Subject: [PATCH 199/414] draft --- comm/lcal/src/kernels/collectives.cce | 35 +++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index a371d86c..45d7751b 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -401,12 +401,12 @@ __attribute__((always_inline)) inline __aicore__ void input2BuffRank( AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); *ctrlFlagsUB = flag; - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); CpUB2GM(ctrlFlagGM, ctrlFlagsUB, sizeof(int64_t)); } - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; } if (dataSizeRemain <= 0) { @@ -455,7 +455,7 @@ __attribute__((always_inline)) inline __aicore__ void PostSyncBigData( template __attribute__((always_inline)) inline __aicore__ void PostSyncBigData910B2C( - __ubuf__ int64_t *ctrlFlagsUB, __gm__ T* buff[8], uint32_t rank, uint32_t rankSize, + __ubuf__ int64_t *ctrlFlagsUB, __gm__ T* buff[MAX_RANK_NUM_OF_ONE_910B2C], uint32_t rank, uint32_t rankSize, int64_t dataOffsetNum, int64_t ipcBuffMaxNum, int64_t magic, int64_t i, const int64_t peerRankId, const int64_t singleNodeRankSize) { @@ -481,10 +481,10 @@ __attribute__((always_inline)) inline __aicore__ void PostSyncBigData910B2C( } const int64_t postSyncPeerFlagIdx = MEM_DMA_UNIT_INT_NUM + dataOffsetNum + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; __gm__ int64_t* ctrlFlagsGMPeer = - (__gm__ int64_t *)((__gm__ T *)buff[peerRankId] + ipcBuffMaxNum) + postSyncPeerFlagIdx; + (__gm__ int64_t *)((__gm__ T *)buff[peerRankId] + ipcBuffMaxNum) + dataOffsetNum + postSyncPeerFlagIdx; SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMPeer, (int64_t)magic + i); CheckFlagNew(ctrlFlagsUB, - (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + postSyncPeerFlagIdx, + (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + postSyncPeerFlagIdx, (int64_t)magic + i); } @@ -498,10 +498,10 @@ __attribute__((always_inline)) inline __aicore__ void PostSyncBigDataWriteAcross if (i > 0) { SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); - __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ T *)buff[x] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ T *)buff[x] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); - __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ T *)buff[rank] + ipcBuffMaxNum) + dataOffsetNum + postSyncFlagIdx; CheckFlagNew(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); } } @@ -519,15 +519,14 @@ __attribute__((always_inline)) inline __aicore__ void SetAtomicOp(int op) AscendC::SetAtomicMax(); break; case 3: - AscendC::SetAtomicMin(); + AscendC::SetAtomicMin(); break; default: ; } } -template -__attribute__((always_inline)) inline __aicore__ void PostSync(__ubuf__ int64_t *ctrlFlagsUB, __gm__ T **buff, +__attribute__((always_inline)) inline __aicore__ void PostSync(__ubuf__ int64_t *ctrlFlagsUB, __gm__ int64_t **buff, int32_t rank, int32_t rankSize, int64_t magic) { if (GetBlockIdx() == 0) { @@ -547,7 +546,7 @@ __attribute__((always_inline)) inline __aicore__ void PostSync(__ubuf__ int64_t } } -template +template __attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *buff, int64_t dataOffsetNum, int64_t buffOffsetNum, __gm__ T *output, int64_t outputOffsetNum, int op) { @@ -561,10 +560,10 @@ __attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSi AscendC::PipeBarrier(); while (dataSizeRemain >= UB_SINGLE_ADD_SIZE_MAX) { - CpGM2UB(inputUB, (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, UB_SINGLE_ADD_SIZE_MAX); + CpGM2UB(inputUB, (__gm__ T *)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, UB_SINGLE_ADD_SIZE_MAX); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CpUB2GM((__gm__ T*)output + outputOffsetNum, inputUB, UB_SINGLE_ADD_SIZE_MAX); + CpUB2GM((__gm__ T *)output + outputOffsetNum, inputUB, UB_SINGLE_ADD_SIZE_MAX); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); dataSizeRemain -= UB_SINGLE_ADD_SIZE_MAX; @@ -572,14 +571,14 @@ __attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSi outputOffsetNum += (UB_SINGLE_ADD_SIZE_MAX / sizeof(T)); } if (dataSizeRemain <= 0) { - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); AscendC::SetAtomicNone(); AscendC::PipeBarrier(); return; } - CpGM2UB(inputUB, (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, dataSizeRemain); + CpGM2UB(inputUB, (__gm__ T *)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, dataSizeRemain); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); CpUB2GM((__gm__ T*)output + outputOffsetNum, inputUB, dataSizeRemain); -- Gitee From beaf7251b4678349c4743e1385935fd2b89b91ce Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 15:34:44 +0800 Subject: [PATCH 200/414] draft --- comm/lcal/src/kernels/collectives.cce | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index 45d7751b..f5dcf479 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -581,14 +581,14 @@ __attribute__((always_inline)) inline __aicore__ void ProcessData(int64_t dataSi CpGM2UB(inputUB, (__gm__ T *)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, dataSizeRemain); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - CpUB2GM((__gm__ T*)output + outputOffsetNum, inputUB, dataSizeRemain); - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + CpUB2GM((__gm__ T *)output + outputOffsetNum, (__ubuf__ T *)inputUB, dataSizeRemain); + AscendC::SetFlag(EVENT_ID3); + AscendC::WaitFlag(EVENT_ID3); AscendC::SetAtomicNone(); AscendC::PipeBarrier(); } -template +template __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dataSizeRemain, __ubuf__ T *inputUB[2], __gm__ T *buff, int64_t dataOffsetNum, int64_t buffOffsetNum, __gm__ T *output, int64_t outputOffsetNum, int op) { @@ -597,9 +597,9 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dat } AscendC::PipeBarrier(); - #ifdef __DAV_C220_VEC__ +#ifdef __DAV_C220_VEC__ SetAtomicOpType(op); - #endif +#endif AscendC::PipeBarrier(); AscendC::SetFlag(EVENT_ID0); @@ -608,15 +608,15 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dat uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; AscendC::WaitFlag(eventId); - CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, size); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)((__gm__ int64_t*)buff + dataOffsetNum) + buffOffsetNum, size); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); CpUB2GM((__gm__ T*)output + outputOffsetNum, (i & 1) ? inputUB[0] : inputUB[1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; - buffOffsetNum += (size / size(T)); - outputOffsetNum += (size / size(T)); + buffOffsetNum += (size / sizeof(T)); + outputOffsetNum += (size / sizeof(T)); } AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); @@ -625,6 +625,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dat AscendC::WaitFlag(EVENT_ID3); AscendC::SetAtomicNone(); AscendC::PipeBarrier(); + return; } @@ -638,9 +639,9 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(i AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - #ifdef __DAV_C220_VEC__ +#ifdef __DAV_C220_VEC__ SetAtomicOpType(op); - #endif +#endif AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); -- Gitee From 50f553280221efdb44e0c21905af8e8ac7adf7a8 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 15:39:45 +0800 Subject: [PATCH 201/414] draft --- comm/lcal/src/kernels/collectives.cce | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index f5dcf479..e2feea2f 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -629,7 +629,7 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNew(int64_t dat } -template +template __attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(int64_t dataSizeRemain, __ubuf__ T *inputUB[2], __gm__ T *buff, int64_t dataOffsetNum, int64_t buffOffsetNum, __gm__ T *output, int64_t outputOffsetNum, int op) { @@ -651,15 +651,15 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(i uint32_t size = dataSizeRemain > UB_SINGLE_PING_PONG_ADD_SIZE_MAX ? UB_SINGLE_PING_PONG_ADD_SIZE_MAX : dataSizeRemain; event_t eventId = (i & 1) ? EVENT_ID0 : EVENT_ID1; AscendC::WaitFlag(eventId); - CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)((__gm__ int64_t *)buff + dataOffsetNum) + buffOffsetNum, size); + CpGM2UB((i & 1) ? inputUB[0] : inputUB[1], (__gm__ T*)((__gm__ int64_t*)buff + dataOffsetNum) + buffOffsetNum, size); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); CpUB2GM((__gm__ T*)output + outputOffsetNum, (i & 1) ? inputUB[0] : inputUB[1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; - buffOffsetNum += (size / size(T)); - outputOffsetNum += (size / size(T)); + buffOffsetNum += (size / sizeof(T)); + outputOffsetNum += (size / sizeof(T)); } AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); @@ -667,34 +667,31 @@ __attribute__((always_inline)) inline __aicore__ void ProcessDataNewNonBarrier(i AscendC::SetFlag(EVENT_ID3); AscendC::WaitFlag(EVENT_ID3); AscendC::SetAtomicNone(); - AscendC::PipeBarrier(); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); + return; } -template __attribute__((always_inline)) inline __aicore__ void CheckFlagGE(__ubuf__ int64_t *ctrlFlagsUB, - __gm__ int64_t *CtrlFlagGM, int64_t checkValue) + __gm__ int64_t *ctrlFlagGM, int64_t checkValue) { while (true) { AscendC::PipeBarrier(); - CpGM2UBAlignB16(ctrlFlagsUB, CtrlFlagGM, sizeof(int64_t)); + CpGM2UBAlignB16(ctrlFlagsUB, ctrlFlagGM, sizeof(int64_t)); AscendC::PipeBarrier(); - if ((*ctrlFlagsUB >> 10) == (checkValue >> 10) && (*checkFlagsUB & 0x3FF) >= (checkValue & 0x3FF)) { + if ((*ctrlFlagsUB >> 10) == (checkValue >> 10) && (*ctrlFlagsUB & 0x3FF) >= (checkValue & 0x3FF)) { break; } } } - -template __attribute__((always_inline)) inline __aicore__ void NewCheckFlagGE(__ubuf__ int64_t *ctrlFlagsUB, - __gm__ int64_t *CtrlFlagGM, int64_t checkValue, event_t eventId) + __gm__ int64_t *ctrlFlagGM, int64_t checkValue, event_t eventId) { AscendC::SetFlag(eventId); while (true) { AscendC::WaitFlag(eventId); - CpGM2UBAlignB16(ctrlFlagsUB, CtrlFlagGM, sizeof(int64_t)); + CpGM2UBAlignB16(ctrlFlagsUB, ctrlFlagGM, sizeof(int64_t)); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); if ((*ctrlFlagsUB >> 20) == (checkValue >> 20) && (*checkFlagsUB & 0xFFFFF) >= (checkValue & 0xFFFFF)) { -- Gitee From 3d0b1019097800242e8f5af494fbf6e9323940dd Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 15:41:46 +0800 Subject: [PATCH 202/414] draft --- comm/lcal/src/kernels/collectives.cce | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/comm/lcal/src/kernels/collectives.cce b/comm/lcal/src/kernels/collectives.cce index e2feea2f..401a7c84 100644 --- a/comm/lcal/src/kernels/collectives.cce +++ b/comm/lcal/src/kernels/collectives.cce @@ -688,32 +688,29 @@ __attribute__((always_inline)) inline __aicore__ void CheckFlagGE(__ubuf__ int64 __attribute__((always_inline)) inline __aicore__ void NewCheckFlagGE(__ubuf__ int64_t *ctrlFlagsUB, __gm__ int64_t *ctrlFlagGM, int64_t checkValue, event_t eventId) { - AscendC::SetFlag(eventId); + AscendC::SetFlag(eventId); while (true) { - AscendC::WaitFlag(eventId); + AscendC::WaitFlag(eventId); CpGM2UBAlignB16(ctrlFlagsUB, ctrlFlagGM, sizeof(int64_t)); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - if ((*ctrlFlagsUB >> 20) == (checkValue >> 20) && (*checkFlagsUB & 0xFFFFF) >= (checkValue & 0xFFFFF)) { + if ((*ctrlFlagsUB >> 20) == (checkValue >> 20) && (*ctrlFlagsUB & 0xFFFFF) >= (checkValue & 0xFFFFF)) { break; } - AscendC::SetFlag(eventId); + AscendC::SetFlag(eventId); } } -template -__attribute__((always_inline)) inline __aicore__ int64_t GetDeterministicRankOffset(int64_t x) -{ +__attribute__((always_inline)) inline __aicore__ int64_t GetDeterministicRankOffset(int64_t x) { int64_t count = 1; while (!(x & 1)) { - x >> 1; + x >>= 1; count <<= 1; } return count; } -template -__attribute__((always_inline)) inline __aicore__ void CopyInput2BuffBroadCast(__ubuf__ char* inputUB, __gm__ char* buff +__attribute__((always_inline)) inline __aicore__ void CopyInput2BuffBroadCast(__ubuf__ char* inputUB, __gm__ char* buff, __gm__ char* input, int64_t singleCoreDataNum, int64_t blockDataOffset) { -- Gitee From b0bec0ad5573137d4b90c0ff2cd115dc008b40be Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 16:32:03 +0800 Subject: [PATCH 203/414] draft --- .../lca_allgather_2npu_big_data_write.cce | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce diff --git a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce new file mode 100644 index 00000000..c22f341c --- /dev/null +++ b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllGather2npuBigDataWriteOrigin( + __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, + uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, int64_t flagOffset2nd, + int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx) +{ + const int64_t dataBlockAllNum = len * sizeof(T) / MEM_DMA_UNIT_BYTE; + const int64_t singleCoreDataBlockNum = dataBlockAllNum / blockNumPerGroup; + const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); + const int64_t buffDataDMAOffsetNum = coreSegmentedIdx * singleCoreDataNum; + + __gm__ T *receiveBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); + __gm__ T *sendBuff = input; + int64_t dataSizeRemain = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE; + if (coreSegmentedIdx == blockNumPerGroup - 1) { + dataSizeRemain = (len - singleCoreDataNum * coreSegmentedIdx) * sizeof(T); + } + if (dataSizeRemain <= 0) { + return; + } + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + flagOffset1st; + if (GetBlockIdx() < blockNumPerGroup) { + input2BuffRankMagic( + dataSizeRemain, inputUB[0], receiveBuff, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum, + ctrlFlagsUB, ctrlFlagsGMX, magic); + return; + } + CM2GMPingPong(dataSizeRemain, inputUB, output + allLen * rank + processedNum, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum); + + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + int64_t alldataSizeNeed2Add = dataSizeRemain; + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB >= CeilDiv(alldataSizeNeed2Add, DMA_SIZE_PER_FLAG)) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) { + continue; + } + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 - magic); + if (preparedDataGroupCount <= 0 || *ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > alldataSizeNeed2Add) { + dataSizeRemain = alldataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + + GM2GMPingPong(dataSizeRemain, inputUB, output + allLen * x + processedNum, + buffDataDMAOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum), + buffDataDMAOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T)); + AscendC::PipeBarrier(); + + *ctrlFlagsUB = preparedDataGroupCount; + AscendC::PipeBarrier(); + } + SetFlag(ctrlFlagsUB, ctrlFlagsGM, 0); +} + +template +inline __aicore__ void LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + int64_t x = (rank == 0 ? 1 : 0); + if (GetBlockIdx() >= blockNumPerGroup) { + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; + + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + for (int64_t i = 0; i < CeilDiv(len, ipcBuffMaxNum); i++) { + *ctrlFlagsUB = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNum; + int64_t remainLen = (len - processedNum < ipcBuffMaxNum) ? len - processedNum : ipcBuffMaxNum; + + PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalAllGather2npuBigDataWriteOrigin( + buff, input + processedNum, output, processedNum, blockNumPerGroup, rank, rankSize, len, remainLen, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); +} \ No newline at end of file -- Gitee From b6b77635034d4a6d5a323a4215863a96819d0adc Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 16:35:22 +0800 Subject: [PATCH 204/414] draft --- .../lca_allgather_2npu_big_data_write.cce | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce index c22f341c..6021bb30 100644 --- a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce +++ b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce @@ -40,14 +40,14 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather2npuBigDataWr ctrlFlagsUB, ctrlFlagsGMX, magic); return; } - CM2GMPingPong(dataSizeRemain, inputUB, output + allLen * rank + processedNum, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum); + GM2GMPingPong(dataSizeRemain, inputUB, output + allLen * rank + processedNum, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum); *ctrlFlagsUB = 0; *ctrlFlagsUB1 = 0; - int64_t alldataSizeNeed2Add = dataSizeRemain; + int64_t allDataSizeNeed2Add = dataSizeRemain; AscendC::PipeBarrier(); while (true) { - if (*ctrlFlagsUB >= CeilDiv(alldataSizeNeed2Add, DMA_SIZE_PER_FLAG)) { + if (*ctrlFlagsUB >= CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG)) { break; } @@ -63,13 +63,13 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather2npuBigDataWr } dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; - if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > alldataSizeNeed2Add) { - dataSizeRemain = alldataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; } GM2GMPingPong(dataSizeRemain, inputUB, output + allLen * x + processedNum, buffDataDMAOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), - (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum), + (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), buffDataDMAOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T)); AscendC::PipeBarrier(); @@ -112,11 +112,11 @@ inline __aicore__ void LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_FUN_16P(T)) AscendC::PipeBarrier(); int64_t processedNum = i * ipcBuffMaxNum; - int64_t remainLen = (len - processedNum < ipcBuffMaxNum) ? len - processedNum : ipcBuffMaxNum; + int64_t remainNum = (len - processedNum < ipcBuffMaxNum) ? len - processedNum : ipcBuffMaxNum; PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); LcalAllGather2npuBigDataWriteOrigin( - buff, input + processedNum, output, processedNum, blockNumPerGroup, rank, rankSize, len, remainLen, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, + buff, input + processedNum, output, processedNum, blockNumPerGroup, rank, rankSize, len, remainNum, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx); } DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); -- Gitee From dc6a01f30f3bacbea0b9d6374bda245523983791 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 16:36:30 +0800 Subject: [PATCH 205/414] draft --- comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce index 6021bb30..073966d2 100644 --- a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce +++ b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce @@ -22,7 +22,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather2npuBigDataWr const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); const int64_t buffDataDMAOffsetNum = coreSegmentedIdx * singleCoreDataNum; - __gm__ T *receiveBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t *)buff[x] + dataOffsetNum); __gm__ T *sendBuff = input; int64_t dataSizeRemain = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE; if (coreSegmentedIdx == blockNumPerGroup - 1) { @@ -94,7 +94,7 @@ inline __aicore__ void LcalAllGather2npuBigDataWrite(ALLREDUCE_ARGS_FUN_16P(T)) __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; - + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; int64_t corePerRank = blockNumPerGroup; int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; -- Gitee From 952c0ac29aaf915e86c2bf6bb5459561698beb7d Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 16:37:24 +0800 Subject: [PATCH 206/414] draft --- comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce index 073966d2..39945f1e 100644 --- a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce +++ b/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce @@ -22,7 +22,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather2npuBigDataWr const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); const int64_t buffDataDMAOffsetNum = coreSegmentedIdx * singleCoreDataNum; - __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t *)buff[x] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); __gm__ T *sendBuff = input; int64_t dataSizeRemain = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE; if (coreSegmentedIdx == blockNumPerGroup - 1) { -- Gitee From e144407fd7b98b9cbcf9fd4a127d27dc019d9126 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 16:55:48 +0800 Subject: [PATCH 207/414] draft --- comm/lcal/src/kernels/lca_allgather_2npu.cce | 59 ++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 comm/lcal/src/kernels/lca_allgather_2npu.cce diff --git a/comm/lcal/src/kernels/lca_allgather_2npu.cce b/comm/lcal/src/kernels/lca_allgather_2npu.cce new file mode 100644 index 00000000..472f7979 --- /dev/null +++ b/comm/lcal/src/kernels/lca_allgather_2npu.cce @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +inline __aicore__ void LcalAllGather2npu(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t x = GetBlockIdx() / corePerRank; + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + int64_t flagOffset1st = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_BYTE; + int64_t flagOffset2nd = (x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_BYTE; + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(32); + *ctrlFlagsUB22 = 0; + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + __gm__ T *sendBuff = input; + int64_t dataNumRemain = len / corePerRank; + int64_t sendBuffOffsetNum = coreSegmentedIdx * dataNumRemain; + int64_t buffOffsetNum = sendBuffOffsetNum + rank * len; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumRemain = len - dataNumRemain * coreSegmentedIdx; + } + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + GM2GM(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, buffOffsetNum, sendBuff, sendBuffOffsetNum); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*) buff[x] + flagOffset1st; + AscendC::PipeBarrier(); + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + AscendC::PipeBarrier(); + + __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*) buff[rank] + flagOffset2nd; + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMTemp, (int64_t)magic); + + buffOffsetNum = sendBuffOffsetNum + x * len; + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + GM2GM(dataNumRemain * sizeof(T), inputUB[0], (__gm__ T*)output, buffOffsetNum, sendBuff, buffOffsetNum); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); +} \ No newline at end of file -- Gitee From c537b7fced0ce4d2423b02088a2b7cf3b341da72 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:10:27 +0800 Subject: [PATCH 208/414] draft --- comm/lcal/src/kernels/lca_allgather_2npu.cce | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_2npu.cce b/comm/lcal/src/kernels/lca_allgather_2npu.cce index 472f7979..713fe320 100644 --- a/comm/lcal/src/kernels/lca_allgather_2npu.cce +++ b/comm/lcal/src/kernels/lca_allgather_2npu.cce @@ -17,18 +17,18 @@ inline __aicore__ void LcalAllGather2npu(ALLREDUCE_ARGS_FUN_16P(T)) const int64_t corePerRank = GetLcalBlockNum() / rankSize; const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; const int64_t x = GetBlockIdx() / corePerRank; - + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; - int64_t flagOffset1st = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_BYTE; - int64_t flagOffset2nd = (x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_BYTE; + const int64_t flagOffset1st = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset2nd = (x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; __gm__ T* buff[8] = { buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7 }; __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(32); - *ctrlFlagsUB22 = 0; - __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + *ctrlFlagsUB2 = 0; + __ubuf__ T* inputUB[2] = { (__ubuf__ T*)(64), (__ubuf__ T*)(97312) }; __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); __gm__ T *sendBuff = input; @@ -47,7 +47,7 @@ inline __aicore__ void LcalAllGather2npu(ALLREDUCE_ARGS_FUN_16P(T)) SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); AscendC::PipeBarrier(); - __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*) buff[rank] + flagOffset2nd; + __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + flagOffset2nd; CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMTemp, (int64_t)magic); buffOffsetNum = sendBuffOffsetNum + x * len; -- Gitee From ffd27b1cbdc4959811ea31f48e68ca0f3f0a52df Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:27:18 +0800 Subject: [PATCH 209/414] draft --- .../lcal/src/kernels/lca_allgather_910B2C.cce | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 comm/lcal/src/kernels/lca_allgather_910B2C.cce diff --git a/comm/lcal/src/kernels/lca_allgather_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_910B2C.cce new file mode 100644 index 00000000..79634536 --- /dev/null +++ b/comm/lcal/src/kernels/lca_allgather_910B2C.cce @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +inline __aicore__ void LcalAllGather910B2C(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + + const int64_t singleNodeRankSize = rankSize >> 1; + if (GetBlockIdx() >= singleNodeRankSize + 2) { + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + return; + } + const int64_t localNodeRankId = rank > singleNodeRankSize ? rank - singleNodeRankSize : rank; + const int64_t nodeId = rank < singleNodeRankSize ? 0 : 1; + + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t corePerRank = 1; + + __gm__ T* buff[16] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + int64_t dataSizeRemain = len * sizeof(T); + + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + + if (GetBlockIdx() >= singleNodeRankSize || GetBlockIdx() >= 8) { + int coreStep0Idx = 0; + if (GetBlockIdx() == 9 || GetBlockIdx() == singleNodeRankSize + 1) { + coreStep0Idx = 1; + } + int64_t sendBuffOffsetNum = 0; + int64_t revBuffOffsetNum = 0; + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (nodeId + singleNodeRankSize) * MEM_DMA_UNIT_INT_NUM; + if ((rank < singleNodeRankSize && coreStep0Idx == 1) || + (rank > singleNodeRankSize && coreStep0Idx == 0)) { + receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum); + ctrlFlagsGM = (__gm__ int64_t*)buff[peerRankId] + (nodeId + singleNodeRankSize) * MEM_DMA_UNIT_INT_NUM; + } + if (rank > singleNodeRankSize) { + revBuffOffsetNum = len; + } + + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, input, sendBuffOffsetNum); + + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + return; + } + + int64_t x = rank < singleNodeRankSize ? GetBlockIdx() : GetBlockIdx() + singleNodeRankSize; + __gm__ T *receiveBuff = output; + __gm__ T *sendBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); + + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM, magic); + int64_t revBuffOffsetNum = GetBlockIdx() * len; + int64_t sendBuffOffsetNum = 0; + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); + + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM, magic); + revBuffOffsetNum = (singleNodeRankSize + GetBlockIdx()) * len; + sendBuffOffsetNum = len; + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); +} \ No newline at end of file -- Gitee From bb9e63b06803982ffb26ded6db85f706524404d4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:30:15 +0800 Subject: [PATCH 210/414] draft --- .../lcal/src/kernels/lca_allgather_910B2C.cce | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_910B2C.cce index 79634536..c583fa3d 100644 --- a/comm/lcal/src/kernels/lca_allgather_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_910B2C.cce @@ -11,7 +11,7 @@ #include "collectives.cce" template -inline __aicore__ void LcalAllGather910B2C(ALLREDUCE_ARGS_FUN_16P(T)) +__attribute__((always_inline)) inline __aicore__ void LcalAllGather910B2C(ALLREDUCE_ARGS_FUN_16P(T)) { DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); @@ -20,11 +20,11 @@ inline __aicore__ void LcalAllGather910B2C(ALLREDUCE_ARGS_FUN_16P(T)) DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); return; } - const int64_t localNodeRankId = rank > singleNodeRankSize ? rank - singleNodeRankSize : rank; + const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; const int64_t nodeId = rank < singleNodeRankSize ? 0 : 1; - const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; - + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; @@ -54,11 +54,11 @@ inline __aicore__ void LcalAllGather910B2C(ALLREDUCE_ARGS_FUN_16P(T)) __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (nodeId + singleNodeRankSize) * MEM_DMA_UNIT_INT_NUM; if ((rank < singleNodeRankSize && coreStep0Idx == 1) || - (rank > singleNodeRankSize && coreStep0Idx == 0)) { + (rank >= singleNodeRankSize && coreStep0Idx == 0)) { receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum); ctrlFlagsGM = (__gm__ int64_t*)buff[peerRankId] + (nodeId + singleNodeRankSize) * MEM_DMA_UNIT_INT_NUM; } - if (rank > singleNodeRankSize) { + if (rank >= singleNodeRankSize) { revBuffOffsetNum = len; } @@ -71,16 +71,16 @@ inline __aicore__ void LcalAllGather910B2C(ALLREDUCE_ARGS_FUN_16P(T)) int64_t x = rank < singleNodeRankSize ? GetBlockIdx() : GetBlockIdx() + singleNodeRankSize; __gm__ T *receiveBuff = output; - __gm__ T *sendBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t *)buff[x] + dataOffsetNum); - CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM, magic); int64_t revBuffOffsetNum = GetBlockIdx() * len; int64_t sendBuffOffsetNum = 0; GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); - CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM, magic); revBuffOffsetNum = (singleNodeRankSize + GetBlockIdx()) * len; sendBuffOffsetNum = len; GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); -} \ No newline at end of file +} -- Gitee From 01d2aee708afba965a594b3e2fb344c6363d8137 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:30:47 +0800 Subject: [PATCH 211/414] draft --- comm/lcal/src/kernels/lca_allgather_910B2C.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lca_allgather_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_910B2C.cce index c583fa3d..ea4b8838 100644 --- a/comm/lcal/src/kernels/lca_allgather_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_910B2C.cce @@ -71,7 +71,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather910B2C(ALLRED int64_t x = rank < singleNodeRankSize ? GetBlockIdx() : GetBlockIdx() + singleNodeRankSize; __gm__ T *receiveBuff = output; - __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t *)buff[x] + dataOffsetNum); + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM, magic); int64_t revBuffOffsetNum = GetBlockIdx() * len; -- Gitee From 2ee229b950fca91bb0008647bea25dc154b9184f Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:49:29 +0800 Subject: [PATCH 212/414] draft --- .../lca_allgather__big_data_910B2C.cce | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 comm/lcal/src/kernels/lca_allgather__big_data_910B2C.cce diff --git a/comm/lcal/src/kernels/lca_allgather__big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather__big_data_910B2C.cce new file mode 100644 index 00000000..8e719a9b --- /dev/null +++ b/comm/lcal/src/kernels/lca_allgather__big_data_910B2C.cce @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void CheckThenDMAGM2GM(__ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __gm__ int64_t *ctrlFlagsGMStep1ToCheck, int64_t newMagic, int64_t allDataSizeNeedDMA, + int64_t revBuffOffsetNumOrigin, int64_t processedDataNum, __gm__ T *sendBuff, __gm__ T *revBuff, + int64_t sendBuffOffsetNumOrigin, __ubuf__ T* inputUB[2], int64_t &processedDataGroupCount, int64_t multipleTimes) +{ + PipeBarrier(); + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMStep1ToCheck, sizeof(int64_t)); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + if (*ctrlFlagsUB1 == 0 || ((*ctrlFlagsUB1 >> 10) != (newMagic >> 10))) { + return; + } + + int64_t prepareDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (processedDataGroupCount >= prepareDataGroupCount) { + return; + } + + int64_t curDataSizeRemain = (prepareDataGroupCount - processedDataGroupCount) * DMA_SIZE_PER_FLAG; + if (prepareDataGroupCount >= multipleTimes) { + curDataSizeRemain = allDataSizeNeedDMA - processedDataGroupCount * DMA_SIZE_PER_FLAG; + } + PipeBarrier(); + GM2GMPingPongNonPipeBarrier(curDataSizeRemain, inputUB, revBuff, + revBuffOffsetNumOrigin + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, sendBuffOffsetNumOrigin + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); + processedDataGroupCount = prepareDataGroupCount; + PipeBarrier(); +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + magic *= 1024; + + const int64_t singleNodeRankSize = rankSize >> 1; + + const int64_t allGatherBuffSizePerParagraph910B2C = IPC_BUFF_MAX_SIZE / 2 / sizeof(T) * sizeof(T); + + const int64_t allGatherBuffNumPerParagraph910B2C = allGatherBuffSizePerParagraph910B2C / sizeof(T); + + if (GetBlockIdx() >= singleNodeRankSize + 2) { + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + return; + } + const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; + const int64_t nodeId = rank < singleNodeRankSize ? 0 : 1; + + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t corePerRank = 1; + + __gm__ T* buff[16] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ int64_t* ctrlFlagsUB3 = (__ubuf__ int64_t*)(96); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(128), (__ubuf__ T*)(98336)}; + int64_t revBuffOffsetNumOrigin = nodeId * allGatherBuffNumPerParagraph910B2C; + int64_t processedDataNum = 0; + int64_t totalDataSizeRemain = len * sizeof(T); + const int64_t totalLoopTimes = CeilDiv(totalDataSizeRemain, allGatherBuffSizePerParagraph910B2C); + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + for (int i = 0; i < totalLoopTimes; i++) { + int64_t newMagic = (magic + i + 1) * 1024; + int64_t dataSizeRemain = (i == totalLoopTimes - 1) ? + (totalDataSizeRemain - processedDataNum * sizeof(T)) : allGatherBuffSizePerParagraph910B2C; + int64_t dataNumRemain = dataSizeRemain / sizeof(T); + if (GetBlockIdx() >= singleNodeRankSize) { + + } + } + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); +} -- Gitee From 66c59a72b9867d827330545aa1a71d64b7fbc3e9 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:50:20 +0800 Subject: [PATCH 213/414] draft --- ...her__big_data_910B2C.cce => lca_allgather_big_data_910B2C.cce} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename comm/lcal/src/kernels/{lca_allgather__big_data_910B2C.cce => lca_allgather_big_data_910B2C.cce} (100%) diff --git a/comm/lcal/src/kernels/lca_allgather__big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lca_allgather__big_data_910B2C.cce rename to comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce -- Gitee From 563177c9018f340e6bc59fbae000a801048ba6c7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:50:43 +0800 Subject: [PATCH 214/414] draft --- comm/lcal/src/kernels/lca_allgather_big_data.cce | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 comm/lcal/src/kernels/lca_allgather_big_data.cce diff --git a/comm/lcal/src/kernels/lca_allgather_big_data.cce b/comm/lcal/src/kernels/lca_allgather_big_data.cce new file mode 100644 index 00000000..cd9274ad --- /dev/null +++ b/comm/lcal/src/kernels/lca_allgather_big_data.cce @@ -0,0 +1,11 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" -- Gitee From 34f198216a6e89dbaa7f68d660e38812f77d7055 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:51:05 +0800 Subject: [PATCH 215/414] draft --- comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce | 1 - 1 file changed, 1 deletion(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index 8e719a9b..90e601a7 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -95,5 +95,4 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C } } - DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); } -- Gitee From d85c4cbf1392577a87b6575aa701b23b994fac94 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 17:53:39 +0800 Subject: [PATCH 216/414] draft --- comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index 90e601a7..3f271213 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -91,7 +91,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C (totalDataSizeRemain - processedDataNum * sizeof(T)) : allGatherBuffSizePerParagraph910B2C; int64_t dataNumRemain = dataSizeRemain / sizeof(T); if (GetBlockIdx() >= singleNodeRankSize) { - + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ int64_t *ctrlFlagsGMStep0ToSet =(__gm__ int64_t*)buff[rank] + flagOffset1st; } } -- Gitee From fc33d5468b435bcc2f9027e7b16a2f0be083a85c Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 20:50:58 +0800 Subject: [PATCH 217/414] draft --- .../kernels/lca_allgather_big_data_910B2C.cce | 93 ++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index 3f271213..bd45070f 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -92,7 +92,98 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C int64_t dataNumRemain = dataSizeRemain / sizeof(T); if (GetBlockIdx() >= singleNodeRankSize) { __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - __gm__ int64_t *ctrlFlagsGMStep0ToSet =(__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ int64_t *ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[rank] + flagOffset1st; + if ((nodeId == 0) && GetBlockIdx() == singleNodeRankSize + 1) { + *ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[peerRankId] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; + } + if ((nodeId == 1) && GetBlockIdx() == singleNodeRankSize) { + *ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[peerRankId] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM; + } + if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1) || + (nodeId == 1 && GetBlockIdx() == singleNodeRankSize)) { + receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum); + } + + input2BuffRankMagic(dataNumRemain, inputUB[0], receiveBuff, revBuffOffsetNumOrigin, input, + processedDataNum, ctrlFlagsUB, ctrlFlagsGMStep0ToSet, newMagic); + if (i < totalLoopTimes - 1) { + if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize)) { + int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; + for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { + CheckFlag(ctrlFlagsUB, + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); + } + } + if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize + 1)) { + int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; + for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { + CheckFlag(ctrlFlagsUB, + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); + } + } + if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1)) { + for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { + CheckFlag(ctrlFlagsUB, + (__gm__ int64_t*)buff[checkLogicRank] + (GetLcalBlockNum() + localNodeRankId) * MEM_DMA_UNIT_INT_NUM, + newMagic); + } + } + if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize)) { + for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { + CheckFlag(ctrlFlagsUB, + (__gm__ int64_t*)buff[checkLogicRank + singleNodeRankSize] + localNodeRankId * MEM_DMA_UNIT_INT_NUM, + newMagic); + } + } + } + } else { + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + *ctrlFlagsUB3 = 0; + __gm__ int64_t *ctrlFlagsGMStep1ToCheck1st = (__gm__ int64_t*)buff[GetBlockIdx()] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t *ctrlFlagsGMStep1ToSet1st = (__gm__ int64_t*)buff[GetBlockIdx()] + localNodeRankId * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t *ctrlFlagsGMStep1ToCheck2nd = (__gm__ int64_t*)buff[GetBlockIdx()] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t *ctrlFlagsGMStep1ToSet2nd = (__gm__ int64_t*)buff[peerRankId] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[GetBlockIdx()] + dataOffsetNum); + if (nodeId == 1) { + ctrlFlagsGMStep1ToCheck1st = (__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; + ctrlFlagsGMStep1ToSet1st = (__gm__ int64_t*)buff[peerRankId] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagsGMStep1ToCheck2nd = (__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagsGMStep1ToSet2nd = (__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + (localNodeRankId + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM; + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + dataOffsetNum); + } + int64_t revBuffOffsetNumOrigin1st = GetBlockIdx() * len + processedDataNum; + int64_t revBuffOffsetNumOrigin2nd = (GetBlockIdx() + singleNodeRankSize) * len + processedDataNum; + int64_t allDataSizeNeedDMA = dataSizeRemain; + int64_t multipleTimes = CeilDiv(allDataSizeNeedDMA, DMA_SIZE_PER_FLAG); + bool step1NeedSetFirst = true; + bool step1NeedSetSecond = true; + + int64_t processedDataGroupCount1st = 0; + int64_t processedDataGroupCount2nd = 0; + PipeBarrier(); + while (true) { + if (processedDataGroupCount1st < multipleTimes) { + CheckThenDMAGM2GM(ctrlFlagsUB, ctrlFlagsUB1, ctrlFlagsGMStep1ToCheck1st, newMagic, dataSizeRemain, + revBuffOffsetNumOrigin1st, processedDataNum, sendBuff, output, 0, inputUB, + processedDataGroupCount1st, multipleTimes); + } else if (step1NeedSetFirst) { + if (i < totalLoopTimes - 1) { + SetFlag(ctrlFlagsUB1, ctrlFlagsGMStep1ToSet1st, newMagic); + } + step1NeedSetFirst = false; + } + } + + if (processedDataGroupCount2nd < multipleTimes) { + CheckThenDMAGM2GM(ctrlFlagsUB2, ctrlFlagsUB3, ctrlFlagsGMStep1ToCheck2nd, newMagic, dataSizeRemain, + revBuffOffsetNumOrigin2nd, processedDataNum, sendBuff, output, allGatherBuffNumPerParagraph910B2C, inputUB, + processedDataGroupCount2nd, multipleTimes); + } + } } -- Gitee From d8b706f653159de9e3473a0ae1329eb40d205dde Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 20:54:01 +0800 Subject: [PATCH 218/414] draft --- .../kernels/lca_allgather_big_data_910B2C.cce | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index bd45070f..cec733d5 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -18,27 +18,27 @@ __attribute__((always_inline)) inline __aicore__ void CheckThenDMAGM2GM(__ubuf__ { PipeBarrier(); CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMStep1ToCheck, sizeof(int64_t)); - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); if (*ctrlFlagsUB1 == 0 || ((*ctrlFlagsUB1 >> 10) != (newMagic >> 10))) { return; } - int64_t prepareDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); - if (processedDataGroupCount >= prepareDataGroupCount) { + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (processedDataGroupCount >= preparedDataGroupCount) { return; } - int64_t curDataSizeRemain = (prepareDataGroupCount - processedDataGroupCount) * DMA_SIZE_PER_FLAG; - if (prepareDataGroupCount >= multipleTimes) { + int64_t curDataSizeRemain = (preparedDataGroupCount - processedDataGroupCount) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount >= multipleTimes) { curDataSizeRemain = allDataSizeNeedDMA - processedDataGroupCount * DMA_SIZE_PER_FLAG; } PipeBarrier(); GM2GMPingPongNonPipeBarrier(curDataSizeRemain, inputUB, revBuff, revBuffOffsetNumOrigin + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), sendBuff, sendBuffOffsetNumOrigin + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); - processedDataGroupCount = prepareDataGroupCount; + processedDataGroupCount = preparedDataGroupCount; PipeBarrier(); } @@ -182,6 +182,15 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C CheckThenDMAGM2GM(ctrlFlagsUB2, ctrlFlagsUB3, ctrlFlagsGMStep1ToCheck2nd, newMagic, dataSizeRemain, revBuffOffsetNumOrigin2nd, processedDataNum, sendBuff, output, allGatherBuffNumPerParagraph910B2C, inputUB, processedDataGroupCount2nd, multipleTimes); + } else if (step1NeedSetSecond) { + if (i < totalLoopTimes - 1) { + SetFlag(ctrlFlagsUB3, ctrlFlagsGMStep1ToSet2nd, newMagic); + } + step1NeedSetSecond = false; + } + + if (!step1NeedSetFirst && !step1NeedSetSecond) { + break; } } -- Gitee From 1cd40381f3cbe6ae97c487062df057d891d05d8e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 20:57:46 +0800 Subject: [PATCH 219/414] draft --- .../kernels/lca_allgather_big_data_910B2C.cce | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index cec733d5..1fb17ae9 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -61,14 +61,14 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; const int64_t nodeId = rank < singleNodeRankSize ? 0 : 1; - const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; const int64_t corePerRank = 1; - __gm__ T* buff[16] = { + __gm__ T *buff[16] = { buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7, buff8, buff9, buff10, buff11, @@ -93,48 +93,48 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C if (GetBlockIdx() >= singleNodeRankSize) { __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); __gm__ int64_t *ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[rank] + flagOffset1st; - if ((nodeId == 0) && GetBlockIdx() == singleNodeRankSize + 1) { - *ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[peerRankId] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; + if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1)) { + ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[peerRankId] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; } - if ((nodeId == 1) && GetBlockIdx() == singleNodeRankSize) { - *ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[peerRankId] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM; + if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize)) { + ctrlFlagsGMStep0ToSet = (__gm__ int64_t*)buff[peerRankId] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM; } if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1) || (nodeId == 1 && GetBlockIdx() == singleNodeRankSize)) { receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum); } - input2BuffRankMagic(dataNumRemain, inputUB[0], receiveBuff, revBuffOffsetNumOrigin, input, + input2BuffRankMagic(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNumOrigin, input, processedDataNum, ctrlFlagsUB, ctrlFlagsGMStep0ToSet, newMagic); if (i < totalLoopTimes - 1) { if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize)) { int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, - (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, - newMagic); + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); } } if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize + 1)) { int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, - (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, - newMagic); + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); } } - if ((nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1)) { + if (nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1) { for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[checkLogicRank] + (GetLcalBlockNum() + localNodeRankId) * MEM_DMA_UNIT_INT_NUM, - newMagic); + newMagic); } } - if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize)) { + if (nodeId == 1 && GetBlockIdx() == singleNodeRankSize) { for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[checkLogicRank + singleNodeRankSize] + localNodeRankId * MEM_DMA_UNIT_INT_NUM, - newMagic); + newMagic); } } } @@ -153,12 +153,12 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C ctrlFlagsGMStep1ToSet1st = (__gm__ int64_t*)buff[peerRankId] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM; ctrlFlagsGMStep1ToCheck2nd = (__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + (singleNodeRankSize + 1) * MEM_DMA_UNIT_INT_NUM; ctrlFlagsGMStep1ToSet2nd = (__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + (localNodeRankId + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM; - __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + dataOffsetNum); + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[GetBlockIdx() + singleNodeRankSize] + dataOffsetNum); } int64_t revBuffOffsetNumOrigin1st = GetBlockIdx() * len + processedDataNum; int64_t revBuffOffsetNumOrigin2nd = (GetBlockIdx() + singleNodeRankSize) * len + processedDataNum; int64_t allDataSizeNeedDMA = dataSizeRemain; - int64_t multipleTimes = CeilDiv(allDataSizeNeedDMA, DMA_SIZE_PER_FLAG); + int64_t multipleTimes = CeilDiv(dataSizeRemain, DMA_SIZE_PER_FLAG); bool step1NeedSetFirst = true; bool step1NeedSetSecond = true; @@ -168,18 +168,17 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C while (true) { if (processedDataGroupCount1st < multipleTimes) { CheckThenDMAGM2GM(ctrlFlagsUB, ctrlFlagsUB1, ctrlFlagsGMStep1ToCheck1st, newMagic, dataSizeRemain, - revBuffOffsetNumOrigin1st, processedDataNum, sendBuff, output, 0, inputUB, - processedDataGroupCount1st, multipleTimes); + revBuffOffsetNumOrigin1st, processedDataNum, sendBuff, output, 0, inputUB, + processedDataGroupCount1st, multipleTimes); } else if (step1NeedSetFirst) { if (i < totalLoopTimes - 1) { SetFlag(ctrlFlagsUB1, ctrlFlagsGMStep1ToSet1st, newMagic); } step1NeedSetFirst = false; - } } - if (processedDataGroupCount2nd < multipleTimes) { - CheckThenDMAGM2GM(ctrlFlagsUB2, ctrlFlagsUB3, ctrlFlagsGMStep1ToCheck2nd, newMagic, dataSizeRemain, + if (processedDataGroupCount2nd < multipleTimes) { + CheckThenDMAGM2GM(ctrlFlagsUB2, ctrlFlagsUB3, ctrlFlagsGMStep1ToCheck2nd, newMagic, dataSizeRemain, revBuffOffsetNumOrigin2nd, processedDataNum, sendBuff, output, allGatherBuffNumPerParagraph910B2C, inputUB, processedDataGroupCount2nd, multipleTimes); } else if (step1NeedSetSecond) { -- Gitee From 026fa6494af9f0ae0bbca08f7028502e61da60d5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 21:00:47 +0800 Subject: [PATCH 220/414] draft --- .../kernels/lca_allgather_big_data_910B2C.cce | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index 1fb17ae9..f3923b87 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -111,16 +111,16 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, - (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, - newMagic); + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); } } if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize + 1)) { int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, - (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, - newMagic); + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); } } if (nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1) { @@ -175,24 +175,25 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C SetFlag(ctrlFlagsUB1, ctrlFlagsGMStep1ToSet1st, newMagic); } step1NeedSetFirst = false; - } + } if (processedDataGroupCount2nd < multipleTimes) { CheckThenDMAGM2GM(ctrlFlagsUB2, ctrlFlagsUB3, ctrlFlagsGMStep1ToCheck2nd, newMagic, dataSizeRemain, - revBuffOffsetNumOrigin2nd, processedDataNum, sendBuff, output, allGatherBuffNumPerParagraph910B2C, inputUB, - processedDataGroupCount2nd, multipleTimes); - } else if (step1NeedSetSecond) { - if (i < totalLoopTimes - 1) { - SetFlag(ctrlFlagsUB3, ctrlFlagsGMStep1ToSet2nd, newMagic); + revBuffOffsetNumOrigin2nd, processedDataNum, sendBuff, output, allGatherBuffNumPerParagraph910B2C, inputUB, + processedDataGroupCount2nd, multipleTimes); + } else if (step1NeedSetSecond) { + if (i < totalLoopTimes - 1) { + SetFlag(ctrlFlagsUB3, ctrlFlagsGMStep1ToSet2nd, newMagic); + } + step1NeedSetSecond = false; } - step1NeedSetSecond = false; - } - if (!step1NeedSetFirst && !step1NeedSetSecond) { - break; + if (!step1NeedSetFirst && !step1NeedSetSecond) { + break; + } } - } + processedDataNum += allGatherBuffNumPerParagraph910B2C; } - + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); } -- Gitee From 520cd9dda9cbd65535851d5ba8655f6950bc6e79 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 21:01:56 +0800 Subject: [PATCH 221/414] draft --- comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce index f3923b87..ad630473 100644 --- a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce @@ -111,16 +111,16 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData910B2C int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, - (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, - newMagic); + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); } } if ((nodeId == 1 && GetBlockIdx() == singleNodeRankSize + 1)) { int64_t checkFlagOffset = nodeId * GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM; for (int checkLogicRank = 0; checkLogicRank < singleNodeRankSize; checkLogicRank++) { CheckFlag(ctrlFlagsUB, - (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, - newMagic); + (__gm__ int64_t*)buff[rank] + checkFlagOffset + checkLogicRank * MEM_DMA_UNIT_INT_NUM, + newMagic); } } if (nodeId == 0 && GetBlockIdx() == singleNodeRankSize + 1) { -- Gitee From 265c76692dc48345e40c90197b4377c91ef5295c Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 21 Aug 2025 21:08:16 +0800 Subject: [PATCH 222/414] rename --- .../kernels/{lca_allgather_2npu.cce => lcal_allgather_2npu.cce} | 0 ..._big_data_write.cce => lcal_allgather_2npu_big_data_write.cce} | 0 .../{lca_allgather_910B2C.cce => lcal_allgather_910B2C.cce} | 0 .../{lca_allgather_big_data.cce => lcal_allgather_big_data.cce} | 0 ...her_big_data_910B2C.cce => lcal_allgather_big_data_910B2C.cce} | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename comm/lcal/src/kernels/{lca_allgather_2npu.cce => lcal_allgather_2npu.cce} (100%) rename comm/lcal/src/kernels/{lca_allgather_2npu_big_data_write.cce => lcal_allgather_2npu_big_data_write.cce} (100%) rename comm/lcal/src/kernels/{lca_allgather_910B2C.cce => lcal_allgather_910B2C.cce} (100%) rename comm/lcal/src/kernels/{lca_allgather_big_data.cce => lcal_allgather_big_data.cce} (100%) rename comm/lcal/src/kernels/{lca_allgather_big_data_910B2C.cce => lcal_allgather_big_data_910B2C.cce} (100%) diff --git a/comm/lcal/src/kernels/lca_allgather_2npu.cce b/comm/lcal/src/kernels/lcal_allgather_2npu.cce similarity index 100% rename from comm/lcal/src/kernels/lca_allgather_2npu.cce rename to comm/lcal/src/kernels/lcal_allgather_2npu.cce diff --git a/comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce b/comm/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce similarity index 100% rename from comm/lcal/src/kernels/lca_allgather_2npu_big_data_write.cce rename to comm/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce diff --git a/comm/lcal/src/kernels/lca_allgather_910B2C.cce b/comm/lcal/src/kernels/lcal_allgather_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lca_allgather_910B2C.cce rename to comm/lcal/src/kernels/lcal_allgather_910B2C.cce diff --git a/comm/lcal/src/kernels/lca_allgather_big_data.cce b/comm/lcal/src/kernels/lcal_allgather_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lca_allgather_big_data.cce rename to comm/lcal/src/kernels/lcal_allgather_big_data.cce diff --git a/comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce b/comm/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lca_allgather_big_data_910B2C.cce rename to comm/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce -- Gitee From 4bea0def4f6c160c5e6c2b3843e18a4cdbce49a5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 11:56:04 +0800 Subject: [PATCH 223/414] draft --- .../src/kernels/lcal_allgather_big_data.cce | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/comm/lcal/src/kernels/lcal_allgather_big_data.cce index cd9274ad..4d2cea0d 100644 --- a/comm/lcal/src/kernels/lcal_allgather_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allgather_big_data.cce @@ -9,3 +9,91 @@ */ #include #include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin( + __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, + uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1[16], + __ubuf__ int64_t* ctrlFlagsUB2[16], __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, int64_t flagOffset2nd, + int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx) +{ + int64_t avgNumDMAPerCore = len / blockNumPerGroup; + int64_t dataNumRemain = avgNumDMAPerCore; + if (GetBlockIdx() == blockNumPerGroup - 1) { + dataNumRemain = len - dataNumRemain * GetBlockIdx(); + } + + __gm__ T *receiveBuff = (__gm__ T *)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *sendBuff = input; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + flagOffset1st; + if (GetBlockIdx() < blockNumPerGroup) { + int64_t ipcBuffOffsetNum = GetBlockIdx() * avgNumDMAPerCore; + int64_t inputOffsetNum = GetBlockIdx() * avgNumDMAPerCore; + input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, inputOffsetNum, + sendBuff, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGMTemp, magic); + return; + } + + for (int64_t i = 0; i < blockNumPerGroup; i++) { + *ctrlFlagsUB1[i] = 0; + *ctrlFlagsUB2[i] = 0; + } + + while (true) { + for (int64_t blockGroup0Idx = 0; blockGroup0Idx < blockNumPerGroup; blockGroup0Idx++) { + if (*ctrlFlagsUB1[blockGroup0Idx] == INT64_MAX) { + continue; + } + + int64_t allDataSizeNeedDMA = avgNumDMAPerCore * sizeof(T); + if (blockGroup0Idx == blockNumPerGroup - 1) { + allDataSizeNeedDMA = (len - blockGroup0Idx * avgNumDMAPerCore) * sizeof(T); + } + + if (*ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG >= allDataSizeNeedDMA) { + *ctrlFlagsUB1[blockGroup0Idx] = INT64_MAX; + continue; + } + + ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (blockGroup0Idx) * MEM_DMA_UNIT_INT_NUM; + CpGM2UB(ctrlFlagsUB2[blockGroup0Idx], ctrlFlagsGMX, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if ((*ctrlFlagsUB2[blockGroup0Idx] >> 10) != (magic >> 10)) { + continue; + } + int64_t preparedDataGroupCount = *ctrlFlagsUB2[blockGroup0Idx] - magic; + if (preparedDataGroupCount <= 0 || *ctrlFlagsUB1[blockGroup0Idx] >= preparedDataGroupCount) { + continue; + } + + receiveBuff = (__gm__ T *)output; + sendBuff = (__gm__ T *)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + int64_t revBuffOffsetNum = x * allLen + processedNum + blockGroup0Idx * avgNumDMAPerCore + + *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG / sizeof(T); + int64_t sendBuffOffsetNum = blockGroup0Idx * avgNumDMAPerCore + + *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG / sizeof(T); + + int64_t dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB1[blockGroup0Idx]) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeedDMA) { + dataSizeRemain = allDataSizeNeedDMA - *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG; + } + + AscendC::PipeBarrier(); + GM2GMPingPong(dataSizeRemain, inputUB, receiveBuff, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); + AscendC::PipeBarrier(); + } + + bool finished = true; + for (int64_t blockGroup0Idx = 0; blockGroup0Idx < blockNumPerGroup; blockGroup0Idx) { + if (*ctrlFlagsUB1[blockGroup0Idx] != INT64_MAX) { + finished = false; + break; + } + } + if (finished) { + break; + } + } +} \ No newline at end of file -- Gitee From 4f1b45a0abf1f00fc5ca88e1d4a0e6281c21463b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 15:46:24 +0800 Subject: [PATCH 224/414] draft --- .../src/kernels/lcal_allgather_big_data.cce | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/comm/lcal/src/kernels/lcal_allgather_big_data.cce index 4d2cea0d..a954efa3 100644 --- a/comm/lcal/src/kernels/lcal_allgather_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allgather_big_data.cce @@ -96,4 +96,55 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin break; } } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + *magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_BYTE; + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1[16]; + __ubuf__ int64_t* ctrlFlagsUB2[16]; + for (int64_t i = 0; i < 16; i++) { + ctrlFlagsUB1[i] = (__ubuf__ int64_t*)(32) + i * 8; + ctrlFlagsUB2[i] = (__ubuf__ int64_t*)(544) + i * 8; + } + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(1056), (__ubuf__ T*)(98336)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup / rankSize; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + int64_t x = GetBlockIdx() / corePerRank; + if (GetBlockIdx() >= blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; + + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + int64_t dataLen = len; + for (int64_t i = 0; i < CeilDiv(dataLen, ipcBuffMaxNum); i++) { + *ctrlFlagsUB = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNum; + int64_t remainNum = (dataLen - processedNum < ipcBuffMaxNum) ? dataLen - processedNum : ipcBuffMaxNum; + + PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalAllGatherBigDataOrigin( + buff, input + processedNum, output, processedNum, blockNumPerGroup, rank, rankSize, len, remainNum, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx); + AscendC::PipeBarrier(); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); } \ No newline at end of file -- Gitee From 1260d35812b3bafd483fb3b82b265e9b344cc2a9 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 15:52:42 +0800 Subject: [PATCH 225/414] draft --- .../src/kernels/lcal_allgather_big_data.cce | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/comm/lcal/src/kernels/lcal_allgather_big_data.cce index a954efa3..b820bb99 100644 --- a/comm/lcal/src/kernels/lcal_allgather_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allgather_big_data.cce @@ -13,7 +13,7 @@ template __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin( __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, - uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1[16], + uint32_t rankSize, uint64_t allLen, uint64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1[16], __ubuf__ int64_t* ctrlFlagsUB2[16], __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx) { @@ -23,16 +23,16 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin dataNumRemain = len - dataNumRemain * GetBlockIdx(); } - __gm__ T *receiveBuff = (__gm__ T *)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); __gm__ T *sendBuff = input; __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + flagOffset1st; if (GetBlockIdx() < blockNumPerGroup) { int64_t ipcBuffOffsetNum = GetBlockIdx() * avgNumDMAPerCore; int64_t inputOffsetNum = GetBlockIdx() * avgNumDMAPerCore; - input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, inputOffsetNum, - sendBuff, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGMTemp, magic); - return; + input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, ipcBuffOffsetNum, + sendBuff, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); + return; } for (int64_t i = 0; i < blockNumPerGroup; i++) { @@ -41,7 +41,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin } while (true) { - for (int64_t blockGroup0Idx = 0; blockGroup0Idx < blockNumPerGroup; blockGroup0Idx++) { + for (int64_t blockGroup0Idx = 0; blockGroup0Idx < blockNumPerGroup; blockGroup0Idx++) { if (*ctrlFlagsUB1[blockGroup0Idx] == INT64_MAX) { continue; } @@ -69,11 +69,11 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin } receiveBuff = (__gm__ T *)output; - sendBuff = (__gm__ T *)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + sendBuff = (__gm__ T *)((__gm__ int64_t*)buff[x] + dataOffsetNum); int64_t revBuffOffsetNum = x * allLen + processedNum + blockGroup0Idx * avgNumDMAPerCore + *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG / sizeof(T); int64_t sendBuffOffsetNum = blockGroup0Idx * avgNumDMAPerCore + - *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG / sizeof(T); + *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG / sizeof(T); int64_t dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB1[blockGroup0Idx]) * DMA_SIZE_PER_FLAG; if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeedDMA) { @@ -83,6 +83,9 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin AscendC::PipeBarrier(); GM2GMPingPong(dataSizeRemain, inputUB, receiveBuff, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); AscendC::PipeBarrier(); + + *ctrlFlagsUB1[blockGroup0Idx] = preparedDataGroupCount; + AscendC::PipeBarrier(); } bool finished = true; -- Gitee From 1ac9565be4eed3a92d79766d49b4241294c31313 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 15:54:33 +0800 Subject: [PATCH 226/414] draft --- comm/lcal/src/kernels/lcal_allgather_big_data.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/comm/lcal/src/kernels/lcal_allgather_big_data.cce index b820bb99..c68f2cec 100644 --- a/comm/lcal/src/kernels/lcal_allgather_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allgather_big_data.cce @@ -89,7 +89,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin } bool finished = true; - for (int64_t blockGroup0Idx = 0; blockGroup0Idx < blockNumPerGroup; blockGroup0Idx) { + for (int64_t blockGroup0Idx = 0; blockGroup0Idx < blockNumPerGroup; blockGroup0Idx++) { if (*ctrlFlagsUB1[blockGroup0Idx] != INT64_MAX) { finished = false; break; @@ -105,8 +105,8 @@ template __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData(ALLREDUCE_ARGS_FUN_16P(T)) { DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); - *magic *= 1024; - const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_BYTE; + magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); __gm__ T* buff[8] = { buff0, buff1, buff2, buff3, @@ -115,7 +115,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData(ALLRE __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); __ubuf__ int64_t* ctrlFlagsUB1[16]; __ubuf__ int64_t* ctrlFlagsUB2[16]; - for (int64_t i = 0; i < 16; i++) { + for (int64_t i = 0; i * 8 < 128; i++) { ctrlFlagsUB1[i] = (__ubuf__ int64_t*)(32) + i * 8; ctrlFlagsUB2[i] = (__ubuf__ int64_t*)(544) + i * 8; } -- Gitee From 21a36175bd1c3580fcf0281065b0dac65fde25ea Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 15:55:42 +0800 Subject: [PATCH 227/414] draft --- comm/lcal/src/kernels/lcal_allgather_big_data.cce | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/comm/lcal/src/kernels/lcal_allgather_big_data.cce index c68f2cec..f6a59d33 100644 --- a/comm/lcal/src/kernels/lcal_allgather_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allgather_big_data.cce @@ -69,7 +69,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigDataOrigin } receiveBuff = (__gm__ T *)output; - sendBuff = (__gm__ T *)((__gm__ int64_t*)buff[x] + dataOffsetNum); + sendBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); int64_t revBuffOffsetNum = x * allLen + processedNum + blockGroup0Idx * avgNumDMAPerCore + *ctrlFlagsUB1[blockGroup0Idx] * DMA_SIZE_PER_FLAG / sizeof(T); int64_t sendBuffOffsetNum = blockGroup0Idx * avgNumDMAPerCore + @@ -115,7 +115,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGatherBigData(ALLRE __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); __ubuf__ int64_t* ctrlFlagsUB1[16]; __ubuf__ int64_t* ctrlFlagsUB2[16]; - for (int64_t i = 0; i * 8 < 128; i++) { + for (int64_t i = 0; i * 8 < 128; i ++) { ctrlFlagsUB1[i] = (__ubuf__ int64_t*)(32) + i * 8; ctrlFlagsUB2[i] = (__ubuf__ int64_t*)(544) + i * 8; } -- Gitee From e801f07a3247047b487762bf09ca943b8b0778d0 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 16:15:06 +0800 Subject: [PATCH 228/414] draft --- comm/lcal/src/kernels/lcal_allgather.cce | 62 ++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allgather.cce diff --git a/comm/lcal/src/kernels/lcal_allgather.cce b/comm/lcal/src/kernels/lcal_allgather.cce new file mode 100644 index 00000000..59f1b12b --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allgather.cce @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllGather(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t x = GetBlockIdx() / corePerRank; + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + int64_t dataNumRemain = len / GetLcalBlockNum(); + int64_t buffOffsetNum = rank * len + GetBlockIdx() * dataNumRemain; + if (GetBlockIdx() == GetLcalBlockNum() - 1) { + dataNumRemain = len - dataNumRemain * GetBlockIdx(); + } + + DumpLcclLogInfo(dumpAddr, LogId::INIT, Op::COPYONLY); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); + + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *sendBuff = input; + int64_t sendBuffOffsetNum = buffOffsetNum - rank * len; + GM2GM(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, buffOffsetNum, sendBuff, sendBuffOffsetNum); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + + for (int64_t i = 0; i < GetLcalBlockNum(); i++) { + __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + i * MEM_DMA_UNIT_INT_NUM; + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMTemp, (int64_t)magic); + } + dataNumRemain = len / corePerRank; + buffOffsetNum = x * len + coreSegmentedIdx * dataNumRemain; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumRemain = len - dataNumRemain * coreSegmentedIdx; + } + + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + GM2GM(dataNumRemain * sizeof(T), inputUB[0], (__gm__ T*)output, buffOffsetNum, sendBuff, buffOffsetNum); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, Op::COPYONLY); +} \ No newline at end of file -- Gitee From 70d42c45d5793bbc344920e360e1686a795f82e7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 16:16:07 +0800 Subject: [PATCH 229/414] draft --- comm/lcal/src/kernels/lcal_allgather.cce | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allgather.cce b/comm/lcal/src/kernels/lcal_allgather.cce index 59f1b12b..f9be19a1 100644 --- a/comm/lcal/src/kernels/lcal_allgather.cce +++ b/comm/lcal/src/kernels/lcal_allgather.cce @@ -41,11 +41,11 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather(ALLREDUCE_AR int64_t sendBuffOffsetNum = buffOffsetNum - rank * len; GM2GM(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, buffOffsetNum, sendBuff, sendBuffOffsetNum); - __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*) buff[rank] + flagOffset1st; SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); for (int64_t i = 0; i < GetLcalBlockNum(); i++) { - __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + i * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[x] + i * MEM_DMA_UNIT_INT_NUM; CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMTemp, (int64_t)magic); } dataNumRemain = len / corePerRank; @@ -54,7 +54,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllGather(ALLREDUCE_AR dataNumRemain = len - dataNumRemain * coreSegmentedIdx; } - sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); GM2GM(dataNumRemain * sizeof(T), inputUB[0], (__gm__ T*)output, buffOffsetNum, sendBuff, buffOffsetNum); -- Gitee From 2f6a60c37fd0027bdd1527269a33b1f644f36d03 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:04:06 +0800 Subject: [PATCH 230/414] draft --- .../src/kernels/lcal_allreduce_two_shot.cce | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_two_shot.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce new file mode 100644 index 00000000..9a3253e3 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t x = GetBlockIdx() / corePerRank; + __gm__ T* buff[16] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + const int64_t memDmaUnitNum = MEM_DMA_UNIT_BYTE / sizeof(T); + const int64_t singleNPUProcessDataBlockNum = len / memDmaUnitNum / rankSize; + const int64_t singleNPUProcessDataNum = singleNPUProcessDataBlockNum * memDmaUnitNum; + int64_t thisNPUProcessDataNum = singleNPUProcessDataNum; + if (rank == rankSize - 1) { + thisNPUProcessDataNum = len - rank * singleNPUProcessDataNum; + } + + int64_t xNPUProcessDataNum = singleNPUProcessDataNum; + if (x == rankSize - 1) { + xNPUProcessDataNum = len - x * singleNPUProcessDataNum; + } + + const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank / memDmaUnitNum * memDmaUnitNum; + const int64_t thisNPUCoreGroupAvgDMADataNum = thisNPUProcessDataNum / corePerRank / memDmaUnitNum * memDmaUnitNum; + + int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + int64_t buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*) buff[rank] + flagOffset1st; + if (input != (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum)) { + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum); + AscendC::PipeBarrier(); + + SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t*)((__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum) + len) + MEM_DMA_UNIT_INT_NUM, magic); + AscendC::PipeBarrier(); + + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + } + + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + + if (x == rank) { + goto label0; + } + buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + dataSizeRemain = thisNPUProcessDataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (thisNPUProcessDataNum - coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + ctrlFlagsGM = ((__gm__ int64_t*)buff[x]) + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + if (input != (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum)) { + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic); + } + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, processOutput, buffOffsetNum, op); + +lable0: + ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset2nd; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic); + + for (int i = 0; i < GetLcalBlockNum(); i++) { + if (i / corePerRank == x) { + continue; + } + __gm__ int64_t* ctrlFlagsGMTemp = ((__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + i) * MEM_DMA_UNIT_INT_NUM); + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMTemp, (int64_t)magic); + } + + buffOffsetNum = x * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], output, buffOffsetNum, sendBuff, buffOffsetNum); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From 5df9c85611ede58c7f43efa5be8608aa426e3790 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:06:30 +0800 Subject: [PATCH 231/414] draft --- comm/lcal/src/kernels/lcal_allreduce_two_shot.cce | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce index 9a3253e3..0df5a896 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce @@ -50,29 +50,29 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot(ALLRE dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); } - int64_t buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + int64_t buffOffsetNum = x * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); - __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*) buff[rank] + flagOffset1st; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; if (input != (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum)) { __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); GM2GM(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum); AscendC::PipeBarrier(); - SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t*)((__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum) + len) + MEM_DMA_UNIT_INT_NUM, magic); + SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + len) + MEM_DMA_UNIT_INT_NUM, magic); AscendC::PipeBarrier(); SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); } - __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T*)(((__gm__ int64_t*)buff[rank]) + dataOffsetNum); if (x == rank) { goto label0; } buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; - dataSizeRemain = thisNPUProcessDataNum * sizeof(T); + dataSizeRemain = thisNPUCoreGroupAvgDMADataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { dataSizeRemain = (thisNPUProcessDataNum - coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum) * sizeof(T); } @@ -85,7 +85,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot(ALLRE } ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, processOutput, buffOffsetNum, op); -lable0: +label0: ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset2nd; SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic); @@ -104,7 +104,7 @@ lable0: } __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); - GM2GM(dataSizeRemain, inputUB[0], output, buffOffsetNum, sendBuff, buffOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], output, buffOffsetNum, sendBuff, buffOffsetNum); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); -- Gitee From db232dfb1afc3f60727aafce6eb57d64521f31d4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:07:19 +0800 Subject: [PATCH 232/414] draft --- comm/lcal/src/kernels/lcal_allreduce_two_shot.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce index 0df5a896..4daf8298 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce @@ -66,7 +66,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot(ALLRE SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); } - __gm__ T *processOutput = (__gm__ T*)(((__gm__ int64_t*)buff[rank]) + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T *)(((__gm__ int64_t*)buff[rank]) + dataOffsetNum); if (x == rank) { goto label0; -- Gitee From 95e983b26ecc0a50212a6c0d02c9585fdb35c285 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:07:45 +0800 Subject: [PATCH 233/414] draft --- comm/lcal/src/kernels/lcal_allreduce_two_shot.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce index 4daf8298..0c9b4a65 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce @@ -66,7 +66,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot(ALLRE SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); } - __gm__ T *processOutput = (__gm__ T *)(((__gm__ int64_t*)buff[rank]) + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T *)(((__gm__ int64_t *)buff[rank]) + dataOffsetNum); if (x == rank) { goto label0; -- Gitee From 4db6ca29b8c2a52e7708c60f30dc4396c239ace5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:48:09 +0800 Subject: [PATCH 234/414] draft --- .../src/kernels/lcal_allreduce_big_data.cce | 174 ++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data.cce new file mode 100644 index 00000000..3853b1a3 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data.cce @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigDataOrigin + (__gm__ T* buff[16], __gm__ T *input, __gm__ T *output, int64_t blockNumPerGroup, uint32_t rank, uint32_t rankSize, + uint64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, + int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) +{ + const int64_t memDmaUnitNum = MEM_DMA_UNIT_BYTE / sizeof(T); + const int64_t singleNPUProcessDataBlockNum = len / memDmaUnitNum / rankSize; + const int64_t singleNPUProcessDataNum = singleNPUProcessDataBlockNum * memDmaUnitNum; + int64_t thisNPUProcessDataNum = singleNPUProcessDataNum; + if (rank == rankSize - 1) { + thisNPUProcessDataNum = len - rank * singleNPUProcessDataNum; + } + + int64_t xNPUProcessDataNum = singleNPUProcessDataNum; + if (x == rankSize - 1) { + xNPUProcessDataNum = len - x * singleNPUProcessDataNum; + } + + const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank / memDmaUnitNum * memDmaUnitNum; + const int64_t thisNPUCoreGroupAvgDMADataNum = thisNPUProcessDataNum / corePerRank / memDmaUnitNum * memDmaUnitNum; + + int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + int64_t buffOffsetNum = x * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + flagOffset1st; + if (GetBlockIdx() < blockNumPerGroup) { + input2BuffRankMagic(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); + } else { + if (x == rank) { + goto label0; + } + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + + int64_t allDataSizeNeed2Add = thisNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + allDataSizeNeed2Add = (thisNPUProcessDataNum - coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB >= CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG)) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagsGMX, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if (*ctrlFlagsUB1 == 0 || *ctrlFlagsUB2 == 0 || + ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((ctrlFlagsUB2 >> 10) != (magic >> 10))) { + continue; + } + + int64_t preparedDataGroupCount = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? + (*ctrlFlagsUB1 & 0x3FF) : (*ctrlFlagsUB2 & 0x3FF); + if (*ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + buffOffsetNum = rank * singleNPUProcessDataBlockNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + ProcessDataNew(dataSizeRemain, inputUB, buff[x], dataOffsetNum, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + processOutput, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + AscendC::PipeBarrier(); + + *ctrlFlagsUB = preparedDataGroupCount; + AscendC::PipeBarrier(); + } + } +label0: + if (GetBlockIdx() >= blockNumPerGroup) { + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset2nd, (int64_t)magic); + return; + } + AscendC::PipeBarrier(); + + + for (int i = 0; i < blockNumPerGroup; i++) { + if (i / corePerRank == x) { + continue; + } + __gm__ int64_t* ctrlFlagsGMTemp = ((__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + i) * MEM_DMA_UNIT_INT_NUM); + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMTemp, (int64_t)magic); + } + + buffOffsetNum = x * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + GM2GMPingPong(dataSizeRemain, inputUB, output, buffOffsetNum, sendBuff, buffOffsetNum); + +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[16] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup / rankSize; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + int64_t x = GetBlockIdx() / corePerRank; + if (GetBlockIdx() >= blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + for (int64_t i = 0; i < CeilDiv(len, ipcBuffMaxNum); i++) { + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNum; + int64_t remainNum = (len - processedNum < ipcBuffMaxNum) ? len - processedNum : ipcBuffMaxNum; + + PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalAllReduceBigDataOrigin( + buff, input + processedNum, output + processedNum, blockNumPerGroup, rank, rankSize, remainNum, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx, op); + AscendC::PipeBarrier(); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From ad9f772334f09d5154a631f474f68690266613de Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:51:07 +0800 Subject: [PATCH 235/414] draft --- .../src/kernels/lcal_allreduce_big_data.cce | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data.cce index 3853b1a3..4d1ed67a 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data.cce @@ -7,18 +7,17 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#include #include "collectives.cce" template __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigDataOrigin (__gm__ T* buff[16], __gm__ T *input, __gm__ T *output, int64_t blockNumPerGroup, uint32_t rank, uint32_t rankSize, - uint64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, - __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, - int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) + uint64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, + int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) { const int64_t memDmaUnitNum = MEM_DMA_UNIT_BYTE / sizeof(T); - const int64_t singleNPUProcessDataBlockNum = len / memDmaUnitNum / rankSize; + const int64_t singleNPUProcessDataBlockNum = (len / memDmaUnitNum) / rankSize; const int64_t singleNPUProcessDataNum = singleNPUProcessDataBlockNum * memDmaUnitNum; int64_t thisNPUProcessDataNum = singleNPUProcessDataNum; if (rank == rankSize - 1) { @@ -30,8 +29,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigDataOrigin xNPUProcessDataNum = len - x * singleNPUProcessDataNum; } - const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank / memDmaUnitNum * memDmaUnitNum; - const int64_t thisNPUCoreGroupAvgDMADataNum = thisNPUProcessDataNum / corePerRank / memDmaUnitNum * memDmaUnitNum; + const int64_t xNPUCoreGroupAvgDMADataNum = (xNPUProcessDataNum / corePerRank / memDmaUnitNum) * memDmaUnitNum; + const int64_t thisNPUCoreGroupAvgDMADataNum = (thisNPUProcessDataNum / corePerRank / memDmaUnitNum) * memDmaUnitNum; int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { @@ -54,7 +53,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigDataOrigin *ctrlFlagsUB2 = 0; ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; - __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum); int64_t allDataSizeNeed2Add = thisNPUCoreGroupAvgDMADataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { @@ -71,23 +70,23 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigDataOrigin AscendC::PipeBarrier(); if (*ctrlFlagsUB1 == 0 || *ctrlFlagsUB2 == 0 || - ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((ctrlFlagsUB2 >> 10) != (magic >> 10))) { + ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { continue; } int64_t preparedDataGroupCount = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? (*ctrlFlagsUB1 & 0x3FF) : (*ctrlFlagsUB2 & 0x3FF); - if (*ctrlFlagsUB >= preparedDataGroupCount) { + if (*ctrlFlagsUB >= preparedDataGroupCount) { continue; } - buffOffsetNum = rank * singleNPUProcessDataBlockNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; } ProcessDataNew(dataSizeRemain, inputUB, buff[x], dataOffsetNum, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), - processOutput, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + processOutput, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); AscendC::PipeBarrier(); *ctrlFlagsUB = preparedDataGroupCount; @@ -102,7 +101,7 @@ label0: AscendC::PipeBarrier(); - for (int i = 0; i < blockNumPerGroup; i++) { + for (int64_t i = 0; i < blockNumPerGroup; i++) { if (i / corePerRank == x) { continue; } -- Gitee From 3fb0ebae524f307bc4ee7021b077c05219a9413e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 17:51:51 +0800 Subject: [PATCH 236/414] draft --- comm/lcal/src/kernels/lcal_allreduce_big_data.cce | 3 --- 1 file changed, 3 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data.cce index 4d1ed67a..d16ef7ed 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data.cce @@ -100,7 +100,6 @@ label0: } AscendC::PipeBarrier(); - for (int64_t i = 0; i < blockNumPerGroup; i++) { if (i / corePerRank == x) { continue; @@ -117,7 +116,6 @@ label0: __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); GM2GMPingPong(dataSizeRemain, inputUB, output, buffOffsetNum, sendBuff, buffOffsetNum); - } template @@ -151,7 +149,6 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData(ALLRE DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); - int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); for (int64_t i = 0; i < CeilDiv(len, ipcBuffMaxNum); i++) { *ctrlFlagsUB = 0; -- Gitee From 42004ff7d5b5bc8da2f502967a0158c76ad940e5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 18:59:51 +0800 Subject: [PATCH 237/414] draft --- .../src/kernels/lcal_allreduce_2npu_read.cce | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce b/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce new file mode 100644 index 00000000..048a4128 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuRead(ALLREDUCE_ARGS_FUN(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * coreSegmentedIdx; + const int64_t x = GetBlockIdx() / corePerRank; + + const int64_t dataBlockAllNum = len * sizeof(T) / MEM_DMA_UNIT_BYTE; + const int64_t singleCoreDataBlockNum = dataBlockAllNum / corePerRank; + const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); + const int64_t buffOffsetNum = coreSegmentedIdx * singleCoreDataNum; + + int64_t dataSizeRemain = singleCoreDataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (len - coreSegmentedIdx * coreSegmentedIdx) * sizeof(T); + } + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + if (x == rank) { + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset1st, magic); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + return; + } else { + GM2GM(dataSizeRemain, inputUB[0], output, buffOffsetNum, input, buffOffsetNum); + } + + CheckFlag(ctrlFlagsUB, (((__gm__ int64_t*)buff[x]) + flagOffset1st), magic); + CheckFlag(ctrlFlagsUB, (((__gm__ int64_t*)buff[rank]) + flagOffset1st), magic); + + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, output, buffOffsetNum, op); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From 82b3aa05cc908cd10b8724d9b98e3b492217a0e3 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 19:02:45 +0800 Subject: [PATCH 238/414] draft --- comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce b/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce index 048a4128..bbf2ec42 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce @@ -20,7 +20,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuRead(ALLR buff4, buff5, buff6, buff7 }; __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); - __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(32), (__ubuf__ T*)(97312)}; const int64_t corePerRank = GetLcalBlockNum() / rankSize; const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; @@ -34,7 +34,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuRead(ALLR int64_t dataSizeRemain = singleCoreDataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { - dataSizeRemain = (len - coreSegmentedIdx * coreSegmentedIdx) * sizeof(T); + dataSizeRemain = (len - singleCoreDataNum * coreSegmentedIdx) * sizeof(T); } DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); @@ -53,7 +53,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuRead(ALLR CheckFlag(ctrlFlagsUB, (((__gm__ int64_t*)buff[x]) + flagOffset1st), magic); CheckFlag(ctrlFlagsUB, (((__gm__ int64_t*)buff[rank]) + flagOffset1st), magic); - ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, output, buffOffsetNum, op); + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, output, buffOffsetNum, op); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); } \ No newline at end of file -- Gitee From c466fbb6d2bbca877879a0b29abe84a6972caaea Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 19:06:27 +0800 Subject: [PATCH 239/414] draft --- .../src/kernels/lcal_allreduce_2npu_write.cce | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce b/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce new file mode 100644 index 00000000..4f1de047 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuWrite(ALLREDUCE_ARGS_FUN(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(32), (__ubuf__ T*)(97312)}; + + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * coreSegmentedIdx; + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * (GetLcalBlockNum() + coreSegmentedIdx); + const int64_t x = GetBlockIdx() / corePerRank; + + const int64_t dataBlockAllNum = len * sizeof(T) / MEM_DMA_UNIT_BYTE; + const int64_t singleCoreDataBlockNum = dataBlockAllNum / corePerRank; + const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); + const int64_t buffOffsetNum = coreSegmentedIdx * singleCoreDataNum; + + int64_t dataSizeRemain = singleCoreDataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (len - singleCoreDataNum * coreSegmentedIdx) * sizeof(T); + } + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + if (x != rank) { + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + flagOffset1st, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset2nd, magic); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + return; + } else { + GM2GM(dataSizeRemain, inputUB[0], output, buffOffsetNum, input, buffOffsetNum); + } + + CheckFlag(ctrlFlagsUB, (((__gm__ int64_t*)buff[rank]) + flagOffset1st), magic); + CheckFlag(ctrlFlagsUB, (((__gm__ int64_t*)buff[rank]) + flagOffset2nd), magic); + + ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, output, buffOffsetNum, op); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From 327ba58ec146e948b910c01bcb3820d8beae0025 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 19:07:29 +0800 Subject: [PATCH 240/414] draft --- comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce b/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce index 4f1de047..a4f05816 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce @@ -10,7 +10,7 @@ #include "collectives.cce" template -__attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuWrite(ALLREDUCE_ARGS_FUN(T)) +__attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuWrite(ALLREDUCE_ARGS_FUN_16P(T)) { DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); @@ -41,7 +41,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuWrite(ALL DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); if (x != rank) { - __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); GM2GM(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum); SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + flagOffset1st, magic); SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset2nd, magic); -- Gitee From 92ceca149098f42fd213c270179a48ded0173fc9 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 19:19:30 +0800 Subject: [PATCH 241/414] draft --- .../kernels/lcal_allreduce_2npu_big_write.cce | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce b/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce new file mode 100644 index 00000000..c2c52b53 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuBigDataWriteOrigin + (__gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t blockNumPerGroup, uint32_t rank, uint32_t rankSize, + uint64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, + int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) +{ + const int64_t dataBlockAllNum = CeilDiv(len * sizeof(T), MEM_DMA_UNIT_BYTE); + const int64_t singleCoreDataBlockNum = dataBlockAllNum / corePerRank; + const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); + const int64_t buffDataDMAOffsetNum = coreSegmentedIdx * singleCoreDataNum; + + int64_t dataSizeRemain = singleCoreDataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (len - singleCoreDataNum * coreSegmentedIdx) * sizeof(T); + } + + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + flagOffset1st; + if (GetBlockIdx() < blockNumPerGroup) { + input2BuffRankMagic( + dataSizeRemain, inputUB[0], receiveBuff, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum, + ctrlFlagsUB, ctrlFlagsGM, magic); + return; + } + GM2GMPingPong(dataSizeRemain, inputUB, output, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum); + + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + int64_t allDataSizeNeed2Add = dataSizeRemain; + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB >= CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG)) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) { + continue; + } + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 - magic); + if (preparedDataGroupCount <= 0 || *ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + ProcessDataNew(dataSizeRemain, inputUB, buff[rank], dataOffsetNum, + buffDataDMAOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + output, buffDataDMAOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + AscendC::PipeBarrier(); + + *ctrlFlagsUB = preparedDataGroupCount; + AscendC::PipeBarrier(); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuBigDataWrite(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + int64_t x = (rank == 0 ? 1 : 0); + if (GetBlockIdx() >= blockNumPerGroup) { + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; + + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + for (int64_t i = 0; i < CeilDiv(len, ipcBuffMaxNum); i++) { + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNum; + int64_t remainNum = (len - processedNum < ipcBuffMaxNum) ? len - processedNum : ipcBuffMaxNum; + + PostSyncBigDataWriteAcrossCard(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalAllReduce2npuBigDataWriteOrigin( + buff, input + processedNum, output + processedNum, blockNumPerGroup, rank, rankSize, remainNum, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx, op); + AscendC::PipeBarrier(); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} -- Gitee From 8e27065a54a23a410c303973ffa33d3b900d52fa Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 19:22:06 +0800 Subject: [PATCH 242/414] draft --- .../src/kernels/lcal_allreduce_2npu_big_write.cce | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce b/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce index c2c52b53..d770dc58 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce @@ -17,23 +17,23 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduce2npuBigDataWr int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) { const int64_t dataBlockAllNum = CeilDiv(len * sizeof(T), MEM_DMA_UNIT_BYTE); - const int64_t singleCoreDataBlockNum = dataBlockAllNum / corePerRank; + const int64_t singleCoreDataBlockNum = dataBlockAllNum / blockNumPerGroup; const int64_t singleCoreDataNum = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE / sizeof(T); const int64_t buffDataDMAOffsetNum = coreSegmentedIdx * singleCoreDataNum; - int64_t dataSizeRemain = singleCoreDataNum * sizeof(T); - if (coreSegmentedIdx == corePerRank - 1) { + int64_t dataSizeRemain = singleCoreDataBlockNum * MEM_DMA_UNIT_BYTE; + if (coreSegmentedIdx == blockNumPerGroup - 1) { dataSizeRemain = (len - singleCoreDataNum * coreSegmentedIdx) * sizeof(T); } - __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + flagOffset1st; if (GetBlockIdx() < blockNumPerGroup) { input2BuffRankMagic( dataSizeRemain, inputUB[0], receiveBuff, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum, - ctrlFlagsUB, ctrlFlagsGM, magic); - return; + ctrlFlagsUB, ctrlFlagsGMX, magic); + return; } GM2GMPingPong(dataSizeRemain, inputUB, output, buffDataDMAOffsetNum, input, buffDataDMAOffsetNum); -- Gitee From 670bfafc21a3094a18939fd62435720342c6325d Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 20:08:48 +0800 Subject: [PATCH 243/414] draft --- .../src/tools/socket/lcal_sock_exchange.cpp | 1 + .../src/tools/socket/lcal_sock_exchange.h | 179 ++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 comm/lcal/src/tools/socket/lcal_sock_exchange.cpp create mode 100644 comm/lcal/src/tools/socket/lcal_sock_exchange.h diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -0,0 +1 @@ + diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.h b/comm/lcal/src/tools/socket/lcal_sock_exchange.h new file mode 100644 index 00000000..a5732ab2 --- /dev/null +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCCL_SOCK_EXCHANGE_H +#define LCCL_SOCK_EXCHANGE_H + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mki/utils/log/log.h" + +#include "lcal_types.h" +#include "lcal_api.h" + +namespace Lcal { + +union LcalSocketAddress { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin; +}; + +constexpr uint64_t LCAL_MAGIC = 0xdddd0000dddd0000; + +struct LcalBootstrapHandle { + uint64_t magic; + union LcalSocketAddress addr; +}; + +union LcalBootstrap { + LcalBootstrapHandle handle; + LcalUniqueId uid; +}; + +int BootstrapGetUniqueId(LcalBootstrapHandle &handle, int commDomain); + +class LcalSockExchange { +public: + LcalSockExchange(int rank, int rankSize, std::vector &rankList, int commDomain); + LcalSockExchange(int rank, int rankSize, LcalUniqueId); + ~LcalSockExchange(); + + template int AllGather(const T *sendBuf, size_t sendCount, T *recvBuf) + { + if (!isInit_ && Prepare() != LCAL_SUCCESS) { + return LCAL_ERROR_INTERNAL; + } + isInit_ = true; + + if (!IsServer()) { + return ClientSendRecv(sendBuf, sendCount, recvBuf); + } else { + return ServerRecvSend(sendBuf, sendCount, recvBuf); + } + } + + int GetNodeNum(); + + static bool CheckValid(LcalUniqueId lcalCommId) + { + LcalBootstrap id {}; + id.uid = lcalCommId; + return id.handle.magic == LCAL_MAGIC; + } + +private: + void GetIpAndPort(); + int Prepare(); + int Listen(); + int Accept(); + int StartSecureTunnel(); + void Close(int &fd) const; + int Connect(); + int AcceptConnection(int fd, sockaddr_in &clientAddr, socklen_t *sinSize) const; + void Cleanup(); + bool IsServer() const; + static bool CheckErrno(int ioErrno) + { + return ((ioErrno == EAGAIN) || (ioErrno == EWOULDBLOCK) || (ioErrno == EINTR)); + } + + template int Send(int fd, const T *sendBuf, size_t sendSize, int flag) const + { + do { + auto ret = send(fd, sendBuf, sendSize, flag); + if (ret < 0) { + if (CheckErrno(errno)) { + MKI_LOG(ERROR) << "send failed: " << strerror(errno); + continue; + } + MKI_LOG(DEBUG) << "Send failed: " << strerror(errno); + } + return ret; + } while (true); + } + + template int Recv(int fd, T *sendBuf, size_t sendSize, int flag) const + { + do { + auto ret = recv(fd, sendBuf, sendSize, flag); + if (ret < 0) { + if (CheckErrno(errno)) { + MKI_LOG(ERROR) << "recv failed: " << strerror(errno); + continue; + } + MKI_LOG(DEBUG) << "recv failed: " << strerror(errno); + } + return ret; + } while (true); + } + + template int ClientSendRecv(const T *sendBuf, size_t sendSize, T *recvBuf) + { + if (Send(fd_, sendBuf, sendSize * sizeof(T), 0) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " send buffer failed"; + return LCAL_ERROR_INTERNAL; + } + + if (Recv(fd_, sendBuf, sendSize * rankSize_ * sizeof(T), MSG_WAITALL) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " recv buffer failed"; + return LCAL_ERROR_INTERNAL; + } + + return LCAL_SUCCESS; + } + + template int ServerRecvSend(const T *sendBuf, size_t sendSize, T *recvBuf) + { + auto ret = memcpy_s(recvBuf, sendSize * sizeof (T), sendBuf, sendSize * sizeof (T)); + if (ret != EOK) { + MKI_LOG(ERROR) << "Failed to copy sendBuf to recvBuf."; + return LCAL_ERROR_INTERNAL; + } + + for (int i = 1; i < rankSize_; ++i) { + if (Recv(clientFds_[i], recvBuf + i * sendSize, sendSize * sizeof(T), MSG_WAITALL) <= 0) { + MKI_LOG(ERROR) << "Server side recv rank " << i << " buffer failed"; + return LCAL_ERROR_INTERNAL; + } + } + + for (int i = 1; i < rankSize_; ++i) { + if (Send(clientFds_[i], recvBuf, sendSize * rankSize_* sizeof(T), 0) <= 0) { + MKI_LOG(ERROR) << "Server side send rank " << i << " buffer failed"; + return LCAL_ERROR_INTERNAL; + } + } + + return LCAL_SUCCESS; + } + + pid_t pid_ = 0; + int rank_ = 0; + int rankSize_ = 0; + int fd_ = -1; + std::vector clientFds_ = {}; + bool isInit_ = false; + std::vector rankList_ = {}; + int commDomain_ = -1; + std::string ip_ = ""; + uint16_t port_ = 0; + LcalBootstrap lcalCommId_ = {}; +}; +} + +#endif \ No newline at end of file -- Gitee From 2633512500c0db3b8840dbb51b985a6fc2c5b955 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 20:11:05 +0800 Subject: [PATCH 244/414] draft --- comm/lcal/src/tools/socket/lcal_sock_exchange.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.h b/comm/lcal/src/tools/socket/lcal_sock_exchange.h index a5732ab2..90f997ac 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.h +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.h @@ -30,7 +30,7 @@ namespace Lcal { union LcalSocketAddress { struct sockaddr sa; struct sockaddr_in sin; - struct sockaddr_in6 sin; + struct sockaddr_in6 sin6; }; constexpr uint64_t LCAL_MAGIC = 0xdddd0000dddd0000; @@ -50,7 +50,7 @@ int BootstrapGetUniqueId(LcalBootstrapHandle &handle, int commDomain); class LcalSockExchange { public: LcalSockExchange(int rank, int rankSize, std::vector &rankList, int commDomain); - LcalSockExchange(int rank, int rankSize, LcalUniqueId); + LcalSockExchange(int rank, int rankSize, LcalUniqueId lcalCommId); ~LcalSockExchange(); template int AllGather(const T *sendBuf, size_t sendCount, T *recvBuf) @@ -107,10 +107,10 @@ private: } while (true); } - template int Recv(int fd, T *sendBuf, size_t sendSize, int flag) const + template int Recv(int fd, T *recvBuf, size_t recvSize, int flag) const { do { - auto ret = recv(fd, sendBuf, sendSize, flag); + auto ret = recv(fd, recvBuf, recvSize, flag); if (ret < 0) { if (CheckErrno(errno)) { MKI_LOG(ERROR) << "recv failed: " << strerror(errno); @@ -129,8 +129,8 @@ private: return LCAL_ERROR_INTERNAL; } - if (Recv(fd_, sendBuf, sendSize * rankSize_ * sizeof(T), MSG_WAITALL) <= 0) { - MKI_LOG(ERROR) << "Client side " << rank_ << " recv buffer failed"; + if (Recv(fd_, recvBuf, sendSize * rankSize_ * sizeof(T), MSG_WAITALL) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " recv buffer failed "; return LCAL_ERROR_INTERNAL; } @@ -153,7 +153,7 @@ private: } for (int i = 1; i < rankSize_; ++i) { - if (Send(clientFds_[i], recvBuf, sendSize * rankSize_* sizeof(T), 0) <= 0) { + if (Send(clientFds_[i], recvBuf, sendSize * rankSize_ * sizeof(T), 0) <= 0) { MKI_LOG(ERROR) << "Server side send rank " << i << " buffer failed"; return LCAL_ERROR_INTERNAL; } -- Gitee From 5ebd306485751b81bc12b806ceac47a55c0c3fdf Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 20:38:48 +0800 Subject: [PATCH 245/414] draft --- .../src/tools/socket/lcal_sock_exchange.cpp | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index 8b137891..d33dc801 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -1 +1,130 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "lcal_sock_exchange.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +using namespace std; +namespace Lcal { +const string LCAL_LOCAL_SOCK_IP = "127.0.0.1"; +constexpr uint16_t LCAL_DEFAULT_SOCK_PORT = 10067; +constexpr uint32_t LCAL_MAX_BACK_LOG = 65536; + +int ParseIpAndPort(const char* input, string &ip, uint16_t &port) +{ + if (input == nullptr) { + return LCAL_INVALID_VALUE; + } + string inputStr(input); + size_t colonPos = inputStr.find(":"); + if (colonPos == strings::npos) { + MKI_LOG(ERROR) << "Input string does not contain a colon separating IP and port."; + return LCAL_ERROR_INTERNAL; + } + + ip = inputStr.substr(0, colonPos); + std::string portStr = inputStr.substr(colonPos + 1); + + std::istringstream portStream(portStr); + portStream >> port; + if (portStream.fail() || portStream.bad()) { + MKI_LOG(ERROR) << "Invalid port number."; + return LCAL_ERROR_INTERNAL; + } + return LCAL_SUCCESS; +} + +LcalSockExchange::~LcalSockExchange() +{ + Cleanup(); +} + +LcalSockExchange::LcalSockExchange(int rank, int rankSize, std::vector &rankList, int commDomain) + : rank_(rank), rankSize_(rankSize), rankList_(rankList), commDomain_(commDomain) +{ +} + +LcalSockExchange::LcalSockExchange(int rank, int rankSize, LcalUniqueId lcalCommId) + : rank_(rank), rankSize_(rankSize) +{ + lcalCommId_.uid = lcalCommId; +} + +int LcalSockExchange::GetNodeNum() +{ + if (!isInit_ && Prepare() != LCAL_SUCCESS) { + return LCAL_ERROR_INTERNAL; + } + isInit_ = true; + const string filePath = "/proc/sys/kernel/rankdom/boot_id"; + ifstream fileStream(filePath); + stringstream buffer; + if (fileStream) { + buffer << fileStream.rdbuf(); + fileStream.close(); + } + const std::string uuid = buffer.str(); + MKI_LOG(DEBUG) << "rank:" << rank_ << " UUID " << uuid; + + set uuidSet {}; + uuidSet.insert(uuid); + int nodeNum = -1; + if (IsServer()) { + for (int i = 1; i < rankSize_; ++i) { + if (Recv(clientFds_[i], const_cast<__caddr_t>(uuid.data()), uuid.size(), 0) <= 0) { + MKI_LOG(ERROR) << "Server side recv rank " << i << " buffer failed"; + return LCAL_ERROR_INTERNAL; + } + uuidSet.insert(uuid); + } + nodeNum = static_cast(uuidSet.size()); + for (int i = 1; i < rankSize_; ++i) { + if (Send(clientFds_[i], &nodeNum, sizeof(int), 0) <= 0) { + MKI_LOG(ERROR) << "Server side send rank " << i << " buffer failed"; + return LCAL_ERROR_INTERNAL; + } + } + } else { + if (Send(fd_, uuid.data(), uuid.size(), 0) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " send buffer failed"; + return LCAL_ERROR_INTERNAL; + } + if (Send(fd_, &nodeNum, sizeof(int), 0) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " recv buffer failed"; + return LCAL_ERROR_INTERNAL; + } + } + return nodeNum; +} + +void LcalSockExchange::GetIpAndPort() +{ + +} + +} \ No newline at end of file -- Gitee From 61b6f132cc20a0e7fd232a74763ed46b6f4279fd Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 20:40:14 +0800 Subject: [PATCH 246/414] draft --- comm/lcal/src/tools/socket/lcal_sock_exchange.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index d33dc801..eec8d62e 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -33,7 +33,7 @@ using namespace std; namespace Lcal { const string LCAL_LOCAL_SOCK_IP = "127.0.0.1"; constexpr uint16_t LCAL_DEFAULT_SOCK_PORT = 10067; -constexpr uint32_t LCAL_MAX_BACK_LOG = 65536; +constexpr uint32_t LCAL_MAX_BACK_LOG = 65535; int ParseIpAndPort(const char* input, string &ip, uint16_t &port) { @@ -41,8 +41,8 @@ int ParseIpAndPort(const char* input, string &ip, uint16_t &port) return LCAL_INVALID_VALUE; } string inputStr(input); - size_t colonPos = inputStr.find(":"); - if (colonPos == strings::npos) { + size_t colonPos = inputStr.find(':'); + if (colonPos == string::npos) { MKI_LOG(ERROR) << "Input string does not contain a colon separating IP and port."; return LCAL_ERROR_INTERNAL; } @@ -81,7 +81,7 @@ int LcalSockExchange::GetNodeNum() return LCAL_ERROR_INTERNAL; } isInit_ = true; - const string filePath = "/proc/sys/kernel/rankdom/boot_id"; + const string filePath = "/proc/sys/kernel/random/boot_id"; ifstream fileStream(filePath); stringstream buffer; if (fileStream) { @@ -114,8 +114,8 @@ int LcalSockExchange::GetNodeNum() MKI_LOG(ERROR) << "Client side " << rank_ << " send buffer failed"; return LCAL_ERROR_INTERNAL; } - if (Send(fd_, &nodeNum, sizeof(int), 0) <= 0) { - MKI_LOG(ERROR) << "Client side " << rank_ << " recv buffer failed"; + if (Recv(fd_, &nodeNum, sizeof(int), 0) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " recv buffer failed "; return LCAL_ERROR_INTERNAL; } } @@ -124,7 +124,7 @@ int LcalSockExchange::GetNodeNum() void LcalSockExchange::GetIpAndPort() { - + } } \ No newline at end of file -- Gitee From 034b1fd2633a7ab3aea2c77291297b40ed8a9d98 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 20:48:46 +0800 Subject: [PATCH 247/414] draft --- comm/lcal/src/tools/socket/lcal_sock_exchange.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index eec8d62e..47a88289 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -124,7 +124,22 @@ int LcalSockExchange::GetNodeNum() void LcalSockExchange::GetIpAndPort() { + const char* env = Mki::GetEnv("LCAL_COMM_ID"); + if (env == nullptr or ParseIpAndPort(env, ip_, port_) != LCAL_SUCCESS) { + ip_ = LCAL_LOCAL_SOCK_IP; + port_ = LCAL_DEFAULT_SOCK_PORT; + } + port_ += commDomain_; + lcalCommId_.handle.addr.sin.sin_family = AF_INET; + lcalCommId_.handle.addr.sin.sin_addr.s_addr = inet_addr(LCAL_LOCAL_SOCK_IP.c_str()); + lcalCommId_.handle.addr.sin.sin_port = htons(port_); + MKI_LOG(DEBUG) << "curRank: " << rank_ << " commDomain: " << commDomain_ << " ip: " << ip_ << " port: " << prot_; } +int LcalSockExchange::Prepare() +{ + if (lcalCommId_.handle.magic != LCAL_MAGIC) { + GetIpAndPort(); + } } \ No newline at end of file -- Gitee From 9ab818219786d8162b64165f56840891bba7e173 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Fri, 22 Aug 2025 20:49:22 +0800 Subject: [PATCH 248/414] draft --- comm/lcal/src/tools/socket/lcal_sock_exchange.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index 47a88289..db9bf8c3 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -134,7 +134,7 @@ void LcalSockExchange::GetIpAndPort() lcalCommId_.handle.addr.sin.sin_family = AF_INET; lcalCommId_.handle.addr.sin.sin_addr.s_addr = inet_addr(LCAL_LOCAL_SOCK_IP.c_str()); lcalCommId_.handle.addr.sin.sin_port = htons(port_); - MKI_LOG(DEBUG) << "curRank: " << rank_ << " commDomain: " << commDomain_ << " ip: " << ip_ << " port: " << prot_; + MKI_LOG(DEBUG) << "curRank: " << rank_ << " commDomain: " << commDomain_ << " ip: " << ip_ << " port: " << port_; } int LcalSockExchange::Prepare() -- Gitee From 4e4659b346f7e9e508d0d856f9f80182f542825a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 10:24:51 +0800 Subject: [PATCH 249/414] draft --- .../src/tools/socket/lcal_sock_exchange.cpp | 274 ++++++++++++++++++ 1 file changed, 274 insertions(+) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index db9bf8c3..1d8846ca 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -142,4 +142,278 @@ int LcalSockExchange::Prepare() if (lcalCommId_.handle.magic != LCAL_MAGIC) { GetIpAndPort(); } + if (!IsServer()) { + if (ip_ != LCAL_LOCAL_SOCK_IP) { + MKI_LOG(ERROR) << "Multi-machine is not supported at the moment"; + return LCAL_ERROR_INTERNAL; + } + return Connect(); + } + + clientFds_.resize(rankSize_, -1); + if (Listen() != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "Listen Failed!"; + return LCAL_ERROR_INTERNAL; + } + + if (Accept() != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "Accept Failed!"; + return LCAL_ERROR_INTERNAL; + } + + return LCAL_SUCCESS; +} + +int LcalSockExchange::Listen() +{ + fd_ = socket(AF_INET, SOCK_STREAM, 0); + if (fd_ < 0) { + MKI_LOG(ERROR) << "Server side create socket failed"; + return LCAL_ERROR_INTERNAL; + } + + int reuse = 1; + if (setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(int)) < 0) { + MKI_LOG(ERROR) << "Server side set reuseaddr failed"; + return LCAL_ERROR_INTERNAL; + } + + struct sockaddr *addrPtr = &lcalCommId_.handle.addr.sa; + if (bind(fd_, addrPtr, sizeof(struct sockaddr)) < 0) { + MKI_LOG(ERROR) << "Server side bind" << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; + return LCAL_ERROR_INTERNAL; + } + + if (listen(fd_, LCAL_MAX_BACK_LOG) < 0) { + MKI_LOG(ERROR) << "Server side listen" << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; + return LCAL_ERROR_INTERNAL; + } + MKI_LOG(ERROR) << "The server is listening! ip: " << inet_ntoa(lcalCommId_.handle.addr.sin.sin_addr) + << " port: " << ntohs(lcalCommId_.handle.addr.sin.sin_port); + + return LCAL_ERROR_INTERNAL; +} + +int LcalSockExchange::AcceptConnection(int fd, sockaddr_in& clientAddr, socklen_t *sinSize) const +{ + int clientFd; + LcalSocketAddress clientAddrPtr; + clientAddrPtr.sin = clientAddr; + + do { + clientFd = accept(fd, &clientAddrPtr.sa, sinSize); + if (clientFd < 0) { + if (!CheckErrno(errno)) { + MKI_LOG(ERROR) << "Server side accept failed " << strerror(errno); + return -1; + } + MKI_LOG(DEBUG) << "accept failed " << strerror(errno); + continue; + } + break; + } while (true); + + return clientFd; +} + +int LcalSockExchange::Accept() +{ + struct sockaddr_in clientAddr; + socklen_t sinSize = sizeof(struct sockaddr_in); + + for (int i = 1; i < rankSize_; ++i) { + int fd = AcceptConnection(fd, clientAddr, &sinSize); + if (fd < 0) { + MKI_LOG(ERROR) << "AcceptConnection failed"; + return LCAL_ERROR_INTERNAL; + } + + int rank = 0; + if (Recv(fd, &rank, sizeof(rank), 0) <= 0) { + MKI_LOG(ERROR) << "Server side recv rank id failed"; + return LCAL_ERROR_INTERNAL; + } + + if (rank > rankSize_ || rank <= 0 || clientFds_[rank] >= 0) { + MKI_LOG(ERROR) << "Server side recv invalid rank id " << rank; + return LCAL_ERROR_INTERNAL; + } + + MKI_LOG(DEBUG) << "Server side recv rank id " << rank; + clientFds_[rank] = fd; + } + + return LCAL_SUCCESS; +} + +int LcalSockExchange::Close(int &fd) const +{ + if (fd == -1) { + return; + } + + if (close(fd) < 0) { + MKI_LOG(WARN) << "failed to close fd:" << fd; + return; + } + + fd = -1; +} + +int LcalSockExchange::Connect() +{ + MKI_LOG(DEBUG) << "Client side " << rank_ << " begin to connect"; + + fd_ = socket(AF_INET, SOCK_STREAM, 0); + if (fd_ < 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " create socket failed"; + return LCAL_ERROR_INTERNAL; + } + + int sleepTimeS = 1; + int maxRetryCount = 180; + int retryCount = 0; + bool success = false; + struct sockaddr *addrPtr = &lcalCommId_.handle.addr.sa; + while (retryCount < maxRetryCount) { + if (connect(fd_, addrPtr, sizeof(struct sockaddr)) < 0) { + if (error == ECONNREFUSED) { + MKI_LOG(DEBUG) << "Client side " << rank_ << " try connect " << (retryCount + 1) << " times refused"; + retryCount++; + sleep(sleepTimeS); + continue; + } + if (errno != EINTR) { + MKI_LOG(ERROR) << "Client side " << rank_ << " connect failed: " << strerror(errno); + break; + } + MKI_LOG(DEBUG) << "Client side " << rank_ << " try connect failed: " << strerror(errno); + continue; + } + success = true; + break; + } + + if (!success) { + MKI_LOG(ERROR) << "Client side " << rank_ << " connect failed"; + return LCAL_ERROR_INTERNAL; + } + + if (Send(fd_, &rank_, sizeof(rank_), 0) <= 0) { + MKI_LOG(ERROR) << "Client side " << rank_ << " send rank failed"; + return LCAL_ERROR_INTERNAL; + } + + return LCAL_SUCCESS; +} + +int LcalSockExchange::IsServer() const +{ + return rank_ == 0; +} + +int LcalSockExchange::Cleanup() +{ + if (fd_ >= 0) { + close(fd_); + } + + if (clientFds_.empty()) { + return; + } + + for (int i = 1; i < rankSize_; ++i) { + if (clientFds_[i] >= 0) { + Close(clientFds_[i]); + } + } + if (pid_ > 0) { + kill(pid_, SIGINT); + int status; + waitpid(pid_, &status, 0); + MKI_LOG(DEBUG) << "child process resources cleaned up"; + } +} + +int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair) +{ + std::string ip; + uint16_t port; + int ret = ParseIpAndPort(ipPortPair, ip, port); + if (ret != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "lcal ParseIpAndPort failed!"; + return LCAL_ERROR_INTERNAL + } + ua->sin.sin_family = AF_INET; + ua->sin.sin_addr.s_addr = inet_addr(ip.c_str()); + ua->sin.sin_port = htons(port); + return LCAL_SUCCESS; +} + +int BootstrapGetServerIp(LcalSocketAddress& handle) +{ + char hostname[256]; + + if (gethostname(hostname, sizeof(hostname)) < 0) { + MKI_LOG(ERROR) << "ERROR: Failed to get hostname."; + return LCAL_ERROR_INTERNAL; + } + + struct hostent *hostEntry = gethostbyname(hostname); + if (hostEntry == nullptr) { + MKI_LOG(ERROR) << "ERROR: Failed to get host entry."; + return LCAL_ERROR_INTERNAL; + } + + const char* ip = inet_ntoa(*reinterpret_cast(hostEntry->h_addr_list[0])); + if (ip == nullptr) { + MKI_LOG(ERROR) << "ERROR: Failed to convert IP address."; + return LCAL_ERROR_INTERNAL; + } + + auto ret = memset_s(&handle, sizeof(handle), 0, sizeof(handle)); + if (ret != EOK) { + MKI_LOG(ERROR) << "Failed to memset_s handle in BootstrapGetServerIp"; + return LCAL_ERROR_INTERNAL; + } + handle.sin.sin_family = AF_INET; + handle.sin.sin_addr.s_addr = inet_addr(ip); + handle.sin.sin_port = 0; + + return LCAL_SUCCESS; +} + +int BootstrapGetUniqueId(struct LcalBootstrapHandle& handle, int commDomain) +{ + auto ret = memset_s(&handle, sizeof(LcalBootstrapHandle), 0, sizeof(LcalBootstrapHandle)); + if (ret != EOK) { + MKI_LOG(ERROR) << "Failed to memset_s handle in BootstrapGetUniqueId."; + return LCAL_ERROR_INTERNAL; + } + + const char* env = Mki::GetEnv("LCAL_COMM_ID"); + if (env) { + MKI_LOG(INFO) << "LCAL_COMM_ID set by environment to " << env; + if (GetAddrFromString(&handle.addr, env) != LCAL_SUCCESS) { + MKI_LOG(WARN) << ("Invalid LCAL_COMM_ID, please use format: :"); + return LCAL_ERROR_INTERNAL; + } + } else { + int bootRet = BootstrapGetServerIp(handle.addr); + if (bootRet != LCAL_SUCCESS) { + MKI_LOG(ERROR) << "lcal BootstrapGetServerIp failed!"; + return LCAL_ERROR_INTERNAL; + } + } + int dev; + int aclRet = aclrtGetDevice(&dev); + if (aclRet != ACL_SUCCESS) { + MKI_LOG(ERROR) << "ERROR: GetDevice."; + return LCAL_ERROR_INTERNAL; + } + handle.addr.sin.sin_port = htons(LCAL_DEFAULT_SOCK_PORT + dev + commDomain); + handle.magic = LCAL_MAGIC; + + return LCAL_SUCCESS; +} } \ No newline at end of file -- Gitee From b8288c6115a86e45c9a1d7038aae6c7313ec7102 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 10:32:17 +0800 Subject: [PATCH 250/414] draft --- .../src/tools/socket/lcal_sock_exchange.cpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index 1d8846ca..758a0bbe 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -180,18 +180,18 @@ int LcalSockExchange::Listen() struct sockaddr *addrPtr = &lcalCommId_.handle.addr.sa; if (bind(fd_, addrPtr, sizeof(struct sockaddr)) < 0) { - MKI_LOG(ERROR) << "Server side bind" << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; + MKI_LOG(ERROR) << "Server side bind " << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; return LCAL_ERROR_INTERNAL; } if (listen(fd_, LCAL_MAX_BACK_LOG) < 0) { - MKI_LOG(ERROR) << "Server side listen" << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; + MKI_LOG(ERROR) << "Server side listen " << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; return LCAL_ERROR_INTERNAL; } - MKI_LOG(ERROR) << "The server is listening! ip: " << inet_ntoa(lcalCommId_.handle.addr.sin.sin_addr) - << " port: " << ntohs(lcalCommId_.handle.addr.sin.sin_port); + MKI_LOG(INFO) << "The server is listening! ip: " << inet_ntoa(lcalCommId_.handle.addr.sin.sin_addr) + << " port: " << ntohs(lcalCommId_.handle.addr.sin.sin_port); - return LCAL_ERROR_INTERNAL; + return LCAL_SUCCESS; } int LcalSockExchange::AcceptConnection(int fd, sockaddr_in& clientAddr, socklen_t *sinSize) const @@ -204,10 +204,10 @@ int LcalSockExchange::AcceptConnection(int fd, sockaddr_in& clientAddr, socklen_ clientFd = accept(fd, &clientAddrPtr.sa, sinSize); if (clientFd < 0) { if (!CheckErrno(errno)) { - MKI_LOG(ERROR) << "Server side accept failed " << strerror(errno); + MKI_LOG(ERROR) << "Server side accept failed" << strerror(errno); return -1; } - MKI_LOG(DEBUG) << "accept failed " << strerror(errno); + MKI_LOG(DEBUG) << "accept failed: " << strerror(errno); continue; } break; @@ -222,7 +222,7 @@ int LcalSockExchange::Accept() socklen_t sinSize = sizeof(struct sockaddr_in); for (int i = 1; i < rankSize_; ++i) { - int fd = AcceptConnection(fd, clientAddr, &sinSize); + int fd = AcceptConnection(fd_, clientAddr, &sinSize); if (fd < 0) { MKI_LOG(ERROR) << "AcceptConnection failed"; return LCAL_ERROR_INTERNAL; @@ -234,7 +234,7 @@ int LcalSockExchange::Accept() return LCAL_ERROR_INTERNAL; } - if (rank > rankSize_ || rank <= 0 || clientFds_[rank] >= 0) { + if (rank >= rankSize_ || rank <= 0 || clientFds_[rank] >= 0) { MKI_LOG(ERROR) << "Server side recv invalid rank id " << rank; return LCAL_ERROR_INTERNAL; } @@ -246,7 +246,7 @@ int LcalSockExchange::Accept() return LCAL_SUCCESS; } -int LcalSockExchange::Close(int &fd) const +void LcalSockExchange::Close(int &fd) const { if (fd == -1) { return; @@ -277,7 +277,7 @@ int LcalSockExchange::Connect() struct sockaddr *addrPtr = &lcalCommId_.handle.addr.sa; while (retryCount < maxRetryCount) { if (connect(fd_, addrPtr, sizeof(struct sockaddr)) < 0) { - if (error == ECONNREFUSED) { + if (errno == ECONNREFUSED) { MKI_LOG(DEBUG) << "Client side " << rank_ << " try connect " << (retryCount + 1) << " times refused"; retryCount++; sleep(sleepTimeS); @@ -307,15 +307,15 @@ int LcalSockExchange::Connect() return LCAL_SUCCESS; } -int LcalSockExchange::IsServer() const +bool LcalSockExchange::IsServer() const { return rank_ == 0; } -int LcalSockExchange::Cleanup() +void LcalSockExchange::Cleanup() { if (fd_ >= 0) { - close(fd_); + Close(fd_); } if (clientFds_.empty()) { @@ -342,7 +342,7 @@ int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair) int ret = ParseIpAndPort(ipPortPair, ip, port); if (ret != LCAL_SUCCESS) { MKI_LOG(ERROR) << "lcal ParseIpAndPort failed!"; - return LCAL_ERROR_INTERNAL + return LCAL_ERROR_INTERNAL; } ua->sin.sin_family = AF_INET; ua->sin.sin_addr.s_addr = inet_addr(ip.c_str()); @@ -361,7 +361,7 @@ int BootstrapGetServerIp(LcalSocketAddress& handle) struct hostent *hostEntry = gethostbyname(hostname); if (hostEntry == nullptr) { - MKI_LOG(ERROR) << "ERROR: Failed to get host entry."; + MKI_LOG(ERROR) << "ERROR: Failed to get host entry." ; return LCAL_ERROR_INTERNAL; } @@ -373,7 +373,7 @@ int BootstrapGetServerIp(LcalSocketAddress& handle) auto ret = memset_s(&handle, sizeof(handle), 0, sizeof(handle)); if (ret != EOK) { - MKI_LOG(ERROR) << "Failed to memset_s handle in BootstrapGetServerIp"; + MKI_LOG(ERROR) << "Failed to memset_s handle in BootstrapGetServerIp."; return LCAL_ERROR_INTERNAL; } handle.sin.sin_family = AF_INET; @@ -396,12 +396,12 @@ int BootstrapGetUniqueId(struct LcalBootstrapHandle& handle, int commDomain) MKI_LOG(INFO) << "LCAL_COMM_ID set by environment to " << env; if (GetAddrFromString(&handle.addr, env) != LCAL_SUCCESS) { MKI_LOG(WARN) << ("Invalid LCAL_COMM_ID, please use format: :"); - return LCAL_ERROR_INTERNAL; + return LCAL_INVALID_VALUE; } } else { int bootRet = BootstrapGetServerIp(handle.addr); if (bootRet != LCAL_SUCCESS) { - MKI_LOG(ERROR) << "lcal BootstrapGetServerIp failed!"; + MKI_LOG(ERROR) << "lcal BootstrapGetIpPort failed!"; return LCAL_ERROR_INTERNAL; } } -- Gitee From 8113ec765be22511dda99f658266597bd810168a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 10:32:58 +0800 Subject: [PATCH 251/414] draft --- comm/lcal/src/tools/socket/lcal_sock_exchange.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index 758a0bbe..ba586718 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -188,7 +188,7 @@ int LcalSockExchange::Listen() MKI_LOG(ERROR) << "Server side listen " << ntohs(lcalCommId_.handle.addr.sin.sin_port) << " failed"; return LCAL_ERROR_INTERNAL; } - MKI_LOG(INFO) << "The server is listening! ip: " << inet_ntoa(lcalCommId_.handle.addr.sin.sin_addr) + MKI_LOG(INFO) << "The server is listening! ip: "<< inet_ntoa(lcalCommId_.handle.addr.sin.sin_addr) << " port: " << ntohs(lcalCommId_.handle.addr.sin.sin_port); return LCAL_SUCCESS; -- Gitee From 55690f0ffcff9b5d0d8f4174fa2a82182792a1a4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 12:00:44 +0800 Subject: [PATCH 252/414] draft --- .../kernels/lcal_allreduce_deterministic.cce | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_deterministic.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce new file mode 100644 index 00000000..61aedbc1 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void Utils(__ubuf__ T * ub, __gm__ T * gm, T value) +{ + AscendC::PipeBarrier(); + *ub = value; + AscendC::PipeBarrier(); + CpUB2GM(gm, ub, sizeof(T)); + AscendC::PipeBarrier(); +} + +template +__attribute__((always_inline)) inline __aicore__ void SumByPairs( + __ubuf__ int64_t *ctrlFlagsUB, __gm__ T* buff[8], int64_t x, int64_t blockNumPerGroup, int64_t corePerRank, + int64_t coreSegmentedIdx, int64_t magic, int64_t deterministicOffNum, int64_t thisNPUProcessDataNum, + int64_t thisNPUCoreGroupAvgDMADataNum, int64_t dataOffsetNum, int64_t dataSizeRemain, __ubuf__ T *inputUB[2], + int op, int rank, int rankSize) { + int64_t target = 0; + __gm__ int64_t *ctrlFlagsGM; + __gm__ int64_t *ctrlFlagsGMTemp; + __gm__ int64_t *ctrlFlagsGMTemp1; + int64_t buffOffsetNum; + + if (x == 0) { + return; + } + + int64_t multiple = GetDeterministicRankOffset(x); + if ((x & 1) == 1) { + target = x - multiple; + ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + target * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic); + + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + int64_t outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, + processOutput, outputOffsetNum, op); + ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic + multiple); + } else { + target = x - multiple; + ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + blockNumPerGroup + (target + multiple / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + multiple / 2); + + int64_t multipleTmp = multiple; + while (x + multipleTmp / 2 >= rankSize) { + multipleTmp /= 2; + } + if (multipleTmp > 1) { + ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + blockNumPerGroup + (x + multipleTmp / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + multipleTmp / 2); + } + + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + int64_t outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, + processOutput, outputOffsetNum, op); + ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic + multiple); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic <<= 10; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + constexpr int32_t maxBuffSize = 16; + __gm__ T* buff[maxBuffSize] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum >> 1; + int64_t corePerRank = blockNumPerGroup / rankSize; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + + const int64_t x = GetBlockIdx() / corePerRank; + if (GetBlockIdx() >= blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } + + const int64_t singleNPUProcessDataNum = len / rankSize; + int64_t thisNPUProcessDataNum = singleNPUProcessDataNum; + if (rank == rankSize - 1) { + thisNPUProcessDataNum = len - rank * singleNPUProcessDataNum; + } + + int64_t xNPUProcessDataNum = singleNPUProcessDataNum; + if (x == rankSize - 1) { + xNPUProcessDataNum = len - x * singleNPUProcessDataNum; + } + + const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank; + const int64_t thisNPUCoreGroupAvgDMADataNum = thisNPUProcessDataNum / corePerRank; + + int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + int64_t buffOffsetNum = x * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + int64_t deterministicOffNum = len; + + if (GetBlockIdx() < blockNumPerGroup) { + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum); + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + } else { + buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + dataSizeRemain = thisNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (thisNPUProcessDataNum - coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + int64_t revBuffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMX, magic); + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, revBuffOffsetNum); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; + + if (rankSize >= 4) { + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + SumByPairs(ctrlFlagsUB, buff, x, blockNumPerGroup, corePerRank, coreSegmentedIdx, magic, deterministicOffNum, thisNPUProcessDataNum, + thisNPUCoreGroupAvgDMADataNum, dataOffsetNum, dataSizeRemain, inputUB, op, rank, rankSize); + } else { + SetFlag(ctrlFlagsUB, ctrlFlagsGM, ((x == 0) ? (magic + 1) : magic)); + if (x != 0) { + __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[x] + (blockNumPerGroup + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + 1); + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + int64_t outputOffsetNum = deterministicOffNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, + processOutput, outputOffsetNum, op); + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic + 1); + } + } + } + SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + IPC_BUFF_MAX_SIZE / sizeof(T)) + MEM_DMA_UNIT_INT_NUM, magic); + + if (GetBlockIdx() > blockNumPerGroup) { + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + return; + } + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic); + + __gm__ int64_t* ctrlFlagsGMX = ((__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM); + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic); + + buffOffsetNum = coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + int64_t revBuffOffsetNum = x * singleNPUProcessDataNum + buffOffsetNum; + int64_t sendBuffOffsetNum = deterministicOffNum + buffOffsetNum; + GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From 6de4b911b418c3472230eb26aa2dfd8e875f2dc7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 12:04:33 +0800 Subject: [PATCH 253/414] draft --- .../kernels/lcal_allreduce_deterministic.cce | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce index 61aedbc1..32d947f9 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce @@ -42,7 +42,7 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairs( CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic); buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; - __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); int64_t outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, processOutput, outputOffsetNum, op); @@ -53,17 +53,17 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairs( ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + blockNumPerGroup + (target + multiple / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + multiple / 2); - int64_t multipleTmp = multiple; - while (x + multipleTmp / 2 >= rankSize) { - multipleTmp /= 2; + int64_t multipleTemp = multiple; + while (x + multipleTemp / 2 >= rankSize) { + multipleTemp /= 2; } - if (multipleTmp > 1) { - ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + blockNumPerGroup + (x + multipleTmp / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; - CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + multipleTmp / 2); + if (multipleTemp > 1) { + ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + blockNumPerGroup + (x + multipleTemp / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp1, magic + multipleTemp / 2); } buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; - __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); int64_t outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, processOutput, outputOffsetNum, op); @@ -79,7 +79,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); magic <<= 10; const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; - const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); constexpr int32_t maxBuffSize = 16; __gm__ T* buff[maxBuffSize] = { buff0, buff1, buff2, buff3, @@ -92,11 +92,11 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; - int64_t blockNumPerGroup = GetLcalBlockNum >> 1; + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; int64_t corePerRank = blockNumPerGroup / rankSize; int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; - const int64_t x = GetBlockIdx() / corePerRank; + int64_t x = GetBlockIdx() / corePerRank; if (GetBlockIdx() >= blockNumPerGroup) { x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; @@ -141,13 +141,13 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; - __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t *)buff[x] + dataOffsetNum); int64_t revBuffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMX, magic); - GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, revBuffOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, buffOffsetNum); - __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; if (rankSize >= 4) { SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); @@ -156,12 +156,12 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic } else { SetFlag(ctrlFlagsUB, ctrlFlagsGM, ((x == 0) ? (magic + 1) : magic)); if (x != 0) { - __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[x] + (blockNumPerGroup + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + 1); buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; - __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); int64_t outputOffsetNum = deterministicOffNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; - ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, + ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, processOutput, outputOffsetNum, op); SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic + 1); } @@ -169,7 +169,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic } SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + IPC_BUFF_MAX_SIZE / sizeof(T)) + MEM_DMA_UNIT_INT_NUM, magic); - if (GetBlockIdx() > blockNumPerGroup) { + if (GetBlockIdx() >= blockNumPerGroup) { DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); return; @@ -178,7 +178,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic); - __gm__ int64_t* ctrlFlagsGMX = ((__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM); + __gm__ int64_t* ctrlFlagsGMX= ((__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM); CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic); buffOffsetNum = coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; -- Gitee From cfc836b1fc87beccc5453dc8c9eb1f54281de7df Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 12:06:23 +0800 Subject: [PATCH 254/414] draft --- comm/lcal/src/kernels/lcal_allreduce_deterministic.cce | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce index 32d947f9..4b61c7f7 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce @@ -101,6 +101,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; const int64_t singleNPUProcessDataNum = len / rankSize; int64_t thisNPUProcessDataNum = singleNPUProcessDataNum; @@ -142,7 +143,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); - __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t *)buff[x] + dataOffsetNum); + __gm__ T *sendBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); int64_t revBuffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMX, magic); GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, sendBuff, buffOffsetNum); @@ -156,7 +157,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic } else { SetFlag(ctrlFlagsUB, ctrlFlagsGM, ((x == 0) ? (magic + 1) : magic)); if (x != 0) { - __gm__ int64_t* ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t *ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp, magic + 1); buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); -- Gitee From 690b72a1eafbbed5a04267bf2a465588aa88bad1 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 12:06:46 +0800 Subject: [PATCH 255/414] draft --- comm/lcal/src/kernels/lcal_allreduce_deterministic.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce index 4b61c7f7..595f1920 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce @@ -61,7 +61,7 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairs( ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + blockNumPerGroup + (x + multipleTemp / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; CheckFlag(ctrlFlagsUB, ctrlFlagsGMTemp1, magic + multipleTemp / 2); } - + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); int64_t outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; -- Gitee From 44ae457160aeaa695c7c8dca00c2c80c5977e2d6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 14:56:45 +0800 Subject: [PATCH 256/414] draft --- .../lcal_allreduce_deterministic_big_data.cce | 326 ++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce new file mode 100644 index 00000000..33a9c5ad --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void SumByPairsBigData( + __ubuf__ int64_t *ctrlFlagsUB, __ubuf__ int64_t *ctrlFlagsUB1, __ubuf__ int64_t *ctrlFlagsUB2, __gm__ T* buff[8], int64_t x, + int64_t blockNumPerGroup, int64_t corePerRank, int64_t coreSegmentedIdx, int64_t magic, int64_t deterministicOffNum, + int64_t thisNPUProcessDataNum, int64_t thisNPUCoreGroupAvgDMADataNum, int64_t dataOffsetNum, int64_t dataSizeRemain, __ubuf__ T *inputUB[2], + int op, int rank, int rankSize, int64_t allTimes, int64_t allDataSizeNeed2Add) { + int64_t target = 0; + __gm__ int64_t *ctrlFlagsGM; + __gm__ int64_t *ctrlFlagsGMTemp; + __gm__ int64_t *ctrlFlagsGMTemp1; + int64_t buffOffsetNum; + __gm__ T *processOutput; + int64_t outputOffsetNum; + + if (x == 0) { + return; + } + + int64_t multiple = GetDeterministicRankOffset(x); + if (x % 2 == 1) { + target = x - multiple; + ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + target * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); + outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + } else { + target = x - multiple; + ctrlFlagsGMTemp = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup * 2 + (target + multiple / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + + int64_t multipleTemp = multiple; + while (x + multipleTemp / 2 >= rankSize) { + multipleTemp /= 2; + } + if (multipleTemp > 0) { + if ((x + multipleTemp / 2) != x) { + ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + + (blockNumPerGroup * 2 + (x + multipleTemp / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + } else { + ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + + (blockNumPerGroup * 2 + x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + } + + } + + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); + outputOffsetNum = deterministicOffNum + target * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + } + AscendC::PipeBarrier(); + + while (true) { + if (*ctrlFlagsUB >= allTimes) { + break; + } + + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMTemp, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagsGMTemp1, sizeof(int64_t)); + AscendC::PipeBarrier(); + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10)) { + continue; + } + + *ctrlFlagsUB1 = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? *ctrlFlagsUB1 : *ctrlFlagsUB2; + AscendC::PipeBarrier(); + + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (*ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + ProcessDataNew(dataSizeRemain, inputUB, buff[rank], dataOffsetNum, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + processOutput, outputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + AscendC::PipeBarrier(); + *ctrlFlagsUB = preparedDataGroupCount; + CpUB2GM((__gm__ int64_t*) buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); + AscendC::PipeBarrier(); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministicBigDataOrigin( + __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t blockNumPerGroup, uint32_t rank, uint32_t rankSize, + uint64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, + int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) +{ + const int64_t singleNPUProcessDataNum = len / rankSize; + int64_t thisNPUProcessDataNum = singleNPUProcessDataNum; + if (rank == rankSize - 1) { + thisNPUProcessDataNum = len - rank * singleNPUProcessDataNum; + } + + int64_t xNPUProcessDataNum = singleNPUProcessDataNum; + if (x == rankSize - 1) { + xNPUProcessDataNum = len - x * singleNPUProcessDataNum; + } + + const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank; + const int64_t thisNPUCoreGroupAvgDMADataNum = thisNPUProcessDataNum / corePerRank; + + int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + int64_t buffOffsetNum = x * singleNPUProcessDataNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + int64_t deterministicOffNum = len; + + if (GetBlockIdx() < blockNumPerGroup) { + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + input2BuffRankMagic(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, input, buffOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); + return; + } + + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + int64_t allDataSizeNeed2Add = thisNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + allDataSizeNeed2Add = (thisNPUProcessDataNum - coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + int64_t allTimes = CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG); + + if (GetBlockIdx() < blockNumPerGroup * 2) { + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM; + + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); + __gm__ T *sendBuff = (__gm__ T *)((__gm__ int64_t *)buff[x] + dataOffsetNum); + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB >= allTimes) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + AscendC::PipeBarrier(); + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) { + continue; + } + + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (*ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + int64_t revBuffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + + GM2GMPingPong(dataSizeRemain, inputUB, receiveBuff, revBuffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T)); + AscendC::PipeBarrier(); + *ctrlFlagsUB = preparedDataGroupCount; + if (x == 0) { + CpUB2GM((__gm__ int64_t*) buff[rank] + (GetBlockIdx() + blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); + } + CpUB2GM(ctrlFlagsGM, ctrlFlagsUB1, sizeof(int64_t)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + } + } + + if (GetBlockIdx() >= blockNumPerGroup * 2) { + if (x == 0) { + return; + } + + if (rankSize >= 4) { + AscendC::PipeBarrier(); + SumByPairsBigData(ctrlFlagsUB, buff, x, blockNumPerGroup, corePerRank, coreSegmentedIdx, magic, deterministicOffNum, thisNPUProcessDataNum, + thisNPUCoreGroupAvgDMADataNum, dataOffsetNum, dataSizeRemain, inputUB, op, rank, rankSize, allTimes, allDataSizeNeed2Add); + } else { + __gm__ int64_t* ctrlFlagsGMPre = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup * 2 + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); + int64_t outputOffsetNum = deterministicOffNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB >= allTimes) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMPre, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagsGM, sizeof(int64_t)); + AscendC::PipeBarrier(); + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10) || if ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { + continue; + } + + *ctrlFlagsUB1 = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? *ctrlFlagsUB1 : *ctrlFlagsUB2; + AscendC::PipeBarrier(); + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (*ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + ProcessDataNew(dataSizeRemain, inputUB, buff[rank], dataOffsetNum, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + processOutput, outputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + AscendC::PipeBarrier(); + *ctrlFlagsUB = preparedDataGroupCount; + CpUB2GM((__gm__ int64_t*) buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); + AscendC::PipeBarrier(); + } + } + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM, magic); + return; + } + + __gm__ int64_t* ctrlFlagsGM; + if (rankSize >= 4) { + ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + + (GetLcalBlockNum() + 2 * blockNumPerGroup + + (rankSize > 4 ? 4 : 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + } else { + ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + + (GetLcalBlockNum() + 2 * blockNumPerGroup + + (rankSize - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + } + + constexpr int32_t lastFlagPos = 8; + constexpr int32_t sumPairGroup = 2; + if (rankSize > lastFlagPos) { + ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + MEM_DMA_UNIT_INT_NUM * + (GetLcalBlockNum() + sumPairGroup * blockNumPerGroup + lastFlagPos * corePerRank + coreSegmentedIdx); + } + + dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + buffOffsetNum = coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic); + + __gm__ T *sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + int64_t revBuffOffsetNum = x * singleNPUProcessDataNum + buffOffsetNum; + int64_t sendBuffOffsetNum = deterministicOffNum + buffOffsetNum; + + GM2GMPingPong(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); + return; +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministicBigData(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic <<= 10; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + constexpr int32_t maxBuffSize = 16; + __gm__ T* buff[maxBuffSize] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup / rankSize; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + + int64_t x = GetBlockIdx() / corePerRank; + if (GetBlockIdx() >= blockNumPerGroup && GetBlockIdx() < 2 * blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } else if (GetBlockIdx() >= 2 * blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup * 2) / corePerRank; + flagOffset1st = (GetBlockIdx() - blockNumPerGroup * 2) * MEM_DMA_UNIT_INT_NUM; + } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; + + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + int64_t ipcDeterministicBuffMaxNum = ipcBuffMaxNum / corePerRank; + int64_t loopTimes = CeilDiv(len, ipcDeterministicBuffMaxNum); + for (int64_t i = 0; i < loopTimes; i++) { + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcDeterministicBuffMaxNum; + int64_t remainNum = (len - processedNum < ipcDeterministicBuffMaxNum) ? len - processedNum : ipcDeterministicBuffMaxNum; + + PostSyncBigData(ctrlFlagsUB, buff, x, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalAllReduceDeterministicBigDataOrigin( + buff, input + processedNum, output + processedNum, blockNumPerGroup, rank, rankSize, remainNum, (maigc + i) << 10, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx, op); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From 747122a8c3720ce07fb6a7bb23c73535e9bb1554 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:01:38 +0800 Subject: [PATCH 257/414] draft --- .../lcal_allreduce_deterministic_big_data.cce | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce index 33a9c5ad..8356bd35 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce @@ -49,7 +49,7 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairsBigData( (blockNumPerGroup * 2 + (x + multipleTemp / 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; } else { ctrlFlagsGMTemp1 = (__gm__ int64_t*)buff[rank] + - (blockNumPerGroup * 2 + x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + (blockNumPerGroup + x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; } } @@ -69,7 +69,7 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairsBigData( CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMTemp, sizeof(int64_t)); CpGM2UB(ctrlFlagsUB2, ctrlFlagsGMTemp1, sizeof(int64_t)); AscendC::PipeBarrier(); - if ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10)) { + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10) || (*ctrlFlagsUB2 >> 10) != (magic >> 10)) { continue; } @@ -89,7 +89,7 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairsBigData( processOutput, outputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); AscendC::PipeBarrier(); *ctrlFlagsUB = preparedDataGroupCount; - CpUB2GM((__gm__ int64_t*) buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); + CpUB2GM((__gm__ int64_t *) buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); AscendC::PipeBarrier(); } } @@ -151,17 +151,17 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic break; } - CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMX, sizeof(int64_t)); AscendC::PipeBarrier(); if ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) { continue; } - int64_t preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + int64_t preparedDataGroupCount = *ctrlFlagsUB1 & 0x3FF; if (*ctrlFlagsUB >= preparedDataGroupCount) { continue; } - + buffOffsetNum = rank * singleNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { @@ -174,7 +174,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic AscendC::PipeBarrier(); *ctrlFlagsUB = preparedDataGroupCount; if (x == 0) { - CpUB2GM((__gm__ int64_t*) buff[rank] + (GetBlockIdx() + blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); + CpUB2GM((__gm__ int64_t *) buff[rank] + (GetBlockIdx() + blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); } CpUB2GM(ctrlFlagsGM, ctrlFlagsUB1, sizeof(int64_t)); AscendC::SetFlag(EVENT_ID0); @@ -189,10 +189,10 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic if (rankSize >= 4) { AscendC::PipeBarrier(); - SumByPairsBigData(ctrlFlagsUB, buff, x, blockNumPerGroup, corePerRank, coreSegmentedIdx, magic, deterministicOffNum, thisNPUProcessDataNum, + SumByPairsBigData(ctrlFlagsUB, ctrlFlagsUB1, ctrlFlagsUB2, buff, x, blockNumPerGroup, corePerRank, coreSegmentedIdx, magic, deterministicOffNum, thisNPUProcessDataNum, thisNPUCoreGroupAvgDMADataNum, dataOffsetNum, dataSizeRemain, inputUB, op, rank, rankSize, allTimes, allDataSizeNeed2Add); } else { - __gm__ int64_t* ctrlFlagsGMPre = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup * 2 + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGMPre = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup * 2 + (x - 1) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (blockNumPerGroup + x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; buffOffsetNum = deterministicOffNum + x * thisNPUProcessDataNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; __gm__ T *processOutput = (__gm__ T*)((__gm__ int64_t *)buff[rank] + dataOffsetNum); @@ -206,7 +206,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMPre, sizeof(int64_t)); CpGM2UB(ctrlFlagsUB2, ctrlFlagsGM, sizeof(int64_t)); AscendC::PipeBarrier(); - if ((*ctrlFlagsUB1 >> 10) != (magic >> 10) || if ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { + if ((*ctrlFlagsUB1 >> 10) != (magic >> 10) || (*ctrlFlagsUB2 >> 10) != (magic >> 10)) { continue; } @@ -225,19 +225,19 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic processOutput, outputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); AscendC::PipeBarrier(); *ctrlFlagsUB = preparedDataGroupCount; - CpUB2GM((__gm__ int64_t*) buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); + CpUB2GM((__gm__ int64_t *) buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, ctrlFlagsUB1, sizeof(int64_t)); AscendC::PipeBarrier(); } } - SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM, magic); return; } - __gm__ int64_t* ctrlFlagsGM; + __gm__ int64_t* ctrlFlagsGMX; if (rankSize >= 4) { ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + 2 * blockNumPerGroup + - (rankSize > 4 ? 4 : 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + (rankSize > 4 ? 4 : 2) * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; } else { ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (GetLcalBlockNum() + 2 * blockNumPerGroup + @@ -250,7 +250,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + MEM_DMA_UNIT_INT_NUM * (GetLcalBlockNum() + sumPairGroup * blockNumPerGroup + lastFlagPos * corePerRank + coreSegmentedIdx); } - + dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); @@ -263,7 +263,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic int64_t revBuffOffsetNum = x * singleNPUProcessDataNum + buffOffsetNum; int64_t sendBuffOffsetNum = deterministicOffNum + buffOffsetNum; - GM2GMPingPong(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); + GM2GMPingPong(dataSizeRemain, inputUB, output, revBuffOffsetNum, sendBuff, sendBuffOffsetNum); return; } @@ -287,7 +287,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; - int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t blockNumPerGroup = GetLcalBlockNum() / 3; int64_t corePerRank = blockNumPerGroup / rankSize; int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; @@ -305,20 +305,20 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); - int64_t ipcDeterministicBuffMaxNum = ipcBuffMaxNum / corePerRank; - int64_t loopTimes = CeilDiv(len, ipcDeterministicBuffMaxNum); + int64_t ipcBuffDeterministicMaxNum = ipcBuffMaxNum / corePerRank; + int64_t loopTimes = CeilDiv(len, ipcBuffDeterministicMaxNum); for (int64_t i = 0; i < loopTimes; i++) { *ctrlFlagsUB = 0; *ctrlFlagsUB1 = 0; *ctrlFlagsUB2 = 0; AscendC::PipeBarrier(); - int64_t processedNum = i * ipcDeterministicBuffMaxNum; - int64_t remainNum = (len - processedNum < ipcDeterministicBuffMaxNum) ? len - processedNum : ipcDeterministicBuffMaxNum; + int64_t processedNum = i * ipcBuffDeterministicMaxNum; + int64_t remainNum = (len - processedNum < ipcBuffDeterministicMaxNum) ? len - processedNum : ipcBuffDeterministicMaxNum; - PostSyncBigData(ctrlFlagsUB, buff, x, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); LcalAllReduceDeterministicBigDataOrigin( - buff, input + processedNum, output + processedNum, blockNumPerGroup, rank, rankSize, remainNum, (maigc + i) << 10, ctrlFlagsUB, ctrlFlagsUB1, + buff, input + processedNum, output + processedNum, blockNumPerGroup, rank, rankSize, remainNum, (magic + i) << 10, ctrlFlagsUB, ctrlFlagsUB1, ctrlFlagsUB2, inputUB, dataOffsetNum, flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx, op); } DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); -- Gitee From e727492b4adbc2df9e1c304236d33695c76bb2a8 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:02:33 +0800 Subject: [PATCH 258/414] draft --- comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce index 8356bd35..c5e09465 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce @@ -305,7 +305,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceDeterministic DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); - int64_t ipcBuffDeterministicMaxNum = ipcBuffMaxNum / corePerRank; + int64_t ipcBuffDeterministicMaxNum = DETERMINISTIC_BUFF_SIZE / sizeof(T); int64_t loopTimes = CeilDiv(len, ipcBuffDeterministicMaxNum); for (int64_t i = 0; i < loopTimes; i++) { *ctrlFlagsUB = 0; -- Gitee From 53de4bccfe185591ed94cc54bc45df8c91a5d041 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:02:59 +0800 Subject: [PATCH 259/414] draft --- comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce | 1 - 1 file changed, 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce index c5e09465..c9454d6a 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce @@ -65,7 +65,6 @@ __attribute__((always_inline)) inline __aicore__ void SumByPairsBigData( break; } - CpGM2UB(ctrlFlagsUB1, ctrlFlagsGMTemp, sizeof(int64_t)); CpGM2UB(ctrlFlagsUB2, ctrlFlagsGMTemp1, sizeof(int64_t)); AscendC::PipeBarrier(); -- Gitee From a87844819afb119e1e539deacd51b7a03d83d41e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:29:17 +0800 Subject: [PATCH 260/414] draft --- .../lcal_allreduce_two_shot_910B2C.cce | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce new file mode 100644 index 00000000..3d8da319 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + const int64_t singleNodeRankSize = rankSize >> 1; + const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + + const int64_t x = GetBlockIdx() / corePerRank; + const int64_t xLocalRankId = x % singleNodeRankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + __gm__ T* buff[16] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + const int64_t oneNpuProcessDataAvgNum = len / singleNodeRankSize; + int64_t thisNPUProcessDataNum = oneNpuProcessDataAvgNum; + if (localNodeRankId == singleNodeRankSize - 1) { + thisNPUProcessDataNum = len - localNodeRankId * oneNpuProcessDataAvgNum; + } + + int64_t xNPUProcessDataNum = oneNpuProcessDataAvgNum; + if (xLocalRankId == singleNodeRankSize - 1) { + xNPUProcessDataNum = len - xLocalRankId * oneNpuProcessDataAvgNum; + } + + const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank; + const int64_t thisNPUCoreGroupAvgDMADataNum = thisNPUProcessDataNum / corePerRank; + + int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + if ((rank < singleNodeRankSize && x < singleNodeRankSize) || + (rank >= singleNodeRankSize && x >= singleNodeRankSize)) { + __gm__ int64_t* ctrlFlagsGMSet = (__gm__ int64_t*)buff[rank] + (xLocalRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + + int64_t sendBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + int64_t revBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, input, sendBuffOffsetNum); + SetFlag(ctrlFlagsUB, ctrlFlagsGMSet, magic); + + dataSizeRemain = thisNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (thisNPUProcessDataNum - coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + if (rank != x) { + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (localNodeRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (localNodeRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + sendBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, sendBuffOffsetNum, receiveBuff, revBuffOffsetNum, op); + } + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (xLocalRankId + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + if (rank == x) { + for (int i = 0; i < singleNodeRankSize; i++) { + if ((xLocalRankId + singleNodeRankSize + coreSegmentedIdx) == + (i * corePerRank + singleNodeRankSize + coreSegmentedIdx)) { + continue; + } + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (i * corePerRank + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + } + + receiveBuff = ((__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum)) + len; + sendBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), sendBuffOffsetNum); + + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[peerRankId] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + + revBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, len + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum, + (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), revBuffOffsetNum, op); + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic) + } + + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + + dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); + } + + sendBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), sendBuffOffsetNum); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From e0e969e3d3a85edc4e07e8c86459f09cee9de6ec Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:30:46 +0800 Subject: [PATCH 261/414] draft --- comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce index 3d8da319..a98c37a1 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce @@ -18,6 +18,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C const int64_t singleNodeRankSize = rankSize >> 1; const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; -- Gitee From 96b1a9307a053d00ad31b86e6b6daac97c5ad96e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:31:41 +0800 Subject: [PATCH 262/414] draft --- .../lcal_allreduce_two_shot_910B2C.cce | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce index a98c37a1..108ede78 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce @@ -18,7 +18,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C const int64_t singleNodeRankSize = rankSize >> 1; const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; - const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); @@ -37,15 +37,15 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; - const int64_t oneNpuProcessDataAvgNum = len / singleNodeRankSize; - int64_t thisNPUProcessDataNum = oneNpuProcessDataAvgNum; + const int64_t oneNPUProcessDataAvgNum = len / singleNodeRankSize; + int64_t thisNPUProcessDataNum = oneNPUProcessDataAvgNum; if (localNodeRankId == singleNodeRankSize - 1) { - thisNPUProcessDataNum = len - localNodeRankId * oneNpuProcessDataAvgNum; + thisNPUProcessDataNum = len - localNodeRankId * oneNPUProcessDataAvgNum; } - int64_t xNPUProcessDataNum = oneNpuProcessDataAvgNum; + int64_t xNPUProcessDataNum = oneNPUProcessDataAvgNum; if (xLocalRankId == singleNodeRankSize - 1) { - xNPUProcessDataNum = len - xLocalRankId * oneNpuProcessDataAvgNum; + xNPUProcessDataNum = len - xLocalRankId * oneNPUProcessDataAvgNum; } const int64_t xNPUCoreGroupAvgDMADataNum = xNPUProcessDataNum / corePerRank; @@ -64,8 +64,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C __gm__ int64_t* ctrlFlagsGMSet = (__gm__ int64_t*)buff[rank] + (xLocalRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - int64_t sendBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; - int64_t revBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + int64_t sendBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + int64_t revBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, input, sendBuffOffsetNum); SetFlag(ctrlFlagsUB, ctrlFlagsGMSet, magic); @@ -76,8 +76,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C if (rank != x) { CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (localNodeRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (localNodeRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); - sendBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; - revBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + sendBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, sendBuffOffsetNum, receiveBuff, revBuffOffsetNum, op); } SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (xLocalRankId + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); @@ -91,7 +91,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C } receiveBuff = ((__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum)) + len; - sendBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + sendBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; revBuffOffsetNum = coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), sendBuffOffsetNum); @@ -99,7 +99,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); - revBuffOffsetNum = localNodeRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, len + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), revBuffOffsetNum, op); SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic) @@ -112,8 +112,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); } - sendBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; - revBuffOffsetNum = xLocalRankId * oneNpuProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + sendBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), sendBuffOffsetNum); } DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); -- Gitee From fb58ec4bde678fceef4843303f5af3da80ddd8ec Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:34:07 +0800 Subject: [PATCH 263/414] draft --- .../lcal_allreduce_two_shot_910B2C.cce | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce index 108ede78..c941d4e3 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce @@ -76,11 +76,11 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C if (rank != x) { CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (localNodeRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (localNodeRankId + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); - sendBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; - revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + sendBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; + revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, sendBuffOffsetNum, receiveBuff, revBuffOffsetNum, op); } - SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (xLocalRankId + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (xLocalRankId + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); if (rank == x) { for (int i = 0; i < singleNodeRankSize; i++) { if ((xLocalRankId + singleNodeRankSize + coreSegmentedIdx) == @@ -90,31 +90,31 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (i * corePerRank + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); } - receiveBuff = ((__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum)) + len; + receiveBuff = ((__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum)) + len; sendBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; revBuffOffsetNum = coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; GM2GM(dataSizeRemain, inputUB[0], receiveBuff, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), sendBuffOffsetNum); - SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[peerRankId] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[peerRankId] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); - CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (rankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, len + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), revBuffOffsetNum, op); - SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic) + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); } - CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); - dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); + int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { dataSizeRemain = (xNPUProcessDataNum - coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum) * sizeof(T); } - sendBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; + sendBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum;; revBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; - GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), sendBuffOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum), sendBuffOffsetNum); } DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); -- Gitee From b5132e84b70c4ffe6c99e1144bf4fed524b2d3a1 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 15:35:11 +0800 Subject: [PATCH 264/414] draft --- comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce index c941d4e3..fe0f3ebd 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce @@ -80,7 +80,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, sendBuffOffsetNum, receiveBuff, revBuffOffsetNum, op); } - SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (xLocalRankId + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (xLocalRankId + singleNodeRankSize + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); if (rank == x) { for (int i = 0; i < singleNodeRankSize; i++) { if ((xLocalRankId + singleNodeRankSize + coreSegmentedIdx) == @@ -102,10 +102,10 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C revBuffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum; ProcessData(dataSizeRemain, inputUB[0], buff[rank], dataOffsetNum, len + coreSegmentedIdx * thisNPUCoreGroupAvgDMADataNum, (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum), revBuffOffsetNum, op); - SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); } - CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (rankSize + corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM, magic); int64_t dataSizeRemain = xNPUCoreGroupAvgDMADataNum * sizeof(T); if (coreSegmentedIdx == corePerRank - 1) { @@ -114,7 +114,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceTwoShot910B2C sendBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum;; revBuffOffsetNum = xLocalRankId * oneNPUProcessDataAvgNum + coreSegmentedIdx * xNPUCoreGroupAvgDMADataNum; - GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum), sendBuffOffsetNum); + GM2GM(dataSizeRemain, inputUB[0], output, revBuffOffsetNum, (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum), sendBuffOffsetNum); } DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); -- Gitee From bdc5c67ac7eed55dbb3ad2ed74eb9e552095406e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 16:59:37 +0800 Subject: [PATCH 265/414] draft --- .../lcal_allreduce_big_data_910B2C.cce | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce new file mode 100644 index 00000000..5d3a3ec2 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2COrigin( + ALLREDUCE_ARGS_FUN_16P_Origin(T), const int64_t singleNodeRankSize, const int64_t localNodeRankId, + const int64_t coreGroupIdx, const int64_t peerRankId, const int64_t dataOffsetNum, __ubuf__ int64_t* ctrlFlagsUB, + __ubuf__ int64_t* ctrlFlagsUB1, __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ int64_t* ctrlFlagsUB3, + __ubuf__ T* inputUB[2], const int64_t x, const int64_t xLocalNodeRankId, __gm__ int64_t *ctrlFlagGMSet, + __gm__ int64_t *ctrlFlagGMCheck, __gm__ T *sendBuff, __gm__ T *receiveBuff +) +{ + const int64_t oneNPUProcessDataAvgNum = len / singleNodeRankSize; + int64_t thisNPUProcessDataNum = oneNPUProcessDataAvgNum; + if (localNodeRankId == singleNodeRankSize - 1) { + thisNPUProcessDataNum = len - localNodeRankId * oneNPUProcessDataAvgNum; + } + + int64_t xNPUProcessDataNum = oneNPUProcessDataAvgNum; + if (xLocalNodeRankId == singleNodeRankSize - 1) { + xNPUProcessDataNum = len - xLocalNodeRankId * oneNPUProcessDataAvgNum; + } + + int64_t dataSizeRemain = xNPUProcessDataNum * sizeof(T); + *ctrlFlagsUB = 0; + if (coreGroupIdx == 0) { + const int64_t buffOffsetNum = xLocalNodeRankId * oneNPUProcessDataAvgNum; + AscendC::PipeBarrier(); + input2BuffRankMagic(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, sendBuff, buffOffsetNum, ctrlFlagsUB, ctrlFlagsGMSet, magic); + } else if (coreGroupIdx == 1) { + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + __gm__ int64_t *ctrlFlagGMCheckLocal = (__gm__ int64_t*)buff[rank] + localNodeRankId * MEM_DMA_UNIT_INT_NUM; + + const int64_t buffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum; + const int64_t allDataSizeNeed2Add = thisNPUProcessDataNum * sizeof(T); + const int64_t multipleTimes = CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG); + if (x == rank || multipleTimes == 0) { + SetFlag(ctrlFlagsUB3, ctrlFlagGMSet, ((magic & 0xfffffffffffffc00) | multipleTimes)); + return; + } + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB >= multipleTimes) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagGMCheckLocal, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagGMCheck, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if (*ctrlFlagsUB1 == 0 || *ctrlFlagsUB2 == 0 || + ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { + continue; + } + + int64_t preparedDataGroupCount = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? + (*ctrlFlagsUB1 & 0x3FF) : (*ctrlFlagsUB2 & 0x3FF); + if (*ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount >= multipleTimes) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + ProcessDataNewNonBarrier(dataSizeRemain, inputUB, buff[x], dataOffsetNum, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + receiveBuff, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + SetFlag(ctrlFlagsUB3, ctrlFlagGMSet, ((*ctrlFlagsUB1 & 0xfffffffffffffc00) | preparedDataGroupCount)); + + *ctrlFlagsUB = preparedDataGroupCount; + AscendC::PipeBarrier(); + } + } else if (coreGroupIdx == 3) { + if (GetBlockIdx() == singleNodeRankSize * 3) { + __gm__ int64_t *ctrlFlagGMSetLocal = (__gm__ int64_t *)buff[rank] + (rankSize + 1) * MEM_DMA_UNIT_INT_NUM; + *ctrlFlagsUB2 = 0; + const int64_t buffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum; + const int64_t allDataSizeNeed2Add = thisNPUProcessDataNum * sizeof(T); + const int64_t multipleTimes = CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG); + int64_t processedDataGroupCount = 0; + int64_t preparedDataGroupCount = 0; + AscendC::PipeBarrier(); + while (true) { + *ctrlFlagsUB1 = INT64_MAX; + if (processedDataGroupCount >= multipleTimes) { + break; + } + + for (int i = 0; i < singleNodeRankSize; i++) { + if (i == localNodeRankId) { + continue; + } + *ctrlFlagsUB2 = 0; + + do { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + CpGM2UB(ctrlFlagsUB2, ctrlFlagGMCheck + i * MEM_DMA_UNIT_INT_NUM, sizeof(int64_t)); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + } while ((ctrlFlagsUB2 >> 10) != (magic >> 10)); + + if (*ctrlFlagsUB1 > *ctrlFlagsUB2) { + *ctrlFlagsUB1 = *ctrlFlagsUB2; + } + } + + preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (processedDataGroupCount >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - processedDataGroupCount) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount >= multipleTimes) { + dataSizeRemain = allDataSizeNeed2Add - processedDataGroupCount * DMA_SIZE_PER_FLAG; + } + + AscendC::PipeBarrier(); + GM2GMPingPongNonPipeBarrier(dataSizeRemain, inputUB, receiveBuff, + len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, + buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); + SetFlagNonPipeBarrier(ctrlFlagsUB3, ctrlFlagGMSet, ctrlFlagGMSetLocal, + ((*ctrlFlagsUB1 & 0xfffffffffffffc00) | preparedDataGroupCount)); + + processedDataGroupCount = preparedDataGroupCount; + } + } else { + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + __gm__ int64_t *ctrlFlagGMCheckLocal = (__gm__ int64_t*)buff[rank] + (rankSize + 1) * MEM_DMA_UNIT_INT_NUM; + + const int64_t buffOffsetNum = localNodeRankId * oneNPUProcessDataAvgNum; + const int64_t allDataSizeNeed2Add = thisNPUProcessDataNum * sizeof(T); + const int64_t multipleTimes = CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG); + int64_t processedDataGroupCount = 0; + int64_t preparedDataGroupCount = 0; + while (true) { + AscendC::PipeBarrier(); + if (processedDataGroupCount >= multipleTimes) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagGMCheckLocal, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagGMCheck, sizeof(int64_t)); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + if (*ctrlFlagsUB1 == 0 || *ctrlFlagsUB2 == 0 || + ((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { + continue; + } + + preparedDataGroupCount = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? + (*ctrlFlagsUB1 & 0x3FF) : (*ctrlFlagsUB2 & 0x3FF); + if (processedDataGroupCount >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - processedDataGroupCount) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount >= multipleTimes) { + dataSizeRemain = allDataSizeNeed2Add - processedDataGroupCount * DMA_SIZE_PER_FLAG; + } + AscendC::PipeBarrier(); + ProcessDataNewNonBarrier(dataSizeRemain, inputUB, sendBuff, 0, buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + receiveBuff, len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), op); + SetFlagNonPipeBarrier(ctrlFlagsUB3, ctrlFlagGMSet, ((magic & 0xfffffffffffffc00) | preparedDataGroupCount)); + + processedDataGroupCount = preparedDataGroupCount; + } + } + } else if (coreGroupIdx == 2) { + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + const int64_t buffOffsetNum = xLocalNodeRankId * oneNPUProcessDataAvgNum; + const int64_t allDataSizeNeed2Add = xNPUProcessDataNum * sizeof(T); + const int64_t multipleTimes = CeilDiv(allDataSizeNeed2Add, DMA_SIZE_PER_FLAG); + int64_t processedDataGroupCount = 0; + int64_t preparedDataGroupCount = 0; + + if (thisNPUProcessDataNum != 0) { + CheckFlag(ctrlFlagsUB, + (__gm__ int64_t*)buff[rank] + (singleNodeRankSize + xLocalNodeRankId) * MEM_DMA_UNIT_INT_NUM, + CeilDiv(thisNPUProcessDataNum * sizeof(T), DMA_SIZE_PER_FLAG) + magic); + } + + while (true) { + if (processedDataGroupCount >= multipleTimes) { + break; + } + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + CpGM2UB(ctrlFlagsUB1, ctrlFlagGMCheck, sizeof(int64_t)); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + if (*ctrlFlagsUB1 == 0 || ((*ctrlFlagsUB1 >> 10) != (magic >> 10))) { + continue; + } + + preparedDataGroupCount = (*ctrlFlagsUB1 & 0x3FF); + if (processedDataGroupCount >= preparedDataGroupCount) { + continue; + } + + dataSizeRemain = (preparedDataGroupCount - processedDataGroupCount) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount >= multipleTimes) { + dataSizeRemain = allDataSizeNeed2Add - processedDataGroupCount * DMA_SIZE_PER_FLAG; + } + AscendC::PipeBarrier(); + GM2GMPingPongNonPipeBarrier(dataSizeRemain, inputUB, receiveBuff, + buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); + processedDataGroupCount = preparedDataGroupCount; + } + } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C(ALLREDUCE_ARGS_FUN_16P(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic *= 1024; + __gm__ T* buff[16] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7, + buff8, buff9, buff10, buff11, + buff12, buff13, buff14, buff15 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ int64_t* ctrlFlagsUB3 = (__ubuf__ int64_t*)(96); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(128), (__ubuf__ T*)(98336)}; + + const int64_t singleNodeRankSize = rankSize >> 1; + const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; + + const int64_t coreGroupIdx = GetBlockIdx() % singleNodeRankSize; + + const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + + const int64_t x = (rank < singleNodeRankSize) ? (GetBlockIdx() % singleNodeRankSize) : + ((GetBlockIdx() % singleNodeRankSize) + singleNodeRankSize); + const int64_t xLocalNodeRankId = x % singleNodeRankSize; + + __gm__ T *sendBuff = input; + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ int64_t *ctrlFlagGMSet = (__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + + + __gm__ int64_t *ctrlFlagGMCheck = (__gm__ int64_t*)buff[x] + (localNodeRankId) * MEM_DMA_UNIT_INT_NUM; + switch (coreGroupIdx) { + case 0: + break; + case 1: + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + ctrlFlagGMSet = (__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + break; + case 2: + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + receiveBuff = output; + ctrlFlagGMCheck = (__gm__ int64_t*)buff[x] + (rankSize + 2) * MEM_DMA_UNIT_INT_NUM; + break; + case 3: + { + if (GetBlockIdx() == singleNodeRankSize * 3) { + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum); + ctrlFlagGMCheck = (__gm__ int64_t*)buff[rank] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; + ctrlFlagGMSet = (__gm__ int64_t*)buff[peerRankId] + rankSize * MEM_DMA_UNIT_INT_NUM; + } else { + sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + ctrlFlagGMCheck = (__gm__ int64_t*)buff[rank] + rankSize * MEM_DMA_UNIT_INT_NUM; + ctrlFlagGMSet = (__gm__ int64_t*)buff[rank] + (rankSize + 2) * MEM_DMA_UNIT_INT_NUM; + } + } + default: + ; + } + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + + const int64_t allreduceBuffSizePerParagraph910B2C = + IPC_BUFF_MAX_SIZE / (singleNodeRankSize + 1) / sizeof(T) * sizeof(T); + + const int64_t ipcBuffMaxSizePerLoop = allreduceBuffSizePerParagraph910B2C * singleNodeRankSize; + const int64_t ipcBuffMaxNumPerLoop = ipcBuffMaxSizePerLoop / sizeof(T); + const int64_t loopTimes = CeilDiv(len, ipcBuffMaxNumPerLoop); + const int64_t ipcMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + for (int64_t i = 0; i < loopTimes; i++) { + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + *ctrlFlagsUB3 = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNumPerLoop; + int64_t remainNum = (len - processedNum < ipcBuffMaxNumPerLoop) ? len - processedNum : ipcBuffMaxNumPerLoop; + + switch (coreGroupIdx) { + case 0: + sendBuff = input + processedNum; + break; + case 2: + receiveBuff = output + processedNum; + break; + default: + ; + } + + PostSyncBigData910B2C(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcMaxNum, magic, i, peerRankId, + singleNodeRankSize); + LcalAllReduceBigData910B2COrigin( + MODIFIABLE_MAGIC_PROCESSED_NUM_ALLREDUCE_ARGS_CALL_16P_Origin(processedNum, remainNum, ((magic + i) * 1024)), + singleNodeRankSize, localNodeRankId, coreGroupIdx, peerRankId, dataOffsetNum, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, ctrlFlagsUB3, inputUB, x, xLocalNodeRankId, ctrlFlagGMSet, ctrlFlagGMCheck, sendBuff, + receiveBuff + ); + AscendC::PipeBarrier(); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::OVERALL, static_cast(op)); +} \ No newline at end of file -- Gitee From debd36cb8ef4554d4618e114fdd442fce08a4538 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 17:05:07 +0800 Subject: [PATCH 266/414] draft --- .../lcal_allreduce_big_data_910B2C.cce | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce index 5d3a3ec2..4b9e44a8 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce @@ -7,6 +7,7 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ +#include #include "collectives.cce" template @@ -14,7 +15,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C ALLREDUCE_ARGS_FUN_16P_Origin(T), const int64_t singleNodeRankSize, const int64_t localNodeRankId, const int64_t coreGroupIdx, const int64_t peerRankId, const int64_t dataOffsetNum, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ int64_t* ctrlFlagsUB3, - __ubuf__ T* inputUB[2], const int64_t x, const int64_t xLocalNodeRankId, __gm__ int64_t *ctrlFlagGMSet, + __ubuf__ T* inputUB[2], const int64_t x, const int64_t xLocalNodeRankId, __gm__ int64_t *ctrlFlagGMSet, __gm__ int64_t *ctrlFlagGMCheck, __gm__ T *sendBuff, __gm__ T *receiveBuff ) { @@ -28,13 +29,13 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C if (xLocalNodeRankId == singleNodeRankSize - 1) { xNPUProcessDataNum = len - xLocalNodeRankId * oneNPUProcessDataAvgNum; } - + int64_t dataSizeRemain = xNPUProcessDataNum * sizeof(T); *ctrlFlagsUB = 0; if (coreGroupIdx == 0) { const int64_t buffOffsetNum = xLocalNodeRankId * oneNPUProcessDataAvgNum; AscendC::PipeBarrier(); - input2BuffRankMagic(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, sendBuff, buffOffsetNum, ctrlFlagsUB, ctrlFlagsGMSet, magic); + input2BuffRankMagic(dataSizeRemain, inputUB[0], receiveBuff, buffOffsetNum, sendBuff, buffOffsetNum, ctrlFlagsUB, ctrlFlagGMSet, magic); } else if (coreGroupIdx == 1) { *ctrlFlagsUB1 = 0; *ctrlFlagsUB2 = 0; @@ -72,8 +73,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C if (preparedDataGroupCount >= multipleTimes) { dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; } - ProcessDataNewNonBarrier(dataSizeRemain, inputUB, buff[x], dataOffsetNum, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), - receiveBuff, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + ProcessDataNewNonBarrier(dataSizeRemain, inputUB, sendBuff, 0, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), + receiveBuff, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); SetFlag(ctrlFlagsUB3, ctrlFlagGMSet, ((*ctrlFlagsUB1 & 0xfffffffffffffc00) | preparedDataGroupCount)); *ctrlFlagsUB = preparedDataGroupCount; @@ -107,7 +108,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C CpGM2UB(ctrlFlagsUB2, ctrlFlagGMCheck + i * MEM_DMA_UNIT_INT_NUM, sizeof(int64_t)); SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); - } while ((ctrlFlagsUB2 >> 10) != (magic >> 10)); + } while ((*ctrlFlagsUB2 >> 10) != (magic >> 10)); if (*ctrlFlagsUB1 > *ctrlFlagsUB2) { *ctrlFlagsUB1 = *ctrlFlagsUB2; @@ -126,9 +127,9 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C AscendC::PipeBarrier(); GM2GMPingPongNonPipeBarrier(dataSizeRemain, inputUB, receiveBuff, - len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), - sendBuff, - buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); + len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, + buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); SetFlagNonPipeBarrier(ctrlFlagsUB3, ctrlFlagGMSet, ctrlFlagGMSetLocal, ((*ctrlFlagsUB1 & 0xfffffffffffffc00) | preparedDataGroupCount)); @@ -161,7 +162,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C } preparedDataGroupCount = ((*ctrlFlagsUB1 & 0x3FF) <= (*ctrlFlagsUB2 & 0x3FF)) ? - (*ctrlFlagsUB1 & 0x3FF) : (*ctrlFlagsUB2 & 0x3FF); + (*ctrlFlagsUB1 & 0x3FF) : (*ctrlFlagsUB2 & 0x3FF); if (processedDataGroupCount >= preparedDataGroupCount) { continue; } @@ -219,8 +220,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C } AscendC::PipeBarrier(); GM2GMPingPongNonPipeBarrier(dataSizeRemain, inputUB, receiveBuff, - buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), - sendBuff, len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); + buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); processedDataGroupCount = preparedDataGroupCount; } } @@ -247,47 +248,47 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C const int64_t singleNodeRankSize = rankSize >> 1; const int64_t localNodeRankId = rank >= singleNodeRankSize ? rank - singleNodeRankSize : rank; - const int64_t coreGroupIdx = GetBlockIdx() % singleNodeRankSize; + const int64_t coreGroupIdx = GetBlockIdx() / singleNodeRankSize; const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; - const int64_t x = (rank < singleNodeRankSize) ? (GetBlockIdx() % singleNodeRankSize) : - ((GetBlockIdx() % singleNodeRankSize) + singleNodeRankSize); + const int64_t x = (rank < singleNodeRankSize) ? (GetBlockIdx() % singleNodeRankSize) : + ((GetBlockIdx() % singleNodeRankSize) + singleNodeRankSize); const int64_t xLocalNodeRankId = x % singleNodeRankSize; __gm__ T *sendBuff = input; __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - __gm__ int64_t *ctrlFlagGMSet = (__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t *ctrlFlagGMSet = ((__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM); - __gm__ int64_t *ctrlFlagGMCheck = (__gm__ int64_t*)buff[x] + (localNodeRankId) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t *ctrlFlagGMCheck = ((__gm__ int64_t*)buff[x] + (localNodeRankId) * MEM_DMA_UNIT_INT_NUM); switch (coreGroupIdx) { case 0: break; case 1: sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - ctrlFlagGMSet = (__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagGMSet = ((__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM); break; case 2: sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); receiveBuff = output; - ctrlFlagGMCheck = (__gm__ int64_t*)buff[x] + (rankSize + 2) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagGMCheck = ((__gm__ int64_t*)buff[x] + (rankSize + 2) * MEM_DMA_UNIT_INT_NUM); break; case 3: { if (GetBlockIdx() == singleNodeRankSize * 3) { sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[peerRankId] + dataOffsetNum); - ctrlFlagGMCheck = (__gm__ int64_t*)buff[rank] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM; - ctrlFlagGMSet = (__gm__ int64_t*)buff[peerRankId] + rankSize * MEM_DMA_UNIT_INT_NUM; + ctrlFlagGMCheck = ((__gm__ int64_t*)buff[rank] + singleNodeRankSize * MEM_DMA_UNIT_INT_NUM); + ctrlFlagGMSet = ((__gm__ int64_t*)buff[peerRankId] + rankSize * MEM_DMA_UNIT_INT_NUM); } else { sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - ctrlFlagGMCheck = (__gm__ int64_t*)buff[rank] + rankSize * MEM_DMA_UNIT_INT_NUM; - ctrlFlagGMSet = (__gm__ int64_t*)buff[rank] + (rankSize + 2) * MEM_DMA_UNIT_INT_NUM; + ctrlFlagGMCheck = ((__gm__ int64_t*)buff[rank] + rankSize * MEM_DMA_UNIT_INT_NUM); + ctrlFlagGMSet = ((__gm__ int64_t*)buff[rank] + (rankSize + 2) * MEM_DMA_UNIT_INT_NUM); } } default: -- Gitee From d3b9abbed71c402f2cb0149db81b8539fdf5589e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 17:06:13 +0800 Subject: [PATCH 267/414] draft --- comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce index 4b9e44a8..f926b986 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce @@ -220,8 +220,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C } AscendC::PipeBarrier(); GM2GMPingPongNonPipeBarrier(dataSizeRemain, inputUB, receiveBuff, - buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), - sendBuff, len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); + buffOffsetNum + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T), + sendBuff, len + processedDataGroupCount * DMA_SIZE_PER_FLAG / sizeof(T)); processedDataGroupCount = preparedDataGroupCount; } } @@ -270,7 +270,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C case 1: sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); - ctrlFlagGMSet = ((__gm__ int64_t*)buff[rank] + (GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM); + ctrlFlagGMSet = ((__gm__ int64_t*)buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM); break; case 2: sendBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); -- Gitee From 743fd1ddca0ea4f68d39ffa08f6a27bee5a4f6a3 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 17:08:07 +0800 Subject: [PATCH 268/414] draft --- comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce index f926b986..2a3755cd 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce @@ -74,7 +74,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; } ProcessDataNewNonBarrier(dataSizeRemain, inputUB, sendBuff, 0, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), - receiveBuff, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); + receiveBuff, buffOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T), op); SetFlag(ctrlFlagsUB3, ctrlFlagGMSet, ((*ctrlFlagsUB1 & 0xfffffffffffffc00) | preparedDataGroupCount)); *ctrlFlagsUB = preparedDataGroupCount; -- Gitee From 06798fc69ea47c8105934fd0c5ceb2c4b68a49c4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 17:08:31 +0800 Subject: [PATCH 269/414] draft --- comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce index 2a3755cd..4f68ed44 100644 --- a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce +++ b/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce @@ -251,7 +251,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAllReduceBigData910B2C const int64_t coreGroupIdx = GetBlockIdx() / singleNodeRankSize; const int64_t peerRankId = rank < singleNodeRankSize ? rank + singleNodeRankSize : rank - singleNodeRankSize; - + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; const int64_t x = (rank < singleNodeRankSize) ? (GetBlockIdx() % singleNodeRankSize) : -- Gitee From f56733f7b8fa9815427fd58822c9dcfbd55cf2c8 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 19:28:30 +0800 Subject: [PATCH 270/414] draft --- .../lcal/src/kernels/lcal_broadcast_write.cce | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_broadcast_write.cce diff --git a/comm/lcal/src/kernels/lcal_broadcast_write.cce b/comm/lcal/src/kernels/lcal_broadcast_write.cce new file mode 100644 index 00000000..d1454e43 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_broadcast_write.cce @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +inline __aicore__ void GM2GM8(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *receiveBuff, int64_t revBuffOffsetNum, + __gm__ T *sendBuff, int64_t sendBuffOffsetNum) +{ + int64_t times = 0; + AscendC::PipeBarrier(); + while (dataSizeRemain >= UB_SINGLE_DMA_SIZE_MAX) { + AscendC::PipeBarrier(); + CpGM2UBAlignB16(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::PipeBarrier(); + CpUB2GMAlignB16( + (__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::PipeBarrier(); + times += 1; + dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; + } + if (dataSizeRemain <= 0) { + return; + } + AscendC::PipeBarrier(); + CpGM2UBAlignB16(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + dataSizeRemain); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::PipeBarrier(); + CpUB2GMAlignB16((__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + inputUB, dataSizeRemain); + AscendC::PipeBarrier(); +} + + +extern "C" __global__ __aicore__ void LcalBroadcastWrite(ALLREDUCE_ARGS_FUN(char)) +{ + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t x = GetBlockIdx() / corePerRank; + if (x >= rankSize) { + return; + } + if (rank != root && x != rank) { + return; + } + if (rank == root && x == root) { + return; + } + + __gm__ char* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ char* inputUB = (__ubuf__ char*)(64); + + const int64_t singleCoreDataSize = len / corePerRank; + int64_t dataNumRemain = singleCoreDataSize; + int64_t buffOffsetNum = coreSegmentedIdx * singleCoreDataSize; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumRemain = len - buffOffsetNum; + } + if (rank == root) { + __gm__ char *receiveBuff = (__gm__ char*)((__gm__ int64_t*)buff[x] + GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM); + GM2GM8(dataNumRemain, inputUB, receiveBuff, buffOffsetNum, input, buffOffsetNum); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[x] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM), magic); + } else { + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); + __gm__ char *sendBuff = (__gm__ char*)((__gm__ int64_t*)buff[x] + GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM); + __gm__ char *receiveBuff = (__gm__ char*)output; + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[root] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM, magic); + } +} + +__attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWriteOrigin(ALLREDUCE_ARGS_FUN(char)) +{ + uint32_t blockSize = UB_SINGLE_DMA_SIZE_MAX; + int64_t magicInner = magic << SYNC_FLAG_BIT_NUM; + __gm__ char* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ char* inputUB = (__ubuf__ char*)(64); + len = CeilDiv(len, 2) * 2; + const int64_t groupSize = rankSize - 1; + const int64_t groupDataSize = blockSize * groupSize; + const int64_t groupNum = CeilDiv(len, groupDataSize); + const int64_t blockTotalNum = CeilDiv(len, blockSize); + const int64_t lastBlockNum = CeilDiv(len - (groupNum - 1) * groupDataSize, blockSize); + if (GetBlockIdx() == rank || GetBlockIdx() >= rankSize) { + return; + } + int64_t inputIndex = GetBlockIdx(); + if (GetBlockIdx() == root) { + if (rank > root) { + inputIndex = rank - 1; + } else { + inputIndex = rank; + } + } else if (GetBlockIdx() > root) { + inputIndex = GetBlockIdx() - 1; + } + int64_t blockDataOffset; + int64_t remain; + if (rank == root) { + for (int64_t currentCount = inputIndex; currentCount < blockTotalNum; currentCount += groupSize) { + blockDataOffset = currentCount * blockSize; + remain = (currentCount == blockTotalNum - 1) ? (len - (blockTotalNum - 1) : blockSize) : blockSize; + CopyInput2BuffBroadCast(inputUB, buff[GetBlockIdx()], (__gm__ char*)input, remain, blockDataOffset); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[GetBlockIdx()] + root * MEM_DMA_UNIT_INT_NUM, magicInner + currentCount); + } + } else { + for (int64_t currentCount = inputIndex; currentCount < blockTotalNum; currentCount += groupSize) { + blockDataOffset = currentCount * blockSize; + remain = (currentCount == blockTotalNum - 1) ? (len - (blockTotalNum - 1) : blockSize) : blockSize; + CheckFlagGE(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + root * MEM_DMA_UNIT_INT_NUM), magicInner + currentCount); + AscendC::PipeBarrier(); + + if (remain > 0) { + CpGM2UB(inputUB, (__gm__ char*)((__gm__ int64_t * )buff[rank] + GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM) + blockDataOffset, remain); + AscendC::PipeBarrier(); + CpUB2GM((__gm__ char*)output + blockDataOffset, inputUB, remain); + } + } + } + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[GetBlockIdx()] + (GetLcalBlockNum() + root) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + root) * MEM_DMA_UNIT_INT_NUM), magic); +} + +__attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWrite(ALLREDUCE_ARGS_FUN(char)) +{ + magic = magic << SYNC_FLAG_BIT_NUM; + __gm__ char* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t postSyncFlagIdx = MEM_DMA_UNIT_INT_NUM + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + const int64_t loopNum = CeilDiv(len, IPC_BUFF_MAX_SIZE); + + for (int64_t i = 0; i < loopNum; i++) { + int64_t processedNum = i * IPC_BUFF_MAX_SIZE; + int64_t remainNum = (len - processedNum < IPC_BUFF_MAX_SIZE) ? len - processedNum : IPC_BUFF_MAX_SIZE; + if (i > 0) { + SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); + + for (int64_t targetNPU = 0; targetNPU < rankSize; targetNPU++) { + if (targetNPU == rank) { + continue; + } + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ char *)buff[targetNPU] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; + CheckFlagGE(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); + } + } + LcalBroadcast2npuBigDataWriteOrigin( + input + processedNum, output + processedNum, rank, rankSize, remainNum, magic + i, 0, root, + localRankSize, loopTime, sendCountMatrix, dumpAddr, + buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7); + AscendC::PipeBarrier(); + } +} \ No newline at end of file -- Gitee From 5ba8fae664f47b8af58cbcca249607c124896895 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 19:33:15 +0800 Subject: [PATCH 271/414] draft --- .../lcal/src/kernels/lcal_broadcast_write.cce | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_broadcast_write.cce b/comm/lcal/src/kernels/lcal_broadcast_write.cce index d1454e43..f65bd9e8 100644 --- a/comm/lcal/src/kernels/lcal_broadcast_write.cce +++ b/comm/lcal/src/kernels/lcal_broadcast_write.cce @@ -11,7 +11,7 @@ #include "collectives.cce" template -inline __aicore__ void GM2GM8(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *receiveBuff, int64_t revBuffOffsetNum, +inline __aicore__ void GM2GMB8(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *receiveBuff, int64_t revBuffOffsetNum, __gm__ T *sendBuff, int64_t sendBuffOffsetNum) { int64_t times = 0; @@ -26,8 +26,8 @@ inline __aicore__ void GM2GM8(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm_ CpUB2GMAlignB16( (__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, inputUB, UB_SINGLE_DMA_SIZE_MAX); - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); AscendC::PipeBarrier(); times += 1; dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; @@ -36,13 +36,14 @@ inline __aicore__ void GM2GM8(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm_ return; } AscendC::PipeBarrier(); - CpGM2UBAlignB16(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + CpGM2UBAlignB16(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), dataSizeRemain); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); AscendC::PipeBarrier(); - CpUB2GMAlignB16((__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T), - inputUB, dataSizeRemain); + CpUB2GMAlignB16( + (__gm__ T*)receiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + inputUB, dataSizeRemain); AscendC::PipeBarrier(); } @@ -77,14 +78,14 @@ extern "C" __global__ __aicore__ void LcalBroadcastWrite(ALLREDUCE_ARGS_FUN(char } if (rank == root) { __gm__ char *receiveBuff = (__gm__ char*)((__gm__ int64_t*)buff[x] + GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM); - GM2GM8(dataNumRemain, inputUB, receiveBuff, buffOffsetNum, input, buffOffsetNum); - SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, magic); - CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[x] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM), magic); + GM2GMB8(dataNumRemain, inputUB, receiveBuff, buffOffsetNum, input, buffOffsetNum); + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM), magic); } else { - CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[x] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); __gm__ char *sendBuff = (__gm__ char*)((__gm__ int64_t*)buff[x] + GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM); __gm__ char *receiveBuff = (__gm__ char*)output; - SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[root] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM, magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[root] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM, magic); } } @@ -96,7 +97,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWr buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7 }; - __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t * )(0); __ubuf__ char* inputUB = (__ubuf__ char*)(64); len = CeilDiv(len, 2) * 2; const int64_t groupSize = rankSize - 1; @@ -122,15 +123,15 @@ __attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWr if (rank == root) { for (int64_t currentCount = inputIndex; currentCount < blockTotalNum; currentCount += groupSize) { blockDataOffset = currentCount * blockSize; - remain = (currentCount == blockTotalNum - 1) ? (len - (blockTotalNum - 1) : blockSize) : blockSize; + remain = (currentCount == blockTotalNum - 1) ? (len - (blockTotalNum - 1) * blockSize) : blockSize; CopyInput2BuffBroadCast(inputUB, buff[GetBlockIdx()], (__gm__ char*)input, remain, blockDataOffset); - SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[GetBlockIdx()] + root * MEM_DMA_UNIT_INT_NUM, magicInner + currentCount); + SetFlag(ctrlFlagsUB, (__gm__ int64_t * )buff[GetBlockIdx()] + root * MEM_DMA_UNIT_INT_NUM, magicInner + currentCount); } } else { for (int64_t currentCount = inputIndex; currentCount < blockTotalNum; currentCount += groupSize) { blockDataOffset = currentCount * blockSize; - remain = (currentCount == blockTotalNum - 1) ? (len - (blockTotalNum - 1) : blockSize) : blockSize; - CheckFlagGE(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + root * MEM_DMA_UNIT_INT_NUM), magicInner + currentCount); + remain = (currentCount == blockTotalNum - 1) ? (len - (blockTotalNum - 1) * blockSize) : blockSize; + CheckFlagGE(ctrlFlagsUB, ((__gm__ int64_t * )buff[rank] + root * MEM_DMA_UNIT_INT_NUM), magicInner + currentCount); AscendC::PipeBarrier(); if (remain > 0) { @@ -140,8 +141,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWr } } } - SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[GetBlockIdx()] + (GetLcalBlockNum() + root) * MEM_DMA_UNIT_INT_NUM, magic); - CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + (GetLcalBlockNum() + root) * MEM_DMA_UNIT_INT_NUM), magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t * )buff[GetBlockIdx()] + (GetLcalBlockNum() + root) * MEM_DMA_UNIT_INT_NUM, magic); + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t * )buff[rank] + (GetLcalBlockNum() + root) * MEM_DMA_UNIT_INT_NUM), magic); } __attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWrite(ALLREDUCE_ARGS_FUN(char)) @@ -163,15 +164,15 @@ __attribute__((always_inline)) inline __aicore__ void LcalBroadcast2npuBigDataWr if (i > 0) { SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); - __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); for (int64_t targetNPU = 0; targetNPU < rankSize; targetNPU++) { if (targetNPU == rank) { continue; } - __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ char *)buff[targetNPU] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; - CheckFlagGE(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ char *)buff[targetNPU] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; + CheckFlagNew(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); } } LcalBroadcast2npuBigDataWriteOrigin( -- Gitee From 86c3c8f5edd169181e57186de9ef678863c2806c Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 19:34:07 +0800 Subject: [PATCH 272/414] draft --- comm/lcal/src/kernels/lcal_broadcast_write.cce | 1 + 1 file changed, 1 insertion(+) diff --git a/comm/lcal/src/kernels/lcal_broadcast_write.cce b/comm/lcal/src/kernels/lcal_broadcast_write.cce index f65bd9e8..838372a0 100644 --- a/comm/lcal/src/kernels/lcal_broadcast_write.cce +++ b/comm/lcal/src/kernels/lcal_broadcast_write.cce @@ -85,6 +85,7 @@ extern "C" __global__ __aicore__ void LcalBroadcastWrite(ALLREDUCE_ARGS_FUN(char CheckFlag(ctrlFlagsUB, ((__gm__ int64_t*)buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), magic); __gm__ char *sendBuff = (__gm__ char*)((__gm__ int64_t*)buff[x] + GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM); __gm__ char *receiveBuff = (__gm__ char*)output; + GM2GMB8(dataNumRemain, inputUB, receiveBuff, buffOffsetNum, sendBuff, buffOffsetNum); SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[root] + (GetBlockIdx() + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM, magic); } } -- Gitee From a1a7dd3aa1126681f5037f484f26b342f8c15e44 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 19:46:39 +0800 Subject: [PATCH 273/414] draft --- .../kernels/lcal_broadcast_write_big_data.cce | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_broadcast_write_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_broadcast_write_big_data.cce b/comm/lcal/src/kernels/lcal_broadcast_write_big_data.cce new file mode 100644 index 00000000..94dec72c --- /dev/null +++ b/comm/lcal/src/kernels/lcal_broadcast_write_big_data.cce @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +__attribute__((always_inline)) inline __aicore__ void LcalBroadcastOrigin(ALLREDUCE_ARGS_FUN(char)) +{ + uint32_t blockSize = UB_SINGLE_DMA_SIZE_MAX; + int64_t magicInner = magic << SYNC_FLAG_BIT_NUM; + __gm__ char* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t * )(0); + __ubuf__ char* inputUB = (__ubuf__ char*)(64); + len = CeilDiv(len, 2) * 2; + const int64_t groupSize = rankSize - 1; + const int64_t groupDataSize = blockSize * groupSize; + const int64_t groupNum = CeilDiv(len, groupDataSize); + const int64_t blockTotalNum = CeilDiv(len, blockSize); + const int64_t lastBlockNum = CeilDiv(len - (groupNum - 1) * groupDataSize, blockSize); + if (GetBlockIdx() == rank || GetBlockIdx() >= rankSize) { + return; + } + int64_t inputIndex = GetBlockIdx(); + if (GetBlockIdx() == root) { + if (rank > root) { + inputIndex = rank - 1; + } else { + inputIndex = rank; + } + } else if (GetBlockIdx() > root) { + inputIndex = GetBlockIdx() - 1; + } + int64_t blockDataOffset; + int64_t remain; + for (int64_t currentCount = inputIndex; currentCount < blockTotalNum; currentCount += groupSize) { + blockDataOffset = currentCount * blockSize; + remain = blockSize; + if (currentCount == blockTotalNum - 1) { + remain = len - (blockTotalNum - 1) * blockSize; + } + if (rank == root) { + CopyInput2BuffBroadCast(inputUB, buff[rank], (__gm__ char*)input, remain, blockDataOffset); + SetFlag(ctrlFlagsUB, (__gm__ int64_t * )buff[rank] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM, magicInner + currentCount); + } else { + if (GetBlockIdx() == root) { + CheckFlagGE(ctrlFlagsUB, ((__gm__ int64_t * )buff[GetBlockIdx()] + rank * MEM_DMA_UNIT_INT_NUM), + magicInner + currentCount); + } else { + CheckFlagGE(ctrlFlagsUB, ((__gm__ int64_t * )buff[GetBlockIdx()] + GetBlockIdx() * MEM_DMA_UNIT_INT_NUM), + magicInner + currentCount); + } + AscendC::PipeBarrier(); + + if (remain > 0) { + CpGM2UBAlignB16(inputUB, (__gm__ char*)((__gm__ int64_t * )buff[GetBlockIdx()] + GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM) +blockDataOffset, remain); + AscendC::PipeBarrier(); + CpUB2GMAlignB16((__gm__ char*)output + blockDataOffset, inputUB, remain); + if (GetBlockIdx() == root) { + CpUB2GMAlignB16((__gm__ char*)((__gm__ int64_t * )buff[rank] + GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM) + blockDataOffset, inputUB, remain); + SetFlag(ctrlFlagsUB, (__gm__ int64_t * )buff[rank] + rank * MEM_DMA_UNIT_INT_NUM, magicInner + currentCount); + } + } + } + } + if (rank != root) { + SetFlag(ctrlFlagsUB, (__gm__ int64_t * )buff[GetBlockIdx()] + (GetLcalBlockNum() + rank) * MEM_DMA_UNIT_INT_NUM, magic); + } + + if (rank == root) { + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t * )buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM), + magic); + } else if (GetBlockIdx() == root) { + for (int64_t i = 0; i < rankSize; i++) { + if (i == rank || i == root) { + continue; + } + CheckFlag(ctrlFlagsUB, ((__gm__ int64_t * )buff[rank] + (GetLcalBlockNum() + i) * MEM_DMA_UNIT_INT_NUM), magic); + } + } +} + +__attribute__((always_inline)) inline __aicore__ void LcalBroadcastBigData(ALLREDUCE_ARGS_FUN(char)) +{ + magic = magic << SYNC_FLAG_BIT_NUM; + __gm__ char* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t postSyncFlagIdx = MEM_DMA_UNIT_INT_NUM + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM; + const int64_t loopNum = CeilDiv(len, IPC_BUFF_MAX_SIZE); + + for (int64_t i = 0; i < loopNum; i++) { + int64_t processedNum = i * IPC_BUFF_MAX_SIZE; + int64_t remainNum = (len - processedNum < IPC_BUFF_MAX_SIZE) ? len - processedNum : IPC_BUFF_MAX_SIZE; + if (i > 0) { + SyncWithinNPUNew(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + MEM_DMA_UNIT_INT_NUM, magic + i); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t *)((__gm__ char *)buff[rank] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGM, (int64_t)magic + i); + + for (int64_t targetNPU = 0; targetNPU < rankSize; targetNPU++) { + if (targetNPU == rank) { + continue; + } + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t *)((__gm__ char *)buff[targetNPU] + IPC_BUFF_MAX_SIZE) + dataOffsetNum + postSyncFlagIdx; + CheckFlagNew(ctrlFlagsUB, ctrlFlagsGMX, (int64_t)magic + i); + } + } + LcalBroadcastOrigin( + input + processedNum, output + processedNum, rank, rankSize, remainNum, magic + i, 0, root, localRankSize, + loopTime, sendCountMatrix, dumpAddr, buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7); + AscendC::PipeBarrier(); + } +} \ No newline at end of file -- Gitee From ac9a331a09916b57c098865621c738f7027f295e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 19:47:30 +0800 Subject: [PATCH 274/414] rename --- ...l_broadcast_write_big_data.cce => lcal_broadcast_big_data.cce} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename comm/lcal/src/kernels/{lcal_broadcast_write_big_data.cce => lcal_broadcast_big_data.cce} (100%) diff --git a/comm/lcal/src/kernels/lcal_broadcast_write_big_data.cce b/comm/lcal/src/kernels/lcal_broadcast_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_broadcast_write_big_data.cce rename to comm/lcal/src/kernels/lcal_broadcast_big_data.cce -- Gitee From 45b16d1d37a57987fc10b86afd861b139b363d3d Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 25 Aug 2025 19:48:49 +0800 Subject: [PATCH 275/414] draft --- comm/lcal/src/kernels/lcal_broadcast_big_data.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_broadcast_big_data.cce b/comm/lcal/src/kernels/lcal_broadcast_big_data.cce index 94dec72c..5debb811 100644 --- a/comm/lcal/src/kernels/lcal_broadcast_big_data.cce +++ b/comm/lcal/src/kernels/lcal_broadcast_big_data.cce @@ -79,7 +79,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalBroadcastOrigin(ALLRED CheckFlag(ctrlFlagsUB, ((__gm__ int64_t * )buff[rank] + (GetLcalBlockNum() + GetBlockIdx()) * MEM_DMA_UNIT_INT_NUM), magic); } else if (GetBlockIdx() == root) { - for (int64_t i = 0; i < rankSize; i++) { + for (int64_t i = 0; i < rankSize; ++i) { if (i == rank || i == root) { continue; } -- Gitee From 49f07d0d2155916c6395dfcb68920e02a9a5d6b8 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:13:56 +0800 Subject: [PATCH 276/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter.cce | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_reduce_scatter.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/comm/lcal/src/kernels/lcal_reduce_scatter.cce new file mode 100644 index 00000000..bc14bcdb --- /dev/null +++ b/comm/lcal/src/kernels/lcal_reduce_scatter.cce @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void CpInputToBuffAndOutput(__ubuf__ T** inputUB, __gm__ T* buff, __gm__ T* input, __gm__ T* output, + int64_t dataOffsetNum, int64_t dataNumDMARemain, int64_t inputOffset, + int64_t outputOffsetNum, int32_t rank, int64_t corePerRank, + int64_t UB_SINGLE_DMA_NUM_MAX) +{ + int64_t dataProcessingBatchTime = 0; + while (dataNumDMARemain >= UB_SINGLE_DMA_NUM_MAX) { + CpGM2UB(inputUB[0], input + inputOffset + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, + UB_SINGLE_DMA_NUM_MAX); + AscendC::PipeBarrier(); + if (GetBlockIdx() >= rank * corePerRank && (GetBlockIdx() < (rank * corePerRank + corePerRank))) { + CpUB2GM((__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, + inputUB[0], UB_SINGLE_DMA_NUM_MAX); + } else { + CpUB2GM( + (__gm__ T *)((__gm__ int64_t *)buff + dataOffsetNum) + inputOffset + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, + inputUB[0], UB_SINGLE_DMA_NUM_MAX); + } + AscendC::PipeBarrier(); + } + if (dataNumDMARemain <= 0) { + return; + } + CpGM2UB(inputUB[0], input + inputOffset + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, + dataNumDMARemain * sizeof(T)); + AscendC::PipeBarrier(); + if (GetBlockIdx() >= rank * corePerRank && (GetBlockIdx() < (rank * corePerRank + corePerRank))) { + CpUB2GM((__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, + inputUB[0], dataNumDMARemain * sizeof(T)); + AscendC::PipeBarrier(); + } else { + CpUB2GM( + (__gm__ T *)((__gm__ int64_t *)buff + dataOffsetNum) + inputOffset + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, + inputUB[0], dataNumDMARemain * sizeof(T)); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUCE_ARGS_FUN(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t UB_SINGLE_DMA_NUM_MAX = UB_SINGLE_DMA_SIZE_MAX / sizeof(T); + + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t inputNum = len * rankSize; + const int64_t dataDMAPerCore = CeilDiv(len, corePerRank); + const int64_t inputOffset = GetBlockIdx() / corePerRank * len + coreSegmentedIdx * dataDMAPerCore; + + int64_t dataNumDMARemain = dataDMAPerCore; + int64_t oneNPUProcessNum = len; + int64_t oneCoreProcessNum = CeilDiv(len, corePerRank); + const int64_t outputOffsetNum = oneCoreProcessNum * (GetBlockIdx() % corePerRank); + int64_t dataSizeRemain = oneCoreProcessNum * sizeof(T); + if (coreSegmentedIdx == corePerRank - 1) { + dataNumDMARemain = len - coreSegmentedIdx * dataDMAPerCore; + dataSizeRemain = (len - coreSegmentedIdx * oneCoreProcessNum) * sizeof(T); + } + + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + AscendC::PipeBarrier(); + CpInputToBuffAndOutput(inputUB, buff[rank], input, output, dataOffsetNum, dataNumDMARemain, + inputOffset, outputOffsetNum, rank, corePerRank, UB_SINGLE_DMA_NUM_MAX); + SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + inputNum) + MEM_DMA_UNIT_INT_NUM, magic); + + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + flagOffset1st, (int64_t)magic); + const int64_t x = GetBlockIdx() / corePerRank; + AscendC::PipeBarrier(); + if (x == rank) { + SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset2nd, (int64_t)magic); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + return; + } + const int64_t buffOffsetNum = rank * oneNPUProcessNum + outputOffsetNum; + + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, + (__gm__ int64_t*)buff[x] + (rank * corePerRank + (GetBlockIdx() % corePerRank)) * MEM_DMA_UNIT_INT_NUM, + (int64_t)magic); + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, output, outputOffsetNum, op); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); +} \ No newline at end of file -- Gitee From 40fdf61235a1e0c3bacbb17b3d1153a788403578 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:19:56 +0800 Subject: [PATCH 277/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter.cce | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/comm/lcal/src/kernels/lcal_reduce_scatter.cce index bc14bcdb..4f03c33f 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter.cce @@ -18,17 +18,20 @@ __attribute__((always_inline)) inline __aicore__ void CpInputToBuffAndOutput(__u int64_t dataProcessingBatchTime = 0; while (dataNumDMARemain >= UB_SINGLE_DMA_NUM_MAX) { CpGM2UB(inputUB[0], input + inputOffset + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, - UB_SINGLE_DMA_NUM_MAX); + UB_SINGLE_DMA_SIZE_MAX); AscendC::PipeBarrier(); if (GetBlockIdx() >= rank * corePerRank && (GetBlockIdx() < (rank * corePerRank + corePerRank))) { CpUB2GM((__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, - inputUB[0], UB_SINGLE_DMA_NUM_MAX); + inputUB[0], UB_SINGLE_DMA_SIZE_MAX); } else { CpUB2GM( (__gm__ T *)((__gm__ int64_t *)buff + dataOffsetNum) + inputOffset + UB_SINGLE_DMA_NUM_MAX * dataProcessingBatchTime, - inputUB[0], UB_SINGLE_DMA_NUM_MAX); + inputUB[0], UB_SINGLE_DMA_SIZE_MAX); } AscendC::PipeBarrier(); + dataNumDMARemain -= UB_SINGLE_DMA_NUM_MAX; + dataProcessingBatchTime += 1; + AscendC::PipeBarrier(); } if (dataNumDMARemain <= 0) { return; @@ -57,7 +60,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUC buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7 }; - __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(98304)}; __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; const int64_t UB_SINGLE_DMA_NUM_MAX = UB_SINGLE_DMA_SIZE_MAX / sizeof(T); @@ -83,7 +86,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUC AscendC::PipeBarrier(); CpInputToBuffAndOutput(inputUB, buff[rank], input, output, dataOffsetNum, dataNumDMARemain, inputOffset, outputOffsetNum, rank, corePerRank, UB_SINGLE_DMA_NUM_MAX); - SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + inputNum) + MEM_DMA_UNIT_INT_NUM, magic); + SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t*)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + inputNum) + MEM_DMA_UNIT_INT_NUM, magic); SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + flagOffset1st, (int64_t)magic); const int64_t x = GetBlockIdx() / corePerRank; @@ -92,9 +95,9 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUC SetFlag((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset2nd, (int64_t)magic); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); return; - } + } const int64_t buffOffsetNum = rank * oneNPUProcessNum + outputOffsetNum; - + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (rank * corePerRank + (GetBlockIdx() % corePerRank)) * MEM_DMA_UNIT_INT_NUM, (int64_t)magic); -- Gitee From 5f14258fd6650c89c08916e6db2dda537cadcc4a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:20:47 +0800 Subject: [PATCH 278/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/comm/lcal/src/kernels/lcal_reduce_scatter.cce index 4f03c33f..0e204371 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter.cce @@ -86,7 +86,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUC AscendC::PipeBarrier(); CpInputToBuffAndOutput(inputUB, buff[rank], input, output, dataOffsetNum, dataNumDMARemain, inputOffset, outputOffsetNum, rank, corePerRank, UB_SINGLE_DMA_NUM_MAX); - SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t*)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + inputNum) + MEM_DMA_UNIT_INT_NUM, magic); + SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + inputNum) + MEM_DMA_UNIT_INT_NUM, magic); SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + flagOffset1st, (int64_t)magic); const int64_t x = GetBlockIdx() / corePerRank; -- Gitee From b30117175ac3ede7eb4615bcc332b2a82f750222 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:21:21 +0800 Subject: [PATCH 279/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/comm/lcal/src/kernels/lcal_reduce_scatter.cce index 0e204371..7a90b88b 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter.cce @@ -88,7 +88,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUC inputOffset, outputOffsetNum, rank, corePerRank, UB_SINGLE_DMA_NUM_MAX); SyncWithinNPU(ctrlFlagsUB, (__gm__ int64_t *)((__gm__ T *)((__gm__ int64_t *)buff[rank] + dataOffsetNum) + inputNum) + MEM_DMA_UNIT_INT_NUM, magic); - SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + flagOffset1st, (int64_t)magic); + SetFlag(ctrlFlagsUB, (__gm__ int64_t*)buff[rank] + flagOffset1st, (int64_t)magic); const int64_t x = GetBlockIdx() / corePerRank; AscendC::PipeBarrier(); if (x == rank) { -- Gitee From 39e60111833bd7fe19185d21d74957136cdccace Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:21:36 +0800 Subject: [PATCH 280/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter.cce | 1 + 1 file changed, 1 insertion(+) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/comm/lcal/src/kernels/lcal_reduce_scatter.cce index 7a90b88b..eaae5176 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter.cce @@ -101,6 +101,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatter(ALLREDUC CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, (__gm__ int64_t*)buff[x] + (rank * corePerRank + (GetBlockIdx() % corePerRank)) * MEM_DMA_UNIT_INT_NUM, (int64_t)magic); + ProcessData(dataSizeRemain, inputUB[0], buff[x], dataOffsetNum, buffOffsetNum, output, outputOffsetNum, op); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); } \ No newline at end of file -- Gitee From 5c68a455bf805ff1a1a59b6cdf1a259f93d13449 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:41:41 +0800 Subject: [PATCH 281/414] draft --- .../src/kernels/lcal_reduce_scatter_write.cce | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_reduce_scatter_write.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce new file mode 100644 index 00000000..97d663a0 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +inline __aicore__ void GM2GMAndOutput(int64_t dataSizeRemain, __ubuf__ T *inputUB, __gm__ T *receiveBuff, int64_t revBuffOffsetNum, + __gm__ T *sendBuff, int64_t sendBuffOffsetNum, bool needDMA2Output, __gm__ T *output, int64_t outputOffsetNum) +{ + int64_t times = 0; + while (dataSizeRemain >= UB_SINGLE_DMA_SIZE_MAX) { + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + UB_SINGLE_DMA_SIZE_MAX); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM( + (__gm__ T *)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, UB_SINGLE_DMA_SIZE_MAX); + + if (needDMA2Output) { + CpUB2GM( + (__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, UB_SINGLE_DMA_SIZE_MAX); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + times += 1; + dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; + } + if (dataSizeRemain <= 0) { + return; + } + CpGM2UB(inputUB, (__gm__ T *)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + dataSizeRemain); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + CpUB2GM( + (__gm__ T *)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, dataSizeRemain); + if (needDMA2Output) { + CpUB2GM( + (__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + inputUB, dataSizeRemain); + } + AscendC::PipeBarrier(); +} + +template +inline __aicore__ void LcalReduceScatterWrite(ALLREDUCE_ARGS_FUN(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + const int64_t corePerRank = GetLcalBlockNum() / rankSize; + const int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + const int64_t inputNum = len * rankSize; + const int64_t x = GetBlockIdx() / corePerRank; + + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(32), (__ubuf__ T*)(98304)}; + + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + + const int64_t dataDMAPerCore = CeilDiv(len, corePerRank); + int64_t buffDMAOffsetNum = coreSegmentedIdx * dataDMAPerCore; + int64_t dataNumDMARemain = dataDMAPerCore; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumDMARemain = len - coreSegmentedIdx * dataDMAPerCore; + } + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + GM2GMAndOutput(dataNumDMARemain * sizeof(T), inputUB[0], receiveBuff, rank * len + buffDMAOffsetNum, + input, x * len + buffDMAOffsetNum, (x == rank), output, buffDMAOffsetNum) + SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + flagOffset1st, magic); + AscendC::PipeBarrier(); + + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + flagOffset2nd, magic); + CheckFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[rank] + flagOffset1st, magic); + const int64_t buffOffsetNum = x * len + buffDMAOffsetNum; + if (x == rank) { + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + return; + } + ProcessData(dataNumDMARemain * sizeof(T), inputUB[0], buff[rank], dataOffsetNum, buffOffsetNum, output, buffDMAOffsetNum, op); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); +} \ No newline at end of file -- Gitee From be28c080a37561698a453ab6e5a5c2541b436b6b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:45:47 +0800 Subject: [PATCH 282/414] draft --- .../src/kernels/lcal_reduce_scatter_write.cce | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce index 97d663a0..7e7e8e39 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce @@ -20,32 +20,32 @@ inline __aicore__ void GM2GMAndOutput(int64_t dataSizeRemain, __ubuf__ T *inputU AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); CpUB2GM( - (__gm__ T *)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + (__gm__ T*)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, inputUB, UB_SINGLE_DMA_SIZE_MAX); - + if (needDMA2Output) { CpUB2GM( - (__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + (__gm__ T*)output + outputOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, inputUB, UB_SINGLE_DMA_SIZE_MAX); } - AscendC::SetFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); times += 1; dataSizeRemain -= UB_SINGLE_DMA_SIZE_MAX; } if (dataSizeRemain <= 0) { return; } - CpGM2UB(inputUB, (__gm__ T *)sendBuff + sendBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, - dataSizeRemain); + CpGM2UB(inputUB, (__gm__ T *)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + dataSizeRemain); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); CpUB2GM( - (__gm__ T *)receiveBuff + revBuffOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + (__gm__ T *)receiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), inputUB, dataSizeRemain); if (needDMA2Output) { CpUB2GM( - (__gm__ T *)output + outputOffsetNum + UB_SINGLE_DMA_SIZE_MAX / sizeof(T) * times, + (__gm__ T *)output + outputOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), inputUB, dataSizeRemain); } AscendC::PipeBarrier(); @@ -64,24 +64,24 @@ inline __aicore__ void LcalReduceScatterWrite(ALLREDUCE_ARGS_FUN(T)) buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7 }; - __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); - __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(32), (__ubuf__ T*)(98304)}; + __ubuf__ int64_t *ctrlFlagsUB = (__ubuf__ int64_t *)(0); + __ubuf__ T *inputUB[2] = {(__ubuf__ T *)(32), (__ubuf__ T *)(98304)}; const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; - const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); - const int64_t flagOffset2nd = MEM_DMA_UNIT_INT_NUM * GetLcalBlockNum() + flagOffset1st; + const int64_t flagOffset1st = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset2nd = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; const int64_t dataDMAPerCore = CeilDiv(len, corePerRank); int64_t buffDMAOffsetNum = coreSegmentedIdx * dataDMAPerCore; int64_t dataNumDMARemain = dataDMAPerCore; if (coreSegmentedIdx == corePerRank - 1) { - dataNumDMARemain = len - coreSegmentedIdx * dataDMAPerCore; + dataNumDMARemain = len - buffDMAOffsetNum; } DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); GM2GMAndOutput(dataNumDMARemain * sizeof(T), inputUB[0], receiveBuff, rank * len + buffDMAOffsetNum, - input, x * len + buffDMAOffsetNum, (x == rank), output, buffDMAOffsetNum) + input, x * len + buffDMAOffsetNum, (x == rank), output, buffDMAOffsetNum); SetFlag(ctrlFlagsUB, (__gm__ int64_t *)buff[x] + flagOffset1st, magic); AscendC::PipeBarrier(); -- Gitee From 0532fd283b6870178332fc1b3d4513f2c965dc59 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 10:47:13 +0800 Subject: [PATCH 283/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter_write.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce index 7e7e8e39..350e3ab4 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce @@ -36,16 +36,16 @@ inline __aicore__ void GM2GMAndOutput(int64_t dataSizeRemain, __ubuf__ T *inputU if (dataSizeRemain <= 0) { return; } - CpGM2UB(inputUB, (__gm__ T *)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + CpGM2UB(inputUB, (__gm__ T*)sendBuff + sendBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), dataSizeRemain); AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); CpUB2GM( - (__gm__ T *)receiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + (__gm__ T*)receiveBuff + revBuffOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), inputUB, dataSizeRemain); if (needDMA2Output) { CpUB2GM( - (__gm__ T *)output + outputOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), + (__gm__ T*)output + outputOffsetNum + times * UB_SINGLE_DMA_SIZE_MAX / sizeof(T), inputUB, dataSizeRemain); } AscendC::PipeBarrier(); @@ -69,7 +69,7 @@ inline __aicore__ void LcalReduceScatterWrite(ALLREDUCE_ARGS_FUN(T)) const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; const int64_t flagOffset1st = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; - const int64_t flagOffset2nd = (rank * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset2nd = (x * corePerRank + coreSegmentedIdx) * MEM_DMA_UNIT_INT_NUM; const int64_t dataDMAPerCore = CeilDiv(len, corePerRank); int64_t buffDMAOffsetNum = coreSegmentedIdx * dataDMAPerCore; -- Gitee From 31169d017654788c217f5b976bfef5588ea0af2d Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 11:12:18 +0800 Subject: [PATCH 284/414] draft --- .../kernels/lcal_reduce_scatter_big_data.cce | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce new file mode 100644 index 00000000..b63eda12 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataOrigin( + __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, + uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, int64_t flagOffset2nd, + int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) +{ + const int64_t inputNum = len * rankSize; + const int64_t avgNumDMAPerCore = len / corePerRank; + int64_t dataNumRemain = avgNumDMAPerCore; + + int64_t inputOffsetNum = coreSegmentedIdx * avgNumDMAPerCore; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumRemain = len - inputOffsetNum; + } + if (dataNumRemain <= 0) { + return; + } + + if (GetBlockIdx() < blockNumPerGroup) { + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; + __gm__ T *receiveBuff = (__gm__ T *)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + int64_t ipcBuffOffsetNum = x * len + inputOffsetNum; + input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, ipcBuffOffsetNum, + input, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); + return; + } + + if (x == rank) { + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset2nd; + input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], output, inputOffsetNum, + input, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); + return; + } + + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (coreSegmentedIdx + rank * corePerRank + GetLcalBlockNum()) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + int64_t allDataSizeNeed2Add = dataNumRemain * sizeof(T); + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB * DMA_SIZE_PER_FLAG >= allDataSizeNeed2Add) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagsGMX, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if (((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { + continue; + } + if (*ctrlFlagsUB1 == 0 || *ctrlFlagsUB2 == 0) { + continue; + } + + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 <= *ctrlFlagsUB2) ? (*ctrlFlagsUB1 - magic) : (*ctrlFlagsUB2 - magic); + if (preparedDataGroupCount <= 0 || *ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + int64_t dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + int64_t ipcBuffOffsetNum = rank * len + inputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T); + int64_t outputOffsetNum = inputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T); + + ProcessDataNew(dataSizeRemain, inputUB, buff[x], dataOffsetNum, ipcBuffOffsetNum, output, outputOffsetNum, op); + AscendC::PipeBarrier(); + + *ctrlFlagsUB = preparedDataGroupCount; + AscendC::PipeBarrier(); + } +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigData(ALLREDUCE_ARGS_FUN(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup / rankSize; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + int64_t x = GetBlockIdx() / corePerRank; + if (GetBlockIdx() >= blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; + flagOffset1st = (GetBlockIdx() - blockNumPerGroup) * MEM_DMA_UNIT_INT_NUM; + } + int64_t flagOffset2nd = GetLcalBlockNum() * MEM_DMA_UNIT_INT_NUM + flagOffset1st; + + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + int64_t ipcBuffMaxNumPerRank = ipcBuffMaxNum / rankSize; + int64_t dataLen = len; + + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + for (int64_t i = 0; i < CeilDiv(len, ipcBuffMaxNumPerRank); i++) { + *ctrlFlagsUB = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNumPerRank; + int64_t remainNum = (dataLen - processedNum < ipcBuffMaxNumPerRank) ? dataLen - processedNum : ipcBuffMaxNumPerRank; + + PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalReduceScatterBigDataOrigin( + buff, input + len * x + processedNum, output + processedNum, processedNum, blockNumPerGroup, rank, rankSize, + len, remainNum, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, ctrlFlagsUB2, inputUB, dataOffsetNum, + flagOffset1st, flagOffset2nd, x, corePerRank, coreSegmentedIdx, op); + AscendC::PipeBarrier(); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); +} \ No newline at end of file -- Gitee From 05609cdc6d05821d35c7a7629105445a721c05a7 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 11:13:59 +0800 Subject: [PATCH 285/414] draft --- .../src/kernels/lcal_reduce_scatter_big_data.cce | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce index b63eda12..414d690e 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce @@ -11,9 +11,9 @@ template __attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataOrigin( - __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, int64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, + __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, uint64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, - __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T* inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, int64_t flagOffset2nd, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T *inputUB[2], int64_t dataOffsetNum, int64_t flagOffset1st, int64_t flagOffset2nd, int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) { const int64_t inputNum = len * rankSize; @@ -30,7 +30,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataOr if (GetBlockIdx() < blockNumPerGroup) { __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + flagOffset1st; - __gm__ T *receiveBuff = (__gm__ T *)((__gm__ int64_t*)buff[rank] + dataOffsetNum); + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[rank] + dataOffsetNum); int64_t ipcBuffOffsetNum = x * len + inputOffsetNum; input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, ipcBuffOffsetNum, input, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); @@ -79,7 +79,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataOr int64_t ipcBuffOffsetNum = rank * len + inputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T); int64_t outputOffsetNum = inputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T); - ProcessDataNew(dataSizeRemain, inputUB, buff[x], dataOffsetNum, ipcBuffOffsetNum, output, outputOffsetNum, op); + ProcessDataNew(dataSizeRemain, inputUB, buff[x], dataOffsetNum, ipcBuffOffsetNum, output, outputOffsetNum, op); AscendC::PipeBarrier(); *ctrlFlagsUB = preparedDataGroupCount; @@ -93,7 +93,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigData(A DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); magic *= 1024; const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; - const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); __gm__ T* buff[8] = { buff0, buff1, buff2, buff3, buff4, buff5, buff6, buff7 @@ -119,7 +119,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigData(A DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); - for (int64_t i = 0; i < CeilDiv(len, ipcBuffMaxNumPerRank); i++) { + for (int64_t i = 0; i < CeilDiv(dataLen, ipcBuffMaxNumPerRank); i++) { *ctrlFlagsUB = 0; AscendC::PipeBarrier(); -- Gitee From 21a37a3f7cec1804e278d14ae456904d664ea07d Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 11:20:38 +0800 Subject: [PATCH 286/414] draft --- .../lcal_reduce_scatter_big_data_write.cce | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce new file mode 100644 index 00000000..3fca3966 --- /dev/null +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataWriteOrigin( + __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, uint64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, + uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T *inputUB[2], int64_t dataOffsetNum, int64_t x, int64_t corePerRank, + int64_t coreSegmentedIdx, int op) +{ + const int64_t inputNum = len * rankSize; + const int64_t avgNumDMAPerCore = len / corePerRank; + int64_t dataNumRemain = avgNumDMAPerCore; + + int64_t inputOffsetNum = coreSegmentedIdx * avgNumDMAPerCore; + if (coreSegmentedIdx == corePerRank - 1) { + dataNumRemain = len - inputOffsetNum; + } + if (dataNumRemain <= 0) { + return; + } + + if (GetBlockIdx() < blockNumPerGroup) { + if (rank == x) { + return; + } + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[x] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + __gm__ T *receiveBuff = (__gm__ T*)((__gm__ int64_t*)buff[x] + dataOffsetNum); + int64_t receiveBuffOffsetNum = rank * len + inputOffsetNum; + input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], receiveBuff, receiveBuffOffsetNum, + input, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGMX, magic); + return; + } + + if (x == rank) { + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + input2BuffRankMagic(dataNumRemain * sizeof(T), inputUB[0], output, inputOffsetNum, + input, inputOffsetNum, ctrlFlagsUB, ctrlFlagsGM, magic); + return; + } + + *ctrlFlagsUB = 0; + *ctrlFlagsUB1 = 0; + *ctrlFlagsUB2 = 0; + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*)buff[rank] + (coreSegmentedIdx + rank * corePerRank) * MEM_DMA_UNIT_INT_NUM; + __gm__ int64_t* ctrlFlagsGMX = (__gm__ int64_t*)buff[rank] + (coreSegmentedIdx + x * corePerRank) * MEM_DMA_UNIT_INT_NUM; + int64_t allDataSizeNeed2Add = dataNumRemain * sizeof(T); + AscendC::PipeBarrier(); + while (true) { + if (*ctrlFlagsUB * DMA_SIZE_PER_FLAG >= allDataSizeNeed2Add) { + break; + } + + CpGM2UB(ctrlFlagsUB1, ctrlFlagsGM, sizeof(int64_t)); + CpGM2UB(ctrlFlagsUB2, ctrlFlagsGMX, sizeof(int64_t)); + AscendC::PipeBarrier(); + + if (((*ctrlFlagsUB1 >> 10) != (magic >> 10)) || ((*ctrlFlagsUB2 >> 10) != (magic >> 10))) { + continue; + } + if (*ctrlFlagsUB1 == 0 || *ctrlFlagsUB2 == 0) { + continue; + } + + int64_t preparedDataGroupCount = (*ctrlFlagsUB1 <= *ctrlFlagsUB2) ? (*ctrlFlagsUB1 - magic) : (*ctrlFlagsUB2 - magic); + if (preparedDataGroupCount <= 0 || *ctrlFlagsUB >= preparedDataGroupCount) { + continue; + } + + int64_t dataSizeRemain = (preparedDataGroupCount - *ctrlFlagsUB) * DMA_SIZE_PER_FLAG; + if (preparedDataGroupCount * DMA_SIZE_PER_FLAG > allDataSizeNeed2Add) { + dataSizeRemain = allDataSizeNeed2Add - *ctrlFlagsUB * DMA_SIZE_PER_FLAG; + } + int64_t ipcBuffOffsetNum = x * len + inputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T); + int64_t outputOffsetNum = inputOffsetNum + (*ctrlFlagsUB) * DMA_SIZE_PER_FLAG / sizeof(T); + + ProcessDataNew(dataSizeRemain, inputUB, buff[rank], dataOffsetNum, ipcBuffOffsetNum, output, outputOffsetNum, op); + AscendC::PipeBarrier(); + + *ctrlFlagsUB = preparedDataGroupCount; + AscendC::PipeBarrier(); + } + SetFlag(ctrlFlagsUB, ctrlFlagsGM, 0); + SetFlag(ctrlFlagsUB1, ctrlFlagsGMX, 0); +} + +template +__attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataWrite(ALLREDUCE_ARGS_FUN(T)) +{ + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + magic *= 1024; + const int64_t dataOffsetNum = GetLcalBlockNum() * 2 * MEM_DMA_UNIT_INT_NUM; + int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ int64_t* ctrlFlagsUB1 = (__ubuf__ int64_t*)(32); + __ubuf__ int64_t* ctrlFlagsUB2 = (__ubuf__ int64_t*)(64); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(96), (__ubuf__ T*)(97440)}; + + int64_t blockNumPerGroup = GetLcalBlockNum() >> 1; + int64_t corePerRank = blockNumPerGroup / rankSize; + int64_t coreSegmentedIdx = GetBlockIdx() % corePerRank; + int64_t x = GetBlockIdx() / corePerRank; + if (GetBlockIdx() >= blockNumPerGroup) { + x = (GetBlockIdx() - blockNumPerGroup) / corePerRank; + } + + int64_t ipcBuffMaxNum = IPC_BUFF_MAX_SIZE / sizeof(T); + int64_t ipcBuffMaxNumPerRank = ipcBuffMaxNum / rankSize; + int64_t dataLen = len; + + DumpLcclLogInfo(dumpAddr, LogId::INIT, static_cast(op)); + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); + for (int64_t i = 0; i < CeilDiv(dataLen, ipcBuffMaxNumPerRank); i++) { + *ctrlFlagsUB = 0; + AscendC::PipeBarrier(); + + int64_t processedNum = i * ipcBuffMaxNumPerRank; + int64_t remainNum = (dataLen - processedNum < ipcBuffMaxNumPerRank) ? dataLen - processedNum : ipcBuffMaxNumPerRank; + + PostSyncBigData(ctrlFlagsUB, buff, rank, rankSize, dataOffsetNum, ipcBuffMaxNum, magic, i); + LcalReduceScatterBigDataWriteOrigin( + buff, input + len * x + processedNum, output + processedNum, processedNum, blockNumPerGroup, rank, rankSize, len, remainNum, (magic + i) * 1024, ctrlFlagsUB, ctrlFlagsUB1, + ctrlFlagsUB2, inputUB, dataOffsetNum, x, corePerRank, coreSegmentedIdx, op); + AscendC::PipeBarrier(); + } + DumpLcclLogInfo(dumpAddr, LogId::PROCESS, static_cast(op)); +} \ No newline at end of file -- Gitee From f740a7a11a58a1f73e058a63d899768cc07f2055 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 11:21:21 +0800 Subject: [PATCH 287/414] draft --- comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce index 3fca3966..c2865900 100644 --- a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce +++ b/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce @@ -13,7 +13,7 @@ template __attribute__((always_inline)) inline __aicore__ void LcalReduceScatterBigDataWriteOrigin( __gm__ T* buff[8], __gm__ T *input, __gm__ T *output, uint64_t processedNum, int64_t blockNumPerGroup, uint32_t rank, uint32_t rankSize, int64_t allLen, int64_t len, int64_t magic, __ubuf__ int64_t* ctrlFlagsUB, __ubuf__ int64_t* ctrlFlagsUB1, - __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T *inputUB[2], int64_t dataOffsetNum, int64_t x, int64_t corePerRank, + __ubuf__ int64_t* ctrlFlagsUB2, __ubuf__ T *inputUB[2], int64_t dataOffsetNum,int64_t x, int64_t corePerRank, int64_t coreSegmentedIdx, int op) { const int64_t inputNum = len * rankSize; -- Gitee From 198ec202a6aa50e3974bf5cd4ec0279866660c6a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 11:39:57 +0800 Subject: [PATCH 288/414] draft --- .../src/kernels/lcal_all2all_transpose.cce | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 comm/lcal/src/kernels/lcal_all2all_transpose.cce diff --git a/comm/lcal/src/kernels/lcal_all2all_transpose.cce b/comm/lcal/src/kernels/lcal_all2all_transpose.cce new file mode 100644 index 00000000..ea34ee5d --- /dev/null +++ b/comm/lcal/src/kernels/lcal_all2all_transpose.cce @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "collectives.cce" + +template +__attribute__((always_inline)) inline __aicore__ void LcalAll2AllTranspose(ALLREDUCE_ARGS_FUN_16P(T)) +{ + int32_t width = root; + int32_t burstLen = width / rankSize; + const int64_t dataOffsetNum = GetLcalBlockNum() * 32 * MEM_DMA_UNIT_INT_NUM; + const int64_t flagOffset1st = MEM_DMA_UNIT_INT_NUM * GetBlockIdx(); + int numRows = len / width; + __gm__ T* buff[8] = { + buff0, buff1, buff2, buff3, + buff4, buff5, buff6, buff7 + }; + __ubuf__ int64_t* ctrlFlagsUB = (__ubuf__ int64_t*)(0); + __ubuf__ T* inputUB[2] = {(__ubuf__ T*)(64), (__ubuf__ T*)(97312)}; + + int32_t coreIdx = GetBlockIdx(); + int32_t coreNum = GetLcalBlockNum(); + const int64_t corePerRank = coreNum / rankSize; + const int64_t coreIdxInRank = GetBlockIdx() % corePerRank; + const int64_t coreIdxRankId = GetBlockIdx() / corePerRank; + const int64_t rowNumPerCore = CeilDiv(numRows, coreNum); + int64_t rowNumThisCore = rowNumPerCore; + if (coreIdxInRank == corePerRank - 1) { + rowNumThisCore = numRows - rowNumPerCore * (corePerRank - 1); + } + + const int64_t lenPerRank = len / rankSize; + AscendC::PipeBarrier(); + + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int32_t loopId = 0; loopId < rowNumPerCore; ++loopId) { + event_t eventId = (loopId & 1)? EVENT_ID1 : EVENT_ID0; + int32_t rowId = loopId + coreIdxInRank * rowNumPerCore; + if (rowId > numRows) { + break; + } + __gm__ T* srcPtr = (__gm__ T*)input + rowId * width + coreIdxRankId * burstLen; + __ubuf__ T* iub = (loopId & 1) ? inputUB[1] : inputUB[0]; + AscendC::WaitFlag(eventId); + CpGM2UB(iub, srcPtr, burstLen * sizeof(T)); + AscendC::SetFlag(eventId); + AscendC::WaitFlag(eventId); + __gm__ T* dstPtr = (__gm__ T*)buff[rank] + coreIdxRankId * lenPerRank + rowId * burstLen + dataOffsetNum; + if (coreIdxRankId == rank) { + dstPtr = (__gm__ T*) output + coreIdxRankId * lenPerRank + rowId * burstLen; + } + CpUB2GM(dstPtr, iub, burstLen * sizeof(T)); + AscendC::SetFlag(eventId); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + AscendC::PipeBarrier(); + + __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*) buff[rank] + flagOffset1st; + SetFlag(ctrlFlagsUB, ctrlFlagsGM, magic); + __gm__ int64_t* ctrlFlagsGMWait = (__gm__ int64_t*)buff[coreIdxRankId] + (rank * corePerRank + coreIdxInRank) * MEM_DMA_UNIT_INT_NUM; + CheckFlag((__ubuf__ int64_t*)ctrlFlagsUB, ctrlFlagsGMWait, (int64_t)magic); + + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + __gm__ T* gm_src = (__gm__ T*)buff[coreIdxRankId] + + rank * lenPerRank + coreIdxInRank * rowNumPerCore * burstLen + dataOffsetNum; + __gm__ T* gm_dst = (__gm__ T*)output + coreIdxRankId * lenPerRank + + coreIdxInRank * rowNumPerCore * burstLen; + if (coreIdxRankId != rank) { + GM2GM(rowNumThisCore * burstLen * sizof(T), inputUB[0], gm_dst, 0, gm_src, 0); + } +} \ No newline at end of file -- Gitee From 404cc3eac4354e1e182d9e96bccc511e1551397a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 26 Aug 2025 11:42:15 +0800 Subject: [PATCH 289/414] draft --- comm/lcal/src/kernels/lcal_all2all_transpose.cce | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/kernels/lcal_all2all_transpose.cce b/comm/lcal/src/kernels/lcal_all2all_transpose.cce index ea34ee5d..68a772d0 100644 --- a/comm/lcal/src/kernels/lcal_all2all_transpose.cce +++ b/comm/lcal/src/kernels/lcal_all2all_transpose.cce @@ -30,7 +30,7 @@ __attribute__((always_inline)) inline __aicore__ void LcalAll2AllTranspose(ALLRE const int64_t corePerRank = coreNum / rankSize; const int64_t coreIdxInRank = GetBlockIdx() % corePerRank; const int64_t coreIdxRankId = GetBlockIdx() / corePerRank; - const int64_t rowNumPerCore = CeilDiv(numRows, coreNum); + const int64_t rowNumPerCore = CeilDiv(numRows, corePerRank); int64_t rowNumThisCore = rowNumPerCore; if (coreIdxInRank == corePerRank - 1) { rowNumThisCore = numRows - rowNumPerCore * (corePerRank - 1); @@ -39,12 +39,12 @@ __attribute__((always_inline)) inline __aicore__ void LcalAll2AllTranspose(ALLRE const int64_t lenPerRank = len / rankSize; AscendC::PipeBarrier(); - AscendC::SetFlag(EVENT_ID0); - AscendC::SetFlag(EVENT_ID1); - for (int32_t loopId = 0; loopId < rowNumPerCore; ++loopId) { + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for(int32_t loopId = 0; loopId < rowNumPerCore; ++loopId) { event_t eventId = (loopId & 1)? EVENT_ID1 : EVENT_ID0; int32_t rowId = loopId + coreIdxInRank * rowNumPerCore; - if (rowId > numRows) { + if (rowId >= numRows) { break; } __gm__ T* srcPtr = (__gm__ T*)input + rowId * width + coreIdxRankId * burstLen; @@ -60,8 +60,8 @@ __attribute__((always_inline)) inline __aicore__ void LcalAll2AllTranspose(ALLRE CpUB2GM(dstPtr, iub, burstLen * sizeof(T)); AscendC::SetFlag(eventId); } - AscendC::WaitFlag(EVENT_ID0); - AscendC::WaitFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); AscendC::PipeBarrier(); __gm__ int64_t* ctrlFlagsGM = (__gm__ int64_t*) buff[rank] + flagOffset1st; @@ -76,6 +76,6 @@ __attribute__((always_inline)) inline __aicore__ void LcalAll2AllTranspose(ALLRE __gm__ T* gm_dst = (__gm__ T*)output + coreIdxRankId * lenPerRank + coreIdxInRank * rowNumPerCore * burstLen; if (coreIdxRankId != rank) { - GM2GM(rowNumThisCore * burstLen * sizof(T), inputUB[0], gm_dst, 0, gm_src, 0); + GM2GM(rowNumThisCore * burstLen * sizeof(T), inputUB[0], gm_dst, 0, gm_src, 0); } } \ No newline at end of file -- Gitee From 39e634c9c79d10575dd8fc85cfeac5d2d9e8be28 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 15:24:12 +0800 Subject: [PATCH 290/414] 4 --- comm/lcal/src/ascendc_kernels/lccl_op.h | 4 +- comm/lcal/src/kernels/coc_internal.cpp | 161 ++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 comm/lcal/src/kernels/coc_internal.cpp diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index 5019f4f9..cb53b646 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -128,7 +128,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ constexpr int32_t rankSize910a3 = 16; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ - if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ + if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ if (len * sizeof(type) < SIZE_OF_8M) { \ LcalAllReduce2npuWrite(ALLREDUCE_ARGS_CALL_16P(type)); \ } else { \ @@ -215,7 +215,7 @@ extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_A constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \ constexpr int32_t a3BigDataSize = 32 * 1024 * 1024; \ constexpr int32_t a3SupportRankSize = 4; \ - constexpr int32_t smallRankSize = 8; \ + constexpr int32_t smallRankSize = 8; \ const bool isDbRing = (rankSize == a3SupportRankSize || rankSize == smallRankSize) && \ (len * sizeof(type) * smallRankSize > cceSmallDataSize && \ len * sizeof(type) * smallRankSize <= a3BigDataSize); \ diff --git a/comm/lcal/src/kernels/coc_internal.cpp b/comm/lcal/src/kernels/coc_internal.cpp new file mode 100644 index 00000000..dc27f79e --- /dev/null +++ b/comm/lcal/src/kernels/coc_internal.cpp @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_COC_INTERNAL_H +#define LCAL_COC_INTERNAL_H + +#include +#include "kernel_operator.h" +#include "coc_const_args.cce" + +using namespace AscendC + +template + +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__ubuf__ T *addr) +{ + LocalTensor tensor; + TBuffAddr taddr; + taddr.bufferAddr = reinterpret_cast(addr); + tensor.SetAddr(taddr); + return tensor; +} + +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset) +{ + LocalTensor tensor; + tensor.address_.bufferAddr = buffer_offset; + return tensor; +} + +template +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset, uint32_t logic_pos) +{ + LocalTensor tensor; + tensor.address_.logicPos = logic_pos; + tensor.address_.bufferAddr = buffer_offset; + return tensor; +} + +template +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__gm__ T *addr) +{ + GlobalTensor tensor; + tensor.SetGlobalBuffer(addr); + return tensor; +} + +template +inline __aicore__ void FFTSCrossCoreSync(uint64_t mode, uint64_t flag_id) +{ + uint64_t config = 1 | (mode << 4) | (flag_id << 8); + ffts_cross_core_sync(pipe, config); +} + +template +inline __aicore__ void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, uint8_t sid, uint16_t nBurst, uint16_t lenBurst, + uint16_t srcStride, uint16_t dstStride) +{ + LocalTensor srcTensor = CreateLocalTensor(src); + LocalTensor dstTensor = CreateLocalTensor(dst); + DataCopyParams repeatParams(nBurst, lenBurst, srcStride, dstStride); + DataCopy(dstTensor, srcTensor, repeatParams); +} + +template +inline __aicore__ void Vconv(__ubuf__ Tdst *dst, __ubuf__ Tsrc *src, uint8_t repeat, uint16_t dstBlockStride, + uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride, + const RoundMode &roundMode = RoundMode::CAST_NONE) +{ + LocalTensor srcTensor = CreateLocalTensor(src); + GlobalTensor dstTensor = CreateLocalTensor(dst); + UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride); + Cast(dstTensor, srcTensor, roundMode, -1, repeat, repeatParams); +} + +template +inline __aicore__ void Vadd(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, uint8_t repeat, uint16_t dstBlockStride, + uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride, + uint8_t src0RepeatStride, uint8_t src1RepeatStride) +{ + LocalTensor srcTensor0 = CreateLocalTensor(src0); + LocalTensor srcTensor1 = CreateLocalTensor(src1); + LocalTensor dstTensor = CreateLocalTensor(dst); + BinaryRepeatParams repeatParams(dstBlockStride, src0BlockStride, src1BlockStride, dstRepeatStride, src0RepeatStride, + src1RepeatStride); + Add(dstTensor, srcTensor0, srcTensor1, -1, repeat, repeatParams); +} + +template +inline __aicore__ void Vadds(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, + uint16_t dstBlockStride, uint8_t srcBlockStride, uint8_t dstRepeatStride, + uint8_t srcRepeatStride) +{ + LocalTensor srcTensor = CreateLocalTensor(src); + LocalTensor dstTensor = CreateLocalTensor(dst); + UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride, + src1RepeatStride); + Adds(dstTensor, srcTensor, srcTensor1, -1, repeat, repeatParams); +} + +template +inline __aicore__ void Vmul(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, uint8_t repeat, uint16_t dstBlockStride, + uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride, + uint8_t src0RepeatStride, uint8_t src1RepeatStride) +{ + LocalTensor srcTensor0 = CreateLocalTensor(src0); + LocalTensor srcTensor1 = CreateLocalTensor(src1); + LocalTensor dstTensor = CreateLocalTensor(dst); + BinaryRepeatParams repeatParams(dstBlockStride, src0BlockStride, src1BlockStride, dstRepeatStride, src0RepeatStride, + src1RepeatStride); + Mul(dstTensor, srcTensor0, srcTensor1, -1, repeat, repeatParams); +} + +template +inline __aicore__ void Vmuls(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, + uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, + uint8_t srcRepeatStride) +{ + LocalTensor srcTensor = CreateLocalTensor(src); + LocalTensor dstTensor = CreateLocalTensor(dst); + UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride, + src1RepeatStride); + Muls(dstTensor, srcTensor, srcTensor1, -1, repeat, repeatParams); +} + +inline __aicore__ bool IsQuant(const QuantGranularity &granularity) +{ + return (granularity > QuantGranularity::QUANT_GRANULARITY_UNDEFINED) && + (granularity < QuantGranularity::QUANT_GRANULARITY_PER_TENSOR); +} + +#define COC_ARGS_FUN_IIO(T_INPUT1, T_INPUT2, T_OUTPUT) \ + __gm__ T_INPUT1 *gm_a, __gm__ T_INPUT2 *gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_gamma, \ + __gm__ T_OUTPUT *gm_out, __gm__ T_OUTPUT *gm_allgather_out, GM_ADDR gm_workspace, \ + GM_ADDR gm_dequant_scale, GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, \ + GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, \ + __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t *num_global_tokens_per_local_expert, \ + __gm__ int32_t *global_token_per_expert_matrix, GM_ADDR para_gm + +#define COC_ARGS_FUN_IO(T_INPUT, T_OUTPUT) COC_ARGS_FUN_IIO(T_INPUT, T_INPUT, T_OUTPUT) + +#define COC_ARGS_FUN(T) COC_ARGS_FUN_IO(T, T) + +#define COC_ARGS_CALL() + +#define COC_ARGS_CALL_INT8() \ + +#define PP_MATMUL_AIC_ARGS_FUN(T_INPUT, T_OUTPUT) \ + +#define PP_MATMUL_AIC_ARGS_FUN() \ + +#define PP_MATMUL_AIC_ARGS_CALL() + +#define PP_MATMUL_AIV_PADDING_ARGS_FUN() + -- Gitee From efacabd6ab8da481cb23f024a6402c28f2de0c06 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 15:42:10 +0800 Subject: [PATCH 291/414] 8 --- comm/lcal/src/kernels/coc_internal.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cpp b/comm/lcal/src/kernels/coc_internal.cpp index dc27f79e..06b546d3 100644 --- a/comm/lcal/src/kernels/coc_internal.cpp +++ b/comm/lcal/src/kernels/coc_internal.cpp @@ -18,7 +18,7 @@ using namespace AscendC template -FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__ubuf__ T *addr) +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__ubuf__ T *addr) { LocalTensor tensor; TBuffAddr taddr; @@ -27,7 +27,7 @@ FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__ubuf__ T *addr) return tensor; } -FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset) +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset) { LocalTensor tensor; tensor.address_.bufferAddr = buffer_offset; @@ -93,7 +93,7 @@ inline __aicore__ void Vadd(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, } template -inline __aicore__ void Vadds(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, +inline __aicore__ void Vadds(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, uint16_t dstBlockStride, uint8_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride) { @@ -118,7 +118,7 @@ inline __aicore__ void Vmul(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, } template -inline __aicore__ void Vmuls(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, +inline __aicore__ void Vmuls(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride) { @@ -155,7 +155,6 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define PP_MATMUL_AIC_ARGS_FUN() \ -#define PP_MATMUL_AIC_ARGS_CALL() #define PP_MATMUL_AIV_PADDING_ARGS_FUN() -- Gitee From a774b1780dd8961c053d9f2845fb981c655861c4 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 15:50:05 +0800 Subject: [PATCH 292/414] 6 --- comm/lcal/src/{CmakeLists.txt => CMakeLists.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename comm/lcal/src/{CmakeLists.txt => CMakeLists.txt} (100%) diff --git a/comm/lcal/src/CmakeLists.txt b/comm/lcal/src/CMakeLists.txt similarity index 100% rename from comm/lcal/src/CmakeLists.txt rename to comm/lcal/src/CMakeLists.txt -- Gitee From 9b506792ac36f08d47487ba0ed5845ae25a7f243 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 15:56:09 +0800 Subject: [PATCH 293/414] 9 --- CMakeLists.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1606b57..a7eb81d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,9 +83,9 @@ include_directories( ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src/include ${PROJECT_SOURCE_DIR}/src/kernels/include - ${PROJECT_SOURCE_DIR}/src/kernels/include/lcal - ${PROJECT_SOURCE_DIR}/src/kernels/include/lcal/lcoc - ${PROJECT_SOURCE_DIR}/src/kernels/include/lcal/tiling + ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcal + ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc + ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc/tiling ${PROJECT_SOURCE_DIR}/3rdparty/mki/include ${PROJECT_SOURCE_DIR}/3rdparty/nlohmannJson/include $ENV{ASCEND_HOME_PATH}/include @@ -111,6 +111,7 @@ if(BUILD_TEST_FRAMEWORK OR USE_UNIT_TEST OR USE_PYTHON_TEST OR USE_FUZZ_TEST OR set(CMAKE_CXX_OUTPUT_EXTENSION_REPLACE 1) add_subdirectory(tests) endif() +add_subdirectory(comm/lcal) add_subdirectory(src) if (BUILD_CUSTOMIZE_OPS) add_subdirectory(ops_customize) @@ -123,8 +124,8 @@ message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) install(DIRECTORY ${PROJECT_SOURCE_DIR}/ops_configs DESTINATION ./configs) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/mki/lib/libmki.so DESTINATION lib) -install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/liblcal.so DESTINATION lib) -install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/liblcal_static.a DESTINATION lib) +install(FILES ${PROJECT_SOURCE_DIR}/build/comm/lcal/src/liblcal.so DESTINATION lib) +install(FILES ${PROJECT_SOURCE_DIR}/build/comm/lcal/src/liblcal_static.a DESTINATION lib) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libasdops_aicpu_kernels.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libtbe_adapter.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libcann_ops_adapter.so DESTINATION lib OPTIONAL) -- Gitee From 878450bdbe49aa74aa2577bc943233301e670f7c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 17:17:16 +0800 Subject: [PATCH 294/414] 78 --- CMakeLists.txt | 5 +--- comm/lcal/src/kernels/coc_internal.cce | 0 comm/lcal/src/kernels/coc_internal.cpp | 35 +++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 7 deletions(-) delete mode 100644 comm/lcal/src/kernels/coc_internal.cce diff --git a/CMakeLists.txt b/CMakeLists.txt index a7eb81d8..328b61a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,21 +111,18 @@ if(BUILD_TEST_FRAMEWORK OR USE_UNIT_TEST OR USE_PYTHON_TEST OR USE_FUZZ_TEST OR set(CMAKE_CXX_OUTPUT_EXTENSION_REPLACE 1) add_subdirectory(tests) endif() -add_subdirectory(comm/lcal) add_subdirectory(src) if (BUILD_CUSTOMIZE_OPS) add_subdirectory(ops_customize) endif() set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}") - +add_subdirectory(comm/lcal) message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) install(DIRECTORY ${PROJECT_SOURCE_DIR}/ops_configs DESTINATION ./configs) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/mki/lib/libmki.so DESTINATION lib) -install(FILES ${PROJECT_SOURCE_DIR}/build/comm/lcal/src/liblcal.so DESTINATION lib) -install(FILES ${PROJECT_SOURCE_DIR}/build/comm/lcal/src/liblcal_static.a DESTINATION lib) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libasdops_aicpu_kernels.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libtbe_adapter.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libcann_ops_adapter.so DESTINATION lib OPTIONAL) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce deleted file mode 100644 index e69de29b..00000000 diff --git a/comm/lcal/src/kernels/coc_internal.cpp b/comm/lcal/src/kernels/coc_internal.cpp index 06b546d3..48341789 100644 --- a/comm/lcal/src/kernels/coc_internal.cpp +++ b/comm/lcal/src/kernels/coc_internal.cpp @@ -147,14 +147,43 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define COC_ARGS_FUN(T) COC_ARGS_FUN_IO(T, T) -#define COC_ARGS_CALL() +#define COC_ARGS_CALL() \ + gm_a, gm_b, gm_bias, gm_gamma, gm_out, gm_allgather_out, gm_workspace, gm_dequant_scale, gm_dequant_offset, \ + gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ + num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ + global_token_per_expert_matrix, para_gm #define COC_ARGS_CALL_INT8() \ + reinterpret_cast(gm_a), reinterpret_cast(gm_b), reinterpret_cast(gm_bias), \ + reinterpret_cast(gm_gamma), reinterpret_cast(gm_out), \ + reinterpret_cast(gm_allgather_out), gm_workspace, gm_dequant_scale, gm_dequant_offset, \ + gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ + num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ + global_token_per_expert_matrix, para_gm #define PP_MATMUL_AIC_ARGS_FUN(T_INPUT, T_OUTPUT) \ + GM_ADDR gm_a, GM_ADDR gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_c, \ + __gm__ T_OUTPUT *gm_peer_mem, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ + GM_ADDR gm_dequant_offset, int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m0, \ + int32_t k0, int32_t n0, int32_t m_loop, int32_t k_loop, int32_t n_loop, int32_t core_loop, \ + int32_t swizzl_count, int32_t swizzl_direct, int32_t rank, int32_t rank_size, int32_t p_value, \ + int32_t withSerialMode, QuantGranularity quant_granularity, QuantGranularity dequant_granularity, \ + int32_t ag_dim, int32_t rs_dim, bool inner_dim_is_Ag, bool weight_nz, bool is_91093, \ + __gm__ int32_t *num_local_tokens_per_expert, __gm__ int32_t * num_global_tokens_per_local_expert, \ + __gm__ int32_t *global_tokens_per_expert_matrix, int32_t local_expert_nums, int32_t EP, int32_t TP, \ + int32_t maxOutputSize, int32_t is_moe, bool is_deterministic, int32_t buffer_size \ #define PP_MATMUL_AIC_ARGS_FUN() \ - + reinterpret_cast(gm_a), reinterpret_cast(gm_b), gm_bias, gm_c, gm_peer_mem, \ + reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ + reinterpret_cast(gm_dequant_offset), batch_size, m, k, n, m0, k0, n0, m_loop, k_loop, \ + n_loop, core_loop, swizzl_count, swizzl_direct, rank, rank_size, p_value, withSerialMode, quant_granularity, \ + dequant_granularity, ag_dim, rs_dim, inner_dim_is_Ag, weight_nz, is_91093, \ + num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ + global_tokens_per_expert_matrix, local_expert_nums, EP, TP, maxOutputSize, is_moe, is_deterministic, buffer_size #define PP_MATMUL_AIV_PADDING_ARGS_FUN() - + GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ + GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ + int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ + -- Gitee From 24d1ed11feff830c694c7bba38b89aab4011f38a Mon Sep 17 00:00:00 2001 From: Denver Date: Tue, 26 Aug 2025 11:03:12 +0000 Subject: [PATCH 295/414] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20?= =?UTF-8?q?comm/lcal/src/test.cpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- comm/lcal/src/test.cpp | 1 - 1 file changed, 1 deletion(-) delete mode 100644 comm/lcal/src/test.cpp diff --git a/comm/lcal/src/test.cpp b/comm/lcal/src/test.cpp deleted file mode 100644 index 75fa785d..00000000 --- a/comm/lcal/src/test.cpp +++ /dev/null @@ -1 +0,0 @@ -// test \ No newline at end of file -- Gitee From f18417c4238e39c647c4ac96dbd310f1a76da57b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 20:43:26 +0800 Subject: [PATCH 296/414] 4 --- comm/lcal/src/kernels/coc_internal.cpp | 289 ++++++++++++++++++++++++- 1 file changed, 288 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_internal.cpp b/comm/lcal/src/kernels/coc_internal.cpp index 48341789..42d60d85 100644 --- a/comm/lcal/src/kernels/coc_internal.cpp +++ b/comm/lcal/src/kernels/coc_internal.cpp @@ -186,4 +186,291 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ - + QuantGranularity dequant_granularity, int32_t dequant_group_size, QuantGranularity quant_granularity, \ + int32_t quant_group_size, int32_t weight_nz, int32_t is_moe, int32_t is_moe_averaged, int32_t is_alltoallvc, \ + int32_t EP, int32_t TP, int32_t local_expert_nums, bool is_deterministic + +#define PP_MATMUL_AIV_PADDING_ARGS_FUN() \ + reinterpret_cast(gm_a), reinterpret_cast(gm_b), \ + reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ + reinterpret_cast(gm_dequant_offset), reinterpret_cast(gm_quant_scale), \ + reinterpret_cast(gm_quant_offset), batch_size, m, k, n, trans_a, trans_b, is_int8, \ + dequant_granularity, dequant_group_size, quant_granularity, quant_group_size, weight_nz, is_moe, \ + is_moe_averaged, is_alltoallvc, EP, TP, local_expert_nums, is_deterministic + +#define PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN() \ + GM_ADDR gm_bias, GM_ADDR gm_out, int32_t batch_size, int32_t m, int32_t n, int32_t rank_size + +#define PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL() \ + reinterpret_cast(gm_bias), reinterpret_cast(gm_out), batch_size, m, n, rank_size + +#define PP_MATMUL_AIV_POST_ARGS_CALL() + reinterpret_cast(gm_out), reinterpret_cast(gm_bias), \ + reinterpret_cast(gm_gamma), reinterpret_cast(para_gm) + +#define PP_MATMUL_AIV_POST_ARGS_FUN() \ + GM_ADDR gm_out, GM_ADDR gm_bias, GM_ADDR gm_gamma, GM_ADDR para_gm + +#define TEMPLATE_ARGS_FUN() bool ALIGN = true, bool IS_INT8 = false, bool HAVE_BIAS = false, typename T = half + +#define TEMPLATE_ARGS_FUN() ALIGN, IS_INT8, HAVE_BIAS, T + +inlie __aicore__ void AlignJudge(bool trans_a, bool trans_b, int32_t m, int32_t k, int32_t n, int32_t m_align, + int32_t k_align, int32_t n_align, int32_t &aligned_a, int32_t &aligned_b) +{ + if (!trans_a) { + aligned_a = k != k_align; + } else { + aligned_a = (m != m_align && m != 1); + } + if (!trans_b) { + aligned_b = (n != n_align); + } else { + aligned_b = (k != k_align); + } +} + +inline __aicore__ void GetBlockIdx(int32_t loop_idx, int32_t m_loop, int32_t n_loop, int32_t swizzl_direction, + int32_t swizzl_count, int32_t &m_idx, int32_t &n_idx) +{ + uint32_t in_batch_idx = loop_idx % (m_loop * n_loop); + if (swizzl_direction == 0) { + uint32_t tile_block_loop = (m_loop + swizzl_count - 1) / swizzl_count; + uint32_t tile_block_idx = in_batch_idx / (swizzl_count * n_loop); + uint32_t in_tile_block_idx = in_batch_idx % (swizzl_count * n_loop); + uint32_t n_row = swizzl_count; + if (tile_block_idx == tile_block_loop - 1) { + n_row = m_loop - swizzl_count * tile_block_idx; + } + m_idx = tile_block_idx * swizzl_count + in_tile_block_idx % n_row; + n_idx = in_tile_block_idx / n_row; + if (tile_block_idx % 2 != 0) [ + n_idx = n_loop - n_idx - 1; + ] + } else if (swizzl_direction == 1) { + uint32_t tile_block_loop = (n_loop + swizzl_count - 1) / swizzl_count; + uint32_t tile_block_idx = in_batch_idx / (swizzl_count * m_loop); + uint32_t in_tile_block_idx = in_batch_idx % (swizzl_count * m_loop); + uint32_t n_col = swizzl_count; + if (tile_block_idx == tile_block_loop - 1) { + n_col = n_loop - swizzl_count * tile_block_idx; + } + m_idx = in_tile_block_idx / n_col; + n_idx = tile_block_idx * swizzl_count + in_tile_block_idx % n_col; + if (tile_block_idx % 2 != 0) { + m_idx = m_loop - m_idx - 1; + } + } +} + +template +FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, + uint32_t gmGap, uint32_t ubufGap = 0) +{ + if constexpr (sizeof(T) == 8) { + CopyGmToUbufAlign(reinterpret_cast<__ubuf__ int32_t *>(dst), reinterpret_cast<__gm__ int32_t *>(src), + nBurst * 2, lenBurst * 2, gmGap, ubufGap); + return; + } + DataCopyParams dataCopyParams(nBurst, + (Block32B::Count(lenBurst)), + (Block32B::Count(gmGap)), + (ubufGap) + ); + DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), gmGap * sizeof(T), ubufGap, 0); + LocalTensor ubTensor; + TBuffAddr ubAddr; + ubAddr.logicPos = static_cast(TPosition::VECIN); + ubAddr.bufferAddr = reinterpret_cast(dst); + ubTensor.SetAddr(ubAddr); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(src); + if (Block32B::IsAligned(lenBurst) && Block32B::IsAligned(gmGap)) { + DataCopy(ubTensor, gmTensor, dataCopyAlignParams); + } else { + DataCopyPadExtParams padParams; + DataCopyPad(ubTensor, gmTensor, dataCopyAlignParams, padParams); + } +} + +template +FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, + uint32_t gmGap, uint32_t ubufGap = 0) +{ + DataCopyParams dataCopyParams(nBurst, + static_cast(Block32B::Count(lenBurst)), + static_cast(ubufGap), + static_cast(Block32B::Count(gmGap)) + ); + DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), gmGap * sizeof(T), ubufGap, 0); + LocalTensor ubTensor; + TBuffAddr ubAddr; + ubAddr.logicPos = static_cast(TPosition::VECIN); + ubAddr.bufferAddr = reinterpret_cast(dst); + ubTensor.SetAddr(ubAddr); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(dst); + if (Block32B::IsAligned(lenBurst) && Block32B::IsAligned(gmGap)) { + DataCopy(ubTensor, gmTensor, dataCopyAlignParams); + } else { + DataCopyPadExtParams padParams; + DataCopyPad(gmTensor, ubTensor, dataCopyAlignParams); + } +} + +template +FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, + uint16_t srcStride, uint16_t dstStride) +{ + DataCopyExtParams dataCopyParams(nBurst, + lenBurst, + srcStride, + dstStride, + 0); + LocalTensor ubTensor; + TBuffAddr ubAddr; + ubAddr.logicPos = static_cast(TPosition::VECIN); + ubAddr.bufferAddr = reinterpret_cast(dst); + ubTensor.SetAddr(ubAddr); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(dst)); + DataCopyPad(gmTensor, ubTensor, dataCopyParams); +} + +template +FORCE_INLINE_AICORE void CopyGmToUbuf(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, + uint16_t srcStride, uint16_t dstStride) +{ + DataCopyExtParams dataCopyParams(nBurst, + lenBurst, + srcStride, + dstStride, + ); + LocalTensor ubTensor; + TBuffAddr ubAddr; + ubAddr.logicPos = static_cast(TPosition::VECIN); + ubAddr.bufferAddr = reinterpret_cast(dst); + ubTensor.SetAddr(ubAddr); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(src); + DataCopyPad(ubTensor, gmTensor, dataCopyParams); +} + +template +FORCE_INLINE_AICORE void CopyGmToUbuf(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, + uint16_t srcStride, uint16_t dstStride) +{ + DataCopyExtParams dataCopyParams(nBurst, + lenBurst, + srcStride, + dstStride, + ); + LocalTensor ubTensor; + TBuffAddr ubAddr; + ubAddr.logicPos = static_cast(TPosition::VECIN); + ubAddr.bufferAddr = reinterpret_cast(src); + ubTensor.SetAddr(ubAddr); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(dst); + DataCopyPad(gmTensor, ubTensor, dataCopyParams); +} + +template +FORCE_INLINE_AICORE void CopyUbufToGm(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, + uint16_t srcStride, uint16_t dstStride) +{ + DataCopyExtParams dataCopyParams(nBurst, + lenBurst, + srcStride, + dstStride, + ); + LocalTensor ubTensor; + TBuffAddr ubAddr; + ubAddr.logicPos = static_cast(TPosition::VECIN); + ubAddr.bufferAddr = reinterpret_cast(src); + ubTensor.SetAddr(ubAddr); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(dst); + DataCopyPad(gmTensor, ubTensor, dataCopyParams); +} + +template +FORCE_INLINE_AICORE void CopyUbufToGmUnknow(bool ALIGN, __gm__ T *dst, __ubuf__ T*src, uint16_t nBurst, + uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) +{ + if (ALIGN) { + CopyUbufToGm(dst, src, nBurst, lenBurst / 32, srcStride, dstStride / 32); + } else { + CopyUbufToGmAlignB16(dst, src, nBurst, lenBurst, srcStride, dstStride); + } +} + +template +FORCE_INLINE_AICORE void VectorDup(__ubuf__ T *dst, const T &src, uint8_t repeat, uint16_t dstBlockStride, + uint8_t dstRepeatStride) +{ + LocalTensor ubTensor = CreateLocalTensor(dst); + Duplicate(ubTensor, src, -1, repeat, dstBlockStride, dstRepeatStride); +} + +template +struce CoCBuffAddrAndArgs { +public: + __aicore__ inline CoCBuffAddrAndArgs(COC_ARGS_FUN(T)) + { + GlobalTensor commArgsGm; + commArgsGm.SetGlobalBuffer(reinterpret_cast<__gm__ int *>(coc_comm_args), 2); + rank = commArgsGm.GetValue(0); + localRank = commArgsGm.GetValue(1); + rankSize = commArgsGm.GetValue(2); + localRankSize = commArgsGm.GetValue(3); + extraFlag = commArgsGm.GetValue(4); + RDMA = (extraFlag & ExtraFlag::RDMA) != 0; + TOPO_910B2C = (extraFlag & ExtraFlag::TOPO_910B2C) != 0; + TOOP_910_93 = (extraFlag & ExtraFlag::TOPO_910_93) != 0; + DETERMINISTIC = (extraFlag & ExtraFlag::DETERMINISTIC) != 0; + QUANT_FP16 = (extraFlag & ExtraFlag::QUANT_FP16) != 0; + QUANT_FP32 = (extraFlag & ExtraFlag::QUANT_FP32) != 0; + GlobalTensor<__gm__ T *> peerMemsAddrGm; + peerMemsAddrGm.SetGlobalBuffer(&(reinterpret_cast<__gm__ CoCCommArgs *>(coc_comm_args))->peerMems[0], + LCAL_MAX_RANK_SIZE); + for (int i = 0; i < rankSize; ++i) { + buff[i] = peerMemsAddrGm.GetValue(i); + } + } + + int rank; + int localRank; + int rankSize; + int localRankSize; + int extraFlag; + bool RDMA; + bool TOPO_910B2C; + bool TOOP_910_93; + bool DETERMINISTIC; + bool QUANT_FP16; + bool QUANT_FP32; + __gm__ T *buff[LCAL_MAX_RANK_SIZE]; +}; + +FORCE_INLINE_AICORE void CommMatrixTrunc(__gm__ int32_t* global_tokens_per_expert_matrix, __gm__ int32_t* workspace, int32_t EP, int32_t local_expert_nums, int32_t maxOutputSize) +{ + int32_t expert_nums = local_expert_nums * EP; + for (int32_t i = 0; i < EP; i++) { + int32_t sum_tokens = 0; + for (int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id++) { + int32_t expert_id = i * local_expert_nums + local_expert_id; + for (int32_t j = 0; j < EP; j++) { + if (sum_tokens + global_tokens_per_expert_matrix[j * expert_nums + expert_id] + >= maxOutputSize) { + workspace[j * expert_nums + expert_id] = maxOutputSize - sum_tokens; + sum_tokens = maxOutputSize; + } else { + workspace[j * expert_nums + expert_id] = global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + sum_tokens += global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + } + } + } +} + +#endif // LCAL_COC_INTERNAL_H \ No newline at end of file -- Gitee From 9ae7894cc7d7c5ee3ebb7d2bfd0ace882373733a Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Tue, 26 Aug 2025 20:44:17 +0800 Subject: [PATCH 297/414] 6 --- comm/lcal/src/kernels/{coc_internal.cpp => coc_internal.cce} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename comm/lcal/src/kernels/{coc_internal.cpp => coc_internal.cce} (100%) diff --git a/comm/lcal/src/kernels/coc_internal.cpp b/comm/lcal/src/kernels/coc_internal.cce similarity index 100% rename from comm/lcal/src/kernels/coc_internal.cpp rename to comm/lcal/src/kernels/coc_internal.cce -- Gitee From 1140cdec4fb796a91ea97ccf33b10ad42057e444 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 27 Aug 2025 09:11:38 +0800 Subject: [PATCH 298/414] 4 --- comm/lcal/src/kernels/coc_internal.cce | 79 +++++++++++++------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index 42d60d85..0fb8c666 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -14,7 +14,7 @@ #include "kernel_operator.h" #include "coc_const_args.cce" -using namespace AscendC +using namespace AscendC; template @@ -27,6 +27,7 @@ FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__ubuf__ T *addr) return tensor; } +template FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset) { LocalTensor tensor; @@ -35,7 +36,7 @@ FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset) } template -FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset, uint32_t logic_pos) +FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset, uint8_t logic_pos) { LocalTensor tensor; tensor.address_.logicPos = logic_pos; @@ -44,7 +45,7 @@ FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(uint32_t buffer_offset, uin } template -FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__gm__ T *addr) +FORCE_INLINE_AICORE GlobalTensor CreateGlobalTensor(__gm__ T *addr) { GlobalTensor tensor; tensor.SetGlobalBuffer(addr); @@ -74,13 +75,13 @@ inline __aicore__ void Vconv(__ubuf__ Tdst *dst, __ubuf__ Tsrc *src, uint8_t rep const RoundMode &roundMode = RoundMode::CAST_NONE) { LocalTensor srcTensor = CreateLocalTensor(src); - GlobalTensor dstTensor = CreateLocalTensor(dst); + LocalTensor dstTensor = CreateLocalTensor(dst); UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride); Cast(dstTensor, srcTensor, roundMode, -1, repeat, repeatParams); } template -inline __aicore__ void Vadd(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, uint8_t repeat, uint16_t dstBlockStride, +inline __aicore__ void Vadd(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, uint8_t repeat, uint8_t dstBlockStride, uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride, uint8_t src0RepeatStride, uint8_t src1RepeatStride) { @@ -94,18 +95,17 @@ inline __aicore__ void Vadd(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, template inline __aicore__ void Vadds(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, - uint16_t dstBlockStride, uint8_t srcBlockStride, uint8_t dstRepeatStride, + uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, uint8_t srcRepeatStride) { LocalTensor srcTensor = CreateLocalTensor(src); LocalTensor dstTensor = CreateLocalTensor(dst); - UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride, - src1RepeatStride); - Adds(dstTensor, srcTensor, srcTensor1, -1, repeat, repeatParams); + UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride); + Adds(dstTensor, srcTensor, scalarValue, -1, repeat, repeatParams); } template -inline __aicore__ void Vmul(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, uint8_t repeat, uint16_t dstBlockStride, +inline __aicore__ void Vmul(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, uint8_t repeat, uint8_t dstBlockStride, uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride, uint8_t src0RepeatStride, uint8_t src1RepeatStride) { @@ -124,15 +124,14 @@ inline __aicore__ void Vmuls(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarVa { LocalTensor srcTensor = CreateLocalTensor(src); LocalTensor dstTensor = CreateLocalTensor(dst); - UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride, - src1RepeatStride); - Muls(dstTensor, srcTensor, srcTensor1, -1, repeat, repeatParams); + UnaryRepeatParams repeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride); + Muls(dstTensor, srcTensor, scalarValue, -1, repeat, repeatParams); } inline __aicore__ bool IsQuant(const QuantGranularity &granularity) { return (granularity > QuantGranularity::QUANT_GRANULARITY_UNDEFINED) && - (granularity < QuantGranularity::QUANT_GRANULARITY_PER_TENSOR); + (granularity < QuantGranularity::QUANT_GRANULARITY_MAX); } #define COC_ARGS_FUN_IIO(T_INPUT1, T_INPUT2, T_OUTPUT) \ @@ -141,7 +140,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) GM_ADDR gm_dequant_scale, GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, \ GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, \ __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t *num_global_tokens_per_local_expert, \ - __gm__ int32_t *global_token_per_expert_matrix, GM_ADDR para_gm + __gm__ int32_t *global_tokens_per_expert_matrix, GM_ADDR para_gm #define COC_ARGS_FUN_IO(T_INPUT, T_OUTPUT) COC_ARGS_FUN_IIO(T_INPUT, T_INPUT, T_OUTPUT) @@ -159,7 +158,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) reinterpret_cast(gm_allgather_out), gm_workspace, gm_dequant_scale, gm_dequant_offset, \ gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ - global_token_per_expert_matrix, para_gm + global_tokens_per_expert_matrix, para_gm #define PP_MATMUL_AIC_ARGS_FUN(T_INPUT, T_OUTPUT) \ GM_ADDR gm_a, GM_ADDR gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_c, \ @@ -173,16 +172,16 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) __gm__ int32_t *global_tokens_per_expert_matrix, int32_t local_expert_nums, int32_t EP, int32_t TP, \ int32_t maxOutputSize, int32_t is_moe, bool is_deterministic, int32_t buffer_size \ -#define PP_MATMUL_AIC_ARGS_FUN() \ +#define PP_MATMUL_AIC_ARGS_CALL() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), gm_bias, gm_c, gm_peer_mem, \ reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ reinterpret_cast(gm_dequant_offset), batch_size, m, k, n, m0, k0, n0, m_loop, k_loop, \ n_loop, core_loop, swizzl_count, swizzl_direct, rank, rank_size, p_value, withSerialMode, quant_granularity, \ dequant_granularity, ag_dim, rs_dim, inner_dim_is_Ag, weight_nz, is_91093, \ num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ - global_tokens_per_expert_matrix, local_expert_nums, EP, TP, maxOutputSize, is_moe, is_deterministic, buffer_size + global_tokens_per_expert_matrix, local_expert_nums, EP, TP, maxOutputSize, is_moe, is_deterministic, buffer_size \ -#define PP_MATMUL_AIV_PADDING_ARGS_FUN() +#define PP_MATMUL_AIV_PADDING_ARGS_FUN() \ GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ @@ -190,7 +189,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) int32_t quant_group_size, int32_t weight_nz, int32_t is_moe, int32_t is_moe_averaged, int32_t is_alltoallvc, \ int32_t EP, int32_t TP, int32_t local_expert_nums, bool is_deterministic -#define PP_MATMUL_AIV_PADDING_ARGS_FUN() \ +#define PP_MATMUL_AIV_PADDING_ARGS_CALL() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), \ reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ reinterpret_cast(gm_dequant_offset), reinterpret_cast(gm_quant_scale), \ @@ -204,7 +203,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL() \ reinterpret_cast(gm_bias), reinterpret_cast(gm_out), batch_size, m, n, rank_size -#define PP_MATMUL_AIV_POST_ARGS_CALL() +#define PP_MATMUL_AIV_POST_ARGS_CALL() \ reinterpret_cast(gm_out), reinterpret_cast(gm_bias), \ reinterpret_cast(gm_gamma), reinterpret_cast(para_gm) @@ -213,9 +212,9 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define TEMPLATE_ARGS_FUN() bool ALIGN = true, bool IS_INT8 = false, bool HAVE_BIAS = false, typename T = half -#define TEMPLATE_ARGS_FUN() ALIGN, IS_INT8, HAVE_BIAS, T +#define TEMPLATE_ARGS_CALL() ALIGN, IS_INT8, HAVE_BIAS, T -inlie __aicore__ void AlignJudge(bool trans_a, bool trans_b, int32_t m, int32_t k, int32_t n, int32_t m_align, +inline __aicore__ void AlignJudge(bool trans_a, bool trans_b, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, int32_t &aligned_a, int32_t &aligned_b) { if (!trans_a) { @@ -231,10 +230,10 @@ inlie __aicore__ void AlignJudge(bool trans_a, bool trans_b, int32_t m, int32_t } inline __aicore__ void GetBlockIdx(int32_t loop_idx, int32_t m_loop, int32_t n_loop, int32_t swizzl_direction, - int32_t swizzl_count, int32_t &m_idx, int32_t &n_idx) + int32_t swizzl_count, int64_t &m_idx, int64_t &n_idx) { uint32_t in_batch_idx = loop_idx % (m_loop * n_loop); - if (swizzl_direction == 0) { + if (swizzl_direction == 0) { // Zn uint32_t tile_block_loop = (m_loop + swizzl_count - 1) / swizzl_count; uint32_t tile_block_idx = in_batch_idx / (swizzl_count * n_loop); uint32_t in_tile_block_idx = in_batch_idx % (swizzl_count * n_loop); @@ -244,10 +243,10 @@ inline __aicore__ void GetBlockIdx(int32_t loop_idx, int32_t m_loop, int32_t n_l } m_idx = tile_block_idx * swizzl_count + in_tile_block_idx % n_row; n_idx = in_tile_block_idx / n_row; - if (tile_block_idx % 2 != 0) [ + if (tile_block_idx % 2 != 0) { n_idx = n_loop - n_idx - 1; - ] - } else if (swizzl_direction == 1) { + } + } else if (swizzl_direction == 1) { // Nz uint32_t tile_block_loop = (n_loop + swizzl_count - 1) / swizzl_count; uint32_t tile_block_idx = in_batch_idx / (swizzl_count * m_loop); uint32_t in_tile_block_idx = in_batch_idx % (swizzl_count * m_loop); @@ -272,10 +271,10 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint1 nBurst * 2, lenBurst * 2, gmGap, ubufGap); return; } - DataCopyParams dataCopyParams(nBurst, - (Block32B::Count(lenBurst)), - (Block32B::Count(gmGap)), - (ubufGap) + DataCopyParams dataCopyParams(nBurst, // blockCount + (Block32B::Count(lenBurst)), // blockLen + (Block32B::Count(gmGap)), // srcStride + (ubufGap) // dstStride ); DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), gmGap * sizeof(T), ubufGap, 0); LocalTensor ubTensor; @@ -297,10 +296,10 @@ template FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, uint32_t gmGap, uint32_t ubufGap = 0) { - DataCopyParams dataCopyParams(nBurst, - static_cast(Block32B::Count(lenBurst)), - static_cast(ubufGap), - static_cast(Block32B::Count(gmGap)) + DataCopyParams dataCopyParams(nBurst, // blockCount + static_cast(Block32B::Count(lenBurst)), // blockLen + static_cast(ubufGap), // srcStride + static_cast(Block32B::Count(gmGap)) // dstStride ); DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), gmGap * sizeof(T), ubufGap, 0); LocalTensor ubTensor; @@ -322,10 +321,10 @@ template FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyParams(nBurst, - lenBurst, - srcStride, - dstStride, + DataCopyExtParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride, // dstStride 0); LocalTensor ubTensor; TBuffAddr ubAddr; -- Gitee From d91e4f0f73e19167d2391a55c9a6f6496e221f58 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 27 Aug 2025 09:23:39 +0800 Subject: [PATCH 299/414] 9 --- comm/lcal/src/kernels/coc_internal.cce | 94 +++++++++++++------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index 0fb8c666..be5114e5 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -150,7 +150,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) gm_a, gm_b, gm_bias, gm_gamma, gm_out, gm_allgather_out, gm_workspace, gm_dequant_scale, gm_dequant_offset, \ gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ - global_token_per_expert_matrix, para_gm + global_tokens_per_expert_matrix, para_gm #define COC_ARGS_CALL_INT8() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), reinterpret_cast(gm_bias), \ @@ -285,7 +285,7 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint1 GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(src); if (Block32B::IsAligned(lenBurst) && Block32B::IsAligned(gmGap)) { - DataCopy(ubTensor, gmTensor, dataCopyAlignParams); + DataCopy(ubTensor, gmTensor, dataCopyParams); } else { DataCopyPadExtParams padParams; DataCopyPad(ubTensor, gmTensor, dataCopyAlignParams, padParams); @@ -293,7 +293,7 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint1 } template -FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, +FORCE_INLINE_AICORE void CopyToGmUbufAlign(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, uint32_t gmGap, uint32_t ubufGap = 0) { DataCopyParams dataCopyParams(nBurst, // blockCount @@ -301,18 +301,18 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __ubuf__ T *src, uin static_cast(ubufGap), // srcStride static_cast(Block32B::Count(gmGap)) // dstStride ); - DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), gmGap * sizeof(T), ubufGap, 0); + DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), ubufGap, gmGap * sizeof(T), 0); LocalTensor ubTensor; TBuffAddr ubAddr; ubAddr.logicPos = static_cast(TPosition::VECIN); - ubAddr.bufferAddr = reinterpret_cast(dst); + ubAddr.bufferAddr = reinterpret_cast(src); ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(dst); if (Block32B::IsAligned(lenBurst) && Block32B::IsAligned(gmGap)) { - DataCopy(ubTensor, gmTensor, dataCopyAlignParams); + DataCopy(gmTensor, ubTensor, dataCopyParams); } else { - DataCopyPadExtParams padParams; + DataCopyPadParams padParams; DataCopyPad(gmTensor, ubTensor, dataCopyAlignParams); } } @@ -332,69 +332,69 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, ui ubAddr.bufferAddr = reinterpret_cast(dst); ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; - gmTensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t *>(dst)); - DataCopyPad(gmTensor, ubTensor, dataCopyParams); + gmTensor.SetGlobalBuffer(reinterpret_cast(src)); + DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams); } template -FORCE_INLINE_AICORE void CopyGmToUbuf(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, +FORCE_INLINE_AICORE void CopyToGmAlignB16(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyParams(nBurst, - lenBurst, - srcStride, - dstStride, - ); + DataCopyExtParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride, // dstStride + 0); LocalTensor ubTensor; TBuffAddr ubAddr; ubAddr.logicPos = static_cast(TPosition::VECIN); - ubAddr.bufferAddr = reinterpret_cast(dst); + ubAddr.bufferAddr = reinterpret_cast(src); ubTensor.SetAddr(ubAddr); - GlobalTensor gmTensor; - gmTensor.SetGlobalBuffer(src); - DataCopyPad(ubTensor, gmTensor, dataCopyParams); + GlobalTensor gmTensor; + gmTensor.SetGlobalBuffer(reinterpret_cast(dst)); + DataCopyPad(gmTensor, ubTensor, dataCopyParams); } template FORCE_INLINE_AICORE void CopyGmToUbuf(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyParams(nBurst, - lenBurst, - srcStride, - dstStride, + DataCopyParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride // dstStride ); - LocalTensor ubTensor; + LocalTensor ubTensor; TBuffAddr ubAddr; ubAddr.logicPos = static_cast(TPosition::VECIN); - ubAddr.bufferAddr = reinterpret_cast(src); + ubAddr.bufferAddr = reinterpret_cast(dst); ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; - gmTensor.SetGlobalBuffer(dst); - DataCopyPad(gmTensor, ubTensor, dataCopyParams); + gmTensor.SetGlobalBuffer(src); + DataCopy(ubTensor, gmTensor, dataCopyParams); } template -FORCE_INLINE_AICORE void CopyUbufToGm(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, +FORCE_INLINE_AICORE void CopyUbufToGm(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint16_t lenBurst, uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyParams(nBurst, - lenBurst, - srcStride, - dstStride, + DataCopyExtParams dataCopyExtParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride // dstStride ); - LocalTensor ubTensor; + LocalTensor ubTensor; TBuffAddr ubAddr; ubAddr.logicPos = static_cast(TPosition::VECIN); ubAddr.bufferAddr = reinterpret_cast(src); ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(dst); - DataCopyPad(gmTensor, ubTensor, dataCopyParams); + DataCopy(gmTensor, ubTensor, dataCopyParams); } template -FORCE_INLINE_AICORE void CopyUbufToGmUnknow(bool ALIGN, __gm__ T *dst, __ubuf__ T*src, uint16_t nBurst, +FORCE_INLINE_AICORE void CopyUbufToGmUnknown(bool ALIGN, __gm__ T *dst, __ubuf__ T*src, uint16_t nBurst, uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) { if (ALIGN) { @@ -413,7 +413,7 @@ FORCE_INLINE_AICORE void VectorDup(__ubuf__ T *dst, const T &src, uint8_t repeat } template -struce CoCBuffAddrAndArgs { +struct CoCBuffAddrAndArgs { public: __aicore__ inline CoCBuffAddrAndArgs(COC_ARGS_FUN(T)) { @@ -426,30 +426,31 @@ public: extraFlag = commArgsGm.GetValue(4); RDMA = (extraFlag & ExtraFlag::RDMA) != 0; TOPO_910B2C = (extraFlag & ExtraFlag::TOPO_910B2C) != 0; - TOOP_910_93 = (extraFlag & ExtraFlag::TOPO_910_93) != 0; + TOPO_910_93 = (extraFlag & ExtraFlag::TOPO_910_93) != 0; DETERMINISTIC = (extraFlag & ExtraFlag::DETERMINISTIC) != 0; QUANT_FP16 = (extraFlag & ExtraFlag::QUANT_FP16) != 0; QUANT_FP32 = (extraFlag & ExtraFlag::QUANT_FP32) != 0; GlobalTensor<__gm__ T *> peerMemsAddrGm; - peerMemsAddrGm.SetGlobalBuffer(&(reinterpret_cast<__gm__ CoCCommArgs *>(coc_comm_args))->peerMems[0], + peerMemsAddrGm.SetGlobalBuffer(&(reinterpret_cast<__gm__ CoCCommArgs *>(coc_comm_args))->peerMems[0], LCAL_MAX_RANK_SIZE); for (int i = 0; i < rankSize; ++i) { buff[i] = peerMemsAddrGm.GetValue(i); } } - int rank; + int rank; // attr rank_id, global rank int localRank; - int rankSize; + int rankSize; // global rank size int localRankSize; - int extraFlag; + uint32_t extraFlag; bool RDMA; bool TOPO_910B2C; - bool TOOP_910_93; + bool TOPO_910_93; bool DETERMINISTIC; bool QUANT_FP16; bool QUANT_FP32; - __gm__ T *buff[LCAL_MAX_RANK_SIZE]; + __gm__ T *buff[LCAL_MAX_RANK_SIZE]; // 共享内存地址列表 + //int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE]; }; FORCE_INLINE_AICORE void CommMatrixTrunc(__gm__ int32_t* global_tokens_per_expert_matrix, __gm__ int32_t* workspace, int32_t EP, int32_t local_expert_nums, int32_t maxOutputSize) @@ -464,9 +465,10 @@ FORCE_INLINE_AICORE void CommMatrixTrunc(__gm__ int32_t* global_tokens_per_exper >= maxOutputSize) { workspace[j * expert_nums + expert_id] = maxOutputSize - sum_tokens; sum_tokens = maxOutputSize; - } else { - workspace[j * expert_nums + expert_id] = global_tokens_per_expert_matrix[j * expert_nums + expert_id]; - sum_tokens += global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + } else { + workspace[j * expert_nums + expert_id] = global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + sum_tokens += global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + } } } } -- Gitee From a7fe5673b53800c30f219064a0544049314f1b14 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 27 Aug 2025 09:26:24 +0800 Subject: [PATCH 300/414] 7 --- comm/lcal/src/kernels/coc_internal.cce | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index be5114e5..de655b6d 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -293,7 +293,7 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint1 } template -FORCE_INLINE_AICORE void CopyToGmUbufAlign(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, +FORCE_INLINE_AICORE void CopyUbufToGmAlign(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, uint32_t gmGap, uint32_t ubufGap = 0) { DataCopyParams dataCopyParams(nBurst, // blockCount @@ -333,11 +333,12 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, ui ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(reinterpret_cast(src)); + DataCopyExtParams padParams; DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams); } template -FORCE_INLINE_AICORE void CopyToGmAlignB16(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, +FORCE_INLINE_AICORE void CopyUbufToGmAlignB16(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) { DataCopyExtParams dataCopyParams(nBurst, // blockCount @@ -378,7 +379,7 @@ template FORCE_INLINE_AICORE void CopyUbufToGm(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint16_t lenBurst, uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyExtParams(nBurst, // blockCount + DataCopyParams dataCopyParams(nBurst, // blockCount lenBurst, // blockLen srcStride, // srcStride dstStride // dstStride -- Gitee From de464fa1f3ede82ba9c4776e35270344e31e5d60 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 27 Aug 2025 09:26:52 +0800 Subject: [PATCH 301/414] 2 --- comm/lcal/src/kernels/coc_internal.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index de655b6d..42f03b31 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -333,7 +333,7 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, ui ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(reinterpret_cast(src)); - DataCopyExtParams padParams; + DataCopyExtPadParams padParams; DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams); } -- Gitee From 1c4bc8068b60c466cdec13af61e63ad28c423aa3 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 27 Aug 2025 09:27:16 +0800 Subject: [PATCH 302/414] 5 --- comm/lcal/src/kernels/coc_internal.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index 42f03b31..706ddbf8 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -333,7 +333,7 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, ui ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(reinterpret_cast(src)); - DataCopyExtPadParams padParams; + DataCopyPadExtParams padParams; DataCopyPad(ubTensor, gmTensor, dataCopyParams, padParams); } -- Gitee From 4fd8e2f9671884f10aba3cbfdb873eb237b6ff37 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 10:32:55 +0800 Subject: [PATCH 303/414] draft --- comm/lcal/src/tiling/tiling_args.cpp | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 comm/lcal/src/tiling/tiling_args.cpp diff --git a/comm/lcal/src/tiling/tiling_args.cpp b/comm/lcal/src/tiling/tiling_args.cpp new file mode 100644 index 00000000..ca59bc61 --- /dev/null +++ b/comm/lcal/src/tiling/tiling_args.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_args.h" + +namespace Lcal { + const char* CoCTilingData::ToString() + { + std::string str = + "[CoCTilingData]: \n m=" + std::to_string(m) + ", k=" + std::to_string(k) + ", n=" + std::to_string(n) + + ", batchSize=" + std::to_string(batchSize) + + ", \nblockDim=" + std::to_string(blockDim) + ", rank=" + std::to_string(rank) + + ", rankSize=" + std::to_string(rankSize) + ", tag=" + std::to_string(tag) + + ", \nmLoop=" + std::to_string(mLoop) + ", kLoop=" + std::to_string(kLoop) + + ", nLoop=" + std::to_string(nLoop) + ", coreLoop=" + std::to_string(coreLoop) + + ", tilingKey=" + std::to_string(tilingKey) + + ", m0=" + std::to_string(m0) + ", k0=" + std::to_string(k0) + ", n0=" + std::to_string(n0) + + ", swizzlCount=" + std::to_string(swizzlCount) + ", swizzlDirect=" + std::to_string(swizzlDirect) + + ", pValue=" + std::to_string(pValue) + ", ubMoveNum=" + std::to_string(ubMoveNum) + + ", commNpuSplit=" + std::to_string(commNpuSplit) + ", commDataSplit=" + std::to_string(commDataSplit) + + ", commDirect=" + std::to_string(commDirect) + ", lenPerLoop=" + std::to_string(lenPerLoop) + + ", extraUbMoveNum=" + std::to_string(extraUbMoveNum) + + ", extraCommNpuSplit=" + std::to_string(extraCommNpuSplit) + + ", extraCommDataSplit=" + std::to_string(extraCommDataSplit) + + ", extraCommDirect=" + std::to_string(extraCommDirect) + + ", extraLenPerLoop=" + std::to_string(extraLenPerLoop) + ", splitK=" + std::to_string(splitK) + + ", write2OtherRank=" + std::to_string(write2OtherRank) + + ", withSerialMode=" + std::to_string(withSerialMode) + ", is91093=" + std::to_string(is91093); + return str.data(); + } + + void CoCTilingData::SetDefaultValue() + { + m0 = m < n ? DEFAULT_COL : DEFAULT_ROW; + k0 = DEFAULT_COL; + n0 = m0 == DEFAULT_COL ? DEFAULT_ROW : DEFAULT_COL; + swizzlCount = DEFAULT_SWIZZL_COUNT; + swizzlDirect = m > n ? SWIZZLE_DIRECT_ZERO : SWIZZLE_DIRECT_ONE; + pValue = DEFAULT_P_VALUE; + ubMoveNum = MAX_UB_NUM; + commNpuSplit = COMMNPUSPLIT_ONE; + commDataSplit = rankSize; + commDirect = COMM_DATA_DIRECT; + lenPerLoop = LENPERLOOP_DEFAULT; + extraUbMoveNum = ubMoveNum; + extraCommNpuSplit = commNpuSplit; + extraCommDataSplit = commDataSplit; + extraCommDirect = commDirect; + extraLenPerLoop = lenPerLoop; + splitK = DEFAULT_SPLIT_K; + write2OtherRank = false; + withSerialMode = false; + is91093 = false; + tag = 0; + } +} -- Gitee From bac2d644a57a955cb73cf53073fc18308c820bcd Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 10:36:29 +0800 Subject: [PATCH 304/414] draft --- comm/lcal/src/tiling/tiling_args.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/tiling/tiling_args.cpp b/comm/lcal/src/tiling/tiling_args.cpp index ca59bc61..aa790e5b 100644 --- a/comm/lcal/src/tiling/tiling_args.cpp +++ b/comm/lcal/src/tiling/tiling_args.cpp @@ -11,28 +11,28 @@ #include "tiling_args.h" namespace Lcal { - const char* CoCTilingData::ToString() + const char* CoCTilingData::ToString() const { std::string str = - "[CoCTilingData]: \n m=" + std::to_string(m) + ", k=" + std::to_string(k) + ", n=" + std::to_string(n) + + "[CoCTilingData]: \nm=" + std::to_string(m) + ", k=" + std::to_string(k) + ", n=" + std::to_string(n) + ", batchSize=" + std::to_string(batchSize) + ", \nblockDim=" + std::to_string(blockDim) + ", rank=" + std::to_string(rank) + ", rankSize=" + std::to_string(rankSize) + ", tag=" + std::to_string(tag) + ", \nmLoop=" + std::to_string(mLoop) + ", kLoop=" + std::to_string(kLoop) + ", nLoop=" + std::to_string(nLoop) + ", coreLoop=" + std::to_string(coreLoop) + ", tilingKey=" + std::to_string(tilingKey) + - ", m0=" + std::to_string(m0) + ", k0=" + std::to_string(k0) + ", n0=" + std::to_string(n0) + + ", \nm0=" + std::to_string(m0) + ", k0=" + std::to_string(k0) + ", n0=" + std::to_string(n0) + ", swizzlCount=" + std::to_string(swizzlCount) + ", swizzlDirect=" + std::to_string(swizzlDirect) + ", pValue=" + std::to_string(pValue) + ", ubMoveNum=" + std::to_string(ubMoveNum) + ", commNpuSplit=" + std::to_string(commNpuSplit) + ", commDataSplit=" + std::to_string(commDataSplit) + ", commDirect=" + std::to_string(commDirect) + ", lenPerLoop=" + std::to_string(lenPerLoop) + - ", extraUbMoveNum=" + std::to_string(extraUbMoveNum) + + ", \nextraUbMoveNum=" + std::to_string(extraUbMoveNum) + ", extraCommNpuSplit=" + std::to_string(extraCommNpuSplit) + ", extraCommDataSplit=" + std::to_string(extraCommDataSplit) + ", extraCommDirect=" + std::to_string(extraCommDirect) + - ", extraLenPerLoop=" + std::to_string(extraLenPerLoop) + ", splitK=" + std::to_string(splitK) + + ", extraLenPerLoop=" + std::to_string(extraLenPerLoop) + ", \nsplitK=" + std::to_string(splitK) + ", write2OtherRank=" + std::to_string(write2OtherRank) + - ", withSerialMode=" + std::to_string(withSerialMode) + ", is91093=" + std::to_string(is91093); + ", withSerialMode=" + std::to_string(withSerialMode) + ", \nis_91093=" + std::to_string(is91093); return str.data(); } @@ -41,7 +41,7 @@ namespace Lcal { m0 = m < n ? DEFAULT_COL : DEFAULT_ROW; k0 = DEFAULT_COL; n0 = m0 == DEFAULT_COL ? DEFAULT_ROW : DEFAULT_COL; - swizzlCount = DEFAULT_SWIZZL_COUNT; + swizzlCount = DEFAULT_SWIZZLE_COUNT; swizzlDirect = m > n ? SWIZZLE_DIRECT_ZERO : SWIZZLE_DIRECT_ONE; pValue = DEFAULT_P_VALUE; ubMoveNum = MAX_UB_NUM; -- Gitee From 98b25f973de3a74216ef6e96c30779c3df533f15 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 10:37:24 +0800 Subject: [PATCH 305/414] draft --- comm/lcal/src/tiling/tiling_args.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/tiling/tiling_args.cpp b/comm/lcal/src/tiling/tiling_args.cpp index aa790e5b..5b02b0da 100644 --- a/comm/lcal/src/tiling/tiling_args.cpp +++ b/comm/lcal/src/tiling/tiling_args.cpp @@ -21,7 +21,7 @@ namespace Lcal { ", \nmLoop=" + std::to_string(mLoop) + ", kLoop=" + std::to_string(kLoop) + ", nLoop=" + std::to_string(nLoop) + ", coreLoop=" + std::to_string(coreLoop) + ", tilingKey=" + std::to_string(tilingKey) + - ", \nm0=" + std::to_string(m0) + ", k0=" + std::to_string(k0) + ", n0=" + std::to_string(n0) + + ", \nm0=" + std::to_string(m0) + ", k0=" + std::to_string(k0) + ", n0=" + std::to_string(n0) + ", swizzlCount=" + std::to_string(swizzlCount) + ", swizzlDirect=" + std::to_string(swizzlDirect) + ", pValue=" + std::to_string(pValue) + ", ubMoveNum=" + std::to_string(ubMoveNum) + ", commNpuSplit=" + std::to_string(commNpuSplit) + ", commDataSplit=" + std::to_string(commDataSplit) + -- Gitee From 42b8516a7eb74c629231d37ddb040e291fc51072 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 10:44:01 +0800 Subject: [PATCH 306/414] draft --- comm/lcal/src/tiling/tiling.cpp | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 comm/lcal/src/tiling/tiling.cpp diff --git a/comm/lcal/src/tiling/tiling.cpp b/comm/lcal/src/tiling/tiling.cpp new file mode 100644 index 00000000..aee17345 --- /dev/null +++ b/comm/lcal/src/tiling/tiling.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "tiling_func.h" +#include "mki/utils/log/log.h" +#include "tiling.h" + +namespace Lcal { +CoCTilingData CoCTilingFunc::GenerateTiling(const TaskParam &taskParam, const CoCTiling &tiling) +{ + SetTilingInputParam(taskParam, tiling); + + cocTilingData.SetDefaultValue(); + + this->GetDefaultTiling(taskParam); + + SetTilingData(taskParam, tiling, cocTilingData); + + return cocTilingData; +} + +bool CoCTilingFunc::CheckTiling(const TaskParam &taskParam) +{ + return CheckCoCTilingData(cocTilingData); +} + +void CoCTilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + cocTilingData.ubMoveNum = VALID_UB_MOVE_NUM; + cocTilingData.commNpuSplit = cocTilingData.ranksize; + cocTilingData.commDataSplit = COMMDATASPLIT_ONE; + cocTilingData.commDirect = COMM_DATA_DIRECT; + cocTilingData.lenPerLoop = LENPERLOOP_DEFAULT; +} +} \ No newline at end of file -- Gitee From 15e6c48fec23757d78bae619f0814cceb848f74e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 10:45:28 +0800 Subject: [PATCH 307/414] draft --- comm/lcal/src/tiling/tiling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/tiling.cpp b/comm/lcal/src/tiling/tiling.cpp index aee17345..9e439491 100644 --- a/comm/lcal/src/tiling/tiling.cpp +++ b/comm/lcal/src/tiling/tiling.cpp @@ -14,7 +14,7 @@ namespace Lcal { CoCTilingData CoCTilingFunc::GenerateTiling(const TaskParam &taskParam, const CoCTiling &tiling) { - SetTilingInputParam(taskParam, tiling); + SetTilingInputParam(taskParam, cocTilingData); cocTilingData.SetDefaultValue(); @@ -33,7 +33,7 @@ bool CoCTilingFunc::CheckTiling(const TaskParam &taskParam) void CoCTilingFunc::GetDefaultTiling(const TaskParam &taskParam) { cocTilingData.ubMoveNum = VALID_UB_MOVE_NUM; - cocTilingData.commNpuSplit = cocTilingData.ranksize; + cocTilingData.commNpuSplit = cocTilingData.rankSize; cocTilingData.commDataSplit = COMMDATASPLIT_ONE; cocTilingData.commDirect = COMM_DATA_DIRECT; cocTilingData.lenPerLoop = LENPERLOOP_DEFAULT; -- Gitee From e54bb2ea021677ff31c77fde8cd5fc711c296e63 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:11:54 +0800 Subject: [PATCH 308/414] draft --- comm/lcal/src/tiling/allreduce_tiling.cpp | 135 ++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 comm/lcal/src/tiling/allreduce_tiling.cpp diff --git a/comm/lcal/src/tiling/allreduce_tiling.cpp b/comm/lcal/src/tiling/allreduce_tiling.cpp new file mode 100644 index 00000000..4b819c9b --- /dev/null +++ b/comm/lcal/src/tiling/allreduce_tiling.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "tiling.h" +#include "lcoc_func.h" +#include "tiling_910B.h" +#include "tiling_91093.h" +#include "tiling_func.h" + +namespace Lcal { +const int ALLREDUCE_M_EDGE = 3172; +const int ALLREDUCE_N_EDGE = 3172; + +void CoCMatmulAllReduceTilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + CoCTilingFunc::GetDefaultTiling(taskParam); + if (Is91093(taskParam.chipName)) { + if (cocTilingData.rankSize == RANKSIZE_EIGHT) { + AllReduceNPU91093EightRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_SIXTEEN) { + AllReduceNPU91093SixteenRankFP16Tiling(cocTilingData); + return; + } + } + if (cocTilingData.rankSize == RANKSIZE_FOUR) { + if (taskParam.cocParamDesc.mmInfo.isInt8) { + AllReduceFourRankInt8GetDefaultTiling(cocTilingData); + return; + } else { + AllReduceFourRankFP16GetDefaultTiling(cocTilingData); + return; + } + } else if (cocTilingData.rankSize == RANKSIZE_TWO) { + AllReduceTwoRankFP16Tiling(cocTilingData); + return; + } + AllReduceGetDefaultTiling(cocTilingData); +} + +void CoCMatmulAllReduceDeterTilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + CoCTilingFunc::GetDefaultTiling(taskParam); + if (cocTilingData.rankSize == RANKSIZE_FOUR) { + if (taskParam.cocParamDesc.mmInfo.isInt8) { + AllReduceFourRankInt8GetDefaultTiling(cocTilingData); + return; + } else { + AllReduceFourRankFP16GetDefaultTiling(cocTilingData); + return; + } + } else { + if (taskParam.cocParamDesc.mmInfo.isInt8) { + AllReduceEightRankINT8GetDefaultTiling(cocTilingData); + return; + } else { + AllReduceEightRankFP16GetDefaultTiling(cocTilingData); + return; + } + } + if (cocTilingData.m * cocTilingData.n >= ALLREDUCE_M_EDGE * ALLREDUCE_N_EDGE) { + cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + cocTilingData.extraLenPerLoop = cocTilingData.lenPerLoop; + cocTilingData.extraUbMoveNum = cocTilingData.ubMoveNum; + } + if (cocTilingData.lenPerLoop > TREE_LEN_PER_LOOP) { + cocTilingData.lenPerLoop = TREE_LEN_PER_LOOP; + cocTilingData.ubMoveNum = TREE_LEN_PER_LOOP; + cocTilingData.extraLenPerLoop = cocTilingData.lenPerLoop; + cocTilingData.extraUbMoveNum = cocTilingData.ubMoveNum; + } +} + +bool CheckCMatrix(const TaskParam &taskParam, const CoCTilingData &data) +{ + constexpr int32_t BUFFER_UNIT = 1024; + if (data.withSerialMode != 0 && + data.batchSize * data.m * data.n >= (taskParam.bufferSize * BUFFER_UNIT * BUFFER_UNIT) + / INPUT_DTYPE / MAX_BLOCK_COUNT) { + std::string str = "The matrix c is too large to support serial. " + "withSerialMode: " + std::to_string(data.withSerialMode) + " " + ", batchSize: " + std::to_string(data.batchSize) + " " + ", m: " + std::to_string(data.m) + " " + ", n: " + std::to_string(data.n); + PrintErrorLog(taskParam.lcalType, str); + return false; + } + return true; +} + +bool CoCMatmulAllReduceTilingFunc::CheckTiling(const TaskParam &taskParam) +{ + if (CoCTilingFunc::CheckTiling(taskParam)) { + return false; + } + if (!CheckCMatrix(taskParam, cocTilingData)) { + return false; + } + + auto rankSize = cocTilingData.rankSize; + auto commNpuSplit = cocTilingData.commNpuSplit; + auto commDataSplit = cocTilingData.commDataSplit; + auto coreNum = cocTilingData.blockDim; + int32_t useCoreCount = commNpuSplit * commDataSplit; + + std::vector> paramCheckList = { + {"commNpuSplit * commNpuSplit", useCoreCount, rankSize, coreNum}, + {"commNpuSplit", commNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, rankSize} + }; + return CheckParamScopeList(paramCheckList); +} + +bool CoCMatmulAllReduceDeterTilingFunc::CheckTiling(const TaskParam &taskParam) +{ + if (!CoCMatmulAllReduceTilingFunc::CheckTiling(taskParam)) { + return false; + } + + auto commNpuSplit = cocTilingData.commNpuSplit; + if (commNpuSplit != 1) { + std::string str = "The prodect of commNpuSplit mult equal 1. commNpuSplit: " + std::to_string(commNpuSplit); + PrintErrorLog(taskParam.lcalType, str); + return false; + } + return true; +} +} \ No newline at end of file -- Gitee From a92dbc95c0c9cc730e7fc6faee057bc952a62235 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:14:09 +0800 Subject: [PATCH 309/414] draft --- comm/lcal/src/tiling/allreduce_tiling.cpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling.cpp b/comm/lcal/src/tiling/allreduce_tiling.cpp index 4b819c9b..ee472475 100644 --- a/comm/lcal/src/tiling/allreduce_tiling.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling.cpp @@ -14,8 +14,8 @@ #include "tiling_func.h" namespace Lcal { -const int ALLREDUCE_M_EDGE = 3172; -const int ALLREDUCE_N_EDGE = 3172; +const int ALLREDUCE_M_EDGE = 3072; +const int ALLREDUCE_N_EDGE = 3072; void CoCMatmulAllReduceTilingFunc::GetDefaultTiling(const TaskParam &taskParam) { @@ -50,18 +50,14 @@ void CoCMatmulAllReduceDeterTilingFunc::GetDefaultTiling(const TaskParam &taskPa if (cocTilingData.rankSize == RANKSIZE_FOUR) { if (taskParam.cocParamDesc.mmInfo.isInt8) { AllReduceFourRankInt8GetDefaultTiling(cocTilingData); - return; } else { AllReduceFourRankFP16GetDefaultTiling(cocTilingData); - return; } } else { if (taskParam.cocParamDesc.mmInfo.isInt8) { AllReduceEightRankINT8GetDefaultTiling(cocTilingData); - return; } else { AllReduceEightRankFP16GetDefaultTiling(cocTilingData); - return; } } if (cocTilingData.m * cocTilingData.n >= ALLREDUCE_M_EDGE * ALLREDUCE_N_EDGE) { @@ -86,9 +82,9 @@ bool CheckCMatrix(const TaskParam &taskParam, const CoCTilingData &data) data.batchSize * data.m * data.n >= (taskParam.bufferSize * BUFFER_UNIT * BUFFER_UNIT) / INPUT_DTYPE / MAX_BLOCK_COUNT) { std::string str = "The matrix c is too large to support serial. " - "withSerialMode: " + std::to_string(data.withSerialMode) + " " - ", batchSize: " + std::to_string(data.batchSize) + " " - ", m: " + std::to_string(data.m) + " " + "withSerialMode: " + std::to_string(data.withSerialMode) + "." + ", batchSize: " + std::to_string(data.batchSize) + "." + ", m: " + std::to_string(data.m) + "." ", n: " + std::to_string(data.n); PrintErrorLog(taskParam.lcalType, str); return false; @@ -98,7 +94,7 @@ bool CheckCMatrix(const TaskParam &taskParam, const CoCTilingData &data) bool CoCMatmulAllReduceTilingFunc::CheckTiling(const TaskParam &taskParam) { - if (CoCTilingFunc::CheckTiling(taskParam)) { + if (!CoCTilingFunc::CheckTiling(taskParam)) { return false; } if (!CheckCMatrix(taskParam, cocTilingData)) { @@ -112,7 +108,7 @@ bool CoCMatmulAllReduceTilingFunc::CheckTiling(const TaskParam &taskParam) int32_t useCoreCount = commNpuSplit * commDataSplit; std::vector> paramCheckList = { - {"commNpuSplit * commNpuSplit", useCoreCount, rankSize, coreNum}, + {"commNpuSplit * commDataSplit", useCoreCount, rankSize, coreNum}, {"commNpuSplit", commNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, rankSize} }; return CheckParamScopeList(paramCheckList); @@ -126,7 +122,7 @@ bool CoCMatmulAllReduceDeterTilingFunc::CheckTiling(const TaskParam &taskParam) auto commNpuSplit = cocTilingData.commNpuSplit; if (commNpuSplit != 1) { - std::string str = "The prodect of commNpuSplit mult equal 1. commNpuSplit: " + std::to_string(commNpuSplit); + std::string str = "The product of commNpuSplit must equal 1. commNpuSplit: " + std::to_string(commNpuSplit); PrintErrorLog(taskParam.lcalType, str); return false; } -- Gitee From f29f9cd37f1bb7b2ca030d049ad73331b5beb52e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:15:03 +0800 Subject: [PATCH 310/414] draft --- comm/lcal/src/tiling/allreduce_tiling.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling.cpp b/comm/lcal/src/tiling/allreduce_tiling.cpp index ee472475..e4eff5eb 100644 --- a/comm/lcal/src/tiling/allreduce_tiling.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling.cpp @@ -82,9 +82,9 @@ bool CheckCMatrix(const TaskParam &taskParam, const CoCTilingData &data) data.batchSize * data.m * data.n >= (taskParam.bufferSize * BUFFER_UNIT * BUFFER_UNIT) / INPUT_DTYPE / MAX_BLOCK_COUNT) { std::string str = "The matrix c is too large to support serial. " - "withSerialMode: " + std::to_string(data.withSerialMode) + "." - ", batchSize: " + std::to_string(data.batchSize) + "." - ", m: " + std::to_string(data.m) + "." + "withSerialMode: " + std::to_string(data.withSerialMode) + + ", batchSize: " + std::to_string(data.batchSize) + + ", m: " + std::to_string(data.m) + ", n: " + std::to_string(data.n); PrintErrorLog(taskParam.lcalType, str); return false; -- Gitee From 53e53cabf402de07d63f231a5ea2bc4a93e8255e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:29:38 +0800 Subject: [PATCH 311/414] draft --- .../src/tiling/allreduce_tiling_91093.cpp | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 comm/lcal/src/tiling/allreduce_tiling_91093.cpp diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp new file mode 100644 index 00000000..0ec04fef --- /dev/null +++ b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_91093.h" +#include "tiling_func.h" + +namespace Lcal { + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFALUT = 160; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFALUT = 128; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFALUT = 14; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFALUT = 16; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFALUT = 14; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFALUT = 160; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFALUT = 128; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFALUT = 16; + + static std::map>> g_allreduce91093EightRankFP16CommdatasplitMap = { + }; + + static std::map>> g_allreduce91093EightRankFP16PvalueMap = { + }; + + static std::map>> g_allreduce91093EightRankFP16M0Map = { + }; + + static std::map>> g_allreduce91093EightRankFP16UbmovenumMap = { + }; + + static std::map>> g_allreduce91093SixteenRankFP16CommdatasplitMap = { + }; + + static std::map>> g_allreduce91093SixteenRankFP16M0Map = { + }; + + static std::map>> g_allreduce91093SixteenRankFP16UbmovenumMap = { + }; + + static std::map>> g_allreduce91093SixteenRankFP16PvalueMap = { + }; + + void AllReduceNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map tillingParamMap = { + {&cocTilingData.commDataSplit, + {ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFALUT, + g_allreduce91093EightRankFP16CommdatasplitMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFALUT, + g_allreduce91093EightRankFP16PvalueMap}}, + {&cocTilingData.m0, + {ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFALUT, + g_allreduce91093EightRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFALUT, + g_allreduce91093EightRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} + }; + SetTillingParam(cocTilingData, tillingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit == COMMDATASPLIT_ONE ? cocTilingData.rankSize : COMMNPUSPLIT_ONE; + SetSecondCoreSplitTling(cocTilingData); + } + + void AllReduceNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map tillingParamMap = { + {&cocTilingData.commDataSplit, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFALUT, + g_allreduce91093SixteenRankFP16CommdatasplitMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFALUT, + g_allreduce91093SixteenRankFP16PvalueMap}}, + {&cocTilingData.m0, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFALUT, + g_allreduce91093SixteenRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFALUT, + g_allreduce91093SixteenRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} + }; + SetTillingParam(cocTilingData, tillingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit == COMMDATASPLIT_ONE ? cocTilingData.rankSize : COMMNPUSPLIT_ONE; + SetSecondCoreSplitTling(cocTilingData); + } +} \ No newline at end of file -- Gitee From bb2425757c88d2362f21b40d4913a7c8e8203738 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:31:31 +0800 Subject: [PATCH 312/414] draft --- .../src/tiling/allreduce_tiling_91093.cpp | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp index 0ec04fef..b5a17162 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp @@ -12,14 +12,14 @@ #include "tiling_func.h" namespace Lcal { - constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFALUT = 160; - constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFALUT = 128; - constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFALUT = 14; - constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFALUT = 16; - constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFALUT = 14; - constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFALUT = 160; - constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFALUT = 128; - constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFALUT = 16; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 160; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT = 14; + constexpr int32_t ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT = 14; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT = 160; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::map>> g_allreduce91093EightRankFP16CommdatasplitMap = { }; @@ -47,24 +47,24 @@ namespace Lcal { void AllReduceNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData) { - std::map tillingParamMap = { + std::map tilingParamMap = { {&cocTilingData.commDataSplit, - {ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFALUT, + {ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, g_allreduce91093EightRankFP16CommdatasplitMap}}, {&cocTilingData.pValue, - {ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFALUT, + {ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT, g_allreduce91093EightRankFP16PvalueMap}}, {&cocTilingData.m0, - {ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFALUT, + {ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFAULT, g_allreduce91093EightRankFP16M0Map}}, {&cocTilingData.ubMoveNum, - {ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFALUT, + {ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, g_allreduce91093EightRankFP16UbmovenumMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} }; - SetTillingParam(cocTilingData, tillingParamMap); + SettilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; cocTilingData.commNpuSplit = @@ -74,24 +74,24 @@ namespace Lcal { void AllReduceNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData) { - std::map tillingParamMap = { + std::map tilingParamMap = { {&cocTilingData.commDataSplit, - {ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFALUT, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT, g_allreduce91093SixteenRankFP16CommdatasplitMap}}, {&cocTilingData.pValue, - {ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFALUT, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, g_allreduce91093SixteenRankFP16PvalueMap}}, {&cocTilingData.m0, - {ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFALUT, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFAULT, g_allreduce91093SixteenRankFP16M0Map}}, {&cocTilingData.ubMoveNum, - {ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFALUT, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT, g_allreduce91093SixteenRankFP16UbmovenumMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} }; - SetTillingParam(cocTilingData, tillingParamMap); + SettilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; cocTilingData.commNpuSplit = -- Gitee From d6f71ba1c284ad2970f021cb0341561db6090da0 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:33:00 +0800 Subject: [PATCH 313/414] draft --- .../src/tiling/allreduce_tiling_91093.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp index b5a17162..bdc9be9b 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp @@ -50,21 +50,21 @@ namespace Lcal { std::map tilingParamMap = { {&cocTilingData.commDataSplit, {ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, - g_allreduce91093EightRankFP16CommdatasplitMap}}, + g_allreduce91093EightRankFP16CommdatasplitMap}}, {&cocTilingData.pValue, {ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT, - g_allreduce91093EightRankFP16PvalueMap}}, + g_allreduce91093EightRankFP16PvalueMap}}, {&cocTilingData.m0, {ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFAULT, - g_allreduce91093EightRankFP16M0Map}}, + g_allreduce91093EightRankFP16M0Map}}, {&cocTilingData.ubMoveNum, {ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, - g_allreduce91093EightRankFP16UbmovenumMap}}, + g_allreduce91093EightRankFP16UbmovenumMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} }; - SettilingParam(cocTilingData, tilingParamMap); + SetTilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; cocTilingData.commNpuSplit = @@ -77,21 +77,21 @@ namespace Lcal { std::map tilingParamMap = { {&cocTilingData.commDataSplit, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT, - g_allreduce91093SixteenRankFP16CommdatasplitMap}}, - {&cocTilingData.pValue, - {ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, - g_allreduce91093SixteenRankFP16PvalueMap}}, + g_allreduce91093SixteenRankFP16CommdatasplitMap}}, {&cocTilingData.m0, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFAULT, - g_allreduce91093SixteenRankFP16M0Map}}, + g_allreduce91093SixteenRankFP16M0Map}}, {&cocTilingData.ubMoveNum, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT, - g_allreduce91093SixteenRankFP16UbmovenumMap}}, + g_allreduce91093SixteenRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, + g_allreduce91093SixteenRankFP16PvalueMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} }; - SettilingParam(cocTilingData, tilingParamMap); + SetTilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; cocTilingData.commNpuSplit = -- Gitee From 8929b76517a7180409e29ed7ad9b13c57a3e3c39 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:34:51 +0800 Subject: [PATCH 314/414] draft --- .../lcal/src/tiling/allreduce_tiling_91093.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp index bdc9be9b..9ab49c7b 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp @@ -50,16 +50,16 @@ namespace Lcal { std::map tilingParamMap = { {&cocTilingData.commDataSplit, {ALLREDUCE_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, - g_allreduce91093EightRankFP16CommdatasplitMap}}, + g_allreduce91093EightRankFP16CommdatasplitMap}}, {&cocTilingData.pValue, {ALLREDUCE_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT, - g_allreduce91093EightRankFP16PvalueMap}}, + g_allreduce91093EightRankFP16PvalueMap}}, {&cocTilingData.m0, {ALLREDUCE_91093_EIGHT_RANK_FP16_M0_DEFAULT, - g_allreduce91093EightRankFP16M0Map}}, + g_allreduce91093EightRankFP16M0Map}}, {&cocTilingData.ubMoveNum, {ALLREDUCE_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, - g_allreduce91093EightRankFP16UbmovenumMap}}, + g_allreduce91093EightRankFP16UbmovenumMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} @@ -77,17 +77,17 @@ namespace Lcal { std::map tilingParamMap = { {&cocTilingData.commDataSplit, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT, - g_allreduce91093SixteenRankFP16CommdatasplitMap}}, + g_allreduce91093SixteenRankFP16CommdatasplitMap}}, {&cocTilingData.m0, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFAULT, - g_allreduce91093SixteenRankFP16M0Map}}, + g_allreduce91093SixteenRankFP16M0Map}}, {&cocTilingData.ubMoveNum, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT, - g_allreduce91093SixteenRankFP16UbmovenumMap}}, + g_allreduce91093SixteenRankFP16UbmovenumMap}}, {&cocTilingData.pValue, {ALLREDUCE_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, - g_allreduce91093SixteenRankFP16PvalueMap}}, - {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + g_allreduce91093SixteenRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ZERO}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}} }; -- Gitee From 1149a0119b08081f475b8699b941489c9906979e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:52:43 +0800 Subject: [PATCH 315/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 comm/lcal/src/tiling/allreduce_tiling_910B.cpp diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp new file mode 100644 index 00000000..d9f369a6 --- /dev/null +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_91093.h" +#include "tiling_func.h" + +namespace Lcal { + const int32_t ALLREDUCE_SERIAL_MODE_K_SIZE = 8192; + const int32_t ALLREDUCE_SERIAL_MODE_MN_SIZE = 256 * 256 *12; + + constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_TWO_RANK_INT8_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_TWO_RANK_INT8_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_TWO_RANK_INT8_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_TWO_RANK_INT8_M0_DEFAULT = 128; + + static std::vector g_allreduceUbmovenumCoef = { + }; + + static std::vector g_allreducePvalueCoef = { + }; + + static std::map>> g_allreduceFourRankInT8M0Map = { + }; + + static std::map>> g_allreduceFourRankInT8DatasplitMap = { + }; + + static std::map>> g_allreduceFourRankInT8PvalueMap = { + }; + + static std::map>> g_allreduceFourRankInT8UbmovenumMap = { + }; + + static std::map>> g_allreduceFourRankFP16M0Map = { + }; + + static std::map>> g_allreduceFourRankFP16DatasplitMap = { + }; + + static std::map>> g_allreduceFourRankFP16PvalueMap = { + }; + + static std::map>> g_allreduceFourRankFP16UbmovenumMap = { + }; + + static std::map>> g_allreduceEightRankFP16M0Map = { + }; + + static std::map>> g_allreduceEightRankFP16DatasplitMap = { + }; + + static std::map>> g_allreduceEightRankFP16PvalueMap = { + }; + + static std::map>> g_allreduceEightRankFP16UbmovenumMap = { + }; + + static std::map>> g_allreduceEightRankInT8M0Map = { + }; + + static std::map>> g_allreduceEightRankInT8DatasplitMap = { + }; + + static std::map>> g_allreduceEightRankInT8PvalueMap = { + }; + + static std::map>> g_allreduceEightRankInT8UbmovenumMap = { + }; + + static std::map>> g_allreduceTwoRankFP16M0Map = { + }; + + static std::map>> g_allreduceTwoRankFP16CommdatasplitMap = { + }; + + static std::map>> g_allreduceTwoRankFP16UbmovenumMap = { + }; + + static std::map>> g_allreduceTwoRankFP16SwizzldirectMap = { + }; + + static std::map>> g_allreduceTwoRankFP16SwizzlcountMap = { + }; + + static std::map>> g_allreduceTwoRankFP16M0Map = { + }; + + static std::map>> g_allreduceTwoRankFP16PvalueMap = { + }; + + +} \ No newline at end of file -- Gitee From afc350e3dc9637aa30e4804b1f6493870dad4153 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:53:56 +0800 Subject: [PATCH 316/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index d9f369a6..3f2c9ef7 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -19,9 +19,9 @@ namespace Lcal { constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_UBMOVENUM_DEFAULT = 30; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_DATASPLIT_DEFAULT = 32; - constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_M0_DEFAULT = 128; constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT = 8; -- Gitee From e22d20ee172166ad1e1338919a4642107fa3b8fe Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:55:15 +0800 Subject: [PATCH 317/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 3f2c9ef7..e2ce66ab 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -19,17 +19,17 @@ namespace Lcal { constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_UBMOVENUM_DEFAULT = 30; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_UBMOVENUM_DEFAULT = 40; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT = 32; - constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_M0_DEFAULT = 128; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 8; -- Gitee From 6b7ad83f6b4a7456c8eceffeb03ec10ea4085c48 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:56:32 +0800 Subject: [PATCH 318/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index e2ce66ab..42b0d7bd 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -31,14 +31,12 @@ namespace Lcal { constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 8; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT = 128; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 30; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_TWO_RANK_INT8_DATASPLIT_DEFAULT = 32; - constexpr int32_t ALLREDUCE_TWO_RANK_INT8_PVALUE_DEFAULT = 8; - constexpr int32_t ALLREDUCE_TWO_RANK_INT8_UBMOVENUM_DEFAULT = 30; - constexpr int32_t ALLREDUCE_TWO_RANK_INT8_M0_DEFAULT = 128; static std::vector g_allreduceUbmovenumCoef = { }; -- Gitee From 0d26931573246878192fe73ef2e5e59580a53f1a Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 11:59:04 +0800 Subject: [PATCH 319/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 42b0d7bd..dd140957 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -23,20 +23,20 @@ namespace Lcal { constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_PVALUE_DEFAULT = 8; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_FOUR_RANK_INT8_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT = 8; - constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 30; - constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT = 14; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 100; + constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT = 16; constexpr int32_t ALLREDUCE_EIGHT_RANK_FP16_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_UBMOVENUM_DEFAULT = 30; - constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT = 8; - constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT = 32; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_UBMOVENUM_DEFAULT = 100; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT = 14; + constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT = 8; constexpr int32_t ALLREDUCE_EIGHT_RANK_INT8_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 8; - constexpr int32_t ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT = 128; - constexpr int32_t ALLREDUCE_TWO_RANK_FP16_DATASPLIT_DEFAULT = 32; - constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 8; - constexpr int32_t ALLREDUCE_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT = 6; constexpr int32_t ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_SWIZZLCOUNT_DEFAULT = 8; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_SWIZZLDIRECT_DEFAULT = 0; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 6; + constexpr int32_t ALLREDUCE_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::vector g_allreduceUbmovenumCoef = { }; -- Gitee From fba6216dea44f646f87266d1b034e274b565ade5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 12:00:07 +0800 Subject: [PATCH 320/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index dd140957..076857fd 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -8,12 +8,13 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include -#include "tiling_91093.h" +#include "tiling_910B.h" #include "tiling_func.h" +#include "lcal_types.h" namespace Lcal { const int32_t ALLREDUCE_SERIAL_MODE_K_SIZE = 8192; - const int32_t ALLREDUCE_SERIAL_MODE_MN_SIZE = 256 * 256 *12; + const int64_t ALLREDUCE_SERIAL_MODE_MN_SIZE = 256 * 256 *12; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_PVALUE_DEFAULT = 8; -- Gitee From f0298a2d16521e63b13eb7e88c8d09e107e2346e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 12:02:12 +0800 Subject: [PATCH 321/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 076857fd..ffda4805 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -60,13 +60,13 @@ namespace Lcal { static std::map>> g_allreduceFourRankFP16M0Map = { }; - static std::map>> g_allreduceFourRankFP16DatasplitMap = { + static std::map>> g_allreduceFourRankFP16UbmovenumMap = { }; static std::map>> g_allreduceFourRankFP16PvalueMap = { }; - static std::map>> g_allreduceFourRankFP16UbmovenumMap = { + static std::map>> g_allreduceFourRankFP16DatasplitMap = { }; static std::map>> g_allreduceEightRankFP16M0Map = { @@ -75,10 +75,10 @@ namespace Lcal { static std::map>> g_allreduceEightRankFP16DatasplitMap = { }; - static std::map>> g_allreduceEightRankFP16PvalueMap = { + static std::map>> g_allreduceEightRankFP16UbmovenumMap = { }; - static std::map>> g_allreduceEightRankFP16UbmovenumMap = { + static std::map>> g_allreduceEightRankFP16PvalueMap = { }; static std::map>> g_allreduceEightRankInT8M0Map = { @@ -93,9 +93,6 @@ namespace Lcal { static std::map>> g_allreduceEightRankInT8UbmovenumMap = { }; - static std::map>> g_allreduceTwoRankFP16M0Map = { - }; - static std::map>> g_allreduceTwoRankFP16CommdatasplitMap = { }; -- Gitee From 5d8318e8e08473d3cc5806e7ef90e6e6c276fb32 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 14:07:08 +0800 Subject: [PATCH 322/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 70 ++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index ffda4805..aeae22e2 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -14,7 +14,7 @@ namespace Lcal { const int32_t ALLREDUCE_SERIAL_MODE_K_SIZE = 8192; - const int64_t ALLREDUCE_SERIAL_MODE_MN_SIZE = 256 * 256 *12; + const int64_t ALLREDUCE_SERIAL_MODE_MN_SIZE = 256 * 256 * 12; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_DATASPLIT_DEFAULT = 32; constexpr int32_t ALLREDUCE_FOUR_RANK_FP16_PVALUE_DEFAULT = 8; @@ -111,5 +111,71 @@ namespace Lcal { static std::map>> g_allreduceTwoRankFP16PvalueMap = { }; - + int32_t AllReduceUbMoveNum(int m, int k, int n) + { + double commPredict = 1.0 * (m / ONE_K) * (n / ONE_K) * (SECOND_TO_MS / ONE_K) / 40; + double cubePredict = DOUBLE * m * k / B1_FLOP_PER_MS * n; + double mknGB = (m / ONE_K) * (k / ONE_K) * (n / ONE_K); + double mteTimePredict1 = GetMTETime(mknGB, DEFAULT_ROW, DEFAULT_COL); + double mteTimePredict2 = GetMTETime(mknGB, DEFAULT_COL, DEFAULT_ROW); + double mteTimePredict = std::min(mteTimePredict1, mteTimePredict2); + double matmulPredict = std::max(cubePredict, mteTimePredict); + double c0 = matmulPredict / commPredict; + double c1 = 1.0 * m * n / k; + double c2 = sqrt(c1); + double c3 = sqrt(1.0 * m * n) / k; + double c4 = c3 * c3; + double c5 = matmulPredict; + double c6 = commPredict; + double c7 = 1.0 * n / m; + double c8 = 1.0 * m * n / sqrt(k); + double c9 = 1.0 * m * n * sqrt(k); + double c10 = sqrt(1.0 * m * n) * k; + double c11 = sqrt(1.0 * m * n * k); + double c12 = sqrt(1.0 * m * n); + double c13 = 1.0 * k * k / sqrt(1.0 * m * n); + double c14 = 1.0 * k * k * sqrt(1.0 * m * n); + double ubMoveNumDouble = 0; + std::vector featsUpdate = { c0, c1, c2, c3, c4, c5, c6, c7, 1.0 / c0, 1.0 / c1, 1.0 / c2, 1.0 / c3, + 1.0 / c4, c8, c9, c10, c11, c12, c13, 1.0 / c13, c14, 1 }; + for (uint32_t i = 0; i < featsUpdate.size(); i++) { + ubMoveNumDouble += featsUpdate[i] * g_allreduceUbmovenumCoef[i]; + } + + return std::min(std::max(static_cast(ubMoveNumDouble) * HALF_KBYTE, MIN_UB_MOVE_NUM), MAX_UB_NUM); + } + + int32_t AllReducePValue(int m, int k, int n) + { + double commPredict = 1.0 * (m / ONE_K) * (n / ONE_K) * (SECOND_TO_MS / ONE_K) / 40; + double cubePredict = DOUBLE * m * k / B1_FLOP_PER_MS * n; + double mknGB = (m / ONE_K) * (k / ONE_K) * (n / ONE_K); + double mteTimePredict1 = GetMTETime(mknGB, DEFAULT_ROW, DEFAULT_COL); + double mteTimePredict2 = GetMTETime(mknGB, DEFAULT_COL, DEFAULT_ROW); + double mteTimePredict = std::min(mteTimePredict1, mteTimePredict2); + double matmulPredict = std::max(cubePredict, mteTimePredict); + double c0 = matmulPredict / commPredict; + double c1 = 1.0 * m * n / k; + double c2 = sqrt(c1); + double c3 = sqrt(1.0 * m * n) / k; + double c4 = c3 * c3; + double c5 = matmulPredict; + double c6 = commPredict; + double c7 = 1.0 * n / m; + double c8 = 1.0 * m * n / sqrt(k); + double c9 = 1.0 * m * n * sqrt(k); + double c10 = sqrt(1.0 * m * n) * k; + double c11 = sqrt(1.0 * m * n * k); + double c12 = sqrt(1.0 * m * n); + double c13 = 1.0 * k * k / sqrt(1.0 * m * n); + double c14 = 1.0 * k * k * sqrt(1.0 * m * n); + double pValueDouble = 0; + std::vector featsUpdate = { c0, c1, c2, c3, c4, c5, c6, c7, 1.0 / c0, 1.0 / c1, 1.0 / c2, 1.0 / c3, + 1.0 / c4, c8, c9, c10, c11, c12, c13, 1.0 / c13, c14, 1 }; + for (uint32_t i = 0; i < featsUpdate.size(); i++) { + pValueDouble += featsUpdate[i] * g_allreducePvalueCoef[i]; + } + + return std::min(std::max(static_cast(pValueDouble), 1), MAX_P_VALUE); + } } \ No newline at end of file -- Gitee From c8d44854d918df358bda1c677242bd5dfc3a6b29 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 14:08:04 +0800 Subject: [PATCH 323/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index aeae22e2..45ba68cf 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -137,7 +137,7 @@ namespace Lcal { double c14 = 1.0 * k * k * sqrt(1.0 * m * n); double ubMoveNumDouble = 0; std::vector featsUpdate = { c0, c1, c2, c3, c4, c5, c6, c7, 1.0 / c0, 1.0 / c1, 1.0 / c2, 1.0 / c3, - 1.0 / c4, c8, c9, c10, c11, c12, c13, 1.0 / c13, c14, 1 }; + 1.0 / c4, c8, c9, c10, c11, c12, c13, 1.0 / c13, c14, 1 }; for (uint32_t i = 0; i < featsUpdate.size(); i++) { ubMoveNumDouble += featsUpdate[i] * g_allreduceUbmovenumCoef[i]; } @@ -171,7 +171,7 @@ namespace Lcal { double c14 = 1.0 * k * k * sqrt(1.0 * m * n); double pValueDouble = 0; std::vector featsUpdate = { c0, c1, c2, c3, c4, c5, c6, c7, 1.0 / c0, 1.0 / c1, 1.0 / c2, 1.0 / c3, - 1.0 / c4, c8, c9, c10, c11, c12, c13, 1.0 / c13, c14, 1 }; + 1.0 / c4, c8, c9, c10, c11, c12, c13, 1.0 / c13, c14, 1 }; for (uint32_t i = 0; i < featsUpdate.size(); i++) { pValueDouble += featsUpdate[i] * g_allreducePvalueCoef[i]; } -- Gitee From 896767b9a088093c1de7a56e0df920dd3f11ad55 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 15:02:46 +0800 Subject: [PATCH 324/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 45ba68cf..d26e98e4 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -178,4 +178,192 @@ namespace Lcal { return std::min(std::max(static_cast(pValueDouble), 1), MAX_P_VALUE); } + + void AllReduceSetWithSerialMode(CoCTilingData &cocTilingData) + { + int32_t m = cocTilingData.m; + int32_t k = cocTilingData.k; + int32_t n = cocTilingData.n; + + int64_t batchSize = cocTilingData.batchSize; + int64_t commSize = static_cast(batchSize) * m * n; + if (commSize <= ALLREDUCE_SERIAL_MODE_MN_SIZE && k <= ALLREDUCE_SERIAL_MODE_K_SIZE) { + cocTilingData.withSerialMode = 1; + cocTilingData.ubMoveNum = MAX_UB_NUM; + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } else { + cocTilingData.withSerialMode = 0; + } + } + + void AllReduceGetDefaultTiling(CoCTilingData &cocTilingData) + { + int64_t batchSize = cocTilingData.batchSize; + int32_t m = cocTilingData.m; + int32_t k = cocTilingData.k; + int32_t n = cocTilingData.n; + + cocTilingData.swizzlDirect = SWIZZLE_DIRECT_ONE; + cocTilingData.ubMoveNum = AllReduceUbMoveNum(m, k, n); + cocTilingData.pValue = AllReducePValue(m, k, n); + + int64_t cubeSize = static_cast(batchSize) * m * k * n; + int64_t commSize = static_cast(batchSize) * m * n; + constexpr int32_t bufferUnit = 1024; + if ((cubeSize <= MATMUL_BASE_100US && + commSize < (cocTilingData.bufferSize * bufferUnit * bufferUnit) / INPUT_DTYPE / MAX_BLOCK_COUNT) || + commSize <= ALLREDUCE_BASE_100US) { + cocTilingData.withSerialMode = 1; + cocTilingData.ubMoveNum = MAX_UB_NUM; + } else { + cocTilingData.withSerialMode = 0; + } + cocTilingData.commDataSplit = COMM_DATA_DIRECT; + cocTilingData.commNpuSplit = cocTilingData.rankSize; + cocTilingData.commDataSplit = COMMDATASPLIT_ONE; + cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 * cocTilingData.pValue * cocTilingData.blockDim; + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + SetSecondCoreSplitTling(cocTilingData); + } + + void AllReduceFourRankInt8GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLREDUCE_FOUR_RANK_INT8_M0_DEFAULT, + g_allreduceFourRankInT8M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_FOUR_RANK_INT8_UBMOVENUM_DEFAULT, + g_allreduceFourRankInT8UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_FOUR_RANK_INT8_PVALUE_DEFAULT, + g_allreduceFourRankInT8PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + + AllReduceSetWithSerialMode(cocTilingData); + SetSecondCoreSplitTling(cocTilingData); + } + + void AllReduceFourRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLREDUCE_FOUR_RANK_FP16_M0_DEFAULT, + g_allreduceFourRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_FOUR_RANK_FP16_UBMOVENUM_DEFAULT, + g_allreduceFourRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_FOUR_RANK_FP16_PVALUE_DEFAULT, + g_allreduceFourRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + + AllReduceSetWithSerialMode(cocTilingData); + SetSecondCoreSplitTling(cocTilingData); + } + + void AllReduceEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLREDUCE_EIGHT_RANK_FP16_M0_DEFAULT, + g_allreduceEightRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_allreduceEightRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_allreduceEightRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + + AllReduceSetWithSerialMode(cocTilingData); + SetSecondCoreSplitTling(cocTilingData); + } + + void AllReduceEightRankINT8GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLREDUCE_EIGHT_RANK_INT8_M0_DEFAULT, + g_allreduceEightRankInT8M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_EIGHT_RANK_INT8_UBMOVENUM_DEFAULT, + g_allreduceEightRankInT8UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT, + g_allreduceEightRankInT8PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + + AllReduceSetWithSerialMode(cocTilingData); + SetSecondCoreSplitTling(cocTilingData); + } + + void AllReduceTwoRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT, + g_allreduceTwoRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLREDUCE_TWO_RANK_FP16_UBMOVENUM_DEFAULT, + g_allreduceTwoRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT, + g_allreduceTwoRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); + cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + + AllReduceSetWithSerialMode(cocTilingData); + SetSecondCoreSplitTling(cocTilingData); + } } \ No newline at end of file -- Gitee From 5d81f1a71c7a39fc601b205234882196296c7b3e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 15:14:51 +0800 Subject: [PATCH 325/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index d26e98e4..27b22d7e 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -195,7 +195,7 @@ namespace Lcal { cocTilingData.withSerialMode = 0; } } - + void AllReduceGetDefaultTiling(CoCTilingData &cocTilingData) { int64_t batchSize = cocTilingData.batchSize; @@ -218,7 +218,7 @@ namespace Lcal { } else { cocTilingData.withSerialMode = 0; } - cocTilingData.commDataSplit = COMM_DATA_DIRECT; + cocTilingData.commDirect = COMM_DATA_DIRECT; cocTilingData.commNpuSplit = cocTilingData.rankSize; cocTilingData.commDataSplit = COMMDATASPLIT_ONE; cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 * cocTilingData.pValue * cocTilingData.blockDim; @@ -285,6 +285,7 @@ namespace Lcal { void AllReduceEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) { + int dataSplit = 0; std::map tilingParamMap = { {&cocTilingData.m0, {ALLREDUCE_EIGHT_RANK_FP16_M0_DEFAULT, @@ -295,6 +296,9 @@ namespace Lcal { {&cocTilingData.pValue, {ALLREDUCE_EIGHT_RANK_FP16_PVALUE_DEFAULT, g_allreduceEightRankFP16PvalueMap}}, + {&dataSplit, + {ALLREDUCE_EIGHT_RANK_FP16_DATASPLIT_DEFAULT, + g_allreduceEightRankFP16DatasplitMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, @@ -303,9 +307,11 @@ namespace Lcal { }; SetTilingParam(cocTilingData, tilingParamMap); - cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 * cocTilingData.pValue * cocTilingData.blockDim; + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize / cocTilingData.commDataSplit + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / dataSplit; cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); - cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + cocTilingData.lenPerLoop = std::min(cocTilingData.lenPerLoop, TREE_LEN_PER_LOOP); AllReduceSetWithSerialMode(cocTilingData); SetSecondCoreSplitTling(cocTilingData); @@ -313,6 +319,7 @@ namespace Lcal { void AllReduceEightRankINT8GetDefaultTiling(CoCTilingData &cocTilingData) { + int dataSplit = 0; std::map tilingParamMap = { {&cocTilingData.m0, {ALLREDUCE_EIGHT_RANK_INT8_M0_DEFAULT, @@ -323,6 +330,9 @@ namespace Lcal { {&cocTilingData.pValue, {ALLREDUCE_EIGHT_RANK_INT8_PVALUE_DEFAULT, g_allreduceEightRankInT8PvalueMap}}, + {&dataSplit, + {ALLREDUCE_EIGHT_RANK_INT8_DATASPLIT_DEFAULT, + g_allreduceEightRankInT8DatasplitMap}}, {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, @@ -331,9 +341,11 @@ namespace Lcal { }; SetTilingParam(cocTilingData, tilingParamMap); - cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; + cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 * cocTilingData.pValue * cocTilingData.blockDim; + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize / cocTilingData.commDataSplit + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / dataSplit; cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); - cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + cocTilingData.lenPerLoop = std::min(cocTilingData.lenPerLoop, TREE_LEN_PER_LOOP); AllReduceSetWithSerialMode(cocTilingData); SetSecondCoreSplitTling(cocTilingData); @@ -342,12 +354,21 @@ namespace Lcal { void AllReduceTwoRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) { std::map tilingParamMap = { - {&cocTilingData.m0, - {ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT, - g_allreduceTwoRankFP16M0Map}}, + {&cocTilingData.commDataSplit, + {ALLREDUCE_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allreduceTwoRankFP16CommdatasplitMap}}, {&cocTilingData.ubMoveNum, {ALLREDUCE_TWO_RANK_FP16_UBMOVENUM_DEFAULT, g_allreduceTwoRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, + {ALLREDUCE_TWO_RANK_FP16_SWIZZLDIRECT_DEFAULT, + g_allreduceTwoRankFP16SwizzldirectMap}}, + {&cocTilingData.swizzlCount, + {ALLREDUCE_TWO_RANK_FP16_SWIZZLCOUNT_DEFAULT + g_allreduceTwoRankFP16SwizzlcountMap}}, + {&cocTilingData.m0, + {ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT, + g_allreduceTwoRankFP16M0Map}}, {&cocTilingData.pValue, {ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT, g_allreduceTwoRankFP16PvalueMap}}, -- Gitee From 070078dcb7e85022ab337936ca78f41c266fd229 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 15:17:34 +0800 Subject: [PATCH 326/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 27b22d7e..ed8f333c 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -308,7 +308,7 @@ namespace Lcal { SetTilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 * cocTilingData.pValue * cocTilingData.blockDim; - cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize / cocTilingData.commDataSplit + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize / cocTilingData.commDataSplit; cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / dataSplit; cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); cocTilingData.lenPerLoop = std::min(cocTilingData.lenPerLoop, TREE_LEN_PER_LOOP); @@ -342,7 +342,7 @@ namespace Lcal { SetTilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 * cocTilingData.pValue * cocTilingData.blockDim; - cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize / cocTilingData.commDataSplit + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / cocTilingData.rankSize / cocTilingData.commDataSplit; cocTilingData.lenPerLoop = cocTilingData.lenPerLoop / dataSplit; cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); cocTilingData.lenPerLoop = std::min(cocTilingData.lenPerLoop, TREE_LEN_PER_LOOP); @@ -351,7 +351,7 @@ namespace Lcal { SetSecondCoreSplitTling(cocTilingData); } - void AllReduceTwoRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + void AllReduceTwoRankFP16Tiling(CoCTilingData &cocTilingData) { std::map tilingParamMap = { {&cocTilingData.commDataSplit, @@ -364,7 +364,7 @@ namespace Lcal { {ALLREDUCE_TWO_RANK_FP16_SWIZZLDIRECT_DEFAULT, g_allreduceTwoRankFP16SwizzldirectMap}}, {&cocTilingData.swizzlCount, - {ALLREDUCE_TWO_RANK_FP16_SWIZZLCOUNT_DEFAULT + {ALLREDUCE_TWO_RANK_FP16_SWIZZLCOUNT_DEFAULT, g_allreduceTwoRankFP16SwizzlcountMap}}, {&cocTilingData.m0, {ALLREDUCE_TWO_RANK_FP16_M0_DEFAULT, @@ -372,19 +372,13 @@ namespace Lcal { {&cocTilingData.pValue, {ALLREDUCE_TWO_RANK_FP16_PVALUE_DEFAULT, g_allreduceTwoRankFP16PvalueMap}}, - {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, - {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, - {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, - {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}} }; SetTilingParam(cocTilingData, tilingParamMap); - cocTilingData.lenPerLoop = ALLREDUCE_LENPERLOOP_DEFAULT / RANKSIZE_EIGHT * cocTilingData.rankSize; - cocTilingData.lenPerLoop = RoundNum(cocTilingData.lenPerLoop, HALF_KBYTE); - cocTilingData.ubMoveNum = cocTilingData.lenPerLoop; + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; AllReduceSetWithSerialMode(cocTilingData); - SetSecondCoreSplitTling(cocTilingData); } } \ No newline at end of file -- Gitee From 87fdfa14c5949b9401ddf6447dfad6e054e30ded Mon Sep 17 00:00:00 2001 From: guanguan Date: Wed, 27 Aug 2025 15:51:58 +0800 Subject: [PATCH 327/414] add --- comm/lcal/include/lcoc/lcoc.h | 48 + comm/lcal/include/lcoc/lcoc_args.h | 98 ++ comm/lcal/include/lcoc/lcoc_base.h | 58 + comm/lcal/include/lcoc/lcoc_func.h | 29 + comm/lcal/include/lcoc/lcoc_workspace.h | 102 ++ comm/lcal/include/lcoc/tiling/tiling.h | 64 + comm/lcal/include/lcoc/tiling/tiling_91093.h | 22 + comm/lcal/include/lcoc/tiling/tiling_910B.h | 27 + comm/lcal/include/lcoc/tiling/tiling_args.h | 148 +++ comm/lcal/include/lcoc/tiling/tiling_func.h | 51 + comm/lcal/src/coc_kernel_args.cpp | 82 ++ comm/lcal/src/coc_kernel_args.h | 48 + comm/lcal/src/kernels/coc_add_bias_runner.cce | 291 +++++ .../coc_allgather_matmul_reduce_scatter.cce | 37 + comm/lcal/src/kernels/coc_allreduce.cce | 690 ++++++++++ comm/lcal/src/kernels/coc_comm_base.cce | 472 +++++++ comm/lcal/src/kernels/coc_const_args.cce | 134 ++ comm/lcal/src/kernels/coc_postprocessor.cce | 201 +++ comm/lcal/src/kernels/coc_ppmatmul.cce | 1143 +++++++++++++++++ comm/lcal/src/kernels/coc_ppmatmul_switch.cce | 104 ++ comm/lcal/src/lcoc_func.cpp | 83 ++ .../tiling/allgather_reducescatter_tiling.cpp | 309 +++++ comm/lcal/src/tiling/tiling_func.cpp | 284 ++++ 23 files changed, 4525 insertions(+) create mode 100644 comm/lcal/include/lcoc/lcoc.h create mode 100644 comm/lcal/include/lcoc/lcoc_args.h create mode 100644 comm/lcal/include/lcoc/lcoc_base.h create mode 100644 comm/lcal/include/lcoc/lcoc_func.h create mode 100644 comm/lcal/include/lcoc/lcoc_workspace.h create mode 100644 comm/lcal/include/lcoc/tiling/tiling.h create mode 100644 comm/lcal/include/lcoc/tiling/tiling_91093.h create mode 100644 comm/lcal/include/lcoc/tiling/tiling_910B.h create mode 100644 comm/lcal/include/lcoc/tiling/tiling_args.h create mode 100644 comm/lcal/include/lcoc/tiling/tiling_func.h create mode 100644 comm/lcal/src/coc_kernel_args.cpp create mode 100644 comm/lcal/src/coc_kernel_args.h create mode 100644 comm/lcal/src/kernels/coc_add_bias_runner.cce create mode 100644 comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce create mode 100644 comm/lcal/src/kernels/coc_allreduce.cce create mode 100644 comm/lcal/src/kernels/coc_comm_base.cce create mode 100644 comm/lcal/src/kernels/coc_const_args.cce create mode 100644 comm/lcal/src/kernels/coc_postprocessor.cce create mode 100644 comm/lcal/src/kernels/coc_ppmatmul.cce create mode 100644 comm/lcal/src/kernels/coc_ppmatmul_switch.cce create mode 100644 comm/lcal/src/lcoc_func.cpp create mode 100644 comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp create mode 100644 comm/lcal/src/tiling/tiling_func.cpp diff --git a/comm/lcal/include/lcoc/lcoc.h b/comm/lcal/include/lcoc/lcoc.h new file mode 100644 index 00000000..e5120735 --- /dev/null +++ b/comm/lcal/include/lcoc/lcoc.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_LCOC_H +#define LCAL_LCOC_H + +#include +#include +#include "lcoc_args.h" +#include "tiling_args.h" + +namespace Lcal { + class Lcoc { +public: + Lcoc() = delete; + explicit Lcoc(LcalComm &comm); + explicit Lcoc(LcalComm *comm); + ~Lcoc(); + int SetParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc); + int MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream = nullptr); + int AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, + void *workspace, aclrtStream stream = nullptr); + int64_t GetWorkspaceSize(); + LcalComm *GetComm(); + MatMulInfo &GetMatMulInfo(); + void GetTiling(CoCTiling &tiling); + +private: + int LaunchOperator(CoCInputPkg &inputPkg, CoCOutputPkg &outputPkg, void *workspace, aclrtStream stream); + bool CheckBasic(const CoCInputPkg &inputPkg, const CoCOutputPkg &outputPkg, LcalType lcalType) const; + bool CheckInputParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc) const; + void SetLcocParam(LcalType lcalType, const CoCParamDesc ¶mDesc); + void SetTaskParam(LcalType lcalType, const CoCParamDesc ¶mDesc, const LcalComm &comm); + +private: + LcalComm *comm_ = nullptr; + CoCTilingData tiling_ = {}; + TaskParam taskParam_ = {}; + bool tilingSuccess_ = false; +}; +} +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/lcoc_args.h b/comm/lcal/include/lcoc/lcoc_args.h new file mode 100644 index 00000000..f496a86d --- /dev/null +++ b/comm/lcal/include/lcoc/lcoc_args.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_LCOC_ARGS_H +#define LCAL_LCOC_ARGS_H + +#include +#include +#include + +constexpr int64_t WORKSPACE_REDUCE_SIZE = 4000000; +#pragma once +namespace Lcal { + const constexpr int32_t INT8_ELE_SIZE = 1; + const constexpr int32_t FP_BF_16_ELE_SIZE = 2; + constexpr uint32_t ALIGN_BYTES = 512; + constexpr int32_t PARAM_CHECK_MAX_VALUE = -1; + constexpr int32_t PARAM_CHECK_MIN_VALUE_ZERO = 0; + constexpr int32_t PARAM_CHECK_MIN_VALUE_ONE = 1; + constexpr int32_t INPUT_PARAM_DEFAULT_VALUE = -1; + constexpr int32_t MAX_M_VALUE = 10000000; + constexpr int32_t MAX_K_VALUE = 100000; + constexpr int32_t MAX_N_VALUE = 100000; + + enum CoCDataTypeDesc : int { + COC_DATA_TYPE_UNDEFINED = -1, + FP16FP16_FP32_FP16 = 0, + BF16BF16_FP32_BF16 = 1, + INT8INT8_INT32_FP16 = 2, + INT8INT8_INT32_BF16 = 3, + FP16INT8_INT32_FP16 = 4, + BF16INT8_INT32_BF16 = 5, + FP16INT8_FP32_FP16 = 6, + BF16INT8_FP32_BF16 = 7, + FP16INT4_FP32_FP16 = 8, + BF16INT4_FP32_BF16 = 9, + COC_DATA_TYPE_DESC_MAX = 10, + }; + + const std::map COC_TYPE2ELE_SIZE = { + { FP16FP16_FP32_FP16, FP_BF_16_ELE_SIZE }, { BF16BF16_FP32_BF16, FP_BF_16_ELE_SIZE }, + { INT8INT8_INT32_FP16, INT8_ELE_SIZE }, { INT8INT8_INT32_BF16, INT8_ELE_SIZE }, + { FP16INT8_INT32_FP16, INT8_ELE_SIZE }, { BF16INT8_INT32_BF16, INT8_ELE_SIZE }, + { FP16INT8_FP32_FP16, FP_BF_16_ELE_SIZE }, { BF16INT8_FP32_BF16, FP_BF_16_ELE_SIZE }, + { FP16INT4_FP32_FP16, FP_BF_16_ELE_SIZE }, { BF16INT4_FP32_BF16, FP_BF_16_ELE_SIZE } + }; + + const std::map COC_TYPE2HCCL_TYPE = { + { FP16FP16_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16BF16_FP32_BF16, HCCL_DATA_TYPE_BP16 }, + { INT8INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { INT8INT8_INT32_BF16, HCCL_DATA_TYPE_BP16 }, + { FP16INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_INT32_BF16, HCCL_DATA_TYPE_BP16 }, + { FP16INT8_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_FP32_BF16, HCCL_DATA_TYPE_BP16 }, + { FP16INT4_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT4_FP32_BF16, HCCL_DATA_TYPE_BP16 } + }; + + struct CoCParamDesc { + CoCDataTypeDesc dataTypeDesc = FP16FP16_FP32_FP16; + MatMulInfo mmInfo = {}; + QuantInfo quantInfo = {}; + PostInfo postInfo = {}; + HcclReduceOp op = HCCL_REDUCE_SUM; + TwoDimTPInfo twoDimTPInfo = {}; + }; + + struct CoCInputPkg { + void *matrixA = nullptr; + void *matrixB = nullptr; + void *bias = nullptr; + void *gamma = nullptr; + void *dequantScale = nullptr; + void *dequantOffset = nullptr; + + void *quantScale = nullptr; + void *quantOffset = nullptr; + }; + + struct CoCOutputPkg { + void *output = nullptr; + void *minOutput = nullptr; + }; + + struct TaskParam { + int32_t rank = -1; + int32_t rankSize = -1; + int32_t blockDim = -1; + int32_t bufferSize = -1; + ChipName chipName = ChipName::CHIP_910B3; + CoCParamDesc cocParamDesc = {}; + LcalType lcalType = LcalType::ALL_REDUCE; + }; +} +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/lcoc_base.h b/comm/lcal/include/lcoc/lcoc_base.h new file mode 100644 index 00000000..2dd25d84 --- /dev/null +++ b/comm/lcal/include/lcoc/lcoc_base.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCAL_LCOC_BASE_H +#define LCAL_LCOC_BASE_H + +#include + +#pragma once +namespace Lcal { +enum QuantGranularity : int { + QUANT_GRANULARITY_UNDEFINED = -1, + PER_TENSOR = 0, + PER_CHANNEL = 1, + PER_GROUP = 2, + PER_TOKEN = 3, + FLOAT32_SCALE_PER_CHANNEL = 4, + QUANT_GRANULARITY_MAX = 5, +}; + +struct MatMulInfo { + int64_t batchSize = 1; + int64_t m = -1; + int64_t k = -1; + int64_t n = -1; + bool transA = false; + bool transB = false; + bool withBias = false; + bool isInt8 = false; + bool weightNz = false; +}; + +struct TwoDimTPInfo { + int32_t agDim = -1; + int32_t rsDim = -1; + bool innerDimIsAg = true; +}; + +struct QuantInfo { + QuantGranularity dequantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; + int32_t dequantGroupSize = -1 + + QuantGranularity quantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; + int32_t quantGroupSize = -1; +}; + +struct PostInfo { + int32_t withRmsNorm = 0; +}; +} +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/lcoc_func.h b/comm/lcal/include/lcoc/lcoc_func.h new file mode 100644 index 00000000..b0c52dd1 --- /dev/null +++ b/comm/lcal/include/lcoc/lcoc_func.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCAL_LCOC_FUNC_H +#define LCAL_LCOC_FUNC_H + +#include +#include +#include +#include +#include + +#pragma once +namespace Lcal { + bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max); + bool CheckParamScopeList(std::vector> paramCheckList); + bool CheckParamAlign(const std::string &name, const int &value, const int &align); + void PrintErrorLog(LcalType lcalType, const std::string &log); + bool CheckParamPowerOfTwo(const std::string &name, int value); +} + +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/comm/lcal/include/lcoc/lcoc_workspace.h new file mode 100644 index 00000000..f254d951 --- /dev/null +++ b/comm/lcal/include/lcoc/lcoc_workspace.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_LCOC_WORKSPACE_H +#define LCAL_LCOC_WORKSPACE_H + +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV_C310__) +#define __aicore__ +#define GM_ADDR int64_t +#endif + +struct LcalWorkspaceInfo { + GM_ADDR gm_reducebuf{ 0 }; + GM_ADDR gm_a_align{ 0 }; + GM_ADDR gm_b_algn{ 0 }; + GM_ADDR gm_accum{ 0 }; + GM_ADDR gm_formate_dequant_scale{ 0 }; + GM_ADDR gm_dequant_param{ 0 }; + + GM_ADDR workspaceSize {0}; +}; + +inline __aicore__ int32_t AlignUp(int32_t len, int32_t size) +{ + return (len + size -1) & ~(size - 1); +} + +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV_C310__) +inline uint64_t GetDequantWorkSpaceSize(Lcal::LcalType lcalType, int32_t withSerialMode, int32_t m, int32_t n, + int32_t m0, int32_t n0, int32_t pValue, int32_t nLoop, int32_t rankSize, int32_t blockDim, + int32_t maxOutputSize = -1) + { + constexpr int32_t TWO = 2; + uint64_t dequantWorkSpaceSize = 0; + if (withSerialMode > 0) { + dequantWorkSpaceSize = (maxOutputSize == -1 ? m : maxOutputSize) * n * sizeof(int32_t); + } else { + if (lcalType == Lcal::LcalType::MATMUL_ALL_REDUCE) { + dequantWorkSpaceSize = pValue * blockDim * m0 * n0 * TWO * sizeof(int32_t); + } else { + dequantWorkSpaceSize = (maxOutputSize == -1 ? m : maxOutputSize) * n * sizeof(int32_t); + } + } + return dequantWorkSpaceSize; +} +#endif + +inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpcae, int32_t batchSize, int32_t m, + int32_t k, int32_t n, int32_t mAlign, int32_t kAlign, int32_t nAlign, bool transa, bool transb, + int32_t mmadSize, bool hasAAlign, bool hasBAlign, int32_t accumRankSize, bool hasAccum = false, + uint64_t dequantWorkSpaceSize = 0, bool hasDequantParam = false, bool hasFormatDequantScale = false, + bool isDeterministic = false, + int32_t isMoe = false, int32_t is_alltoallvc = false, + int32_t EP = 1, int32_t expertPerRank = 1, int32_t outputSize = -1) +{ + if (outputSize == -1) { + outputSize = m; + } + constexpr int32_t ALIGN8 = 8; + LcalWorkspaceInfo lcalWorkspaceInfo; + lcalWorkspaceInfo.gm_reducebuf = gmWorkSpcae; + GM_ADDR workspaceOffset = gmWorkSpcae; + if (isDeterministic) { + workspaceOffset += WORKSPACE_REDUCE_SIZE; + } + + if (hasAAlign) { + lcalWorkspaceInfo.gm_a_align = workspaceOffset; + workspaceOffset += static_cast(batchSize) * (transa ? k * mAlign : m : kAlign) * mmadSize; + } + + if (hasBAlign) { + lcalWorkspaceInfo.gm_b_algn = workspaceOffset; + workspaceOffset += static_cast(batchSize) * (transb ? n * kAlign : k * nAlign) * mmadSize * + (expertPerRank <= 0 ? 1 : expertPerRank); + } + + if (hasDequantParam) { + lcalWorkspaceInfo.gm_dequant_param = workspaceOffset; + workspaceOffset += sizeof(int32_t) * AlignUp(n, ALIGN8); + } + + if (hasFormatdequantscale) { + lcalWorkspaceInfo.gm_formate_dequant_scale = workspaceOffset; + workspaceOffset += sizeof(float) * AlignUp(n, ALIGN8); + } + + if (hasAccum) { + lcalWorkspaceInfo.gm_accum = workspaceOffset; + workspaceOffset += dequantWorkSpaceSize; + } + lcalWorkspaceInfo.workspaceSize = workspaceOffset; + return lcalWorkspaceInfo; +} + +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling.h b/comm/lcal/include/lcoc/tiling/tiling.h new file mode 100644 index 00000000..a41a8d94 --- /dev/null +++ b/comm/lcal/include/lcoc/tiling/tiling.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_TILING_H +#define LCAL_TILING_H + +#include +#include +#include "tiling_args.h" +#include "lcal_types.h" +#include "lcal_comm.h" +#include "lcoc.h" + +namespace Lcal { +class CoCTilingFunc { +public: + CoCTilingFunc(const CoCTilingFunc &) = delete; + CoCTilingFunc &operator = (const CoCTilingFunc &) = delete; + CoCTilingFunc() {} + virtual ~CoCTilingFunc() {} + CoCTilingData GenerateTiling(const TaskParam &taskParam, const CoCTiling &tiling); + + virtual bool CheckTiling(const TaskParam &taskParam); + virtual void GetDefaultTiling(const TaskParam &taskParam); + +protected: + CoCTilingData cocTilingData = {}; +}; + +class CoCMatmullReduceTilingFunc : public CoCTilingFunc { +public: + CoCMatmullReduceTilingFunc(const CoCMatmullReduceTilingFunc &) = delete; + CoCMatmullReduceTilingFunc &operator = (const CoCMatmullReduceTilingFunc &) = delete; + CoCMatmullReduceTilingFunc() {} + bool CheckTiling(const TaskParam &taskParam) override; + void GetDefaultTiling(const TaskParam &taskParam) override; +} + +class CoCMatmullReduceDeterTilingFunc : public CoCTilingFunc { +public: + CoCMatmullReduceDeterTilingFunc(const CoCMatmullReduceDeterTilingFunc &) = delete; + CoCMatmullReduceDeterTilingFunc &operator = (const CoCMatmullReduceDeterTilingFunc &) = delete; + CoCMatmullReduceDeterTilingFunc() {} + bool CheckTiling(const TaskParam &taskParam) override; + void GetDefaultTiling(const TaskParam &taskParam) override; +} + +class CoCAllgatherMatnulReduceScatterTilingFunc : public CoCTilingFunc { +public: + CoCAllgatherMatnulReduceScatterTilingFunc(const CoCAllgatherMatnulReduceScatterTilingFunc &) = delete; + CoCAllgatherMatnulReduceScatterTilingFunc &operator = (const CoCAllgatherMatnulReduceScatterTilingFunc &) = delete; + CoCAllgatherMatnulReduceScatterTilingFunc() {} + bool CheckTiling(const TaskParam &taskParam) override; + void GetDefaultTiling(const TaskParam &taskParam) override; +}; +} + +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_91093.h b/comm/lcal/include/lcoc/tiling/tiling_91093.h new file mode 100644 index 00000000..331ae5ec --- /dev/null +++ b/comm/lcal/include/lcoc/tiling/tiling_91093.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCAL_TILING_91093_H +#define LCAL_TILING_91093_H + +#include "tiling_args.h" +namespace Lcal { + void AllReduceNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData); + void AllReduceNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData); + + void CoCAllgatherMatmulReduceScatterAgEightRsTwoTiling(CoCTilingData &cocTilingData); + void CoCAllgatherMatmulReduceScatterDefaultTiling(CoCTilingData &cocTilingData); +} +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_910B.h b/comm/lcal/include/lcoc/tiling/tiling_910B.h new file mode 100644 index 00000000..8c75adbd --- /dev/null +++ b/comm/lcal/include/lcoc/tiling/tiling_910B.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCAL_TILING_910B_H +#define LCAL_TILING_910B_H + +#include "tiling_args.h" +namespace Lcal { + void AllReduceGetDefaultTiling(CoCTilingData &cocTilingData); + void AllReduceFourRankInt8GetDefaultTiling(CoCTilingData &cocTilingData); + void AllReduceFourRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); + void AllReduceEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); + void AllReduceEightRankINT8GetDefaultTiling(CoCTilingData &cocTilingData); + void AllReduceTwoRankFP16Tiling(CoCTilingData &cocTilingData); + + void ReduceScatterEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); + void ReduceScatterFoutRankINT8Tiling(CoCTilingData &cocTilingData); + +} +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/comm/lcal/include/lcoc/tiling/tiling_args.h new file mode 100644 index 00000000..87b21e81 --- /dev/null +++ b/comm/lcal/include/lcoc/tiling/tiling_args.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_TILING_ARGS_H +#define LCAL_TILING_ARGS_H + +#include "lcoc_base.h" + +#pragma once +namespace Lcal { + constexpr int32_t MAX_CORE_NUM = 20; + constexpr int32_t MAX_L2_SIZE = 192 * 1024 * 1024; + constexpr int32_t MAX_L0CSIZE = 128 * 1024; + constexpr int32_t HBM_BM = 1; + constexpr int32_t L2_BW = 5; + constexpr int32_t BYTE_512 = 512; + constexpr int32_t MAX_UB_NUM = 97280; + constexpr int32_t MIN_UB_NUM = 256; + constexpr int32_t A3_DIE_NUM = 2; + constexpr int32_t DEFAULT_P_VALUE = 1; + constexpr int32_t MIN_P_VALUE = 1; + constexpr int32_t MAX_P_VALUE = 15; + constexpr int32_t TAG_MOD = 10000; + constexpr int32_t SWIZZLE_COUNT_FOUR = 4; + constexpr int32_t DEFAULT_SWIZZLE_COUNT = 7; + constexpr int32_t SWIZZLE_DIRECT_ZERO = 0; + constexpr int32_t SWIZZLE_DIRECT_ONE = 1; + constexpr int32_t COMM_DATA_DIRECT = 0; + constexpr int32_t COMM_NPU_DIRECT = 1; + constexpr int32_t COMMNPUSPLIT_ONE = 1; + constexpr int32_t COMMNPUSPLIT_TWO = 2; + constexpr int32_t COMMNPUSPLIT_THREE = 3; + constexpr int32_t COMMNPUSPLIT_EIGHT = 8; + constexpr int32_t COMMNPUSPLIT_FOUR = 4; + constexpr int32_t COMMDATASPLIT_ONE = 1; + constexpr int32_t COMMDATASPLIT_TWO = 2; + constexpr int32_t COMMDATASPLIT_FOUR = 4; + constexpr int32_t COMMDATASPLIT_EIGHT = 8; + constexpr int32_t COMMDATASPLIT_SIXTEEN = 16; + constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; + constexpr int32_t AXES_ALIGN_SIZE_INT8 = 128; + constexpr int32_t DEFAULT_ROW = 128; + constexpr int32_t DEFAULT_COL = 256; + constexpr int32_t AXES_ALIGN_SIZE = 512; + constexpr int32_t BASE_BLOCK_STEP = 2; + constexpr int32_t INPUT_DTYPE = 2; + constexpr int32_t MAX_BLOCK_COUNT =2; + constexpr int32_t BLOCK_COUNT_3 = 3; + constexpr int32_t FP16_SIZE = 2; + constexpr int32_t FP32_SIZE = 4; + constexpr int32_t BLOCK_SIZE = 16; + constexpr int32_t BLOCK_SIZE_K = 32; + constexpr int32_t ND_SHAPE_SIZE = 2; + constexpr int32_t NZ_SHAPE_SIZE = 4; + constexpr int32_t CUBE_BLOCK_SIZE_INT8 = 512; + constexpr int32_t CUBE_BLOCK_SIZE = 256; + constexpr int32_t MIN_UB_MOVE_NUM = 5120; + constexpr int32_t VALID_UB_MOVE_NUM = 20480; + constexpr int32_t L1AB_PINGPONG_BUFFER_LEN_FP16 = 131072; + constexpr int32_t HALF_KBYTE = 512; + constexpr int32_t SECOND_TO_MS = 1e3; + constexpr int64_t MATMUL_BASE_100US = static_cast(1024) * 8192 * 1024; + constexpr int64_t ALLREDUCE_BASE_100US = 4096 * 1024; + constexpr double ONE_K = 1024.0; + constexpr double B1_FLOP_PER_MS = (364 * 0.8) * 1e9; + constexpr double DOUBLE = 2.0; + constexpr double HALF_PROB = 0.5; + constexpr int32_t CONDITION_M_ST = 0; + constexpr int32_t CONDITION_M_END = 1; + constexpr int32_t CONDITION_K_ST = 2; + constexpr int32_t CONDITION_K_END = 3; + constexpr int32_t CONDITION_N_ST = 4; + constexpr int32_t CONDITION_N_END = 5; + constexpr int32_t RANKSIZE_TWO = 2; + constexpr int32_t RANKSIZE_FOUR = 4; + constexpr int32_t RANKSIZE_EIGHT = 8; + constexpr int32_t RANKSIZE_SIXTEEN = 16; + constexpr int32_t DIV_TWO = 2; + constexpr int32_t LENPERLOOP_DEFAULT = 5120; + constexpr int32_t ALLREDUCE_LENPERLOOP_DEFAULT = 5120; + constexpr int32_t TREE_LEN_PER_LOOP = 20480; + constexpr int32_t DIM_EIGHT = 8; + constexpr int32_t DIM_TWO = 2; + constexpr int32_t DEFAULT_SPLIT_K = 0; + constexpr int32_t NUM_TWO = 2; + + struct CoCTiling { + int32_t m0 = -1; + int32_t k0 = -1; + int32_t n0 = -1; + int32_t swizzlCount = -1; + int32_t swizzlDirect = -1; + int32_t pValue = -1; + int32_t ubMoveNum = -1; + int32_t commNpuSplit = -1; + int32_t commDataSplit = -1; + int32_t commDirect = -1; + int32_t lenPerLoop = -1; + int32_t extraUbMoveNum = -1; + int32_t extraCommNpuSplit = -1; + int32_t extraCommDataSplit = -1; + int32_t extraCommDirect = -1; + int32_t extraLenPerLoop = -1; + int32_t splitK = -1; + int32_t write2OtherRank = -1; + int32_t withSerialMode = -1; + + int32_t is91093 = -1; + int32_t bufferSize = -1; + }; + + struct CoCTilingData : CoCTiling { + int64_t m = -1; + int64_t k = -1; + int64_t n = -1; + int64_t batchSize = -1; + + int32_t blockDim = -1 + int32_t rank = -1; + int32_t rankSize = -1; + int32_t tag = -1; + + int32_t mLoop = -1; + int32_t kLoop = -1; + int32_t nLoop = -1; + int32_t coreLoop = -1; + uint32_t tilingKey = -1; + + const char* ToString() const; + void SetDefaultValue(); + }; + + struct CoCkernelParm { + CoCTilingData cocTilingData = {}; + QuantInfo quantInfo = {}; + TwoDimTPInfo twoDimTPInfo = {}; + PostInfo postInfo = {}; + bool weightNz = false; + }; + +} +#endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_func.h b/comm/lcal/include/lcoc/tiling/tiling_func.h new file mode 100644 index 00000000..ee10892b --- /dev/null +++ b/comm/lcal/include/lcoc/tiling/tiling_func.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_TILING_FUNC_H +#define LCAL_TILING_FUNC_H + +#include +#include +#include +#include "lcoc_args.h" +#include "lcal_types.h" +#include "tiling_args.h" + +#pragma once +namespace Lcal { + struct TilingValue { + int32_t value = -1; + std::map>> conditionMap = {}; + }; + + int32_t CeilDev(int32_t num, int32_t div); + int32_t RoundNum(int32_t num, int32_t rnd); + void UpdateTilingValue(const int32_t &tilingParam, int32_t &tilingDataParam); + double GetMTETime(double mknGB, int32_t m0, int32_t n0, double aBindWidth = 3.0, double bBindWidth = 3.0); + int32_t GetValueFromMKNConditionMap(int32_t m, int32_t k, int32_t n, + int32_t defaultValue, + std::map>> conditionMap); + bool Is910B(const ChipName &chipName); + bool Is91093(const ChipName &chipName); + uint32_t GetTilingKey(const MatMulInfo &mmInfo, CoCTilingData &tilingData); + void DealTilingParamByBuffSize(CoCTilingData &cocTilingData); + int ClampValue(int32_t value, int32_t min, int32_t max); + void SetTilingParam(CoCTilingData &cocTilingData, const std::map& tilingParamMap); + void SetSecondCoreSplitTling(CoCTilingData &cocTilingData); + void SetTilingParam2D(CoCTilingData &cocTilingData, const std::map& tilingParamMap); + bool CheckCoCTiling(const CoCTiling &tiling); + bool CheckCoCTilingData(const CoCTilingData &tilingData); + void TransformCoCTiling(const CoCTiling &tiling, CoCTilingData &tilingData); + void CalTilingParam(const MatMulInfo &mmInfo, CoCTilingData &tilingData); + void SetTilingInputParam(const TaskParam &taskParam, CoCTilingData &tilingData); + void SetTilingData(const TaskParam &taskParam, const CoCTiling &tiling, CoCTilingData &tilingData); + +} + +#endif \ No newline at end of file diff --git a/comm/lcal/src/coc_kernel_args.cpp b/comm/lcal/src/coc_kernel_args.cpp new file mode 100644 index 00000000..fc4b60e0 --- /dev/null +++ b/comm/lcal/src/coc_kernel_args.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "coc_kernel_args.h" +#include +#include +#include +#include +#include "tiling.h" +#include "lcal_internal.h" +using namespace Mki; + +namespace Lcal { +int CoCkernelArgs::SetFFTSAddr() +{ + uint32_t fftsLen; + int MkiRtGetC2cCtrlAddr(&fftsAddr, &fftsLen); + if (error != MKIRT_SUCCESS) { + MKT_LOG(ERROR) << "MkiRtGetC2cCtrlAddr err"; + return LCAL_ERROR_MKIRT; + } + return LCAL_SUCCESS; +} + +void CoCkernelArgs::SetInputPkgArgs(CoCInputPkg &inputPkg) +{ + matrixA = inputPkg.matrixA; + matrixB = inputPkg.matrixB; + bias = inputPkg.bias; + gamma= inputPkg.gamma; + dequantScale = inputPkg.dequantScale; + dequantOffset = inputPkg.dequantOffset; + quantScale = inputPkg.quantScale; + quantOffset = inputPkg.quantOffset; +} + +void CoCkernelArgs::SetOutputPkgArgs(CoCOutputPkg &outputPkg) +{ + output = outputPkg.output; + minOutput = outputPkg.midOutput; +} + +void CoCkernelArgs::SetWorkspacePtrArg(void *workspacePtr) +{ + workspace = workspacePtr; +} + +void CoCkernelArgs::SetParamDescArgs(const CoCParamDesc ¶mDesc) +{ + cockernelParm.quantInfo = paramDesc.quantInfo; + cockernelParm.twoDimTPInfo = paramDesc.twoDimTPInfo; + cockernelParm.postInfo = paramDesc.postInfo; + cockernelParm.weightNz = paramDesc.mmInfo.weightNz; +} + +void CoCkernelArgs::SetCommArgs(const LcalComm &comm) +{ + commArgsPtr = comm.GetCommArgsPtr(); +} + +void CoCkernelArgs::SetCoCTilingDataArgs(const CoCTilingData &tilingData) +{ + pCocTiling = &(cockernelParm.cocTilingData); + cockernelParm.cocTilingData = tilingData; +} + +std::string CoCkernelArgs::ParamToString() +{ + std::string quantInfoString = "[QuantInfo]: dequantGranularity=" + + std::to_string(cockernelParm.quantInfo.dequantGranularity) + "\n"; + std::string weightNzInfoString = "[weightNz]: weightNz=" + + std::to_string(cockernelParm.weightNz) + "\n"; + std::string tilingInfoString = cockernelParm.cocTilingData.ToString(); + return quantInfoString + weightNzInfoString + tilingInfoString; +} +} \ No newline at end of file diff --git a/comm/lcal/src/coc_kernel_args.h b/comm/lcal/src/coc_kernel_args.h new file mode 100644 index 00000000..873833fc --- /dev/null +++ b/comm/lcal/src/coc_kernel_args.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef LCAL_COC_KERNEL_ARGS_H +#define LCAL_COC_KERNEL_ARGS_H + +#include +#include "tiling_args.h" +#include "lcal_comm.h" +#include "lcoc_args.h" + +namespace Lcal { + struct CoCkernelArgs { + void *matrixA = nullptr; + void *matrixB = nullptr; + void *bias = nullptr; + void *gamma = nullptr; + void *output = nullptr; + void *midOutput = nullptr; + void *workspace = nullptr; + void *dequantScale = nullptr; + void *dequantOffset = nullptr; + void *quantScale = nullptr; + void *quantOffset = nullptr; + void *commArgsPtr = nullptr; + uint64_t fftsAddr = 0; + + CoCTilingData *pCocTiling = nullptr; + CoCkernelParm cockernelParm = {}; + int SetFFTSAddr(); + void SetInputPkgArgs(CoCInputPkg &inputPkg); + void SetOutputPkgArgs(CoCOutputPkg &outputPkg); + void SetWorkspacePtrArg(void *workspacePtr); + void SetParamDescArgs(const CoCParamDesc ¶mDesc); + void SetCommArgs(const LcalComm &comm); + void SetCoCTilingDataArgs(const CoCTilingData &tilingData); + std::string ParamToString(); + } +} + +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_add_bias_runner.cce b/comm/lcal/src/kernels/coc_add_bias_runner.cce new file mode 100644 index 00000000..ed887ed2 --- /dev/null +++ b/comm/lcal/src/kernels/coc_add_bias_runner.cce @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef __COC_ADD_BIAS_RUNNER__ +#define __COC_ADD_BIAS_RUNNER__ + +#ifdef __DAV_C220_VEC__ + +#include +#include "coc_internal.cce" + +enum class BiasMode { ADD = 0, MOVE, ATOMIC_ADD }; + +template +class BaseSerialBiasAdder { +public: + __aicore__ explict BaseSerialBiasAdder() = default; + + inline __aicore__ void SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) + { + this->gm_out = reinterpret_cast<__gm__ OutputDtype *>(gm_out); + this->gm_bias = reinterpret_cast<__gm__ OutputDtype *>(gm_bias); + + this->batch_size = batch_size; + this->m = m; + this->n = n; + + int32_t align_core_num = get_block_num() * get_subblockdim(); + int32_t align_core_dix = get_block_dix() * get_subblockdim() + get_subblockid(); + + if constexpr (MODE == BiasMode::MOVE || MODE == BiasMode::ATOMIC_ADD) { + max_len = Block32B::AlignDown(MAX_UB_BUFF / sizeof(OutputDtype)); + } else if constexpr (MODE == BiasMode::ADD) { + max_len = Block32B::AlignDown(MAX_UB_BUFF / sizeof(OutputDtype) * 3); + } + + int32_t n_round = Block32B::AlignUp(n); + m_per_loop = (n_round <= max_len) ? (max_len / n_round) : 1; + n_per_loop = (n_round <= max_len) ? n : max_len; + + int32_t m_per_core_base = m / align_core_num; + int32_t m_remainder = m % align_core_num; + int32_t m_offset_base = align_core_dix * m_per_core_base; + if (align_core_dix < m_remainder) { + m_this_core = m_per_core_base + 1; + m_offset_this_core = m_offset_base + align_core_dix; + } else { + m_this_core = m_per_core_base; + m_offset_this_core = m_offset_base + m_remainder; + } + } + + inline __aicore__ void Run() + { + if constexpr (MODE == BiasMode::ADD) { + AddBias(); + } else if constexpr (MODE == BiasMode::ATOMIC_ADD) { + SetAtomicAdd(); + PipeBarrier(); + MoveBias(); + SetAtomicNone(); + PipeBarrier(); + } + } + + inline __aicore__ void Barrier() + { + FFTSCrossCoreSync(0, AIV_FINISH_ADD_BIAS_FLAG_ID); + WaitEvent(AIV_FINISH_ADD_BIAS_FLAG_ID); + } + +private: + inline __aicore__ void AddBias() + { + if constexpr (MODE != BiasMode:ADD) { + return; + } + + auto ub_bias = reinterpret_cast<__ubuf__ OutputDtype *>((uintptr_t)0); + auto ub_out1 = reinterpret_cast<_ubuf__ OutputDtype *>((uintptr_t)(max_len * sizeof(OutputDtype))); + auto ub_out2 = reinterpret_cast<_ubuf__ OutputDtype *>((uintptr_t)(max_len * sizeof(OutputDtype) * 2)); + bool ping = true; + + for (int32_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { + for (int32_t n_complete = 0, n_this_loop = n_per_loop; n_complete < n; n_complete += n_this_loop) { + n_this_loop = (n_complete + n_this_loop > n) ? (n - n_complete) : n_this_loop; + + // MTE2 ub_bias <- gm_bias + CopyGmToUbufAlign(ub_bias, gm_bias + n_complete, 1, n_this_loop, 0); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + BroadcastBiasforbias(ub_bias, n_this_loop); + + PipeBarrier(); + + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + + ProcessMLoop(n_complete, n_this_loop, ub_out1, ub_out2, ping, ub_bias); + + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + ping = !ping; + } + } + } + + inline __aicore__ void BroadcastBiasforbias(__ubuf__ OutputDtype *ub_bias, int32_t n_this_loop) + { + for (int32_t row_idx = 1; row_idx < m_per_loop; ++row_idx) { + CopyUB2UB(ub_bias + row_idx * Block32B::AlignUp(n), ub_bias, 0, 1, + Block32B::Count(n_this_loop), 0, 0); + } + } + + inline __aicore__ void ProcessMLoop(int32_t n_complete, int32_t n_this_loop, __ubuf__ OutputDtype *ub_out1, + __ubuf__ OutputDtype *ub_out2, bool ping, __ubuf__ OutputDtype *ub_bias) + { + for (int32_t m_complete = 0, m_this_loop = m_per_loop; m_complete < m_this_core; m_complete += m_this_loop) { + m_this_loop = (m_complete + m_this_loop > m_this_core) ? (m_this_core - m_complete) : m_this_loop; + + auto ub_out = ping ? ub_out1 : ub_out2; + auto event_id = ping ? EVENT_ID1 : EVENT_ID2; + int32_t out_offset = (m_offset_this_core + m_complete) * n + n_complete; + + WaitFlag(event_id); + + // MTE2: ub_out <- gm_out + CopyGmToUbufAlign(ub_out, gm_out + out_offset, m_this_loop, n_this_loop, n - n_this_loop); + + SetFlag(event_id); + WaitFlag(event_id); + + // V: ub_out <- ub_out + ub_bias + AddBiasToOutput(ub_out, ub_bias, m_this_loop, n_this_loop); + + SetFlag(event_id); + WaitFlag(event_id); + + // MTE3: gm_out <- ub_out + CopyUbufToGmAlign(gm_out + out_offset, ub_out, m_this_loop, n_this_loop, n - n_this_loop); + + SetFlag(event_id); + } + } + + inline __aicore__ void AddBiasToOutput(__ubuf__ OutputDtype *ub_out, __ubuf__ OutputDtype *ub_bias, + int32_t m_this_loop, int32_t n_this_loop) + { + int32_t n_blocks = m_this_loop * Block32B::Count(n_this_loop); + int32_t repeat_times = DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT); + uint8_t repeat = UINT8_MAX; + for (int32_t repeat_complete = 0; repeat_complete < repeat_times; repeat_complete += repeat) { + repeat = (repeat_complete + repeat > repeat_times) ? (repeat_times - repeat_complete) : repeat; + + int32_t vadd_offset = repeat_complete * Block256B::size; + Vadd(ub_out + vadd_offset, ub_out + vadd_offset, ub_bias + vadd_offset, repeat, 1, 1, 1, 8, 8, 8); + } + } + + inline __aicore__ void MoveBias() + { + if constexpr (MODE != BiasMode::MOVE && MODE != BiasMode::ATOMIC_ADD) { + return; + } + + auto ub_base = reinterpret_cast<__ubuf__ OutputDtype *>((uintptr_t)0); + + for (int32_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { + ProcessBias(ub_base); + } + } + + inline __aicore__ void ProcessBias(__ubuf__ OutputDtype *ub_base) + { + int32_t n_this_loop = n_per_loop; + for (int32_t n_complete = 0; n_complete < n; n_complete += n_this_loop) { + if (n_complete + n_this_loop > n) { + n_this_loop = n - n_complete; + } + + // MTEs: ub_base <- gm_bias + CopyGmToUbufAlign(ub_base, gm_bias + n_complete, 1, n_this_loop, 0); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + BroadcastBias(ub_base, n_this_loop); + + // MTE3: gm_out <- ub_base + CopyBiasToOutput(n_complete, n_this_loop, ub_base); + } + } + + inline __aicore__ void BroadcastBias(__ubuf__ OutputDtype *ub_base, int32_t n_this_loop) + { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + for (int32_t row_idx = 1; row_idx < m_per_loop; ++row_idx) { + CopyUB2UB(ub_base + row_idx *Block32B::AlignUp(n), ub_base, 0, 1, + Block32B::Count(n_this_loop), 0, 0); + } + } + + inline __aicore__ void CopyBiasToOutput(int32_t n_complete, int32_t n_this_loop, __ubuf__ OutputDtype *ub_base) + { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int32_t m_this_loop = m_per_loop; + for (int32_t m_complete = 0; m_complete < m_this_core; m_complete += m_this_loop) { + if (m_complete + m_this_loop > m_this_core) { + m_this_loop = m_this_core - m_complete; + } + + CopyUbufToGmAlign(gm_out + (m_offset_this_core + m_complete) * n + n_complete, ub_base, m_this_loop, + n_this_loop, n - n_this_loop); + } + } + + __gm__ OutputDtype *gm_out; + __gm__ OutputDtype *gm_bias; + + int32_t batch_size; + int32_t m; + int32_t n; + + int32_t m_this_core; + int32_t m_offset_this_core; + + int32_t m_per_loop; + int32_t n_per_loop; + + int32_t max_len; + int32_t repeat_per_loop; +}; + +template +class PureMatmulBiasAdder { + static constexpr auto MODE = std::is_same::value ? BiasMode::ADD : BiasMode::ATOMIC_ADD; + +public: + __aicore__ explicit PureMatmulBiasAdder() = default; + + inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) + { + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()); + } + + inline void __aicore__ Run() + { + base_adder.Run(); + base_adder.Barrier(); + } + +private: + BaseSerialBiasAdder base_adder; +}; + +template +class MatmulAllReduceBiasAdder { +public: + __aicore__ explicit MatmulAllReduceBiasAdder() = default; + + inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) + { + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()); + } + + inline void __aicore__ Run() + { + base_adder.Run(); + base_adder.Barrier(); + } + +private: + BaseSerialBiasAdder base_adder; +}; \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce b/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce new file mode 100644 index 00000000..265667ae --- /dev/null +++ b/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __CCE_KT_TEST__ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + +#include "coc_ppmatmul_switch.cce" +#include "coc_allgather_reducescatter.cce" +#ifdef __DAV_C220_CUBE__ + +// Matmul in LcalAllGatherMatmulReduceScatter +#define COC_ALL_GATHER_MATMUL_REDUCESCATTER_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllGatherMatmulReduceScatter_##type##_mix_aic(COC_ARGS_FUN(type)) { \ + return CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + +#elif __DAV_C220_VEC__ +#define COC_ALL_GATHER_MATMUL_REDUCESCATTER_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllGatherMatmulReduceScatter_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ + return CocAllGatherMatmulReduceScatterAiv(COC_ARGS_CALL()); \ +} +#endif + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_ALL_GATHER_MATMUL_REDUCESCATTER_FUNC_AUTO_DEF); +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/comm/lcal/src/kernels/coc_allreduce.cce new file mode 100644 index 00000000..cc740acb --- /dev/null +++ b/comm/lcal/src/kernels/coc_allreduce.cce @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class AllReduce : public CocCommBase { +public: + __aicore__ explicit AllReduce(){}; + + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) + { + CocCommBase::SetArgsForReduce(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + postprocessor.SetArgs(PP_MATMUL_AIV_POST_ARGS_CALL()); + if constexpr (HAVE_BIAS) { + add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + need_dequant = workspace_info.gm_accum; + if (need_dequant) { + if (withSerialMode) { + serial_dequant_runner.SetArgs(reinterpret_cast<__gm__ bfloat16_t *>(buff[rank]), workspace_info, + reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale), + reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset), + dequant_granularity, batch_size, m, n); + } else { + fused_dequant_runner.SetArgs(reinterpret_cast<__gm__ bfloat16_t *>(buff[rank]), workspace_info, + reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale), + reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset), dequant_granularity, + batch_size, m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, + swizzl_count, p_value, rank_size); + } + } + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + fused_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(buff[rank]), + reinterpret_cast<__gm__ int64_t *>(gm_quant_scale), + m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, + swizzl_count, p_value, rank_size); + serial_pertokrn_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(gm_out), reinterpret_cast<__gm__ int64_t *>(gm_quant_scale), m, n, m0, n0); + } + total_core_idx = aiv_idx * core_num + core_idx; + cal_count = DivCeil(core_loop, loop_num_per_comm); + } + + FORCE_INLINE_AICORE int32_t GetCoreGroup() { + if (total_core_idx < core_count) { + return 0; + } + if (total_core_idx < core_count + SIO_TOTAL_CORE_NUM) { + return 1; + } + return -1; + } + + FORCE_INLINE_AICORE void InitFlags() { + if constexpr (HAVE_BIAS) { + SetAtomicAdd(); + PipeBarrier(); + } + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + + FORCE_INLINE_AICORE void EndFlagsAndBias() { + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + + if constexpr (HAVE_BIAS) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); + } + } + + FORCE_INLINE_AICORE void StartBeforeFisrtStep(uint64_t flag_idx) { + SetAndWaitAivSync(flag_idx, is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT); + SetAtomicAdd(); + PipeBarrier(); + } + + FORCE_INLINE_AICORE void EndFirstStep(uint64_t flag_idx) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); + SetAndWaitAivSync(flag_idx, is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT); + } + + FORCE_INLINE_AICORE void SecondStepParallel(int32_t data_size_remain, __gm__ T* input, int32_t gm_out_offset) { + if (data_size_remain <= 0) { + return; + } + InitFlags(); + int32_t ping_pong_move_count = DivCeil(data_size_remain, max_ub_ping_pong_size); + + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = (move_idx == ping_pong_move_count -1) ? + data_size_remain - move_idx * max_ub_ping_pong_size : max_ub_ping_pong_size; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, input + move_idx * max_ub_ping_pong_size, 1, + actual_move_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + int32_t move_num_offset = gm_out_offset + move_idx * max_ub_ping_pong_size; + CopyUbufToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset); + SetFlag(event_id); + } + EndFlagsAndBias(); + } + + FORCE_INLINE_AICORE void SecondStepParallelWithSplit( int32_t data_size_remain, int32_t cal_idx, + int32_t flag_idx, int32_t data_loop_idx) { + if (data_size_remain <= 0) { + return; + } + InitFlags(); + int32_t rank_per_core = rank_size / comm_npu_split; + int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; + for (int32_t index = 0; index < rank_per_core; index++) { + int32_t rank_idx_rot = (index + core_idx) % rank_per_core; + int32_t real_core_idx = core_rank_offset + rank_idx_rot; + int32_t before_other_rank_offset = data_loop_idx * comm_data_split * len_per_loop; + int32_t other_rank_offset = before_other_rank_offset + real_core_idx * m_per_rank * n0 + core_idx % comm_data_split * len_per_loop; + int32_t other_rank_buff_offset = flag_idx * gm_c_pingpong_size + other_rank_offset; + int32_t ping_pong_move_count = DivCeil(data_size_remain, max_ub_ping_pong_size); + + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = (move_idx == ping_pong_move_count -1) ? + data_size_remain - move_idx * max_ub_ping_pong_size : max_ub_ping_pong_size; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, buff[real_core_idx] + other_rank_buff_offset + move_idx * max_ub_ping_pong_size, 1, + actual_move_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + int32_t move_num_offset = gm_out_offset + move_idx * max_ub_ping_pong_size; + CopyUbufToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset + cal_idx * gm_c_pingpong_size); + SetFlag(event_id); + } + } + EndFlagsAndBias(); + } + + FORCE_INLINE_AICORE void FirstStepDivCore(int32_t data_len, int32_t offset) { + if (is_deterministic && rank_size >=4 && rank_size <= 8) { + return FirstStepInPeerMemTree(data_len, offset); + } + return FirstStepInPeerMemSeq(data_len, offset); + } + + FORCE_INLINE_AICORE void SecondStepSerial(int32_t data_size_remain, __gm__ T *input, + __gm__ T *output) + { + if (data_size_remain <= 0) { + return; + } + InitFlags(); + + int32_t offset = 0; + for (int32_t move_idx = 0; data_size_remain >= max_ub_ping_pong_size; ++move_idx) { + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub, input + offset, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + CopyGmToUbuf(output + offset, ub, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + data_size_remain -= max_ub_ping_pong_size; + offset += max_ub_ping_pong_size; + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + + if (data_size_remain >= 0) { + CopyGmToUbuf(output_UB_T[0], input + offset, 1, (data_size_remain * sizeof(T) + 31) / 32, 0, 0); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + if (ALIGN) { + CopyGmToUbuf(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); + } else { + CopyUbufToGmAlignB16(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T), 0, 0); + } + } + + if constexpr (HAVE_BIAS) { + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); + } + } + + FORCE_INLINE_AICORE void ParallelWithSplit() { + ResetIpcFlags(3); + PipeBarrier(); + + for (int32_t cal_idx = 0; cal_idx < cal_count; ++cal_idx) { + uint64_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + int32_t actual_loop_num = (cal_idx == cal_count -1) ? core_loop - cal_idx * loop_num_per_comm : + loop_num_per_comm; + int32_t m_total = actual_loop_num * m0; + m_per_rank = DivCeil(m_tatal, rank_size); + m_in_rank = (rank * m_per_rank >= m_total) ? 0 : + ((rank + 1) * m_per_rank > m_tatal ? m_total - rank * m_per_rank : m_per_rank); + + WaitEvent(flag_idx); + + if (need_dequant) { + SetAndWaitAivSync(flag_idx); + fused_dequant_runner.RunDequantAllReduce(cal_idx); + } + + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + SetAndWaitAivSync(flag_idx); + fused_pertoken_dequant_runner.RunDequantAllReduce(cal_idx); + } + SetAndWaitAivSync(flag_idx); + + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + + StartBeforeFisrtStep(flag_idx); + + int32_t rank_total = m_in_rank * n0; + int32_t rank_offset = rank * m_per_rank * n0; + + int32_t rank_buff_offset = flag_idx * m0 * n0 * loop_num_per_comm + rank_offset; + + int32_t len_per_core = rank_total / comm_data_split; + int32_t data_split_num = DivCeil(len_per_core, len_per_loop); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + for (int loop_index = 0; loop_index < data_split_num; loop_index++) { + if (aiv_idx == 0 && core_idx < comm_data_split * comm_npu_split) { + int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; + int32_t loop_total = rank_total - before_core_offset; + int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; + + int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + len_per_loop) > loop_total ? + loop_total - real_core_offset : len_per_loop); + + FirstStepDivCore(m_in_core, runk_buff_offset + before_core_offset + real_core_offset); + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + EndFirstStep(flag_idx); + + CrossRankSyncV1(FLAG_ONE_IDX, cal_idx + 1); + SetAndWaitAivSync(flag_idx); + + for (int loop_index = 0; loop_index < data_split_num; loop_index++) { + if (aiv_idx == 0 && core_idx < comm_data_split * comm_npu_split) { + int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; + int32_t loop_total = rank_total - before_core_offset; + int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; + + int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + len_per_loop) > loop_total ? + loop_total - real_core_offset : len_per_loop); + + SecondStepParallelWithSplit(m_in_core, cal_idx, flag_idx, loop_index); + } + } + SetAndWaitAivSync(flag_idx); + + CrossRankSyncV2(FLAG_TWO_IDX, cal_idx + 1); + SetAndWaitAivSync(flag_idx); + SetAicSync(flag_idx); + } + ResetIpcFlags(3); + + if (aiv_idx == 0 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } + } + + FORCE_INLINE_AICORE void DataCopySioRs(int32_t cal_idx_sio, int32_t len_per_rank) { + int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_4; + int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; + int32_t sio_core_idx = total_core_idx - core_count; + int32_t core_offset = sio_core_idx * len_per_core; + int32_t sio_peer_rank = rank ^ 1; + for (int32_t src_rank = rank % 2; src_rank < rank_size; src_rank +=2) { + int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank + core_offset; + FirstStepInPeerMem(len_per_core, buff[sio_peer_rank] + peer_offset, buff[rank] + peer_offset); + } + } + + FORCE_INLINE_AICORE void DataCopySioAg(int32_t cal_idx_sio, int32_t len_per_rank) { + int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_4; + int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; + int32_t sio_core_idx = total_core_idx - core_count; + int32_t core_offset = sio_core_idx * len_per_core; + int32_t sio_peer_rank = rank ^ 1; + for (int32_t src_rank = sio_peer_rank % 2; src_rank < rank_size; src_rank +=2) { + int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank; + int32_t dst_offset = cal_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank + core_offset; + SecondStepParallel(len_per_core, buff[sio_peer_rank] + peer_offset + core_offset, dst_offset); + } + int32_t local_offset = flag_idx_sio * gm_c_pingpong_size + rank * len_per_rank + core_offset; + int32_t dst_offset = cal_idx_sio * gm_c_pingpong_size + rank * len_per_rank + core_offset; + SecondStepParallel(len_per_core, buff[rank] + local_offset, dst_offset); + } + + FORCE_INLINE_AICORE void ParallelSio() { + ResetIpcFlags(3); + PipeBarrier(); + int32_t last_loop_num = core_loop - (cal_count -1) * loop_num_per_comm; + int32_t core_group = GetCoreGroup(); + for (int32_t cal_idx = 0; cal_idx < cal_count + 2; ++cal_idx) { + int32_t hccs_idx = cal_idx -1; + int32_t sio2_idx = cal_idx -2; + int32_t flag_idx_sio1 = cal_idx % BLOCK_COUNT_4; + int32_t flag_idx_hccs = hccs_idx % BLOCK_COUNT_4; + int32_t flag_idx_sio2 = sio2_idx % BLOCK_COUNT_4; + int32_t loop_num_hccs = hccs_idx == cal_count -1 ? last_loop_num : loop_num_per_comm; + + if (cal_idx < cal_count) { + WaitEvent(flag_idx_sio1); + } + + if (need_dequant) { + fused_dequant_runner.RunDequantAllReduce(cal_idx); + } + + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + + StartBeforeFisrtStep(flag_idx_sio1); + + if (core_group == 0 && cal_idx >= 1 && cal_idx < cal_count + 1) { + int32_t size_per_rank = loop_num_hccs * m0 * n0 / rank_size; + int32_t rank_offset = rank * size_per_rank; + int32_t rank_buff_offset = flag_idx_hccs * gm_c_pingpong_size + rank_offset; + int32_t size_per_core = size_per_rank / (comm_data_split); + + int32_t data_split_num = DivCeil(size_per_core, len_per_loop); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + for (int loop_index = 0; loop_index < data_split_num; loop_index++) { + int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; + int32_t loop_total = size_per_rank -before_core_offset; + int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; + + int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + len_per_loop) > loop_total ? + loop_total -real_core_offset : len_per_loop); + + FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + } + if (core_group == 1 && cal_idx < cal_count) { + int32_t loop_num_sio1 = cal_idx == cal_count -1 ? last_loop_num : loop_num_per_comm; + int32_t size_per_rank = loop_num_sio1 * m0 * n0 / rank_size; + DataCopySioRs(cal_idx, size_per_rank); + } + + EndFirstStep(flag_idx_sio1); + + CrossRankSyncV1(FLAG_ONE_IDX, cal_idx + 1); + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + if (core_group == 0 && cal_idx >= 1 && cal_idx < cal_count + 1) { + int32_t size_per_rank = loop_num_hccs * m0 * n0 / rank_size; + int32_t pipe_offset = flag_idx_hccs * gm_c_pingpong_size + other_rank * size_per_rank; + int32_t dst_offset = hccs_idx * gm_c_pingpong_size + other_rank * size_per_rank; + if ((other_rank % 2) == (rank % 2) && other_rank != rank) { + FirstStepInPeerMemTransLayout(size_per_rank, buff[other_rank] + pipe_offset, buff[rank] + pipe_offset, dst_offset); + } + } + if (core_group == 1 && cal_idx >= 2) { + int32_t loop_num_sio2 = sio2_idx == cal_count - 1 ? last_loop_num : loop_num_per_comm; + int32_t size_per_rank = loop_num_sio2 * m0 * n0 / rank_size; + DataCopySioAg(sio2_idx, size_per_rank); + } + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + CrossRankSyncV2(FLAG_TWO_IDX, cal_idx + 1); + + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + + if (cal_idx >= 2) { + SetAicSync(flag_idx_sio2); + } + } + ResetIpcFlags(3); + + if (aiv_idx == 0 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } + } + + FORCE_INLINE_AICORE void Serial() { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag); + WaitEvent(AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID); + + FFTSCrossCoreSync(0, AIV_FINISH_ALIGN_FLAG_ID); + WaitEvent(AIV_FINISH_ALIGN_FLAG_ID); + + if (need_dequant) { + serial_dequant_runner.Run() + } + if (aiv_idx == 1 && core_idx < rank_size) { + int32_t data_size = batch_size * m * n; + int32_t data_size_per_rank = (data_size + BLOCK_SIZE_16 * rank_size -1) / (BLOCK_SIZE_16 * rank_size) * BLOCK_SIZE_16; + if (other_rank == rank) { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ZERO_IDX, tag); + } else { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ZERO_IDX, tag); + PipeBarrier(); + int32_t rank_buff_offset = rank * data_size_per_rank; + FirstStepInPeerMem(data_size_per_rank, buff[other_rank] + rank_buff_offset, buff[rank] + rank_buff_offset, true); + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag * rank_size); + PipeBarrier(); + int32_t data_size_in_other_rank = data_size_per_rank; + if (other_rank * data_size_in_other_rank >= data_size) { + data_size_in_other_rank = 0; + } else if ((other_rank + 1) * data_size_in_other_rank > data_size) { + data_size_in_other_rank = data_size - other_rank * data_size_per_rank; + } + int32_t other_rank_buff_offset = other_rank * data_size_per_rank; + SecondStepSerial(data_size_in_other_rank, buff[other_rank] + other_rank_buff_offset, gm_out + other_rank_buff_offset); + } + } + + FORCE_INLINE_AICORE void SerialWithSplit() { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag); + WaitEvent(AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID); + + FFTSCrossCoreSync(0, AIV_FINISH_ALIGN_FLAG_ID); + WaitEvent(AIV_FINISH_ALIGN_FLAG_ID); + + if (need_dequant) { + serial_dequant_runner.Run(); + } + + int32_t data_size = batch_size * m * n; + int32_t data_size_per_rank = (data_size + BLOCK_SIZE_16 * rank_size -1) / (BLOCK_SIZE_16 * rank_size) * BLOCK_SIZE_16; + + int32_t use_core_count = comm_npu_split * comm_data_split; + int32_t rank_buff_offset = rank * data_size_per_rank; + + int32_t len_per_core = data_size_per_rank / comm_data_split; + int32_t data_split_num = DivCeil(len_per_core, len_per_loop); + + SetAndWaitAivSync(0); + CrossRankSyncV3(MAX_FLAG_COUNT + FLAG_ZERO_IDX, tag); + StartBeforeFisrtStep(0); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + for (int loop_index = 0; loop_index < data_split_num; loop_index++) { + if (aiv_idx == 0 && core_idx < comm_data_split * comm_npu_split) { + int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; + int32_t loop_total = data_size_per_rank - before_core_offset; + int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; + + int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + len_per_loop) > loop_total ? + loop_total - real_core_offset : len_per_loop); + FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + + EndFirstStep(0); + + CrossRankSyncV4(MAX_FLAG_COUNT + FLAG_ONE_IDX, tag); + SetAndWaitAivSync(0); + + if (aiv_idx == 0 && core_idx < rank_size) { + PipeBarrier(); + int32_t data_size_in_other_rank = data_size_per_rank; + if (other_rank * data_size_in_other_rank >= data_size) { + data_size_in_other_rank = 0; + } else if ((other_rank + 1) * data_size_in_other_rank > data_size) { + data_size_in_other_rank = data_size -other_rank * data_size_per_rank; + } + int32_t other_rank_buff_offset = other_rank * data_size_per_rank; + SecondStepSerial(data_size_in_other_rank, buff[other_rank] + other_rank_buff_offset, gm_out + other_rank_buff_offset); + } + } + + FORCE_INLINE_AICORE void Run() + { + preprocessor.Run(); + + if constexpr (HAVE_BIAS) { + add_bias_runner.Run() + } + + if (withSerialMode) { + if (is_deterministic) { + SerialWithSplit() + } else { + Serial(); + } + } else { + ParallelWithSplit(); + } + + PipeBarrier(); + postprocessor.Run(); + PipeBarrier(); + + if (withSerialMode && dequant_granularity == QuantGranularity::PER_TOKEN) { + serial_pertokrn_dequant_runner.Run(); + } + } + +public: + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::FillZero; + using CocCommBase::FirstStepInPeerMem; + using CocCommBase::FirstStepInPeerMemSeq; + using CocCommBase::FirstStepInPeerMemTree; + using CocCommBase::FirstStepInPeerMemTransLayout; + using CocCommBase::CopyUbufToGmTransLayout; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + using CocCommBase::CrossRankSyncV3; + using CocCommBase::CrossRankSyncV4; + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_loop; + using CocCommBase::core_idx; + using CocCommBase::core_num; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_count; + using CocCommBase::swizzl_direct; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_91093; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::max_ub_ping_pong_size; + using CocCommBase::withSerialMode; + using CocCommBase::tag; + using CocCommBase::loop_num_per_comm; + using CocCommBase::gm_c_pingpong_size; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::comm_npu_split; + using CocCommBase::comm_data_split; + using CocCommBase::len_per_loop; + using CocCommBase::core_count; + using CocCommBase::weight_nz; + using CocCommBase::is_deterministic; + using CocCommBase::flag_offset; + int32_t cal_count; + int32_t m_per_rank; + int32_t total_core_idx; + Preprocessor preprocessor; + Postprocessor postprocessor; + MatmulAllReduceBiasAdder add_bias_runner; + SerialDequantRunner serial_dequant_runner; + FusedDequantRunner fused_dequant_runner; + FusedPerTokenDequantRunner fused_pertoken_dequant_runner; + SerialPerTokenDequantRunner serial_pertokrn_dequant_runner; + bool need_dequant; +}; + +constexpr int32_t NO_BIAS_MASK1 = 0b000000 | 0b100000 | 0b010000 | 0b110000 | 0b001000 | 0b101000 | 0b011000 | + 0b111000 | 0b000100 | 0b100100 | 0b010100 | 0b110100 | 0b001100 | 0b101100 | + 0b011100 | 0b111100; +constexpr int32_t BIAS_MASK1 = 0b000010 | 0b100010 | 0b010010 | 0b110010 | 0b001010 | 0b101010 | 0b011010 | 0b111010 | + 0b000110 | 0b100110 | 0b010110 | 0b110110 | 0b001110 | 0b101110 | 0b011110 | 0b111110; + +template +FORCE_INLINE_AICORE void RunAllReduceAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { + AllReduce allreduce_align_16_without_bias; + AllReduce allreduce_align_16_with_bias; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + allreduce_align_16_without_bias.SetArgs(COC_ARGS_CALL()); + allreduce_align_16_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + allreduce_align_16_with_bias.SetArgs(COC_ARGS_CALL()); + allreduce_align_16_with_bias.Run(); + break; + default : + break; + } +} + +template +FORCE_INLINE_AICORE void RunAllReduceUnAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { + AllReduce allreduce_align_16_without_bias; + AllReduce allreduce_align_16_with_bias; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + allreduce_align_16_without_bias.SetArgs(COC_ARGS_CALL()); + allreduce_align_16_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + allreduce_align_16_with_bias.SetArgs(COC_ARGS_CALL()); + allreduce_align_16_with_bias.Run(); + break; + default : + break; + } +} + +template +inline __aicore__ void CocMatmullReduceAiv(COC_ARGS_FUN(T)) +{ + AllReduce allreduce_align_16_without_bias; + AllReduce allreduce_align_16_with_bias; + AllReduce allreduce_align_16_without_bias; + AllReduce allreduce_align_16_with_bias; + + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int64_t batch_size = cocTilingData->batchSize; + int32_t m = cocTilingData->m; + int32_t n = cocTilingData->n; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t rank_size = cocTilingData->rankSize; + int32_t withSerialMode = cocTilingData->withSerialMode; + if ((withSerialMode == 0 && n % BLOCK_SIZE_16 == 0) || (withSerialMode && (batch_size * m * n) % (rank_size * BLOCK_SIZE_16) == 0)) { + RunAllReduceAlign16(tiling_key, COC_ARGS_CALL()); + } else { + RunAllReduceUnAlign16(tiling_key, COC_ARGS_CALL()); + } + PipeBarrier(); +} +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_comm_base.cce b/comm/lcal/src/kernels/coc_comm_base.cce new file mode 100644 index 00000000..2d4401d0 --- /dev/null +++ b/comm/lcal/src/kernels/coc_comm_base.cce @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_COC_COMM_BASE_H +#define LCAL_COC_COMM_BASE_H + +#ifdef __DAV_C220_VEC__ + +#include "coc_internal.cce" +#include "coc_add_bias_runner.cce" +#include "coc_preprocessor.cce" +#include "coc_postprocessor.cce" +#include "tiling_args.h" +#include "lcoc_workspace.h" +template +class CocCommBase { +public: + __aicore__ explicit CocCommBase(){}; + + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) + { + CoCBuffAddrAndArgs coc_buff_and_args(COC_ARGS_CALL()); + for (int i=0; igm_out = gm_out; + max_ub_ping_pong_size = max_ub_ping_pong_size / n0 * n0; + loop_num_per_comm = p_value * get_block_num(); + gm_c_pingpong_size = m0 * n0 * loop_num_per_comm; + } + + FORCE_INLINE_AICORE void SetFromParam(__gm__ uint8_t *para_gm) + { + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + auto quantInfo = ¶->quantInfo; + auto twoDimTPInfo = ¶->twoDimTPInfo; + batch_size = cocTilingData->batchSize; + m = cocTilingData->m; + k = cocTilingData->k; + n = cocTilingData->n; + + m0 = cocTilingData->m0; + k0 = cocTilingData->k0; + n0 = cocTilingData->n0; + + m_loop = cocTilingData->m_loop; + k_loop = cocTilingData->k_loop; + n_loop = cocTilingData->n_loop; + + core_loop = cocTilingData->coreLoop; + swizzl_count = cocTilingData->swizzlCount; + tiling_key = cocTilingData->tilingKey; + rank = cocTilingData->rank; + rank_size = cocTilingData->rankSize; + buffer_size = cocTilingData->bufferSize; + flag_offset = buffer_size * 1024 * 1024 / sizeof(int32_t); + p_value = cocTilingData->pValue; + max_ub_single_dma_size = cocTilingData->ubMoveNum; + withSerialMode = cocTilingData->withSerialMode; + tag = cocTilingData->tag; + comm_npu_split = cocTilingData->commNpuSplit; + comm_data_split = cocTilingData->commDataSplit; + comm_direct = cocTilingData->commDirect; + len_per_loop = cocTilingData->lenPerLoop; + extra_ub_move_num = cocTilingData->extraUbMoveNum; + extra_comm_npu_split = cocTilingData->extraCommNpuSplit; + extra_comm_data_split = cocTilingData->extraCommDataSplit; + extra_comm_direct = cocTilingData->extraCommDirect; + extra_len_per_loop = cocTilingData->extraLenPerLoop; + is_91093 = cocTilingData->is91093; + core_count = comm_npu_split * comm_data_split; + dequant_granularity = static_cast(quantInfo->dequantGranularity); + dequant_group_size = quantInfo->dequantGroupSize; + quant_granularity = static_cast(quantInfo->quantGranularity) + quant_group_size = quantInfo->quantGroupSize; + swizzl_direct = (tiling_key & SWIZZL_MASK) ? true : false; + trans_a = (tiling_key & TRANS_A_MASK) ? true : false; + trans_b = (tiling_key & TRANS_B_MASK) ? true : false; + is_int8 = (tiling_key & INT8_MASK) ? true : false; + + ag_dim = twoDimTPInfo->agDim; + rs_dim = twoDimTPInfo->rsDim; + inner_dim_is_Ag = twoDimTPInfo->innerDimIsAg; + weight_nz = para->weightNz; + } + + FORCE_INLINE_AICORE void SetWorkspace(__gm__ uint8_t *gm_workspace) + { + int32_t m_align, k_align, n_align; + if (is_int8) { + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + } else { + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + } + int32_t aligned_a, aligned_b; + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + + bool has_a_align = IsQuant(quant_granularity) || aligned_a; + bool has_b_align = IsQuant(dequant_granularity) && !is_int8 || aligned_b; + bool has_accum = IsQuant(dequant_granularity) && is_int8 && (std::is_same::value || std::is_same::value); + bool has_dequant_param = (dequant_granularity == QuantGranularity::PER_TOKEN || dequant_granularity == QuantGranularity::PER_TENSOR); + bool hasFormatDequantScale = (dequant_granularity == QuantGranularity::PER_CHANNEL); + + if (weight_nz) { + aligned_b = 0; + has_b_align = false; + } + workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, + trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, + hasFormatDequantScale, is_deterministic, false, false, 0, 0, 0); + + } + + FORCE_INLINE_AICORE void SetAicSync(uint64_t flag_idx) + { + FFTSCrossCoreSync(2, flag_idx); + } + + FORCE_INLINE_AICORE void SetAndWaitAivSync(uint64_t flag_idx, int32_t pipe_depth = 2) + { + FFTSCrossCoreSync(2, flag_idx + pipe_depth); + WaitEvent(flag_idx + pipe_depth); + } + + FORCE_INLINE_AICORE void SetBuffFlag(__ubuf__ int32_t *ctrl_flags_UB, \ + __gm__ int32_t *buff, int32_t flag) + { + *ctrl_flags_UB = flag; + SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + CopyUbufToGmAlignB16(buff, ctrl_flags_UB, 1, sizeof(int32_t), 0, 0); + } + + FORCE_INLINE_AICORE void SetBuffFlagByAdd(__ubuf__ int32_t *ctrl_flags_UB, \ + __gm__ int32_t *buff, int32_t flag) + { + PipeBarrier(); + *ctrl_flags_UB = flag; + PipeBarrier(); + SetAtomicAdd(); + PipeBarrier(); + CopyUbufToGmAlignB16(buff, ctrl_flags_UB, 1, sizeof(int32_t), 0, 0); + PipeBarrier(); + SetAtomicNone(); + PipeBarrier(); + } + + inline __aicore__ void call_dcci(__gm__ void *__restrict__ gm_ptr) + { + __asm__ __volatile__(""); + dcci(gm_ptr, SINGLE_CACHE_LINE); + __asm__ __volatile__(""); + } + + FORCE_INLINE_AICORE void CheckBuffFlag(__ubuf__ int32_t *ctrl_flags_UB, \ + __gm__ int32_t *buff, int32_t flag) + { + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + while (true) { + CopyUbufToGmAlignB16(ctrl_flags_UB, buff, 1, sizeof(int32_t), 0, 0); + SetFlag(EVENT_ID3); + WaitFlag(EVENT_ID3); + if (*ctrl_flags_UB == flag) { + break; + } + } + } + + FORCE_INLINE_AICORE void CrossRankSyncV1(int32_t flag_idx, int32_t flag_data) + { + if (aiv_idx == 0 && core_idx == rank) { + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, FLAG_VALUE); + } else if (aiv_idx == 0 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, + FLAG_VALUE * flag_data); + } + } + + FORCE_INLINE_AICORE void CrossRankSyncV2(int32_t flag_idx, int32_t flag_data) + { + if (aiv_idx == 0 && core_idx < rank_size) { + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, FLAG_VALUE); + } + if (aiv_idx == 0 && core_idx == rank) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, + FLAG_VALUE * rank_size * flag_data); + } + } + + FORCE_INLINE_AICORE void CrossRankSyncV3(int32_t flag_idx, int32_t flag_data) + { + if (aiv_idx == 0 && core_idx == rank) { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); + } else if (aiv_idx == 0 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx]] + flag_offset + flag_idx, + flag_data); + } + } + + FORCE_INLINE_AICORE void CrossRankSyncV4(int32_t flag_idx, int32_t flag_data) + { + if (aiv_idx == 0 && core_idx < rank_size) { + if (core_idx != rank) { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, rank_size * flag_data); + } + } + + FORCE_INLINE_AICORE void ResetIpcFlags(int32_t num_flags) + { + for (int32_t idx = 0; idx < num_flags; ++idx) { + if (core_idx == 0 && aiv_idx == 0) { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + idx, f0); + } + } + } + + FORCE_INLINE_AICORE void FillZero(int32_t data_size_remain, __gm__ T *output, \ + int32_t total_aiv, int32_t aiv_idx_in_clean) + { + int32_t repeat_time = 128; + int32_t num_per_call = repeat_time * 128; + + if constexpr (std::is_same::value) { + VectorDup(output_UB_T[0], static_cast(0), repeat_time, 1, 8); + } + else if constexpr (std::is_same::value) { + VectorDup(output_UB_T[0], static_cast(0), repeat_time, 1, 8); + } + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + data_size_remain = DivCeil(data_size_remain, total_aiv); + data_size_remain = (data_size_remain + 15) / 16 * 16; + int32_t offset = aiv_idx_in_clean * data_size_remain; + while (data_size_remain > 0) { + int32_t data_size = data_size_remain < num_per_call ? data_size_remain : num_per_call; + CopyUbufToGm(output + offset, output_UB_T[0], 1, data_size * sizeof(T) / 32, 0, 0); + data_size_remain -= data_size; + offset += data_size; + } + } + + FORCE_INLINE_AICORE void CopyUbufToGmTransLayout(__ubuf__ T* ub_buff_st, int32_t actual_move_size, int64_t move_num_offset) { + auto ub_buff = ub_buff_st; + int32_t left_m = actual_move_size / n0; + while (left_m > 0) { + int32_t loop_idx = move_num_offset / (m0 * n0); + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzlCount, m_idx, n_idx); + int32_t actual_m = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t actual_n = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + int32_t m_offset = (move_num_offset % (m0 * n0)) / n0; + int32_t actual_move_m = m0 < m_offset + left_m ? m0 - m_offset : left_m; + if (m_offset < actual_m) { + actual_move_m = actual_m < m_offset + left_m ? actual_m - m_offset : left_m; + int64_t out_buff_offset = (m_idx * m0 + m_offset) * n + n_idx * n0; + CopyUbufToGmUnknown(n % BLOCK_SIZE_16 == 0, gm_out + out_buff_offset, ub_buff, actual_move_m, actual_n * sizeof(T), + (n0 - actual_n) * sizeof(T) / 32, (n - actual_n) * sizeof(T)); + } + left_m -= actual_move_m; + move_num_offset += actual_move_m * n0; + ub_buff += actual_move_m * n0; + } + } + + FORCE_INLINE_AICORE void CopyGmToGm(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { + auto ub0 = output_UB_T[0]; + auto ub1 = output_UB_T[1]; + int32_t interm_offset = 0; + for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx) { + uint32_t data_size = interm_offset + max_ub_ping_pong_size < copy_size ? max_ub_ping_pong_size : copy_size - interm_offset; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub = (m_idx & 1) ? ub0 : ub1; + WaitFlag(event_id); + CopyGmToUbuf(ub, gm_src + interm_offset, 1, data_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + CopyUbufToGm(gm_dst + interm_offset, ub, 1, data_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + interm_offset += data_size; + } + } + + FORCE_INLINE_AICORE void FirstStepInPeerMemSeq(int32_t data_size_remain, int32_t core_buff_offset) { + if (data_size_remain <= 0) { + return; + } + auto ub0 = output_UB_T[0]; + auto ub1 = output_UB_T[1]; + int32_t rank_per_core = (rank_size) / comm_npu_split; + int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; + + for (int32_t rank_idx = 0; rank_idx < rank_per_core; ++rank_idx){ + int32_t rank_idx_rot = (rank_idx + core_idx) % rank_per_core; + int32_t m_rank_idx = core_rank_offset + rank_idx_rot; + if (m_rank_idx == rank) { + continue; + } + if (is_91093 && (m_rank_idx % 2) != (rank % 2)) { + continue; + } + CopyGmToGm(buff[m_rank_idx] + core_buff_offset, buff[rank] + core_buff_offset, data_size_remain); + } + } + + FORCE_INLINE_AICORE void FirstStepInPeerMemTree(int32_t data_size_remain, int32_t core_buff_offset) { + if (data_size_remain <= 0) { + return; + } + int32_t rank_per_core = (rank_size) / comm_npu_split; + int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; + + __gm__ T* gm_reducebuf = reinterpret_cast<__gm__ T *>(workspace_info.gm_reducebuf) + core_idx * len_per_loop * rank_size / 2; + + SetAtomicNone(); + int32_t rank_idx = 0; + int32_t turn_atomic_step = rank_size / 2 - 1; + for (int32_t visited = 0; visited < rank_size - 1; visited++) { + if (visited == turn_atomic_step) { + SetAtomicAdd(); + } + int32_t rank_idx_rot = (rank_idx + core_idx) % rank_per_core; + if (rank_idx_rot == rank) { + rank_idx++; + rank_idx_rot = (rank_idx + core_idx) % rank_per_core; + } + if (is_91093 && (rank_idx_rot % 2) != (rank % 2)) { + continue; + } + + auto gm_interm = gm_reducebuf + (visited % turn_atomic_step) * len_per_loop; + if (visited == rank_size - 2) { + gm_interm = buff[rank] + core_buff_offset; + } + auto gm_peer = buff[rank_idx_rot] + core_buff_offset; + CopyGmToGm(gm_peer, gm_interm, data_size_remain); + rank_idx++; + } + if (rank_size == 8) { + CopyGmToGm(gm_reducebuf + 1 * len_per_loop, buff[rank] + core_buff_offset, data_size_remain); + CopyGmToGm(gm_reducebuf + 1 * len_per_loop, gm_reducebuf, data_size_remain); + } + if (rank_size >= 4) { + CopyGmToGm(gm_reducebuf, buff[rank] + core_buff_offset, data_size_remain); + } + } + + FORCE_INLINE_AICORE void FirstStepInPeerMem(int32_t data_size_remain, __gm__ T *input, __gm__ T *output, bool atomic_add = false) { + if (data_size_remain <= 0) { + return; + } + if (atomic_add) { + SetAtomicAdd(); + PipeBarrier(); + } + int32_t offset = 0; + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + CopyGmToGm(input, output, data_size_remain); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + if (atomic_add) { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetAtomicNone(); + PipeBarrier(); + } + } + +public: + __gm__ T *buff[LCAL_MAX_RANK_SIZE]; + __gm__ T *gm_out; + __ubuf__ int32_t *ctrl_flags_UB = (__ubuf__ int32_t *)(0); + __ubuf__ T *output_UB_T[2] = {(__ubuf__ T *)(32), (__ubuf__ T *)(97440)}; + + int32_t batch_size; + int32_t m; + int32_t k; + int32_t n; + int32_t m0; + int32_t k0; + int32_t n0; + + int32_t m_loop; + int32_t n_loop; + int32_t k_loop; + int32_t coreLoop; + int32_t core_idx; + int32_t real_core_idx; + + int32_t rank; + int32_t rank_size; + int32_t buffer_size; + int32_t flag_offset; + + int32_t tiling_key; + int32_t swizzlCount; + int32_t swizzl_direct; + bool trans_a; + bool trans_b; + bool is_int8; + bool is_91093; + int32_t p_value; + + int32_t aiv_idx; + int32_t other_rank; + int32_t core_num; + int32_t max_ub_single_dma_size; + int32_t max_ub_ping_pong_size; + int32_t loop_num_per_comm; + int32_t gm_c_pingpong_size; + int32_t withSerialMode; + int32_t tag; + int32_t comm_npu_split; + int32_t comm_data_split; + int32_t comm_direct; + int32_t len_per_loop; + int32_t core_count; + + int32_t extra_ub_move_num; + int32_t extra_comm_npu_split; + int32_t extra_comm_data_split; + int32_t extra_comm_direct; + int32_t extra_len_per_loop; + bool is_deterministic; + + QuantGranularity dequant_granularity; + int32_t dequant_group_size; + QuantGranularity quant_granularity; + int32_t quant_group_size; + + LcalWorkspaceInfo workspace_info; + + int32_t ag_dim; + int32_t rs_dim; + bool inner_dim_is_Ag; + bool weight_nz{false}; +}; + +#endif +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_const_args.cce b/comm/lcal/src/kernels/coc_const_args.cce new file mode 100644 index 00000000..ea09aec4 --- /dev/null +++ b/comm/lcal/src/kernels/coc_const_args.cce @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef LCAL_COC_CONST_ARGS_H +#define LCAL_COC_CONST_ARGS_H +#include +#include "kernel_operator.h" +using namespace AscendC; +#ifndef FORCE_INLINE_AICORE +#define FORCE_INLINE_AICORE inline __attribute__((always_inline)) __aicore__ +constexpr int32_t BLOCK_SIZE = 16; +constexpr int LCAL_MAX_RANK_SIZE = 32; + +struct ExtraFlag { + static constexpr uint32_t RDMA = 1; + static constexpr uint32_t TOPO_910B2C = 1 << 1; + static constexpr uint32_t TOPO_910_93 = 1 << 2; + static constexpr uint32_t DETERMINISTIC = 1 << 3; + static constexpr uint32_t QUANT_FP16 = 1 << 4; + static constexpr uint32_t QUANT_FP32 = 1 << 5; +}; +#endif +constexpr int32_t AIV_FINISH_ALIGN_FLAG_ID = 8; +constexpr int32_t AIC_FINISH_MATMUL_FLAG_ID = 9; +constexpr int32_t AIV_FINISH_ADD_BIAS_FLAG_ID = 10; +constexpr int32_t AIV_FINISH_DEQUANT_FLAG_ID = 11; +constexpr int32_t AIC_WAIT_AIV_FINISH_ALIGN_FLAG_ID = 12; +constexpr int32_t AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID = 13; + +constexpr int32_t A3_DIE_NUM = 2; +constexpr int32_t BLOCK_SIZE_16 = 16; +constexpr int32_t BLOCK_SIZE_32 = 32; +constexpr int32_t SWIZZL_MASK = 0b100000; +constexpr int32_t TRANS_A_MASK = 0b010000; +constexpr int32_t TRANS_B_MASK = 0b001000; +constexpr int32_t INT8_MASK = 0b000100; +constexpr int32_t BIAS_MASK = 0b000010; +constexpr int32_t QUANT_MASK = 0x00FF0000; +constexpr int32_t QUANT_SHIFT = 16; +constexpr int32_t MAX_BLOCK_COUNT = 2; +constexpr int32_t BLOCK_COUNT_3 = 3; +constexpr int32_t BLOCK_COUNT_4 = 4; +constexpr int32_t L0AB_PINGPONG_BUFFER_LEN = 16384; +constexpr int32_t CUBE_MATRIX_SIZE = 256; +constexpr int32_t L1_PINGPONG_BUFFER_LEN = 131072; +constexpr int32_t MAX_CORE_NUM = 25; +constexpr int32_t MAX_UB_BUFF = 196608; +constexpr int32_t ADD_REPEAT_TIME = 4; +constexpr int32_t FLAG_ZERO_IDX = 0; +constexpr int32_t FLAG_ONE_IDX = 1; +constexpr int32_t FLAG_TWO_IDX = 2; +constexpr int32_t FLAG_ADD_IDX = 3; +constexpr int32_t MAX_FLAG_COUNT = 3 + ADD_REPEAT_TIME * 2; +constexpr int32_t FLAG_VALUE = 1; + +constexpr int32_t VEC_BLOCK_PER_REPEAT = 8; +constexpr uint8_t REPEAT_PER_LOOP = 255; +constexpr uint32_t PPMATMUL_RUN_PURE_MATMUL = 1; +constexpr uint32_t PPMATMUL_RUN_MATMUL_ALLREDUCE = 2; +constexpr int32_t LCAL_2DTP_C_OFFSET = 100 * 1024 * 1024 / sizeof(half); +constexpr uint32_t PPMATMUL_RUN_ALL_GATHER_MATMUL_REDUCE_SCATTER = 6; +constexpr int32_t HCCS_TOTAL_CORE_NUM = 8; +constexpr int32_t SIO_TOTAL_CORE_NUM = 8; +constexpr uint64_t WORKSPACE_REDUCE_SIZE = 4000000; +constexpr int32_t TWOD_DATA_SPLIT_DEFAULT = 2; +constexpr int32_t TWOD_LEN_PER_LOOP_DEFAULT = 5120; + +constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; +constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; +constexpr int32_t FLAG_OFFSET = (LCAL_BUFF_BYTES - FLAG_BUFF_BYTES) / sizeof(int32_t); + +enum QuantGranularity : int { + QUANT_GRANULARITY_UNDEFINED = -1, + PER_TENSOR = 0, + PER_CHANNEL = 1, + PER_GROUP = 2, + PER_TOKEN = 3, + FLOAT32_SCALE_PER_CHANNEL = 4, + QUANT_GRANULARITY_MAX = 5, +} + +template +struct BaseBlock { + static_assert((SIZE & (SIZE - 1)) == 0, "Invalid block size"); + static constexpr size_t size = SIZE / sizeof(T); + + static FORCE_INLINE_AICORE size_t Count(size_t len) + { + return (len + size - 1) / size; + } + + static FORCE_INLINE_AICORE bool IsAligned(size_t len) + { + return len % size == 0; + } + + static FORCE_INLINE_AICORE size_t AlignUp(size_t len) + { + return (len + size - 1) & ~(size - 1); + } + + static FORCE_INLINE_AICORE size_t AlignDown(size_t len) + { + return len & ~(size - 1); + } +}; + +template +using Block32B = BaseBlock; + +template +using Block256B = BaseBlock; + +template +using Block512B = BaseBlock; + +template +struct CoCCommArgs { + int rank; + int localRank; + int rankSize; + int localRankSize; + uint32_t extraFlag; + __gm__ T *peerMems[LCAL_MAX_RANK_SIZE]; + int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE]; +}; + +#endif diff --git a/comm/lcal/src/kernels/coc_postprocessor.cce b/comm/lcal/src/kernels/coc_postprocessor.cce new file mode 100644 index 00000000..154c91b1 --- /dev/null +++ b/comm/lcal/src/kernels/coc_postprocessor.cce @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef __COC_POSTPROCESSOR__ +#define __COC_POSTPROCESSOR__ + +#ifdef __DAV_C220_VEC__ + +#include +#include "coc_internal.cce" +#include "kernel_operator.h" +#include "tiling_args.h" +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t NUM_PER_REP_FP32 = 64; +constexpr int32_t NUM_PER_BLK_FP32 = 8; +constexpr float MINUS_HALF = -0.5; +constexpr float ZERO = 0; +constexpr float ONE = 1; + +template +class RMSNormprocessor { +public: + __aicore__ explicit RMSNormprocessor() = default; + FORCE_INLINE_AICORE void SetArgs(__gm__ uint8_t *gm_in, __gm__ uint8_t *gm_out, __gm__ uint8_t *gm_gamma, + uint32_t m, uint32_t n) + { + this->rmsnorm_in = reinterpret_cast<__gm__ T_in *>(gm_out); + this->rmsnorm_gamma = reinterpret_cast<__gm__ T_out *>(gm_gamma); + this->rmsnorm_out = reinterpret_cast<__gm__ T_out *>(gm_out); + this->m = m; + this->n = n; + this->core_used = core_used; + } + + struct UBufConfig { + int64_t global_subblock_idx; + int64_t total_subblock; + __ubuf__ half *gamma; + __ubuf__ half *fp16_0; + __ubuf__ float *fp32_0; + __ubuf__ float *sqx0; + __ubuf__ float *sum_tmp0; + __ubuf__ float *sum0; + __ubuf__ float *fp32_1; + __ubuf__ half *fp16_1; + __ubuf__ float *sqx1; + __ubuf__ float *sum_tmp1; + __ubuf__ float *sum1; + float epsilon; + bool ping; + }; + + FORCE_INLINE_AICORE UBufConfig InitializeUBufConfig() + { + UBufConfig config; + config.global_subblock_idx = AscendC::GetBlockIdx(); + config.total_subblock = AscendC::GetBlockNum() * AscendC::GetTaskRation(); + + config.gamma = (__ubuf__ half *)get_imm(0); + config.fp16_0 = (__ubuf__ half *)get_imm(1 * 16 * 1024); + config.fp32_0 = (__ubuf__ float *)get_imm(2 * 16 * 1024); + config.sqx0 = (__ubuf__ float *)get_imm(4 * 16 * 1024); + config.sum_tmp0 = (__ubuf__ float *)config.fp16_0; + config.sum0 = (__ubuf__ float *)config.fp16_0 + 64; + + config.fp16_1 = (__ubuf__ half *)get_imm(1 * 16 * 1024 + 96 * 1024); + config.fp32_1 = (__ubuf__ float *)get_imm(2 * 16 * 1024 + 96 * 1024); + config.sqx1 = (__ubuf__ float *)get_imm(4 * 16 * 1024 + 96 * 1024); + config.sum_tmp1 = (__ubuf__ float *)config.fp16_1; + config.sum1 = (__ubuf__ float *)config.fp16_1 + 64; + config.epsilon = 1e-6; + config.ping = true; + return config; + } + + FORCE_INLINE_AICORE void RMSNormRun() + { + SetMaskCount(); + SetAtomicNone(); + + UBufConfig ubufConfig = InitializeUBufConfig(); + + CopyGmToUbufAlign(ubufConfig.gamma, (__gm__ half *)rmsnorm_gamma, 1, n, 0, 0); + PipeBarrier(); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + + for (int64_t global_row_id = ubufConfig.global_subblock_idx % ubufConfig.total_subblock; global_row_id < m; + global_row_id += ubufConfig.total_subblock) { + auto &fp16 = ubufConfig.ping ? ubufConfig.fp16_0 : ubufConfig.fp16_1; + auto &fp32 = ubufConfig.ping ? ubufConfig.fp32_0 : ubufConfig.fp32_1; + auto &sqx = ubufConfig.ping ? ubufConfig.sqx0 : ubufConfig.sqx1; + auto &sum_tmp = ubufConfig.ping ? ubufConfig.sum_tmp0 : ubufConfig.sum_tmp1; + auto &sum = ubufConfig.ping ? ubufConfig.sum0 : ubufConfig.sum1; + auto event_id = ubufConfig.ping ? EVENT_ID0 : EVENT_ID1; + + WaitFlag(event_id); + CopyGmToUbufAlign(fp16, (__gm__ half *)rmsnorm_in + global_row_id * n, 1, n, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + + SetVectorMask(0x0, n); + Vconv(((__ubuf__ float *)fp32), ((__ubuf__ half *)fp16), 1, 1, 1, 8, 4); + PipeBarrier(); + + Vmul(((__ubuf__ float *)sqx), ((__ubuf__ float *)fp32), ((__ubuf__ float *)fp32), 1, 1, 1, 1, 8, 8, 8); + PipeBarrier(); + float average_val = 1.f /n; + Vmuls(((__ubuf__ float *)sqx), ((__ubuf__ float *)sqx), average_val, 1, 1, 1, 8, 8); + PipeBarrier(); + + SetVectorMask(0x0, 64); + VectorDup(((__ubuf__ float *)sum_tmp), 0.f, 1, 1, 8); + PipeBarrier(); + + SetVectorMask(0x0, n); + Vadd(((__ubuf__ float *)sum_tmp), ((__ubuf__ float *)sqx), ((__ubuf__ float *)sum_tmp), 1, 1, 1, 1, 0, 8, + 0); + PipeBarrier(); + + SetVectorMask(0x0, 64); + vcadd(((__ubuf__ float *)sum), ((__ubuf__ float *)sum_tmp), 1, 0, 1, 0, 0); + PipeBarrier(); + + SetVectorMask(0x0, n); + SetFlag(event_id); + WaitFlag(event_id); + float mul_val = 1.f / sqrt(sum[0] + ubufConfig.epsilon); + PipeBarrier(); + SetFlag(event_id); + WaitFlag(event_id); + Vmuls(((__ubuf__ float *)fp32), ((__ubuf__ float *)fp32), mul_val, 1, 1, 1, 8, 8); + PipeBarrier(); + + Vconv(((__ubuf__ half *)fp16), ((__ubuf__ float *)fp32), 1, 1, 1, 4, 8); + PipeBarrier(); + + Vmul(((__ubuf__ half *)fp16), ((__ubuf__ half *)fp16), ((__ubuf__ half *)ubufConfig.gamma), 1, 1, 1, 1, 8, + 8, 8); + PipeBarrier(); + SetFlag(event_id); + WaitFlag(event_id); + + CopyUbufToGmAlign((__gm__ half *)rmsnorm_out + global_row_id * n, fp16, 1, n, 0, 0); + SetFlag(event_id); + ubufConfig.ping = !ubufConfig.ping; + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + } + +private: + __gm__ T_out *rmsnorm_gamma; + __gm__ T_in *rmsnorm_in; + __gm__ T_out *rmsnorm_out; + int32_t m; + int32_t n; + int32_t core_used; +}; + +template +class Postprocessor { +public: + __aicore__ explicit Postprocessor() = default; + + FORCE_INLINE_AICORE void SetArgs(PP_MATMUL_AIV_POST_ARGS_FUN()) + { + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto cocTilingData = ¶-.cocTilingData; + this->with_rms_norm = para->postInfo.withRmsNorm; + if (this->with_rms_norm) { + uint32_t m = cocTilingData->m; + uint32_t n = cocTilingData->n; + rmsnormprocessor.SetArgs(gm_out, gm_out, gm_gamma, m, n); + } + } + + FORCE_INLINE_AICORE void Run() + { + FFTSCrossCoreSync(0, 0); + WaitEvent(0); + if (this->with_rms_norm) { + rmsnormprocessor.RMSNormRun(); + } + } + +private: + int32_t with_rms_norm; + RMSNormprocessor rmsnormprocessor; +}; + +#endif +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce new file mode 100644 index 00000000..f344efee --- /dev/null +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -0,0 +1,1143 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef __PP_MATMUL__ +#define __PP_MATMUL__ +#include "coc_internal.cce" +#include "lcoc_workspace.h" +template +struct GetAccumType { + using T = float; +}; + +template <> +struct GetAccumType { + using T = int32_t; +}; + +#ifdef __DAV_C220_CUBE__ + +constexpr int32_t L0AB_PINGPONG_BUFFER_SIZE = 32768; +constexpr int32_t CUBE_MATRIX_SIZE_B16 = 256; +constexpr int32_t CUBE_MATRIX_SIZE_B8 = 16 * 32; +constexpr int64_t ND2NZ_STRIDE_LIMIT = 65536; +constexpr int32_t SCALE_L1_SIZE = 256 * 8; + +template +inline __aicore__ void CopyCubfToBt(uint64_t dst, __cbuf__ T *src, uint16_t convControl, uint16_t nBurst, uint16_t lenBurst, uint16_t sourceGap, uint16_t dstGap) +{ + DataCopyParams intriParams(nBurst, lenBurst, sourceGap, dstGap); + uint32_t src_buffer_offset = reinterpret_cast(src); + uint32_t dst_buffer_offset = reinterpret_cast(dst); + uint8_t src_logicpos = static_cast(TPosition::C1); + uint8_t dst_logicpos = static_cast(TPosition::C2); + LocalTensor srcTensor; + LocalTensor dstTensor; + srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); + dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); + DataCopy(dstTensor, srcTensor, intriParams); +} + +template +inline __aicore__ void CopyGmToCbuf(__cbuf__ T *dst, __gm__ T *src, uint8_t sid, uint16_t nBurst, uint16_t lenBurst, uint16_t srcStride, uint16_t dstStride, pad_t padMode) +{ + DataCopyParams intriParams(nBurst, lenBurst, srcStride, dstStride); + GlobalTensor srcTensor; + srcTensor.SetGloalBuffer(src); + uint32_t dst_buffer_offset = reinterpret_cast(dst); + uint8_t logicpos = static_cast(TPosition::C1); + LocalTensor dstTensor; + dstTensor = CreateLocalTensor(dst_buffer_offset, logicpos); + DataCopy(dstTensor, srcTensor, intriParams); +} + +template +inline __aicore__ void SetFpc(__fbuf__ T *src) +{ + LocalTensor tensor; + uint32_t src_buffer_offset = reinterpret_cast(src); + tensor = CreateLocalTensor(src_buffer_offset); + SetFixPipeConfig(tensor); +} + +template +inline __aicore__ void LoadCbufToCaTranspose(__ca__ T *dst, __cbuf__ T *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstStride, bool addrmode, uint16_t dstFracStride) +{ + LoadData2dTransposeParams params( + indexID, + repeat, + srcStride, + dstStride, + dstFracStride, + addrmode + ); + uint32_t src_buffer_offset = reinterpret_cast(src); + uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint8_t src_logicpos = static_cast(TPosition::C1); + uint8_t dst_logicpos = static_cast(TPosition::A2); + LocalTensor srcTensor; + LocalTensor dstTensor; + srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); + dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); + LoadDataWithTranspose(dstTensor, srcTensor, params); +} + +template +inline __aicore__ void LoadCbufToCbTranspose(__cb__ T *dst, __cbuf__ T *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstStride, bool addrmode, uint16_t dstFracStride) +{ + LoadData2dTransposeParams params( + indexID, + repeat, + srcStride, + dstStride, + dstFracStride, + addrmode + ); + uint32_t src_buffer_offset = reinterpret_cast(src); + uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint8_t src_logicpos = static_cast(TPosition::C1); + uint8_t dst_logicpos = static_cast(TPosition::B2); + LocalTensor srcTensor; + LocalTensor dstTensor; + srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); + dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); + LoadDataWithTranspose(dstTensor, srcTensor, params); +} + +template +inline __aicore__ void LoadCbufToCa(__ca__ T *dst, __cbuf__ T *src, uint16_t baseIdx, uint8_t repeat, uint16_t srcStride, uint16_t dstStride, uint8_t sid, bool transpose, uint8_t addr_cal_mode) +{ + LoadData2dParams params( + baseIdx, + repeat, + srcStride, + sid, + dstStride, + transpose, + addr_cal_mode + ); + uint32_t src_buffer_offset = reinterpret_cast(src); + uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint8_t src_logicpos = static_cast(TPosition::C1); + uint8_t dst_logicpos = static_cast(TPosition::A2); + LocalTensor srcTensor; + LocalTensor dstTensor; + srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); + dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); + LoadData(dstTensor, srcTensor, params); +} + +template +inline __aicore__ void LoadCbufToCb(__cb__ T *dst, __cbuf__ T *src, uint16_t baseIdx, uint8_t repeat, uint16_t srcStride, uint16_t dstStride, uint8_t sid, bool transpose, uint8_t addr_cal_mode) +{ + LoadData2dParams params( + baseIdx, + repeat, + srcStride, + sid, + dstStride, + transpose, + addr_cal_mode + ); + uint32_t src_buffer_offset = reinterpret_cast(src); + uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint8_t src_logicpos = static_cast(TPosition::C1); + uint8_t dst_logicpos = static_cast(TPosition::B2); + LocalTensor srcTensor; + LocalTensor dstTensor; + srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); + dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); + LoadData(dstTensor, srcTensor, params); +} + +template +struct IntrinsicCopyGmToL1Nd2Nz { + static inline __aicore__ void move( + __cbuf__ T *dst, __gm__ T *src + uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, + uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, + uint16_t dstNzNStride, uint16_t dstNzMatrixStride){ + Nd2NzParams nd2nzParams( + ndNum, nValue, dValue, + srcNdMatrixStride, srcDValue, dstNzC0Stride, + dstNzNStride, dstNzMatrixStride + ); + uint32_t dst_buffer_offset = reinterpret_cast(dst); + uint8_t dst_logicpos = static_cast(TPosition::C1); + LocalTensor dstTensor; + dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); + GlobalTensor srcTensor; + srcTensor.SetGloalBuffer(src); + DataCopy(dstTensor, srcTensor, nd2nzParams); + } +}; + +template +struct CopyGmToL1Nd2zN { + static inline __aicore__ void move( + __cbuf__ T *dst, __gm__ T *src, + uint16_t nValue, uint16_t dValue, uint16_t srcDValue, uint16_t dstNzC0Stride) { + constexpr int BLOCK_LEN = 32 / sizeof(T); + if (srcDValue < ND2NZ_STRIDE_LIMIT) { + IntrinsicCopyGmToL1Nd2Nz::move( + dst, + src, + 0, + 1, + nValue, + dValue, + 0, + srcDValue, + dstNzC0Stride, + 1, + 0 + ); + } else { + for (int i = 0; i < nValue; i++) { + IntrinsicCopyGmToL1Nd2Nz::move( + dst + i * BLOCK_LEN, + src + i * srcDValue, + 0, + 1, + 1, + dValue, + 0, + 0, + dstNzC0Stride, + 0, + 0 + ); + } + } + } +}; + +template +class PpMatmul { + using T_ACCUM = typename GetAccumType::T; + static constexpr bool IS_INT8 = std::is_same::value; +public: + __aicore__ explicit PpMatmul() {}; + + inline __aicore__ void SetArgs(PP_MATMUL_AIC_ARGS_FUN(MmadDtype, OutDtype)) + { + this->gm_c = reinterpret_cast<__gm__ OutDtype *>(gm_c); + this->gm_peer_mem = reinterpret_cast<__gm__ OutDtype *>(gm_peer_mem); + this->gm_dequant_scale = reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale); + has_offset = gm_dequant_offset != nullptr; + + this->batch_size = batch_size; + this->m = m; + this->k = k; + this->n = n; + this->weight_nz = weight_nz; + + cube_matrix_size = IS_INT8 ? CUBE_MATRIX_SIZE_B8 : CUBE_MATRIX_SIZE_B16; + + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + + this->m0 = m0; + this->k0 = k0; + this->n0 = n0; + + this->dequant_granularity = dequant_granularity; + + AlignJudge(TA, TB, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + bool has_a_align = IsQuant(quant_granularity) || aligned_a; + bool has_b_align = IsQuant(dequant_granularity) && !IS_INT8 || aligned_b; + if (weight_nz) { + k_align16 = (k + 16 - 1) / 16 * 16; + n_align16 = Block32B::AlignUp(n); + aligned_b = 0; + has_b_align = false; + } + bool has_accum = IsQuant(dequant_granularity) && IS_INT8 && std::is_same::value; + bool has_format_dequant_offset = (dequant_granularity == QuantGranularity::PER_TENSOR) && IS_INT8 && has_offset; + int32_t accum_rank_size = 1; + + bool has_dequant_param = (dequant_granularity == QuantGranularity::PER_TOKEN || dequant_granularity == QuantGranularity::PER_TENSOR); + bool hasFormatDequantScale = (dequant_granularity == QuantGranularity::PER_CHANNEL); + + workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, + TA, TB, sizeof(MmadDtype), has_a_align, has_b_align, accum_rank_size, has_accum, 0, has_dequant_param, + hasFormatDequantScale, is_deterministic, false, false, 0, 0, 0); + + gm_a_src = reinterpret_cast<__gm__ MmadDtype *>(has_a_align ? workspace_info.gm_a_align : gm_a); + gm_b_src = reinterpret_cast<__gm__ MmadDtype *>(has_b_align ? workspace_info.gm_b_align : gm_b); + gm_accum = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_accum); + gm_format_dequant_offset = reinterpret_cast<__gm__ int32_t *>(has_format_dequant_offset ? + workspace_info.gm_dequant_param : gm_dequant_offset); + + block_size = 32 / sizeof(MmadDtype); + + L1_PINGPONG_BUFFER_LEN = ((m0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size + + (n0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size * (IS_INT8 ? 2 : 1)); + L0AB_PINGPONG_BUFFER_LEN = L0AB_PINGPONG_BUFFER_LEN / sizeof(MmadDtype); + + int32_t a_l1_size = m0 * k0 * sizeof(MmadDtype); + int32_t a_l1_size_round = DivCeil(a_l1_size, 512) * 512; + int32_t b_l1_size = n0 * k0 * sizeof(MmadDtype); + int32_t b_l1_size_round = DivCeil(b_l1_size, 512) * 512; + l1_base_a = reinterpret_cast<__cbuf__ MmadDtype *>((uintptr_t)(IS_INT8 ? SCALE_L1_SIZE : 0)); + l1_base_b = reinterpret_cast<__cbuf__ MmadDtype *>(a_l1_size_round * (IS_INT8 ? 2 : 1) + (uintptr_t)l1_base_a); + + core_num = get_block_num(); + core_idx = get_block_idx(); + + this->m_loop = m_loop; + this->k_loop = k_loop; + this->n_loop = n_loop; + this->core_loop = core_loop; + this->swizzl_count = swizzl_count; + this->swizzl_direct = swizzl_direct; + this->is_91093 = is_91093; + ping_flag = 1; + this->rank = rank; + this->rank_size = rank_size; + this->p_value = p_value; + this->withSerialMode = withSerialMode; + loop_num_per_comm = p_value * core_num; + this->buffer_size = buffer_size; + + this->ag_dim = ag_dim; + this->rs_dim = rs_dim; + this->inner_dim_is_Ag = inner_dim_is_Ag; + if (inner_dim_is_Ag) { + this->ag_rank_idx = rank % ag_dim; + this->rs_rank_idx = rank / ag_dim; + } else { + this->ag_rank_idx = rank / rs_dim; + this->rs_rank_idx = rank % rs_dim; + } + } + + inline __aicore__ void CalLoop(int64_t batch_idx, int64_t m_idx, int64_t n_idx, int32_t m_actual, int32_t n_actual, + __gm__ MmadDtype *gm_a_src_tmp) { + int64_t offset_a, offset_b, offset_a_next, offset_b_next; + int32_t m_round, n_round; + if (IS_INT8) { + if (TA) { + m_round = DivCeil(m_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; + } else { + m_round = DivCeil(m_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } + if (TB) { + n_round = DivCeil(n_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; + } else { + n_round = DivCeil(n_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } + } else { + m_round = DivCeil(m_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + n_round = DivCeil(n_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } + + int32_t mn_max = m_round > n_round ? m_round : n_round; + int32_t k_part_len = L0AB_PINGPONG_BUFFER_LEN / mn_max / block_size * block_size; + if (TA) { + if (aligned_a == 1) { + offset_a = batch_idx * k * m_align + m_idx * m0; + } else { + offset_a = batch_idx * k * m + m_idx * m0; + } + } else { + if (aligned_a == 1) { + offset_a = batch_idx * m * k_align + m_idx * m0 * k_align; + } else { + offset_a = batch_idx * m * k + m_idx * m0 * k; + } + } + if (TB) { + if (aligned_b == 1) { + offset_b = n_idx * n0 * k_align; + } else { + if (weight_nz) { + offset_b = n_idx * n0 * block_size; + } else { + offset_b = n_idx * n0 * k; + } + } + } else { + if (aligned_b == 1) { + offset_b = n_idx * n0; + } else { + if (weight_nz) { + offset_b = n_idx * n0 * k_align16; + } else { + offset_b = n_idx * n0; + } + } + } + int64_t dequant_param_offset = n_idx * n0; + + int32_t k_actual = (k_loop == 1) ? k : k0; + int32_t k_round = DivCeil(k_actual, block_size) * block_size; + + auto l1_buf_a = ping_flag ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; + auto l1_buf_b = ping_flag ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; + auto l0a_buf = ping_flag ? l0a_base : l0a_base + L0AB_PINGPONG_BUFFER_LEN; + auto l0b_buf = ping_flag ? l0b_base : l0b_base + L0AB_PINGPONG_BUFFER_LEN; + auto event_id = ping_flag ? EVENT_ID0 : EVENT_ID1; + + if (IS_INT8 && has_offset) { + PipeBarrier(); + IntrinsicCopyGmToL1Nd2Nz::move( + ((__cbuf__ int32_t *) bias_l1), + ((__gm__ int32_t *)gm_format_dequant_offset) + dequant_param_offset, + 0, + 1, + 1, + n_actual, + 0, + n, + 1, + 1, + 0 + ); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + CopyCubfToBt(((uint64_t)bias_bt), ((__cbuf__ int32_t *)bias_l1), + (uint16_t)0ULL, 1, (n_actual * 4 + 63) / 64, 0, 0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); + } + + auto gm_src_a = gm_a_src_tmp + offset_a; + auto gm_src_b = gm_b_src + offset_b; + + WaitFlag(event_id); + if (m == 1 || m_actual == 1 && !TA) { + CopyGmToCbuf( + l1_buf_a, + gm_src_a, + 0, + 1, + k_round, + 0, + 0, + PAD_NONE + ); + } else { + if (TA) { + auto src_len = m; + if (aligned_a == 1) { + src_len = m_align; + } + CopyGmToL1Nd2zN::move(l1_buf_a, gm_src_a, k_actual, m_actual, src_len, k_round); + } else { + auto src_len = k; + if (aligned_a == 1) { + src_len = k_align; + } + CopyGmToL1Nd2zN::move(l1_buf_a, gm_src_a, m_actual, k_actual, src_len, m_round); + } + } + SetFlag(event_id); + WaitFlag(event_id + 2); + if (TB) { + auto src_len = k; + if (aligned_b == 1) { + src_len = k_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(k_actual, block_size); + CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b, gm_src_b, n_actual, k_actual, src_len, n_round); + } + } else { + auto src_len = n; + if (aligned_b == 1) { + src_len = n_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(n_actual, block_size); + CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, k_actual, k_align16 - k_actual, k_round - k_actual, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b, gm_src_b, k_actual, n_actual, src_len, k_round); + } + } + SetFlag(event_id + 2); + + int mte1_mad_ping_flag = 1; + + for (int64_t k_idx = 0; k_idx < k_loop; k_idx++) { + int32_t k_actual = (k_idx == (k_loop - 1)) ? (k - k_idx * k0) : k0; + int32_t k_round = DivCeil(k_actual, block_size) * block_size; + int32_t k_part_loop = DivCeil(k_actual, k_part_len); + + __cbuf__ MmadDtype *l1_buf_a = ping_flag ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; + __cbuf__ MmadDtype *l1_buf_b = ping_flag ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; + auto event_id = ping_flag ? EVENT_ID0 : EVENT_ID1; + + if (k_idx < k_loop - 1) { + if (TA) { + if (aligned_a == 1){ + offset_a_next = batch_idx * k * m_align + (k_idx + 1) * k0 * m_align + m_idx * m0; + } else { + offset_a_next = batch_idx * k * m + (k_idx + 1) * k0 * m + m_idx * m0; + } + } else { + if (aligned_a == 1){ + offset_a_next = batch_idx * m * k_align + m_idx * m0 * k_align + (k_idx + 1) * k0; + } else { + offset_a_next = batch_idx * m * k + m_idx * m0 * k + (k_idx + 1) * k0; + } + } + if (TB) { + if (aligned_b == 1) { + offset_b_next = batch_idx * n * k_align + n_idx * n0 * k_align + (k_idx + 1) * k0; + } else { + if (weight_nz) { + offset_b_next = batch_idx * n * k + (k_idx + 1) * k0 * n_align16 + n_idx * n0 * block_size; + } else { + offset_b_next = batch_idx * n * k + (k_idx + 1) * k0 + n_idx * n0; + } + } + } else { + if (aligned_b == 1) { + offset_b_next = batch_idx * k * n_align + n_idx * n0 + (k_idx + 1) * k0 * n_align; + } else { + if (weight_nz) { + offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * block_size + n_idx * n0 * k_align16; + } else { + offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * n + n_idx * n0; + } + } + } + + int32_t k_actual_next = ((k_idx + 1) == (k_loop -1)) ? (k - (k_idx + 1) * k0) : k0; + int32_t k_round_next = DivCeil(k_actual_next, block_size) * block_size; + + __cbuf__ MmadDtype *l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; + __cbuf__ MmadDtype *l1_buf_b_next = (1 - ping_flag) ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; + auto event_id_next = (1 - ping_flag) ? EVENT_ID0 : EVENT_ID1; + + auto gm_src_a = gm_a_src_tmp + offset_a_next; + auto gm_src_b = gm_b_src + offset_b_next; + + WaitFlag(event_id_next); + if (m == 1 || m_actual == 1 && !TA) { + CopyGmToCbuf( + l1_buf_a_next, + gm_src_a, + 0, + 1, + k_round_next, + 0, + 0, + PAD_NONE + ); + } else { + if (TA) { + auto src_len = m; + if (aligned_a == 1) { + src_len = m_align; + } + CopyGmToL1Nd2zN::move( + l1_buf_a_next, gm_src_a, k_actual_next, m_actual, src_len, k_round_next); + } else { + auto src_len = k; + if (aligned_a == 1) { + src_len = k_align; + } + CopyGmToL1Nd2zN::move( + l1_buf_a_next, gm_src_a, m_actual, k_actual_next, src_len, m_round); + } + } + SetFlag(event_id_next); + + WaitFlag(event_id_next + 2); + if (TB) { + auto src_len = k; + if (aligned_b == 1) { + src_len = k_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(k_actual_next, block_size); + CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b_next, gm_src_b, n_actual, k_actual_next, src_len, n_round); + } + } else { + auto src_len = n; + if (aligned_b == 1) { + src_len = n_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(n_actual, block_size); + CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, k_actual_next, k_align16 - k_actual_next, k_round_next - k_actual_next, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b_next, gm_src_b, k_actual_next, n_actual, src_len, k_round_next); + } + } + SetFlag(event_id_next + 2); + } + + for (int k_part_idx = 0; k_part_idx < k_part_loop; k_part_idx++) { + int32_t k0_round = (k_part_idx < k_part_loop - 1) ? + k_part_len : k_round - k_part_idx * k_part_len; + int32_t k0_actual = (k_part_idx < k_part_loop - 1) ? + k_part_len : k_actual - k_part_idx * k_part_len; + + auto mte1_mad_event_id = mte1_mad_ping_flag ? EVENT_ID0 : EVENT_ID1; + auto l0a_buf = l0a_base + (1 - mte1_mad_ping_flag) * L0AB_PINGPONG_BUFFER_LEN; + auto l0b_buf = l0b_base + (1 - mte1_mad_ping_flag) * L0AB_PINGPONG_BUFFER_LEN; + + if (k_part_idx == 0) { + WaitFlag(event_id); + } + WaitFlag(mte1_mad_event_id); + if (m == 1 || m_actual == 1 && !TA) { + LoadCbufToCa( + l0a_buf, + l1_buf_a + k_part_idx * k_part_len, + 0, + DivCeil(k0_round, cube_matrix_size), + 1, + 0, + 0, + false, + inc + ); + } else { + if (TA) { + if (IS_INT8) { + for (int i = 0; i < m_round / BLOCK_SIZE_32; i++) { + LoadCbufToCaTranspose( + l0a_buf + i * k0_round * BLOCK_SIZE_32, + l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_32 + + i * k_round * BLOCK_SIZE_32, + 0, + k0_round, + 1, + 0, + 0, + k0_round + ); + } + } else { + for (int i = 0; i < m_round / BLOCK_SIZE_16; i++) { + LoadCbufToCa( + l0a_buf + i * k0_round * BLOCK_SIZE_16, + l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_16 + + i * k_round * BLOCK_SIZE_16, + 0, + k0_round, + 1, + 0, + 0, + true, + inc + ); + } + } + } else { + for (int i = 0; i < m_round / BLOCK_SIZE_16; i++) { + LoadCbufToCa( + l0a_buf + i * cube_matrix_size, + l1_buf_a + k_part_idx * k_part_len * m_round + + i * m_round * block_size, + 0, + m_round / BLOCK_SIZE_16, + 1, + k0_round / block_size - 1, + 0, + false, + inc + ); + } + } + } + if (k_part_idx == k_part_loop -1) { + SetFlag(event_id); + } + + if (k_part_idx == 0) { + WaitFlag(event_id + 2); + } + if (TB) { + LoadCbufToCb( + l0b_buf, + l1_buf_b + k_part_idx * k_part_len * n_round, + 0, + k0_round * n_round / cube_matrix_size, + 1, + 0, + 0, + false, + inc + ); + } else { + if (IS_INT8) { + for (int32_t i = 0; i < k0_round / BLOCK_SIZE_32; i++) { + LoadCbufToCbTranspose( + l0b_buf + i * ((n_actual + 15) / 16 * 16) * BLOCK_SIZE_32, + l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_32) * BLOCK_SIZE_32, + 0, + n_round, + k_round, + 1, + 0, + 0, + ); + } + } else { + for (int32_t i = 0; i < k0_round / BLOCK_SIZE_16; i++) { + LoadCbufToCb( + l0b_buf + i * n_round * BLOCK_SIZE_16, + l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_16) * BLOCK_SIZE_16, + 0, + n_round, + k_round, + 0, + 0, + true, + inc + ); + } + } + } + if (k_part_idx == k_part_loop - 1) { + SetFlag(event_id + 2); + } + + SetFlag(mte1_mad_event_id); + WaitFlag(mte1_mad_event_id); + + bool init_c = (k_idx == 0 && k_part_idx == 0); + if (init_c) { + WaitFlag(EVENT_ID0); + } + + if (IS_INT8 && has_offset) { + if (init_c) { + WaitFlag(EVENT_ID1); + } + PipeBarrier(); + if (m != 1 && m_actual == 1 && TA) { + mad((__cc__ int32_t *)l0c_buf, + (__ca__ int8_t *)l0a_buf, + (__cb__ int8_t *)l0b_buf, + ((uint64_t)bias_bt), + 16, + k0_actual, + n_actual, + 0, + 0, + init_c, + 0 + ); + } else { + mad((__cc__ int32_t *)l0c_buf, + (__ca__ int8_t *)l0a_buf, + (__cb__ int8_t *)l0b_buf, + ((uint64_t)bias_bt), + m_actual, + k0_actual, + n_actual, + 0, + 0, + init_c, + 0 + ); + } + } else { + PipeBarrier(); + if (m != 1 && m_actual == 1 && TA) { + mad(l0c_buf, + l0a_buf, + l0b_buf, + 16, + k0_actual, + n_actual, + 0, + 0, + 0, + init_c + ); + } else { + mad(l0c_buf, + l0a_buf, + l0b_buf, + m_actual, + k0_actual, + n_actual, + 0, + 0, + 0, + init_c + ); + } + } + PipeBarrier(); + SetFlag(mte1_mad_event_id); + + mte1_mad_ping_flag = 1 - mte1_mad_ping_flag; + } + ping_flag = 1 - ping_flag; + } + + if (IS_INT8 && std::is_same::value && (dequant_granularity == QuantGranularity::PER_CHANNEL || + dequant_granularity == QuantGranularity::PER_TOKEN)) { + WaitFlag(EVENT_ID0); + PipeBarrier(); + CopyGmToCbuf( + scale_l1, + gm_dequant_scale + dequant_param_offset, + 0, + 1, + (n_actual * sizeof(int64_t) + 31) / 32, + 0, + 0, + PAD_NONE + ); + SetFlag(EVENT_ID0); + + WaitFlag(EVENT_ID0); + + copy_cbuf_to_fbuf( + scale_FB, + scale_l1, + 1, + (n_actual * sizeof(int64_t) + 127) / 128, + 0, + 0 + ); + PipeBarrier(); + } + } + + inline __aicore__ void MoveL0CToGM(__gm__ OutDtype *gm_dst, int64_t offset_c, int32_t m_actual, int32_t n_actual, int32_t src_stride, int32_t dst_stride) { + #if (__CCE__AICORE__ == 220) + FixpipeParamsV220 FixpipeParams( + n_actual, + m_actual, + src_stride, + dst_stride, + false + ); + #elif (defined(__DAV_C310__)) + FixpipeParamsV310 FixpipeParams( + n_actual, + m_actual, + src_stride, + dst_stride, + ); + #endif + uint64_t src_addr = reinterpret_cast(l0c_buf); + LocalTensor srcTensor = CreateLocalTensor + (reinterpret_cast(l0c_buf), static_cast(TPosition::CO1)); + GlobalTensor dstTensor = CreateGlobalTensor(gm_dst + offset_c); + + if (IS_INT8) { + if constexpr (std::is_same::value) { + if (dequant_granularity == QuantGranularity::PER_CHANNEL || dequant_granularity == QuantGranularity::PER_TOKEN) { + SetFpc(scale_FB); + FixpipeParams.quantPre = VDEQF16; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + SetFlag(EVENT_ID0); + } else if (QuantGranularity::PER_TENSOR) { + FixpipeParams.quantPre = DEQF16; + FixpipeParams.deqScalar = gm_dequant_scale[0]; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + } + } else if constexpr (std::is_same::value) { + GlobalTensor dstAccum = CreateGlobalTensor(gm_accum + offset_c); + Fixpipe(dstAccum, srcTensor, FixpipeParams); + } + } else { + if constexpr (std::is_same::value) { + FixpipeParams.quantPre = F322BF16; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + } else { + FixpipeParams.quantPre = F322F16; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + } + } + SetFlag(EVENT_ID0); + if (IS_INT8 && has_offset) { + SetFlag(EVENT_ID1); + } + } + + inline __aicore__ void InitFlags() { + WaitEvent(AIC_WAIT_AIV_FINISH_ALIGN_FLAG_ID); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID3); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID1); + } + + inline __aicore__ void Endflags() { + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID3); + } + + inline __aicore__ void RunPureMatmul() { + + InitFlags(); + for (int32_t loop_idx = 0; loop_idx < core_loop; loop_idx++) { + if (loop_idx % core_num != core_idx) { + continue; + } + + int64_t batch_idx = loop_idx / (m_loop * n_loop); + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop -1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop -1)) ? (n - n_idx * n0) : n0; + CalLoop(batch_idx, m_idx, n_idx, m_actual, n_actual, gm_a_src); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; + MoveL0CToGM(gm_c, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n); + } + Endflags(); + PipeBarrier(); + + FFTSCrossCoreSync(0, AIC_FINISH_MATMUL_FLAG_ID); + WaitEvent(AIC_FINISH_MATMUL_FLAG_ID); + + FFTSCrossCoreSync(2, AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID); + PipeBarrier(); + } + + inline __aicore__ void RunMatmulAllReduce() { + InitFlags(); + int32_t comm_count = DivCeil(core_loop, loop_num_per_comm); + int32_t pipe_depth = is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT; + for (int32_t cal_idx = 0; cal_idx < comm_count; cal_idx++) { + int32_t loop_idx = cal_idx * core_num + core_idx; + int32_t flag_idx = cal_idx % pipe_depth; + if (cal_idx >= pipe_depth) { + WaitEvent(flag_idx); + } + int32_t actual_loop_num = loop_num_per_comm; + if (cal_idx == comm_count - 1){ + actual_loop_num = core_loop - cal_idx * loop_num_per_comm; + } + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + int64_t batch_idx = loop_idx / (m_loop * n_loop); + int64_t m_idx , n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop -1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop -1)) ? (n - n_idx * n0) : n0; + CalLoop(batch_idx, m_idx, n_idx, m_actual, n_actual, gm_a_src); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int64_t offset_c; + int32_t n_stride; + offset_c = flag_idx * m0 * loop_num_per_comm * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; + n_stride = n0; + MoveL0CToGM(gm_peer_mem, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n_stride); + } + FFTSCrossCoreSync(2, flag_idx); + } + Endflags(); + PipeBarrier(); + } + + inline __aicore__ void RunAllGatherMatmulReduceScatter() { + + InitFlags(); + int32_t twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; + int64_t gm_a_pingpong_size = m0 * k_align * p_value * twod_big_dim; + int64_t gm_c_pingpong_size = p_value * twod_big_dim * n_loop * m0 * n0; + int32_t m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); + int64_t m_per_bigdim = m * ag_dim / twod_big_dim; + int32_t comm_count = DivCeil(batch_size, * m_loop_per_bigdim, p_value); + int32_t loop_num_per_cal = p_value * n_loop * twod_big_dim; + int32_t ag_part_dim = twod_big_dim / ag_dim; + int32_t rs_part_dim = twod_big_dim / rs_dim; + for (int32_t comm_idx = 0; comm_idx < comm_count; comm_idx++) { + uint64_t flag_id = comm_idx % MAX_BLOCK_COUNT; + int32_t actual_p_value = p_value; + if (comm_idx == comm_count - 1) { + actual_p_value = m_loop_per_bigdim - comm_idx * p_value; + } + WaitEvent(flag_id); + + int32_t actual_loop_num = actual_p_value * twod_big_dim * n_loop; + int32_t core_loop_num = DivCeil(actual_p_value * twod_big_dim * n_loop, core_num); + for (int32_t core_loop_idx = 0; core_loop_idx < core_loop_num; core_loop_idx++) { + int32_t loop_offset = core_loop_idx * core_num + core_idx; + if (loop_offset >= actual_loop_num) { + continue; + } + int32_t loop_idx = comm_idx * loop_num_per_cal + loop_offset; + int64_t batch_idx = loop_idx / (m_loop * n_loop * twod_big_dim); + + int64_t m_idx, n_idx; + GetBlockIdx(loop_offset, actual_p_value * twod_big_dim, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + + int32_t m_idx_in_rank = m_idx % actual_p_value; + int64_t m_idx_in_c = comm_idx * p_value + m_idx_in_rank; + int32_t m_actual = (m_idx_in_c == (m_loop_per_bigdim - 1)) ? (m_per_bigdim - m_idx_in_c * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + int64_t bigdim_dix = m_idx / actual_p_value; + + int32_t ag_src_idx = bigdim_dix / ag_part_dim; + int32_t ag_part_idx = bigdim_dix % ag_part_dim; + int32_t rs_dst_idx = bigdim_dix / rs_part_dim; + int32_t rs_part_idx = bigdim_dix % rs_part_dim; + + __gm__ MmadDtype *gm_mem_st; + if (ag_src_idx != ag_rank_idx) { + gm_mem_st = reinterpret_cast<__gm__ MmadDtype *>(gm_peer_mem) + + (comm_idx % MAX_BLOCK_COUNT) * gm_a_pingpong_size + + bigdim_dix * p_value * m0 * k_align; + } else { + gm_mem_st = gm_a_src + (comm_idx * p_value) * m0 * k_align + ag_part_idx * m_per_bigdim * k_align; + } + + CalLoop(batch_idx, m_idx_in_rank, n_idx, m_actual, n_actual, gm_mem_st); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int64_t offset_c; + int32_t n_stride; + __gm__ OutDtype *gm_dst = nullptr; + + if (rs_dst_idx != rs_rank_idx) { + offset_c = gm_c_pingpong_size * (comm_idx % MAX_BLOCK_COUNT) + + (m_idx * n_loop + n_idx) * m0 * n0 + + LCAL_2DTP_C_OFFSET; + gm_dst = gm_peer_mem; + dst_stride = n0; + } else { + offset_c = rs_part_idx * m_per_bigdim * n + + m_idx_in_c * m0 * n + + n_idx * n0; + gm_dst = gm_c; + dst_stride = n; + } + MoveL0CToGM(gm_dst, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, dst_stride); + } + FFTSCrossCoreSync(2, flag_id); + } + + Endflags(); + PipeBarrier(); + } + + inline __aicore__ void Run() { + if (RUN_TYPE == PPMATMUL_RUN_MATMUL_ALLREDUCE) { + if (withSerialMode) { + gm_c = gm_peer_mem; + RunPureMatmul(); + } else { + RunMatmulAllReduce(); + } + } else if (RUN_TYPE == PPMATMUL_RUN_ALL_GATHER_MATMUL_REDUCE_SCATTER) { + RunAllGatherMatmulReduceScatter(); + } + } + +protected: + __gm__ MmadDtype *gm_a_src{nullptr}; + __gm__ MmadDtype *gm_b_src{nullptr}; + + __gm__ OutDtype *gm_c{nullptr}; + __gm__ OutDtype *gm_peer_mem(nullptr); + __gm__ int64_t *gm_dequant_scale{nullptr}; + __gm__ int32_t *gm_format_dequant_offset{nullptr}; + __gm__ int32_t *gm_accum{nullptr}; + + __cbuf__ MmadDtype *l1_base_a = reinterpret_cast<__cbuf__ MmadDtype *>((uintptr_t) SCALE_L1_SIZE); + __cbuf__ MmadDtype *l1_base_b = reinterpret_cast<__cbuf__ MmadDtype *>((uintptr_t) (128 * 1024)); + + __ca__ MmadDtype *l0a_base = reinterpret_cast<__ca__ MmadDtype *>((uintptr_t) 0); + __cb__ MmadDtype *l0b_base = reinterpret_cast<__cb__ MmadDtype *>((uintptr_t) 0); + + __cc__ T_ACCUM *l0c_buf = reinterpret_cast<__cc__ T_ACCUM *>((uintptr_t) 0); + + __cbuf__ int64_t *scale_l1 = reinterpret_cast<__cbuf__ int64_t *>((uintptr_t) 0); + __fbuf__ int64_t *scale_FB = (__fbuf__ int64_t *)(0); + + __cbuf__ int32_t * bias_l1 = reinterpret_cast<__cbuf__ int32_t *>((uintptr_t)0); + uint16_t bias_bt = 0; + bool has_offset{false}; + LcalWorkspaceInfo workspace_info; + + int32_t core_num; + + int32_t batch_size; + int32_t m; + int32_t k; + int32_t n; + int32_t m_align; + int32_t k_align; + int32_t n_align; + int32_t k_align16; + int32_t n_align16; + int32_t m0; + int32_t k0; + int32_t n0; + + int32_t m_loop; + int32_t n_loop; + int32_t k_loop; + int32_t core_loop; + int32_t core_idx; + int32_t ping_flag; + int32_t block_size; + int32_t cube_matrix_size; + + int32_t aligned_a; + int32_t aligned_b; + + int32_t swizzl_count; + int32_t swizzl_direct; + + int32_t L1_PINGPONG_BUFFER_LEN; + int32_t L0AB_PINGPONG_BUFFER_LEN; + int32_t rank; + int32_t rank_size; + int32_t p_value; + int32_t loop_num_per_comm; + + int32_t withSerialMode; + int32_t buffer_size; + + int32_t ag_dim; + int32_t rs_dim; + bool inner_dim_is_Ag{false}; + int32_t ag_rank_idx; + int32_t rs_rank_idx; + bool weight_nz{false}; + + bool is_91093{false}; + QuantGranularity dequant_granularity; +}; + +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce b/comm/lcal/src/kernels/coc_ppmatmul_switch.cce new file mode 100644 index 00000000..7bca03ca --- /dev/null +++ b/comm/lcal/src/kernels/coc_ppmatmul_switch.cce @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "coc_internal.cce" +#include "coc_ppmatmul.cce" +#include "tiling_args.h" + +#ifdef __DAV_C220_CUBE__ + +template +FORCE_INLINE_AICORE void RunPpMatmul(int32_t tiling_key, PP_MATMUL_AIC_ARGS_FUN(TData, TData)) { + PpMatmul matmul_z; + PpMatmul matmul_tb_z; + PpMatmul matmul_z_int8; + PpMatmul matmul_tb_z_int8; + int32_t tiling_key_sel = tiling_key & 0b011101; + switch (tiling_key_sel) { + case 0b000000 : + matmul_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_z.Run(); + break; + case 0b001000 : + matmul_tb_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_tb_z.Run(); + break; + case 0b000100 : + matmul_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_z_int8.Run(); + break; + case 0b001100 : + matmul_tb_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_tb_z_int8.Run(); + break; + default : + break; + } +} + +template +inline __aicore__ void CocPpmatmulSwitchAic(COC_ARGS_FUN(TData)) { + CoCBuffAddrAndArgs coc_buff_and_args(COC_ARGS_CALL()); + __gm__ TData* buff[LCAL_MAX_RANK_SIZE]; + for (int i = 0; i < coc_buff_and_args.rankSize; ++i) { + buff[i] = coc_buff_and_args.buff[i]; + } + bool is_deterministic = coc_buff_and_args.DETERMINISTIC; + set_padding(0); + SetAtomicNone(); + uint64_t config = 0x1; + set_nd_para(config); + SetSyncBaseAddr((uint64_t)ffts_addr); + + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + auto quantInfo = ¶->quantInfo; + auto twoDimTPInfo = ¶->twoDimTPInfo; + + bool weight_nz = para->weightNz; + int32_t batch_size = cocTilingData->batchSize; + int32_t m = cocTilingData->m; + int32_t k = cocTilingData->k; + int32_t n = cocTilingData->n; + + int32_t m0 = cocTilingData->m0; + int32_t k0 = cocTilingData->k0; + int32_t n0 = cocTilingData->n0; + + int32_t m_loop = cocTilingData->m_loop; + int32_t k_loop = cocTilingData->k_loop; + int32_t n_loop = cocTilingData->n_loop; + + int32_t core_loop = cocTilingData->coreLoop; + int32_t swizzl_count = cocTilingData->swizzlCount; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t rank = cocTilingData->rank; + int32_t rank_size = cocTilingData->rankSize; + int32_t p_value = cocTilingData->pValue; + int32_t withSerialMode = cocTilingData->withSerialMode; + bool is_91093 = cocTilingData->is91093; + int32_t buffer_size = cocTilingData->bufferSize; + + int32_t swizzl_direct = (tiling_key & SWIZZL_MASK) ? 1 : 0; + bool is_int8 = (tiling_key & INT8_MASK) != 0; + QuantGranularity dequant_granularity = static_cast(quantInfo->dequantGranularity); + int32_t dequant_group_size = quantInfo->dequantGroupSize; + quantGranularity quant_granularity = static_cast(quantInfo->quantGranularity); + int32_t quant_group_size = quantInfo->quantGroupSize; + __gm__ TData* gm_peer_mem = buff[rank]; + __gm__ TData* gm_c = gm_out; + int32_t ag_dim = twoDimTPInfo->agDim; + int32_t rs_dim = twoDimTPInfo->rsDim; + bool inner_dim_is_Ag = twoDimTPInfo->innerDimIsAg; + + RunPpMatmul(tiling_key, PP_MATMUL_AIC_ARGS_CALL()); + PipeBarrier(); +} + +#endif \ No newline at end of file diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp new file mode 100644 index 00000000..eda5326e --- /dev/null +++ b/comm/lcal/src/lcoc_func.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + #include "lcoc_func.h" + #include "lcoc_args.h" + #include "mki/utils/log/log.h" + + using namespace std; + namespace Lcal { + + bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max) + { + if (value < min || (max != PARA_CHECK_MAX_VALUE && value > max)) { + if (max == PARA_CHECK_MAX_VALUE) { + MKI_LOG(ERROR) << "The " << name << ":" << value << " must equal or greater than " << min << "!"; + } else { + MKI_LOG(ERROR) << "The " << name << ":" << value << " must be in [" << min << "," << max << "]!"; + } + return false; + } + return true; + } + + bool CheckParamScopeList(std::vector> paramCheckList) + { + for (auto ¶m : paramCheckList) { + auto name = std::get<0>(param); + auto value = std::get<1>(param); + auto min = std::get<2>(param); + auto max = std::get<3>(param); + if (value == INPUT_PARAM_DEFAULT_VALUE) { + continue; + } + if (!CheckParamScope(name, value, min, max)) { + return false; + } + } + return true; + } + + bool CheckParamAlign(const std::string &name, const int &value, const int &align) + { + if (value % align != 0) { + MKI_LOG(ERROR) << "The " << name << ":" << value << " must be aligned by " << align << "!"; + return false; + } + return true; + } + + void PrintErrorLog(LcalType lcalType, const std::string &log) + { + MKI_LOG(ERROR) << "[" + LCAL_TYPE2NAME.at(lcalType) + "]: " << log; + } + + bool CheckParamPowerOfTwo(const std::string &name, int value) + { + if (value <= 0) { + MKI_LOG(ERROR) << "The " << name << ":" << value << " must be greater than zero!"; + return false; + } + if ((static_cast(value) & (static_cast(value) - 1)) != 0) { + MKI_LOG(ERROR) << "The " << name << ":" << value << " must be power of two!"; + return false; + } + return true; + } + + int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, + int nElemAlign) + { + int64_t nRow = transpose ? n : m; + int64_t nCol = transpose ? m : n; + int64_t nColAlign = (nCol + nElemAlign - 1) / nElemAlign * nElemAlign; + return batchSize * nRow * nColAlign; + } + } \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp new file mode 100644 index 00000000..0204881d --- /dev/null +++ b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include +#include "tiling.h" +#include "tiling_func.h" +#include "lcoc_func.h" + +#define TILING_MAP std::>> +namespace Lcal { +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFAULT = 11; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_UBMOVENUM_DEFAULT = 40; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_LENPERLOOPMULT_DEFAULT = 400; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16LenperloopmultMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMNPUSPLIT_DEFAULT = 8; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMDATASPLIT_DEFAULT = 1; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT = 12; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 4; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 1; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT = 8; + +// 821 +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_SWIZZLECOUNT_DEFAULT = 5; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_UBMOVENUM_DEFAULT = 60; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_LENPERLOOPMULT_DEFAULT = 400; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMNPUSPLIT_DEFAULT = 8; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16CommnpusplitMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDATASPLIT_DEFAULT = 1; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDIRECT_DEFAULT = 1; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT = 20; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap = {}; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 1; + +constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT = 8; + +// 281 +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_SWIZZLECOUNT_DEFAULT = 11; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_UBMOVENUM_DEFAULT = 10; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_LENPERLOOPMULT_DEFAULT = 400; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMNPUSPLIT_DEFAULT = 1; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDATASPLIT_DEFAULT = 8; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT = 20; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 8; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommnpusplitMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT = 2; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommdatasplitMap = {}; + +// 280 +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_SWIZZLECOUNT_DEFAULT = 9; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_UBMOVENUM_DEFAULT = 40; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_LENPERLOOPMULT_DEFAULT = 400; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMNPUSPLIT_DEFAULT = 1; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDATASPLIT_DEFAULT = 8; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDIRECT_DEFAULT = 0; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16CommdirectMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT = 60; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtralenperloopmultMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 8; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommnpusplitMap = {}; + +constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT = 1; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap = {}; + +const int PVALE_ONE = 1; +const int M0_DEFAULT = 128; +const int K0_DEFAULT = 256; +const int N0_DEFAULT = 256; +const int SWIZZLEDIRECT_ONE = 1; + +void AG8RS2FalseFP16Tiling(CoCTilingData &cocTilingData) +{ + std::map tilingParamMap = { + {&cocTilingData.swizzlCount, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFAULT, + g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_UBMOVENUM_DEFAULT, + g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap}}, + {&cocTilingData.lenPerLoop, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_LENPERLOOPMULT_DEFAULT, + g_allgatherEightReducescatterTwoFalseFP16LenperloopmultMap}}, + {&cocTilingData.commNpuSplit, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMNPUSPLIT_DEFAULT, + g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMDATASPLIT_DEFAULT, + g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap}}, + {&cocTilingData.extraUbMoveNum, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT. + g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap}}, + {&cocTilingData.extraLenPerLoop, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT, + g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap}} + {&cocTilingData.extraCommNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT}}, + {&cocTilingData.extraCommDataSplit, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT}}}; + SetTilingParam2D(cocTilingData, tilingParamMap); + return; +} + +void AG8RS2TrueFP16Tiling(CoCTilingData &cocTilingData) +{ + std::map tilingParamMap = { + {&cocTilingData.swizzlCount, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_SWIZZLECOUNT_DEFAULT, + g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_UBMOVENUM_DEFAULT, + g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap}}, + {&cocTilingData.lenPerLoop, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_LENPERLOOPMULT_DEFAULT}}, + {&cocTilingData.commNpuSplit, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMNPUSPLIT_DEFAULT, + g_allgatherEightReducescatterTwoTrueFP16CommnpusplitMap}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDATASPLIT_DEFAULT, + g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap}}, + {&cocTilingData.commDirect, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDIRECT_DEFAULT}}, + {&cocTilingData.extraUbMoveNum, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT. + g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap}}, + {&cocTilingData.extraLenPerLoop, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, + g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap}} + {&cocTilingData.extraCommNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT}}, + {&cocTilingData.extraCommDataSplit, + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT}}}; + SetTilingParam2D(cocTilingData, tilingParamMap); + return; +} + +void AG2RS8TrueFP16Tiling(CoCTilingData &cocTilingData) +{ + std::map tilingParamMap = { + {&cocTilingData.swizzlCount, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_SWIZZLECOUNT_DEFAULT, + g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_UBMOVENUM_DEFAULT, + g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap}}, + {&cocTilingData.lenPerLoop, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_LENPERLOOPMULT_DEFAULT, + g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap}}, + {&cocTilingData.commNpuSplit, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMNPUSPLIT_DEFAULT}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDATASPLIT_DEFAULT}}, + {&cocTilingData.commDirect, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDIRECT_DEFAULT}}, + {&cocTilingData.extraUbMoveNum, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT. + g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap}}, + {&cocTilingData.extraLenPerLoop, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, + g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap}}, + {&cocTilingData.extraCommNpuSplit, + {DIM_EIGHT, g_allgatherTwoReducescatterEightTrueFP16ExtracommnpusplitMap}}, + {&cocTilingData.extraCommDataSplit, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT, + g_allgatherTwoReducescatterEightTrueFP16ExtracommdatasplitMap}}}; + SetTilingParam2D(cocTilingData, tilingParamMap); + return; +} + +void AG2RS8FalseFP16Tiling(CoCTilingData &cocTilingData) +{ + std::map tilingParamMap = { + {&cocTilingData.swizzlCount, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_SWIZZLECOUNT_DEFAULT, + g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_UBMOVENUM_DEFAULT, + g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap}}, + {&cocTilingData.lenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_LENPERLOOPMULT_DEFAULT}}, + {&cocTilingData.commNpuSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMNPUSPLIT_DEFAULT}}, + {&cocTilingData.commDataSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDATASPLIT_DEFAULT}}, + {&cocTilingData.commDirect, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDIRECT_DEFAULT, + g_allgatherTwoReducescatterEightFalseFP16CommdirectMap}}, + {&cocTilingData.extraUbMoveNum, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT. + g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap}}, + {&cocTilingData.extraLenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT}}, + {&cocTilingData.extraCommNpuSplit, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT, + g_allgatherTwoReducescatterEightFalseFP16ExtracommnpusplitMap}}, + {&cocTilingData.extraCommDataSplit, + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT, + g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap}}}; + SetTilingParam2D(cocTilingData, tilingParamMap); + return; +} + +void CoCAllgatherMatmulReduceScatterTilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + CoCTilingFunc::GetDefaultTiling(taskParam); + + cocTilingData.swizzleDirect = SWIZZLEDIRECT_ONE; + + cocTilingData.m0 = M0_DEFAULT; + cocTilingData.k0 = K0_DEFAULT; + cocTilingData.n0 = N0_DEFAULT; + + cocTilingData.withSerialMode = 0; + cocTilingData.is91093 = 0; + cocTilingData.pValue = PVALE_ONE; + cocTilingData.commDirect = 0; + + auto rsDim = taskParam.cocParamDesc.twoDimTPInfo.rsDim; + auto agDim = taskParam.cocParamDesc.twoDimTPInfo.agDim; + auto innerDimIsAg = taskParam.cocParamDesc.twoDimTPInfo.innerDimIsAg; + if (agDim == DIM_EIGHT && rsDim == DIM_TWO && !innerDimIsAg) { + AG8RS2FalseFP16Tiling(cocTilingData); + } else if (agDim == DIM_EIGHT && rsDim == DIM_TWO && innerDimIsAg) { + AG8RS2TrueFP16Tiling(cocTilingData); + } else if (agDim == DIM_TWO && rsDim == DIM_EIGHT && innerDimIsAg) { + AG2RS8TrueFP16Tiling(cocTilingData); + } else { + AG2RS8FalseFP16Tiling(cocTilingData); + } + cocTilingData.commNpuSplit = std::min(cocTilingData.commNpuSplit, agDim); + cocTilingData.extraCommNpuSplit = std::min(cocTilingData.extraCommNpuSplit, rsDim); +} + +bool CoCAllgatherMatmulReduceScatterTilingFunc::CheckTiling(const TaskParam &taskParam) +{ + if (!CoCTilingFunc::CheckTiling(taskParam)) { + return false; + } + + auto commNpuSplit = cocTilingData.commNpuSplit; + auto commDataSplit = cocTilingData.commDataSplit; + auto extraCommNpuSplit = cocTilingData.extraCommNpuSplit; + auto extraCommDataSplit = cocTilingData.extraCommDataSplit; + auto coreNum = cocTilingData.blockDim; + auto useCoreCount = commNpuSplit * commDataSplit + extraCommNpuSplit * extraCommDataSplit; + + const int maxMValue = 200000; + const int maxNValue = 32768; + const int maxKValue = 32768; + std::vector> paramCheckList = { + {"m", cocTilingData.m, PARAM_CHECK_MIN_VALUE_ONE, maxMValue}, + {"k", cocTilingData.k, PARAM_CHECK_MIN_VALUE_ONE, maxKValue}, + {"n", cocTilingData.n, PARAM_CHECK_MIN_VALUE_ONE, maxNValue}, + {"commNpuSplit * commDataSplit + extraCommNpuSplit * extraCommDataSplit", + useCoreCount, PARAM_CHECK_MIN_VALUE_ONE, coreNum}, + }; + return CheckParamScopeList(paramCheckList); +} +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/tiling_func.cpp b/comm/lcal/src/tiling/tiling_func.cpp new file mode 100644 index 00000000..75623fcb --- /dev/null +++ b/comm/lcal/src/tiling/tiling_func.cpp @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + #include "lcoc_func.h" + #include "lcoc_args.h" + #include "tiling_args.h" + #include "tiling_func.h" + + namespace Lcal { + int32_t CeilDev(int32_t num, int32_t div) + { + if (div == 0) { + return 0; + } + return (num + div - 1) / div; + } + + int32_t RoundNum(int32_t num, int32_t rnd) + { + if (rnd == 0) { + return 0; + } + return (num + rnd - 1) / rnd * rnd; + } + + void UpdateTilingValue(const int32_t &tilingParam, int32_t &tilingDataParam) + { + if (tilingParam != INPUT_PARAM_DEFAULT_VALUE) { + tilingDataParam = tilingParam; + } + } + + double GetMTETime(double mknGB, int32_t m0, int32_t n0, double aBindWidth, double bBindWidth) + { + return DOUBLE * mknGB * (SECOND_TO_MS / ONE_K) * (1.0 / (n0 * aBindWidth) + 1.0 / (m0 * bBindWidth)); + } + + int32_t GetValueFromMKNConditionMap(int32_t m, int32_t k, int32_t n, + int32_t defaultValue, + std::map>> conditionMap) + { + int32_t value = defaultValue; + for (auto iter = conditionMap.cbegin(); iter != conditionMap.cend(); ++iter) { + for (auto &condition : iter->second) { + bool inRange = + m > condition[CONDITION_M_ST] && m <= condition[CONDITION_M_END] && + k > condition[CONDITION_K_ST] && m <= condition[CONDITION_K_END] && + n > condition[CONDITION_N_ST] && m <= condition[CONDITION_N_END]; + if (inRange) { + return iter->first; + } + } + } + return value; + } + + bool Is910B(const ChipName &chipName) + { + return chipName >= ChipName::CHIP_910B1 && chipName <= ChipName::CHIP_910B41; + } + + bool Is91093(const ChipName &chipName) + { + return chipName >= ChipName::CHIP_910_9391 && chipName <= ChipName::CHIP_910_9362; + } + + uint32_t GetTilingKey(const MatMulInfo &mmInfo, CoCTilingData &tilingData) + { + uint32_t tilingKey = static_cast(tilingData.swizzlDirect); + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transA); + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transB); + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.isInt8); + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.withBias); + tilingKey = (static_cast(tilingKey) << 1) + static_cast(tilingData.splitk); + return tilingKey; + } + + void DealTilingParamByBuffSize(CoCTilingData &cocTilingData) + { + auto blockCount = (cocTilingData.is91093 != 0) ? BLOCK_COUNT_3 : MAX_BLOCK_COUNT; + int maxPeerMemPerRank = + (cocTilingData.bufferSize * 1024 * 1024) / INPUT_DTYPE / cocTilingData.rankSize / blockCount; + int maxPValue = maxPeerMemPerRank / cocTilingData.m0 / cocTilingData.k0 / cocTilingData.kLoop; + cocTilingData.pValue = ClampValue(cocTilingData.pValue, MIN_P_VALUE, maxPValue); + + if (cocTilingData.m0 == DEFAULT_COL + && cocTilingData.pValue * cocTilingData.m0 * cocTilingData.k0 * cocTilingData.kLoop >= maxPeerMemPerRank) { + cocTilingData.m0 = DEFAULT_ROW; + cocTilingData.n0 = DEFAULT_COL; + cocTilingData.mLoop = CeilDev(cocTilingData.m, cocTilingData.m0); + cocTilingData.nLoop = CeilDev(cocTilingData.n, cocTilingData.n0); + } + } + + int ClampValue(int32_t value, int32_t min, int32_t max) + { + return std::max(min, std::min(value, max)); + } + + void SetTilingParam(CoCTilingData &cocTilingData, const std::map& tilingParamMap) + { + int32_t m = cocTilingData.m; + int32_t k = cocTilingData.k; + int32_t n = cocTilingData.n; + + for (auto &item : tilingParamMap) { + auto value = item.second.value; + auto conditionMap = item.second.conditionMap; + if (!conditionMap.empty()) { + *item.first = GetValueFromMKNConditionMap(m, k, n, value, conditionMap); + } else if (value != -1) { + *item.first = value; + } + } + + cocTilingData.ubMoveNum = cocTilingData.ubMoveNum * HALF_KBYTE; + if (cocTilingData.m0 >= DEFAULT_ROW) { + cocTilingData.k0 = DEFAULT_COL; + cocTilingData.n0 = cocTilingData.m0 == DEFAULT_ROW ? DEFAULT_COL : DEFAULT_ROW; + cocTilingData.mLoop = CeilDev(cocTilingData.m, cocTilingData.m0); + cocTilingData.nLoop = CeilDev(cocTilingData.n, cocTilingData.n0); + cocTilingData.kLoop = CeilDev(cocTilingData.k, cocTilingData.k0); + } + } + + void SetSecondCoreSplitTling(CoCTilingData &cocTilingData) + { + cocTilingData.extraCommDirect = cocTilingData.commDirect; + cocTilingData.extraCommNpuSplit = cocTilingData.commNpuSplit; + cocTilingData.extraCommDataSplit = cocTilingData.commDataSplit; + cocTilingData.extraLenPerLoop = cocTilingData.lenPerLoop; + cocTilingData.extraUbMoveNum = cocTilingData.ubMoveNum; + } + + void SetTilingParam2D(CoCTilingData &cocTilingData, const std::map& tilingParamMap) + { + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.extraUbMoveNum = cocTilingData.extraUbMoveNum * HALF_KBYTE; + cocTilingData.lenPerLoop = cocTilingData.lenPerLoop * cocTilingData.ubMoveNum / DIV_TWO; + cocTilingData.extraLenPerLoop = cocTilingData.extraLenPerLoop * cocTilingData.extraUbMoveNum / DIV_TWO; + } + + std::map GetCoCTilingPowerOfTwoParamMap() + { + std::map powerOfTwoParamMap = { + {"commDataSplit", true}, + {"extraCommDataSplit", true} + }; + return powerOfTwoParamMap; + } + + std::map GetCoCTilingAlignParamMap() + { + std::map alignParamMap = { + {"m0", BLOCK_SIZE}, + {"n0", BLOCK_SIZE}, + {"k0", BLOCK_SIZE}, + {"ubMoveNum", HALF_KBYTE}, + {"lenPerLoop", HALF_KBYTE}, + {"extraUbMoveNum", HALF_KBYTE}, + {"extraLenPerLoop", HALF_KBYTE}, + }; + return alignParamMap; + } + + std::vector> GetCoCTingparamCheckList(const CoCTiling &tiling) + { + std::vector> paramCheckList = { + {"m0", tiling.m0, BLOCK_SIZE, CUBE_BLOCK_SIZE}, + {"n0", tiling.n0, BLOCK_SIZE, CUBE_BLOCK_SIZE}, + {"k0", tiling.k0, CUBE_BLOCK_SIZE, AXES_ALIGN_SIZE}, + {"swizzlCount", tiling.swizzlCount, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"swizzleDirect", tiling.swizzlDirect, SWIZZLE_DIRECT_ZERO, SWIZZLE_DIRECT_ONE}, + {"ubMoveNum", tiling.ubMoveNum, HALF_KBYTE, MAX_UB_NUM}, + {"commNpuSplit", tiling.commNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"commDataSplit", tiling.commDataSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"commDirect", tiling.commDirect, COMM_DATA_DIRECT, COMM_NPU_DIRECT}, + {"lenPerLoop", tiling.lenPerLoop, HALF_KBYTE, PARA_CHECK_MAX_VALUE}, + {"extraUbMoveNum", tiling.extraUbMoveNum, HALF_KBYTE, MAX_UB_NUM}, + {"extraCommNpuSplit", tiling.extraCommNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"extraCommDataSplit", tiling.extraCommDataSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"extraCommDirect", tiling.extraCommDirect, COMM_DATA_DIRECT, COMM_NPU_DIRECT}, + {"extraLenPerLoop", tiling.extraLenPerLoop, HALF_KBYTE, PARA_CHECK_MAX_VALUE}, + {"splitK", tiling.splitK, PARAM_CHECK_MIN_VALUE_ZERO, PARA_CHECK_MAX_VALUE}, + {"write2OtherRank", tiling.write2OtherRank, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MIN_VALUE_ONE}, + {"withSerialMode", tiling.withSerialMode, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MIN_VALUE_ONE}, + {"is91093", tiling.is91093, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MIN_VALUE_ONE} + }; + return paramCheckList; + } + + bool CheckCoCTiling(const CoCTiling &tiling) + { + auto powerOfTwoParamMap = GetCoCTilingPowerOfTwoParamMap(); + auto alignParamMap = GetCoCTilingAlignParamMap(); + auto paramCheckList = GetCoCTingparamCheckList(tiling); + for (auto ¶m : paramCheckList) { + auto name = std::get<0>(param); + auto value = std::get<1>(param); + auto min = std::get<2>(param); + auto max = std::get<3>(param); + if (value == INPUT_PARAM_DEFAULT_VALUE) { + continue; + } + if (!CheckParamScope(name, value, min, max)) { + return false; + } + if (alignParamMap.find(name) != alignParamMap.end() + && !CheckParamAlign(name, value, alignParamMap[name])) { + return false; + } + if (powerOfTwoParamMap.find(name) != powerOfTwoParamMap.end() + && !CheckParamPowerOfTwo(name, value)) { + return false; + } + } + return true; + } + + bool CheckCoCTilingData(const CoCTilingData &tilingData) + { + if (!CheckCoCTiling(tilingData)) { + return false; + } + std::vector> paramCheckList = { + {"mLoop", tiling.mLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"kLoop", tiling.kLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"nLoop", tiling.nLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"coreLoop", tiling.coreLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"tilingKey", tiling.tilingKey, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + }; + return CheckParamScopeList(paramCheckList); + } + + void TransformCoCTiling(const CoCTiling &tiling, CoCTilingData &tilingData) + { + int* tilingPtr = reinterpret_cast(const_cast(&tiling)); + int* tilingDataPtr = reinterpret_cast(&tilingData); + int length = sizeof(tiling) / sizeof(int32_t); + for (int i = 0; i < length; i++) { + UpdateTilingValue(tilingPtr[i], tilingDataPtr[i]); + } + } + + void CalTilingParam(const MatMulInfo &mmInfo, CoCTilingData &tilingData) + { + tilingData.mLoop = CeilDev(tilingData.m, tilingData.m0); + tilingData.kLoop = CeilDev(tilingData.k, tilingData.k0); + tilingData.nLoop = CeilDev(tilingData.n, tilingData.n0); + tilingData.coreLoop = tilingData.batchSize * tilingData.mLoop * tilingData.nLoop; + tilingData.tilingKey = GetTilingKey(mmInfo, tilingData); + + tilingData.ubMoveNum = RoundNum(tilingData.ubMoveNum, HALF_KBYTE); + tilingData.lenPerLoop = RoundNum(tilingData.lenPerLoop, HALF_KBYTE); + tilingData.extraUbMoveNum = RoundNum(tilingData.extraUbMoveNum, HALF_KBYTE); + tilingData.extraLenPerLoop = RoundNum(tilingData.extraLenPerLoop, HALF_KBYTE); + } + + void SetTilingInputParam(const TaskParam &taskParam, CoCTilingData &tilingData) + { + tilingData.m = taskParam.cocParamDesc.mmInfo.m; + tilingData.n = taskParam.cocParamDesc.mmInfo.n; + tilingData.k = taskParam.cocParamDesc.mmInfo.k; + tilingData.batchSize = taskParam.cocParamDesc.mmInfo.batchSize; + tilingData.blockDim = taskParam.blockDim; + tilingData.rank = taskParam.rank; + tilingData.rankSize = taskParam.rankSize; + tilingData.bufferSize = taskParam.bufferSize; + } + + void SetTilingData(const TaskParam &taskParam, const CoCTiling &tiling, CoCTilingData &tilingData) + { + TransformCoCTiling(tiling, tilingData); + CalTilingParam(taskParam.cocParamDesc.mmInfo, tilingData); + } + } \ No newline at end of file -- Gitee From 7ad0fca247d3975da357540b23b3a37d0ffd8eaf Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 17:04:10 +0800 Subject: [PATCH 328/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index ed8f333c..e6912b34 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -40,6 +40,7 @@ namespace Lcal { constexpr int32_t ALLREDUCE_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::vector g_allreduceUbmovenumCoef = { + }; static std::vector g_allreducePvalueCoef = { -- Gitee From 3906f9d4be8965a630bee3367281a866a45ea5f9 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 17:14:31 +0800 Subject: [PATCH 329/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index e6912b34..5801ee5a 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -40,22 +40,67 @@ namespace Lcal { constexpr int32_t ALLREDUCE_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::vector g_allreduceUbmovenumCoef = { - + {-1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, + 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, + -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, + 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484} }; static std::vector g_allreducePvalueCoef = { + {-4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, + 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, + -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, + 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263} }; static std::map>> g_allreduceFourRankInT8M0Map = { + {256, {{-1, 3072, -1, 2147483647, -1, 768}}} }; static std::map>> g_allreduceFourRankInT8DatasplitMap = { + {1, {{-1, 768, -1, 2147483647, -1, 768}}}, + {2, {{-1, 1536, -1, 2147483647, 768, 1536}}}, + {4, {{768, 10010, -1, 2147483647, -1, 768}}}, + {8, + {{1536, 10010, -1, 2147483647, 768, 1536}, + {10010, 2147483647, 3072, 2147483647, -1, 768}, + {-1, 19680, 7170, 2147483647, 1536, 7424}, + {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, + {16, + {{10010, 2147483647, 3072, 2147483647, 768, 1536}, + {19680, 2147483647, 7170, 2147483647, 1536, 7424}, + {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}}}, + {32, {{10010, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} }; static std::map>> g_allreduceFourRankInT8PvalueMap = { + {1, {{-1, 10010, -1, 2147483647, -1, 768}, {-1, 5148, -1, 2147483647, 768, 1536}}}, + {2, + {{5148, 10010, -1, 2147483647, 768, 1536}, + {10010, 2147483647, 3072, 2147483647, -1, 768}, + {-1, 19680, 7170, 2147483647, 1536, 7424}, + {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, + {4, + {{19680, 2147483647, 7170, 2147483647, 1536, 7424}, + {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}, + {10010, 2147483647, 3072, 2147483647, 768, 1536}}}, + {6, {{10010, 36980, -1, 3072, -1, 1536}}}, + {8, {{36980, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} }; static std::map>> g_allreduceFourRankInT8UbmovenumMap = { + {20, {{53340, 2147483647, 7170, 2147483647, -1, 3072}}}, + {30, + {{-1, 53340, -1, 2147483647, -1, 3072}, + {53340, 2147483647, -1, 4608, -1, 768}, + {53340, 2147483647, 4608, 7170, -1, 3072}, + {-1, 7196, -1, 2147483647, 3072, 2147483647}, + {10010, 15412, -1, 2147483647, 3072, 2147483647}, + {15412, 2147483647, 5634, 2147483647, 3072, 2147483647}}}, + {40, + {{53340, 2147483647, -1, 4608, 768, 3072}, + {7196, 10010, -1, 2147483647, 3072, 2147483647}, + {15412, 2147483647, -1, 5634, 3072, 2147483647}}} }; static std::map>> g_allreduceFourRankFP16M0Map = { -- Gitee From 05ca99929e9839d4f55264e4b5086509ddb4477b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 27 Aug 2025 17:28:49 +0800 Subject: [PATCH 330/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 5801ee5a..029a8ed0 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -40,67 +40,66 @@ namespace Lcal { constexpr int32_t ALLREDUCE_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::vector g_allreduceUbmovenumCoef = { - {-1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, - 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, - -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, - 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484} + { -1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, + 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, + -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, + 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484 } }; static std::vector g_allreducePvalueCoef = { - {-4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, - 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, - -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, - 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263} + { -4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, + 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, + -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, + 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263 } }; static std::map>> g_allreduceFourRankInT8M0Map = { - {256, {{-1, 3072, -1, 2147483647, -1, 768}}} + {256, + {{-1, 3072, -1, 2147483647, -1, 768}}} }; static std::map>> g_allreduceFourRankInT8DatasplitMap = { - {1, {{-1, 768, -1, 2147483647, -1, 768}}}, - {2, {{-1, 1536, -1, 2147483647, 768, 1536}}}, - {4, {{768, 10010, -1, 2147483647, -1, 768}}}, + {1, + {{-1, 768, -1, 2147483647, -1, 768}}}, + {2, + {{-1, 1536, -1, 2147483647, 768, 1536}}}, + {4, + {{768, 10010, -1, 2147483647, -1, 768}}}, {8, - {{1536, 10010, -1, 2147483647, 768, 1536}, - {10010, 2147483647, 3072, 2147483647, -1, 768}, - {-1, 19680, 7170, 2147483647, 1536, 7424}, - {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, + {{1536, 10010, -1, 2147483647, 768, 1536}, {10010, 2147483647, 3072, 2147483647, -1, 768}, + {-1, 19680, 7170, 2147483647, 1536, 7424}, {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, {16, - {{10010, 2147483647, 3072, 2147483647, 768, 1536}, - {19680, 2147483647, 7170, 2147483647, 1536, 7424}, - {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}}}, - {32, {{10010, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} + {{10010, 2147483647, 3072, 2147483647, 768, 1536}, {19680, 2147483647, 7170, 2147483647, 1536, 7424}, + {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}}}, + {32, + {{10010, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} }; static std::map>> g_allreduceFourRankInT8PvalueMap = { - {1, {{-1, 10010, -1, 2147483647, -1, 768}, {-1, 5148, -1, 2147483647, 768, 1536}}}, + {1, + {{-1, 10010, -1, 2147483647, -1, 768}, {-1, 5148, -1, 2147483647, 768, 1536}}}, {2, - {{5148, 10010, -1, 2147483647, 768, 1536}, - {10010, 2147483647, 3072, 2147483647, -1, 768}, - {-1, 19680, 7170, 2147483647, 1536, 7424}, - {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, + {{5148, 10010, -1, 2147483647, 768, 1536}, {10010, 2147483647, 3072, 2147483647, -1, 768}, + {-1, 19680, 7170, 2147483647, 1536, 7424}, {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, {4, - {{19680, 2147483647, 7170, 2147483647, 1536, 7424}, - {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}, - {10010, 2147483647, 3072, 2147483647, 768, 1536}}}, - {6, {{10010, 36980, -1, 3072, -1, 1536}}}, - {8, {{36980, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} + {{19680, 2147483647, 7170, 2147483647, 1536, 7424}, {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}, + {10010, 2147483647, 3072, 2147483647, 768, 1536}}}, + {6, + {{10010, 36980, -1, 3072, -1, 1536}}}, + {8, + {{36980, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} }; static std::map>> g_allreduceFourRankInT8UbmovenumMap = { - {20, {{53340, 2147483647, 7170, 2147483647, -1, 3072}}}, + {20, + {{53340, 2147483647, 7170, 2147483647, -1, 3072}}}, {30, - {{-1, 53340, -1, 2147483647, -1, 3072}, - {53340, 2147483647, -1, 4608, -1, 768}, - {53340, 2147483647, 4608, 7170, -1, 3072}, - {-1, 7196, -1, 2147483647, 3072, 2147483647}, - {10010, 15412, -1, 2147483647, 3072, 2147483647}, - {15412, 2147483647, 5634, 2147483647, 3072, 2147483647}}}, + {{-1, 53340, -1, 2147483647, -1, 3072}, {53340, 2147483647, -1, 4608, -1, 768}, + {53340, 2147483647, 4608, 7170, -1, 3072}, {-1, 7196, -1, 2147483647, 3072, 2147483647}, + {10010, 15412, -1, 2147483647, 3072, 2147483647}, {15412, 2147483647, 5634, 2147483647, 3072, 2147483647}}}, {40, - {{53340, 2147483647, -1, 4608, 768, 3072}, - {7196, 10010, -1, 2147483647, 3072, 2147483647}, - {15412, 2147483647, -1, 5634, 3072, 2147483647}}} + {{53340, 2147483647, -1, 4608, 768, 3072}, {7196, 10010, -1, 2147483647, 3072, 2147483647}, + {15412, 2147483647, -1, 5634, 3072, 2147483647}}} }; static std::map>> g_allreduceFourRankFP16M0Map = { -- Gitee From c27fd6a742538c0b09127cd2e5aa9f839188124d Mon Sep 17 00:00:00 2001 From: guanguan Date: Wed, 27 Aug 2025 17:48:09 +0800 Subject: [PATCH 331/414] fix --- .../tiling/allgather_reducescatter_tiling.cpp | 156 +++++++++++++++--- 1 file changed, 129 insertions(+), 27 deletions(-) diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp index 0204881d..ae92b66a 100644 --- a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp +++ b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp @@ -16,25 +16,54 @@ #define TILING_MAP std::>> namespace Lcal { constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFAULT = 11; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap = { + {9, + {{768, 1536, -1, 2147483647, -1, 7168}, + {1536, 3072, -1, 5120, -1, 14848}, + {1536, 2147483647, 5120, 2147483647, -1, 10752}}}, + {14, {{768, 1536, -1, 5120, 10752, 2147483647}, {1536, 2147483647, -1, 5120, 14848, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_UBMOVENUM_DEFAULT = 40; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap = { + {24, + {{768, 1536, -1, 2147483647, 3072, 10752}, + {1536, 3072, -1, 7168, 3072, 2147483647}, + {3072, 2147483647, -1, 7168, 3072, 2147483647}}}, + {30, {{3072, 2147483647, 7168, 2147483647, 3072, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_LENPERLOOPMULT_DEFAULT = 400; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16LenperloopmultMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16LenperloopmultMap = { + {2, {{768, 1536, -1, 5120, -1, 3072}}}, + {4, {{3072, 2147483647, -1, 3072, -1, 2147483647}, {3072, 2147483647, 14848, 2147483647, 3072, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMNPUSPLIT_DEFAULT = 8; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap = { + {1, + {{768, 1536, 5120, 2147483647, -1, 3072}, + {1536, 3072, 14848, 2147483647, -1, 7168}, + {3072, 2147483647, 14848, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMDATASPLIT_DEFAULT = 1; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap = { + {8, + {{768, 1536, 5120, 2147483647, -1, 3072}, + {1536, 3072, 14848, 2147483647, -1, 7168}, + {3072, 2147483647, 14848, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT = 12; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap = { + {10, + {{-1, 768, -1, 2147483647, 5120, 10752}, + {768, 1536, -1, 2147483647, 5120, 2147483647}, + {1536, 2147483647, -1, 10752, 5120, 2147483647}, + {1536, 2147483647, 10752, 14848, -1, 10752}}}, + {20, {{1536, 2147483647, 14848, 2147483647, 10752, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 4; -static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap = { + {1, {{3072, 2147483647, -1, 3072, 10752, 2147483647}, {1536, 2147483647, 3072, 7168, -1, 2147483647}}}, + {2, {{768, 1536, 5120, 2147483647, 5120, 2147483647}, {1536, 2147483647, 7168, 10752, -1, 2147483647}}}, + {400, {{3072, 2147483647, 10752, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 1; @@ -42,26 +71,42 @@ constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMDATASPLI // 821 constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_SWIZZLECOUNT_DEFAULT = 5; -static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap = { + {9, {{192, 768, -1, 2147483647, -1, 12800}}}, + {17, {{768, 2147483647, 7168, 2147483647, -1, 7936}}}, + {13, + {{-1, 192, 5120, 2147483647, -1, 12800}, + {-1, 192, -1, 2147483647, 15360, 2147483647}, + {768, 2147483647, -1, 7168, -1, 9088}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_UBMOVENUM_DEFAULT = 60; -static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap = { + {30, {{384, 2147483647, -1, 5120, 3968, 2147483647}, {768, 2147483647, 7168, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_LENPERLOOPMULT_DEFAULT = 400; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMNPUSPLIT_DEFAULT = 8; -static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16CommnpusplitMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16CommnpusplitMap = { + {2, {{-1, 192, 7168, 2147483647, -1, 4608}, {192, 2147483647, 5120, 7168, -1, 4544}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDATASPLIT_DEFAULT = 1; -static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap = { + {4, {{-1, 192, 7168, 2147483647, -1, 4608}, {192, 2147483647, 5120, 7168, -1, 4544}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDIRECT_DEFAULT = 1; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT = 20; -static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap = { + {60, {{-1, 192, -1, 5120, -1, 6912}, {-1, 192, -1, 2147483647, 10368, 2147483647}}}, + {40, + {{-1, 192, 5120, 2147483647, -1, 6912}, + {-1, 192, -1, 2147483647, 6912, 10368}, + {192, 384, -1, 2147483647, 1600, 4608}}}, + {30, {{192, 384, -1, 2147483647, 4608, 2147483647}, {768, 2147483647, -1, 5120, -1, 3968}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; -static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap = {}; +static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap = { + {4, {{384, 2147483647, -1, 5120, -1, 3968}, {384, 2147483647, 7168, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 1; @@ -69,36 +114,79 @@ constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMDATASPLIT // 281 constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_SWIZZLECOUNT_DEFAULT = 11; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap = { + {9, + {{3072, 6144, -1, 2147483647, -1, 10752}, + {12288, 2147483647, -1, 7168, -1, 10752}, + {12288, 2147483647, 10752, 2147483647, -1, 5120}}}, + {14, + {{-1, 3072, -1, 7168, -1, 14848}, + {-1, 3072, -1, 10752, 14848, 2147483647}, + {12288, 2147483647, 7168, 10752, -1, 5120}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_UBMOVENUM_DEFAULT = 10; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap = { + {14, {{3072, 6144, 14848, 2147483647, -1, 3072}, {6144, 12288, 14848, 2147483647, 3072, 2147483647}}}, + {24, {{12288, 2147483647, 10752, 14848, -1, 2147483647}}}, + {32, {{-1, 6144, -1, 2147483647, 10752, 2147483647}, {12288, 2147483647, -1, 10752, -1, 2147483647}}}, + {40, {{3072, 6144, -1, 2147483647, 10752, 2147483647}, {6144, 12288, -1, 14848, 3072, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_LENPERLOOPMULT_DEFAULT = 400; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap = { + {4, + {{3072, 6144, -1, 2147483647, 3072, 10752}, + {12288, 2147483647, -1, 10752, -1, 3072}, + {6144, 2147483647, -1, 2147483647, 3072, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMNPUSPLIT_DEFAULT = 1; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDATASPLIT_DEFAULT = 8; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT = 20; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap = { + {8, {{6144, 2147483647, 3072, 5120, 7168, 2147483647}}}, + {10, {{6144, 2147483647, 3072, 5120, -1, 7168}, {6144, 2147483647, 5120, 14848, -1, 2147483647}}}, + {12, {{3072, 6144, 3072, 2147483647, 14848, 2147483647}}}, + {15, {{-1, 3072, 3072, 2147483647, -1, 10752}, {3072, 6144, -1, 2147483647, -1, 5120}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap = { + {4, + {{-1, 3072, -1, 10752, 14848, 2147483647}, + {12288, 2147483647, 3072, 5120, -1, 2147483647}, + {6144, 2147483647, 5120, 7168, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 8; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommnpusplitMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommnpusplitMap = { + {1, {{12288, 2147483647, 14848, 2147483647, -1, 3072}, {12288, 2147483647, 14848, 2147483647, 5120, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT = 2; -static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommdatasplitMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommdatasplitMap = { + {8, {{12288, 2147483647, 14848, 2147483647, -1, 3072}, {12288, 2147483647, 14848, 2147483647, 5120, 2147483647}}}}; // 280 constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_SWIZZLECOUNT_DEFAULT = 9; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap = { + {13, + {{-1, 768, 1280, 2147483647, -1, 7168}, + {1536, 3072, -1, 2147483647, -1, 7168}, + {3072, 2147483647, 5184, 2147483647, -1, 2147483647}}}, + {17, + {{-1, 768, -1, 2147483647, 7168, 2147483647}, + {3072, 2147483647, -1, 4544, 7168, 2147483647}, + {3072, 2147483647, 4544, 5184, -1, 2147483647}}}, + {5, + {{768, 1536, -1, 2147483647, 5120, 2147483647}, + {3072, 2147483647, -1, 4544, -1, 7168}, + {3072, 2147483647, 7680, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_UBMOVENUM_DEFAULT = 40; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap = { + {30, + {{-1, 768, 2176, 3840, -1, 5120}, {-1, 768, 2560, 2147483647, 5120, 7168}, + {-1, 768, -1, 7680, 7168, 2147483647}, {1536, 3072, -1, 6400, -1, 2147483647}}}, + {60, {{-1, 768, 7680, 2147483647, 7168, 2147483647}, {768, 1536, -1, 1280, -1, 7168}}}, + {20, {{768, 1536, -1, 4352, 7168, 2147483647}, {3072, 2147483647, -1, 6400, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_LENPERLOOPMULT_DEFAULT = 400; @@ -107,19 +195,33 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMNPUSPLIT_DEFA constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDATASPLIT_DEFAULT = 8; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDIRECT_DEFAULT = 0; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16CommdirectMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16CommdirectMap = { + {1, + {{-1, 768, 3456, 2147483647, -1, 5120}, + {-1, 768, 2560, 2147483647, 5120, 7168}, + {-1, 768, 4352, 7680, 7168, 2147483647}, + {768, 1536, -1, 2147483647, -1, 7168}, + {1536, 3072, 1280, 2147483647, -1, 2147483647}, + {3072, 2147483647, -1, 7680, 5120, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT = 60; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap = { + {40, {{768, 2147483647, -1, 2176, -1, 5120}}}, + {30, + {{768, 1536, 2176, 2147483647, -1, 5120}, + {768, 1536, -1, 2147483647, 5120, 2147483647}, + {1536, 2147483647, -1, 1792, 5120, 2147483647}}}, + {20, {{1536, 2147483647, 2176, 2147483647, -1, 5120}, {1536, 2147483647, 1792, 2147483647, 5120, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtralenperloopmultMap = {}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 8; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommnpusplitMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommnpusplitMap = { + {1, {{3072, 2147483647, 2176, 2147483647, -1, 5120}, {768, 2147483647, -1, 2147483647, 5120, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT = 1; -static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap = {}; +static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap = { + {8, {{3072, 2147483647, 2176, 2147483647, -1, 5120}, {768, 2147483647, -1, 2147483647, 5120, 2147483647}}}}; const int PVALE_ONE = 1; const int M0_DEFAULT = 128; -- Gitee From 3e7917b86aabf84160704b1f19591de65ff93135 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:35:22 +0800 Subject: [PATCH 332/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 119 ++++++++++++++++-- 1 file changed, 112 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 029a8ed0..1f909157 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -67,10 +67,10 @@ namespace Lcal { {{768, 10010, -1, 2147483647, -1, 768}}}, {8, {{1536, 10010, -1, 2147483647, 768, 1536}, {10010, 2147483647, 3072, 2147483647, -1, 768}, - {-1, 19680, 7170, 2147483647, 1536, 7424}, {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, + {-1, 19680, 7170, 2147483647, 1536, 7424}, {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, {16, {{10010, 2147483647, 3072, 2147483647, 768, 1536}, {19680, 2147483647, 7170, 2147483647, 1536, 7424}, - {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}}}, + {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}}}, {32, {{10010, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 7170, 1536, 2147483647}}} }; @@ -80,10 +80,10 @@ namespace Lcal { {{-1, 10010, -1, 2147483647, -1, 768}, {-1, 5148, -1, 2147483647, 768, 1536}}}, {2, {{5148, 10010, -1, 2147483647, 768, 1536}, {10010, 2147483647, 3072, 2147483647, -1, 768}, - {-1, 19680, 7170, 2147483647, 1536, 7424}, {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, + {-1, 19680, 7170, 2147483647, 1536, 7424}, {-1, 7350, 7170, 2147483647, 7424, 2147483647}}}, {4, {{19680, 2147483647, 7170, 2147483647, 1536, 7424}, {7350, 2147483647, 7170, 2147483647, 7424, 2147483647}, - {10010, 2147483647, 3072, 2147483647, 768, 1536}}}, + {10010, 2147483647, 3072, 2147483647, 768, 1536}}}, {6, {{10010, 36980, -1, 3072, -1, 1536}}}, {8, @@ -95,35 +95,140 @@ namespace Lcal { {{53340, 2147483647, 7170, 2147483647, -1, 3072}}}, {30, {{-1, 53340, -1, 2147483647, -1, 3072}, {53340, 2147483647, -1, 4608, -1, 768}, - {53340, 2147483647, 4608, 7170, -1, 3072}, {-1, 7196, -1, 2147483647, 3072, 2147483647}, - {10010, 15412, -1, 2147483647, 3072, 2147483647}, {15412, 2147483647, 5634, 2147483647, 3072, 2147483647}}}, + {53340, 2147483647, 4608, 7170, -1, 3072}, {-1, 7196, -1, 2147483647, 3072, 2147483647}, + {10010, 15412, -1, 2147483647, 3072, 2147483647}, {15412, 2147483647, 5634, 2147483647, 3072, 2147483647}}}, {40, {{53340, 2147483647, -1, 4608, 768, 3072}, {7196, 10010, -1, 2147483647, 3072, 2147483647}, - {15412, 2147483647, -1, 5634, 3072, 2147483647}}} + {15412, 2147483647, -1, 5634, 3072, 2147483647}}} }; static std::map>> g_allreduceFourRankFP16M0Map = { + {256, + {{-1, 12980, -1, 2147483647, -1, 768}, {12980, 2147483647, -1, 5634, -1, 768}, + {-1, 63360, -1, 4608, 768, 2147483647}, {63360, 2147483647, -1, 4000, 768, 2147483647}, + {63360, 2147483647, 4000, 4608, 1536, 2147483647}, {-1, 5148, 4608, 11264, 768, 2147483647}, + {-1, 2560, 11264, 2147483647, 768, 2147483647}, {5148, 19680, 4608, 2147483647, 13312, 2147483647}}}, + {128, + {{12980, 2147483647, 5634, 2147483647, -1, 768}, {63360, 2147483647, 4000, 4608, 768, 1536}, + {2560, 5148, 11264, 2147483647, 768, 2147483647}, {5148, 19680, 4608, 2147483647, 768, 13312}, + {19680, 2147483647, 4608, 2147483647, 768, 2147483647}}} }; static std::map>> g_allreduceFourRankFP16UbmovenumMap = { + {20, + {{-1, 1536, -1, 2147483647, -1, 1536}, {-1, 1536, 7170, 2147483647, 1536, 19456}, + {1536, 2147483647, 5634, 2147483647, -1, 19456}, {-1, 2147483647, -1, 2147483647, 19456, 2147483647}}}, + {30.0, + {{-1, 1536, -1, 7170, 1536, 19456}, {1536, 2147483647, -1, 5634, -1, 19456}}} }; static std::map>> g_allreduceFourRankFP16PvalueMap = { + {2, + {{-1, 5148, -1, 1536, -1, 1536}, {-1, 5148, 1152, 4608, 3072, 5376}, + {5148, 31220, 3072, 2147483647, -1, 1536}, {-1, 3072, -1, 2147483647, 10240, 2147483647}, + {3072, 5148, 7170, 2147483647, 5376, 2147483647}, {13364, 142040, 7170, 2147483647, 5376, 7424}, + {142040, 2147483647, 7170, 2147483647, 5376, 2147483647}}}, + {1, + {{-1, 5148, 1536, 2147483647, -1, 3072}, {-1, 5148, 4608, 2147483647, 3072, 5376}, + {-1, 3072, -1, 2147483647, 5376, 10240}}}, + {8, + {{-1, 5148, -1, 1536, 1536, 3072}, {68160, 2147483647, -1, 3072, -1, 768}, + {16340, 2147483647, -1, 3072, 768, 5376}, {5148, 13364, -1, 2976, 5376, 2147483647}, + {13364, 2147483647, -1, 5634, 5376, 2147483647}, {13364, 2147483647, 5634, 7170, 10240, 2147483647}}}, + {4, + {{-1, 5148, -1, 1152, 3072, 5376}, {5148, 68160, -1, 3072, -1, 768}, + {5148, 16340, -1, 3072, 768, 5376}, {5148, 31220, 3072, 2147483647, 1536, 5376}, + {31220, 2147483647, 3072, 2147483647, -1, 5376}, {3072, 5148, -1, 7170, 5376, 2147483647}, + {5148, 13364, 2976, 2147483647, 5376, 2147483647}, {13364, 2147483647, 5634, 7170, 5376, 10240}, + {13364, 142040, 7170, 2147483647, 7424, 2147483647}}} }; static std::map>> g_allreduceFourRankFP16DatasplitMap = { + {8, + {{-1, 5148, -1, 3072, -1, 1536}, {-1, 5148, 1152, 4608, 3072, 5376}, + {5148, 68160, 3072, 2147483647, -1, 1536}, {-1, 3072, -1, 2147483647, 10240, 2147483647}, + {3072, 5148, 7170, 2147483647, 5376, 2147483647}, {13364, 142040, 7170, 2147483647, 5376, 7424}, + {142040, 2147483647, 7170, 2147483647, 5376, 2147483647}}}, + {4, + {{-1, 5148, 3072, 2147483647, -1, 1536}, {-1, 5148, 1536, 2147483647, 1536, 3072}, + {-1, 5148, 4608, 2147483647, 3072, 5376}, {-1, 3072, -1, 2147483647, 5376, 10240}}}, + {32, + {{-1, 5148, -1, 1536, 1536, 3072}, {68160, 2147483647, -1, 3072, -1, 768}, + {16340, 2147483647, -1, 3072, 768, 5376}, {5148, 13364, -1, 2976, 5376, 2147483647}, + {13364, 2147483647, -1, 5634, 5376, 2147483647}, {13364, 2147483647, 5634, 7170, 10240, 2147483647}}}, + {16, + {{-1, 5148, -1, 1152, 3072, 5376}, {5148, 68160, -1, 3072, -1, 768}, + {5148, 16340, -1, 3072, 768, 5376}, {5148, 68160, 3072, 2147483647, 1536, 5376}, + {68160, 2147483647, 3072, 2147483647, -1, 5376}, {3072, 5148, -1, 7170, 5376, 2147483647}, + {5148, 13364, 2976, 2147483647, 5376, 2147483647}, {13364, 2147483647, 5634, 7170, 5376, 10240}, + {13364, 142040, 7170, 2147483647, 7424, 2147483647}}} }; static std::map>> g_allreduceEightRankFP16M0Map = { + {128, + {{-1, 31220, -1, 2147483647, -1, 768}, {31220, 36980, 1280, 2147483647, -1, 768}, + {36980, 2147483647, -1, 2147483647, -1, 768}, {-1, 2147483647, -1, 2147483647, 768, 2147483647}}}, + {256, + {{31220, 36980, -1, 1280, -1, 768}}} }; static std::map>> g_allreduceEightRankFP16DatasplitMap = { + {1, + {{-1, 3072, -1, 2147483647, -1, 768}, {3072, 26880, 3072, 2147483647, -1, 768}, + {-1, 1536, -1, 2147483647, 768, 1536}, {1536, 26880, 4608, 2147483647, 768, 1536}, + {26880, 53340, 4608, 2147483647, -1, 768}, {26880, 53340, 3072, 2147483647, 768, 1536}, + {53340, 2147483647, 3072, 2147483647, -1, 1536}, {-1, 768, 4608, 2147483647, 1536, 2147483647}, + {768, 5148, 4608, 2147483647, 1536, 7424}}}, + {4, + {{3072, 26880, -1, 3072, -1, 768}, {-1, 22848, 2976, 4608, 1536, 2147483647}, + {23040, 2147483647, 4608, 7170, 1536, 2147483647}}}, + {8, + {{1536, 26880, -1, 4608, 768, 1536}, {26880, 53340, -1, 3072, 768, 1536}, + {53340, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 384, 3072, 10240}, + {3072, 2147483647, 384, 2976, 1536, 2147483647}, {22848, 2147483647, 2976, 4608, 1536, 2147483647}}}, + {2, + {{26880, 53340, -1, 4608, -1, 768}, {-1, 3072, 384, 2976, 1536, 2147483647}, + {768, 5148, 4608, 2147483647, 7424, 2147483647}, {5148, 23040, 4608, 7170, 1536, 2147483647}, + {5148, 2147483647, 7170, 2147483647, 1536, 2147483647}}}, + {16, + {{-1, 2147483647, -1, 384, 1536, 3072}, {-1, 2147483647, -1, 384, 10240, 2147483647}}} }; static std::map>> g_allreduceEightRankFP16UbmovenumMap = { + {100, + {{-1, 3072, -1, 2147483647, -1, 768}, {3072, 19680, -1, 3072, -1, 768}, + {-1, 3072, -1, 2147483647, 768, 1536}, {3072, 19680, -1, 3072, 768, 1536}, + {-1, 2147483647, 1792, 2976, 1536, 13312}}}, + {30, + {{3072, 19680, 3072, 2147483647, -1, 768}, {19680, 2147483647, -1, 3072, -1, 1536}, + {-1, 2147483647, -1, 1792, 1536, 13312}, {-1, 768, 2976, 2147483647, 5376, 13312}, + {-1, 768, -1, 2147483647, 13312, 2147483647}, {26880, 2147483647, -1, 3072, 13312, 2147483647}}}, + {20, + {{3072, 19680, 3072, 2147483647, 768, 1536}, {19680, 2147483647, 3072, 2147483647, -1, 1536}, + {-1, 2147483647, 2976, 2147483647, 1536, 5376}, {768, 2147483647, 2976, 2147483647, 5376, 13312}, + {768, 26880, -1, 2147483647, 13312, 2147483647}, {26880, 2147483647, 3072, 2147483647, 13312, 2147483647}}} }; static std::map>> g_allreduceEightRankFP16PvalueMap = { + {4, + {{-1, 768, -1, 2147483647, -1, 768}, {12980, 26880, -1, 3072, -1, 768}, + {-1, 15412, 2976, 4608, 1536, 2147483647}, {23040, 2147483647, 4608, 7170, 1536, 2147483647}}}, + {1, + {{768, 12980, -1, 2147483647, -1, 768}, {12980, 26880, 3072, 2147483647, -1, 768}, + {-1, 1536, -1, 2147483647, 768, 1536}, {1536, 26880, 4608, 2147483647, 768, 1536}, + {26880, 53340, 4608, 2147483647, -1, 768}, {26880, 53340, 3072, 2147483647, 768, 1536}, + {53340, 2147483647, 3072, 2147483647, -1, 1536}, {-1, 768, 4608, 2147483647, 1536, 2147483647}, + {768, 5148, 4608, 2147483647, 1536, 7424}}}, + {8, + {{1536, 26880, -1, 4608, 768, 1536}, {26880, 53340, -1, 3072, 768, 1536}, + {53340, 2147483647, -1, 3072, -1, 1536}, {-1, 2147483647, -1, 384, 3072, 10240}, + {3072, 2147483647, 384, 2976, 1536, 2147483647}, {15412, 2147483647, 2976, 4608, 1536, 2147483647}}}, + {2, + {{26880, 53340, -1, 4608, -1, 768}, {-1, 3072, 384, 2976, 1536, 2147483647}, + {768, 5148, 4608, 2147483647, 7424, 2147483647}, {5148, 23040, 4608, 7170, 1536, 2147483647}, + {5148, 2147483647, 7170, 2147483647, 1536, 2147483647}}}, + {14, + {{-1, 2147483647, -1, 384, 1536, 3072}, {-1, 2147483647, -1, 384, 10240, 2147483647}}} }; static std::map>> g_allreduceEightRankInT8M0Map = { -- Gitee From c002c354d670b0f9268b34d615b90e676f9cd8f0 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:37:30 +0800 Subject: [PATCH 333/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 1f909157..31a13c15 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -40,17 +40,17 @@ namespace Lcal { constexpr int32_t ALLREDUCE_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::vector g_allreduceUbmovenumCoef = { - { -1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, - 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, - -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, - 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484 } + { -1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, + 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, + -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, + 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484 } }; static std::vector g_allreducePvalueCoef = { - { -4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, - 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, - -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, - 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263 } + { -4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, + 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, + -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, + 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263 } }; static std::map>> g_allreduceFourRankInT8M0Map = { -- Gitee From ed6bcb8b74055e4f1fc54c9a9c85d691381379e8 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:38:55 +0800 Subject: [PATCH 334/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 31a13c15..de2c74d9 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -232,9 +232,32 @@ namespace Lcal { }; static std::map>> g_allreduceEightRankInT8M0Map = { + {128, + {{-1, 31220, -1, 2147483647, -1, 768}, {31220, 36980, 1280, 2147483647, -1, 768}, + {-1, 36980, -1, 2147483647, 768, 3072}, {36980, 2147483647, -1, 2147483647, -1, 3072}, + {-1, 2147483647, -1, 2147483647, 3072, 13312}, {-1, 1536, -1, 384, 13312, 2147483647}, + {5274, 2147483647, -1, 384, 13312, 2147483647}, {-1, 2147483647, 384, 2147483647, 13312, 2147483647}}}, + {256, + {{31220, 36980, -1, 1280, -1, 768}, {1536, 5274, -1, 384, 13312, 2147483647}}} }; static std::map>> g_allreduceEightRankInT8DatasplitMap = { + {1, + {{-1, 3072, -1, 2147483647, -1, 768}, {3072, 5148, 4608, 2147483647, -1, 768}, + {-1, 1536, -1, 2147483647, 768, 1536}, {3072, 5148, -1, 2147483647, 768, 1536}, + {5148, 2147483647, 5634, 2147483647, -1, 1536}, {-1, 2147483647, 11264, 2147483647, 1536, 5376}}}, + {4, + {{3072, 5148, -1, 4608, -1, 768}, {5148, 31220, -1, 3072, -1, 768}, + {5148, 2147483647, 3072, 4608, -1, 1536}, {-1, 2147483647, 5634, 11264, 1536, 5376}, + {34560, 2147483647, 5634, 2147483647, 5376, 7424}, {7196, 2147483647, 5634, 2147483647, 7424, 13312}}}, + {2, + {{1536, 3072, -1, 2147483647, 768, 1536}, {5148, 2147483647, 4608, 5634, -1, 1536}, + {-1, 34560, 5634, 2147483647, 5376, 7424}, {-1, 3072, -1, 2147483647, 7424, 2147483647}, + {3072, 7196, 5634, 2147483647, 7424, 2147483647}}}, + {8, + {{5148, 31220, -1, 3072, 768, 1536}, {31220, 2147483647, -1, 3072, -1, 1536}, + {-1, 2147483647, -1, 5634, 1536, 7424}, {3072, 7196, -1, 5634, 7424, 2147483647}, + {7196, 2147483647, -1, 5634, 7424, 13312}, {7196, 2147483647, -1, 2147483647, 13312, 2147483647}}} }; static std::map>> g_allreduceEightRankInT8PvalueMap = { -- Gitee From 73c4d7697dd1b1e26eda42f616940bcea041f95e Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:43:57 +0800 Subject: [PATCH 335/414] draft --- .../lcal/src/tiling/allreduce_tiling_910B.cpp | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index de2c74d9..a4ab17e7 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -261,27 +261,136 @@ namespace Lcal { }; static std::map>> g_allreduceEightRankInT8PvalueMap = { + {14, + {{-1, 1536, -1, 2147483647, -1, 768}, {10010, 12980, -1, 2147483647, -1, 768}, + {-1, 7350, -1, 1536, 768, 1536}, {-1, 768, 1536, 2147483647, 768, 1536}, + {-1, 768, -1, 1536, 7424, 2147483647}}}, + {1, + {{1536, 10010, -1, 2147483647, -1, 768}, {12980, 2147483647, 5634, 2147483647, -1, 1536}, + {-1, 2147483647, 11264, 2147483647, 1536, 5376}}}, + {10, + {{7350, 12980, -1, 1536, 768, 1536}}}, + {2, + {{768, 12980, 1536, 2147483647, 768, 1536}, {12980, 2147483647, 4608, 5634, -1, 1536}, + {-1, 34560, 5634, 2147483647, 5376, 7424}, {-1, 768, 1536, 2147483647, 7424, 2147483647}, + {768, 3072, -1, 2147483647, 7424, 19456}}}, + {4, + {{12980, 36980, -1, 3072, -1, 768}, {12980, 2147483647, 3072, 4608, -1, 1536}, + {-1, 2147483647, 5634, 11264, 1536, 5376}, {34560, 2147483647, 5634, 2147483647, 5376, 7424}, + {768, 3072, -1, 2147483647, 19456, 2147483647}, {3072, 2147483647, 5634, 2147483647, 7424, 2147483647}}}, + {8, + {{12980, 36980, -1, 3072, 768, 1536}, {36980, 2147483647, -1, 3072, -1, 1536}, + {-1, 2147483647, -1, 5634, 1536, 7424}, {3072, 2147483647, -1, 5634, 7424, 2147483647}}} }; static std::map>> g_allreduceEightRankInT8UbmovenumMap = { + {80, + {{-1, 7350, -1, 3072, -1, 768}}}, + {100, + {{-1, 7350, 3072, 2147483647, -1, 768}, {-1, 7350, -1, 7170, 768, 3072}, + {-1, 7350, -1, 4608, 3072, 5376}, {7350, 2147483647, -1, 3072, -1, 5376}, + {-1, 768, -1, 2147483647, 5376, 10240}, {768, 1536, -1, 4608, 5376, 2147483647}, + {3072, 2147483647, -1, 2976, 5376, 2147483647}}}, + {30, + {{-1, 7350, 7170, 2147483647, 768, 3072}, {-1, 3072, 7170, 2147483647, 3072, 5376}, + {7350, 23040, 3072, 2147483647, 768, 5376}, {23040, 2147483647, 3072, 2147483647, -1, 5376}, + {-1, 768, -1, 2147483647, 13312, 2147483647}, {768, 1536, 4608, 2147483647, 5376, 2147483647}, + {3072, 120832, 2976, 2147483647, 5376, 13312}, {3072, 2147483647, 2976, 4608, 13312, 2147483647}}}, + {50, + {{-1, 7350, 4608, 7170, 3072, 5376}, {-1, 768, -1, 2147483647, 10240, 13312}}}, + {20, + {{3072, 7350, 7170, 2147483647, 3072, 5376}, {1536, 3072, 7170, 2147483647, 5376, 2147483647}, + {120832, 2147483647, 2976, 2147483647, 5376, 13312}, + {3072, 2147483647, 4608, 2147483647, 13312, 2147483647}}}, + {40, + {{7350, 23040, 3072, 2147483647, -1, 768}, {1536, 3072, -1, 7170, 5376, 2147483647}}} }; static std::map>> g_allreduceTwoRankFP16CommdatasplitMap = { + {16, + {{-1, 6656, -1, 2147483647, -1, 1536}, {6656, 2147483647, -1, 19456, -1, 1536}, + {7680, 2147483647, 19456, 2147483647, -1, 1536}, {-1, 2147483647, -1, 2147483647, 1536, 2147483647}}}, + {4, + {{6656, 7680, 19456, 2147483647, -1, 1536}}} }; static std::map>> g_allreduceTwoRankFP16UbmovenumMap = { + {2, + {{-1, 1536, -1, 3072, -1, 1536}, {-1, 1536, 15360, 2147483647, -1, 1536}, + {1536, 6656, -1, 2147483647, -1, 1536}, {6656, 2147483647, -1, 19456, -1, 1536}, + {7680, 2147483647, 19456, 2147483647, -1, 1536}, {-1, 2147483647, -1, 2147483647, 1536, 2147483647}}}, + {3, + {{-1, 1536, 3072, 15360, -1, 1536}}}, + {6, + {{6656, 7680, 19456, 2147483647, -1, 1536}}} }; static std::map>> g_allreduceTwoRankFP16SwizzldirectMap = { + {1, + {{-1, 6656, -1, 2147483647, -1, 7680}, {6656, 35840, -1, 13312, -1, 7680}, + {35840, 2147483647, -1, 2147483647, -1, 7680}, {-1, 25600, -1, 2147483647, 7680, 2147483647}, + {25600, 2147483647, -1, 2147483647, 7680, 9216}, {25600, 2147483647, -1, 15360, 9216, 11264}, + {25600, 2147483647, -1, 2147483647, 11264, 2147483647}}}, + {0, + {{6656, 35840, 13312, 2147483647, -1, 7680}, {25600, 2147483647, 15360, 2147483647, 9216, 11264}}} }; static std::map>> g_allreduceTwoRankFP16SwizzlcountMap = { + {4, + {{-1, 5632, -1, 2147483647, -1, 1536}, {5632, 7680, -1, 17408, -1, 1536}, + {7680, 9216, -1, 11264, -1, 1536}, {9216, 2147483647, -1, 19456, -1, 1536}, + {19456, 2147483647, 19456, 2147483647, -1, 1536}, {-1, 2147483647, -1, 11264, 1536, 13312}, + {-1, 2147483647, 11264, 15360, 4608, 13312}, {-1, 2147483647, 17408, 2147483647, 1536, 13312}, + {-1, 9216, -1, 15360, 13312, 2147483647}, {-1, 9216, 17408, 2147483647, 13312, 2147483647}, + {9216, 25600, -1, 11264, 13312, 2147483647}, {25600, 35840, -1, 13312, 13312, 2147483647}, + {35840, 2147483647, -1, 2147483647, 13312, 2147483647}}}, + {8, + {{5632, 7680, 17408, 19456, -1, 1536}, {5632, 19456, 19456, 2147483647, -1, 1536}, + {-1, 2147483647, 11264, 17408, 1536, 4608}, {-1, 2147483647, 15360, 17408, 4608, 13312}, + {-1, 9216, 15360, 17408, 13312, 2147483647}, {9216, 25600, 11264, 2147483647, 13312, 2147483647}}}, + {16, + {{7680, 9216, 11264, 19456, -1, 1536}, {25600, 35840, 13312, 2147483647, 13312, 2147483647}}} }; static std::map>> g_allreduceTwoRankFP16M0Map = { + {128, + {{-1, 6656, -1, 2147483647, -1, 7680}, {6656, 2147483647, -1, 13312, -1, 7680}, + {-1, 1536, -1, 7680, 7680, 11264}, {-1, 1536, -1, 6656, 11264, 2147483647}, + {1536, 2147483647, -1, 2147483647, 7680, 2147483647}}}, + {256, + {{6656, 2147483647, 13312, 2147483647, -1, 7680}, {-1, 1536, 7680, 2147483647, 7680, 11264}, + {-1, 1536, 6656, 2147483647, 11264, 2147483647}}} }; static std::map>> g_allreduceTwoRankFP16PvalueMap = { + {2, + {{-1, 2560, -1, 3584, -1, 1536}, {4608, 7680, -1, 7680, -1, 1536}, + {7680, 9216, -1, 2147483647, -1, 1536}, {-1, 15360, 4608, 13312, 1536, 2560}, + {-1, 7680, -1, 13312, 2560, 3584}, {6656, 15360, 13312, 2147483647, 2560, 3584}, + {15360, 25600, 4608, 15360, -1, 2560}, {25600, 2147483647, 19456, 2147483647, -1, 2560}, + {15360, 25600, 11264, 2147483647, 2560, 3584}, {-1, 15360, 9216, 17408, 3584, 9216}, + {-1, 6656, 13312, 2147483647, 11264, 2147483647}, {15360, 35840, 13312, 2147483647, 11264, 2147483647}}}, + {1, + {{-1, 2560, 3584, 2147483647, -1, 1536}, {2560, 4608, -1, 2147483647, -1, 1536}, + {4608, 7680, 7680, 2147483647, -1, 1536}, {9216, 15360, -1, 2147483647, -1, 1536}, + {-1, 15360, 13312, 2147483647, 1536, 2560}, {-1, 6656, 13312, 2147483647, 2560, 3584}, + {15360, 25600, 15360, 2147483647, -1, 2560}, {-1, 15360, 17408, 2147483647, 3584, 9216}, + {-1, 6656, 13312, 2147483647, 9216, 11264}, {9216, 15360, 13312, 2147483647, 9216, 2147483647}}}, + {3, + {{-1, 15360, -1, 4608, 1536, 2560}, {7680, 15360, -1, 13312, 2560, 3584}, + {15360, 2147483647, 2560, 4608, -1, 1536}, {25600, 2147483647, 4608, 19456, -1, 2560}, + {15360, 25600, 4608, 11264, 2560, 3584}, {-1, 15360, 1536, 9216, 3584, 9216}, + {-1, 6656, -1, 13312, 9216, 2147483647}, {15360, 25600, 11264, 2147483647, 3584, 7680}}}, + {4, + {{15360, 30720, -1, 1536, -1, 3584}, {15360, 2147483647, 1536, 2560, -1, 2560}, + {15360, 2147483647, 2560, 4608, 1536, 3584}, {25600, 2147483647, 4608, 2147483647, 2560, 3584}, + {-1, 15360, -1, 1536, 3584, 9216}, {6656, 9216, -1, 2147483647, 9216, 2147483647}, + {9216, 15360, -1, 13312, 9216, 2147483647}, {15360, 25600, -1, 11264, 3584, 7680}, + {25600, 35840, -1, 2147483647, 3584, 6656}, {15360, 35840, 5632, 2147483647, 7680, 11264}}}, + {6, + {{30720, 2147483647, -1, 1536, -1, 3584}, {15360, 2147483647, 1536, 2560, 2560, 3584}, + {25600, 35840, -1, 2147483647, 6656, 7680}, {15360, 35840, -1, 5632, 7680, 11264}, + {15360, 35840, -1, 13312, 11264, 2147483647}, {35840, 2147483647, -1, 2147483647, 3584, 2147483647}}} }; int32_t AllReduceUbMoveNum(int m, int k, int n) -- Gitee From b03248c88ae03b6e6cb0cf8c0a6480149a1ae9e1 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:45:10 +0800 Subject: [PATCH 336/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index a4ab17e7..3e3519f3 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -41,16 +41,16 @@ namespace Lcal { static std::vector g_allreduceUbmovenumCoef = { { -1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, - 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, - -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, - 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484 } + 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, + -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, + 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484 } }; static std::vector g_allreducePvalueCoef = { { -4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, - 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, - -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, - 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263 } + 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, + -2.66903417e-01, -3.68521920e-05, -6.40666333e-09, 6.77406054e-06, -9.92992099e-04, 5.60658043e-02, + 2.69372863e-04, 2.17222337e+01, -1.17749660e-10, 6.100544547671263 } }; static std::map>> g_allreduceFourRankInT8M0Map = { -- Gitee From 7e67fc65d01bdd5f212e0edb276a38a4489d50f6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:45:31 +0800 Subject: [PATCH 337/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 3e3519f3..5330dbcb 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -151,7 +151,7 @@ namespace Lcal { {142040, 2147483647, 7170, 2147483647, 5376, 2147483647}}}, {4, {{-1, 5148, 3072, 2147483647, -1, 1536}, {-1, 5148, 1536, 2147483647, 1536, 3072}, - {-1, 5148, 4608, 2147483647, 3072, 5376}, {-1, 3072, -1, 2147483647, 5376, 10240}}}, + {-1, 5148, 4608, 2147483647, 3072, 5376}, {-1, 3072, -1, 2147483647, 5376, 10240}}}, {32, {{-1, 5148, -1, 1536, 1536, 3072}, {68160, 2147483647, -1, 3072, -1, 768}, {16340, 2147483647, -1, 3072, 768, 5376}, {5148, 13364, -1, 2976, 5376, 2147483647}, -- Gitee From 5d610fd01abb2e832295683f847b1283008c0399 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:46:27 +0800 Subject: [PATCH 338/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 5330dbcb..1f3b32f9 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -41,11 +41,10 @@ namespace Lcal { static std::vector g_allreduceUbmovenumCoef = { { -1.72352427e+01, 2.56887672e-03, -8.21819480e+00, 8.70965589e+01, -3.63853858e-01, 1.27789264e+01, - 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, + 1.29782183e+02, 1.90250023e-02, -3.48175441e+00, 6.18921914e+03, 3.77072171e+03, -5.86895290e+01, -8.70740991e-01, -1.40262280e-04, -2.81910331e-08, 3.22795486e-05, -4.84522320e-03, 2.94839177e-01, 2.97260958e-03, 9.08844709e+01, -5.80426209e-10, 38.183465184603484 } }; - static std::vector g_allreducePvalueCoef = { { -4.23166350e+00, 6.71137487e-04, -1.33434156e+00, 1.12915884e+01, -7.85892737e-02, 2.59059897e+00, 3.22129881e+01, -5.15776887e-02, 9.15542742e-01, 1.56322201e+03, 3.61977421e+01, -5.49544589e-01, -- Gitee From 3a6316177e7fb670a1ba3b69d4682a8e48561e17 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:46:57 +0800 Subject: [PATCH 339/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 1f3b32f9..844a8e64 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -659,7 +659,6 @@ namespace Lcal { SetTilingParam(cocTilingData, tilingParamMap); cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; - AllReduceSetWithSerialMode(cocTilingData); } } \ No newline at end of file -- Gitee From 7156ea0ff7aba8092767b0b900254ee45d697e98 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:47:45 +0800 Subject: [PATCH 340/414] draft --- comm/lcal/src/tiling/allreduce_tiling_910B.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp index 844a8e64..fee13a28 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_910B.cpp @@ -11,7 +11,6 @@ #include "tiling_910B.h" #include "tiling_func.h" #include "lcal_types.h" - namespace Lcal { const int32_t ALLREDUCE_SERIAL_MODE_K_SIZE = 8192; const int64_t ALLREDUCE_SERIAL_MODE_MN_SIZE = 256 * 256 * 12; -- Gitee From cc85dfe9c347e4477ad91eabfa598ccac9308859 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:53:19 +0800 Subject: [PATCH 341/414] draft --- .../src/tiling/allreduce_tiling_91093.cpp | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp index 9ab49c7b..a3532fc5 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp @@ -22,27 +22,187 @@ namespace Lcal { constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; static std::map>> g_allreduce91093EightRankFP16CommdatasplitMap = { + {1, + {{-1, 3072, -1, 2147483647, -1, 768}, {-1, 768, -1, 2147483647, 768, 1536}, + {768, 1536, -1, 7170, 768, 1536}, {1536, 3072, 3072, 6144, 768, 1536}, + {3072, 5148, 3072, 5634, -1, 768}, {-1, 1280, 384, 768, 1536, 3072}, + {-1, 2304, 768, 4000, 1536, 3072}, {-1, 768, 4000, 5634, 1536, 5376}, + {-1, 768, 5634, 2147483647, 1536, 3072}, {768, 1536, 4000, 5634, 1536, 3072}}}, + {16, + {{768, 1536, 7170, 2147483647, 768, 1536}, {1536, 3072, -1, 3072, 768, 1536}, + {1536, 3072, 6144, 2147483647, 768, 1536}, {3072, 5148, -1, 3072, -1, 768}, + {3072, 5148, -1, 5634, 768, 1536}, {3072, 5148, 5634, 2147483647, -1, 1536}, + {5148, 2147483647, -1, 2147483647, -1, 1536}, {-1, 2147483647, -1, 384, 1536, 3072}, + {1280, 2147483647, 384, 768, 1536, 3072}, {2304, 2147483647, 768, 4000, 1536, 3072}, + {-1, 2147483647, -1, 4000, 3072, 2147483647}, {-1, 768, 4000, 5634, 5376, 2147483647}, + {-1, 768, 5634, 2147483647, 3072, 2147483647}, {1536, 2147483647, 4000, 5634, 1536, 3072}, + {768, 2147483647, 5634, 2147483647, 1536, 3072}, {768, 2147483647, 4000, 2147483647, 3072, 2147483647}}} }; static std::map>> g_allreduce91093EightRankFP16PvalueMap = { + {4, + {{-1, 3072, -1, 3072, -1, 768}, {5148, 31220, 3072, 4608, -1, 768}, + {-1, 3072, 768, 4608, 768, 1536}, {31220, 43740, 3072, 2147483647, -1, 1536}, + {43740, 53340, -1, 2147483647, -1, 1536}, {62400, 68160, -1, 2147483647, -1, 768}, + {68160, 2147483647, 3072, 7170, -1, 768}, {68160, 2147483647, 4608, 2147483647, 768, 1536}, + {3072, 19680, 1280, 7170, 1536, 7424}, {7350, 2147483647, 7170, 11264, 1536, 7424}, + {-1, 11904, 2976, 2147483647, 7424, 19456}, {11904, 2147483647, 7170, 2147483647, 7424, 2147483647}}}, + {1, + {{-1, 5148, 3072, 2147483647, -1, 768}, {-1, 3072, 4608, 2147483647, 768, 1536}, + {-1, 2147483647, 11264, 2147483647, 1536, 7424}}}, + {6, + {{3072, 31220, -1, 3072, -1, 768}, {3072, 31220, 768, 4608, 768, 1536}, + {68160, 2147483647, 3072, 4608, 768, 1536}, {19680, 2147483647, 5634, 7170, 1536, 7424}}}, + {2, + {{5148, 31220, 4608, 2147483647, -1, 768}, {3072, 31220, 4608, 2147483647, 768, 1536}, + {68160, 2147483647, 7170, 2147483647, -1, 768}, {-1, 3072, 1280, 7170, 1536, 7424}, + {-1, 7350, 7170, 11264, 1536, 7424}, {-1, 11904, 2976, 2147483647, 19456, 2147483647}}}, + {10, + {{-1, 31220, -1, 768, 768, 1536}, {31220, 43740, -1, 3072, -1, 1536}, + {53340, 62400, -1, 2147483647, -1, 1536}, {62400, 68160, -1, 2147483647, 768, 1536}, + {112280, 2147483647, -1, 3072, -1, 768}, {68160, 2147483647, 1536, 3072, 768, 1536}, + {-1, 38592, 768, 1280, 1536, 3072}, {-1, 5148, -1, 1280, 7424, 13312}, + {38592, 68160, 768, 1280, 1536, 3072}, {19680, 2147483647, 1280, 5634, 1536, 7424}, + {-1, 14336, 1280, 1792, 7424, 2147483647}, {11904, 2147483647, 2976, 7170, 7424, 2147483647}}}, + {12, + {{68160, 112280, -1, 3072, -1, 768}, {-1, 38592, -1, 768, 1536, 7424}, + {-1, 38592, 768, 1280, 3072, 7424}, {-1, 38592, 768, 1280, 13312, 2147483647}, + {68160, 2147483647, 768, 1280, 1536, 3072}, {14336, 2147483647, 1280, 1792, 7424, 2147483647}, + {-1, 2147483647, 1792, 2976, 7424, 2147483647}}}, + {14, + {{68160, 2147483647, -1, 1536, 768, 1536}, {5148, 38592, -1, 1280, 7424, 13312}, + {-1, 38592, -1, 768, 13312, 2147483647}, {38592, 2147483647, -1, 768, 1536, 2147483647}, + {38592, 2147483647, 768, 1280, 3072, 2147483647}}} }; static std::map>> g_allreduce91093EightRankFP16M0Map = { + {128, + {{-1, 3072, -1, 2147483647, -1, 10240}, {-1, 3072, -1, 3072, 10240, 19456}, + {3072, 2147483647, -1, 2147483647, -1, 19456}, {1536, 2147483647, -1, 2147483647, 19456, 2147483647}}}, + {256, + {{-1, 3072, 3072, 2147483647, 10240, 19456}, {-1, 1536, -1, 2147483647, 19456, 2147483647}}} }; static std::map>> g_allreduce91093EightRankFP16UbmovenumMap = { + {80, + {{-1, 768, -1, 7170, -1, 768}, {31220, 36980, -1, 2147483647, -1, 768}, + {-1, 10010, -1, 3072, 1536, 3072}, {-1, 768, 3072, 2147483647, 1536, 3072}}}, + {100, + {{-1, 768, 7170, 2147483647, -1, 768}}}, + {140, + {{768, 3072, -1, 2147483647, -1, 768}}}, + {60, + {{3072, 23040, -1, 3072, -1, 768}, {-1, 36980, -1, 1536, 768, 1536}, + {10010, 36980, -1, 3072, 1536, 3072}, {36980, 2147483647, -1, 1536, -1, 3072}, + {-1, 2147483647, -1, 1280, 3072, 2147483647}}}, + {20, + {{3072, 23040, 3072, 2147483647, -1, 768}, {-1, 36980, 3072, 2147483647, 768, 1536}, + {768, 36980, 3072, 2147483647, 1536, 3072}, {36980, 142040, 3072, 4608, -1, 768}, + {36980, 2147483647, 3072, 4608, 768, 3072}, {768, 2147483647, 2976, 4608, 3072, 2147483647}, + {768, 5148, 4608, 5634, 3072, 2147483647}, {5148, 10010, 4608, 5634, 3072, 9472}, + {-1, 768, 5634, 2147483647, 3072, 2147483647}}}, + {10, + {{23040, 31220, -1, 2147483647, -1, 768}, {142040, 2147483647, 3072, 4608, -1, 768}, + {36980, 2147483647, 4608, 2147483647, -1, 3072}, {5148, 10010, 4608, 5634, 9472, 2147483647}, + {10010, 2147483647, 4608, 5634, 3072, 2147483647}, {768, 2147483647, 5634, 2147483647, 3072, 2147483647}}}, + {30, + {{-1, 36980, 1536, 3072, 768, 1536}, {36980, 2147483647, 1536, 3072, -1, 3072}, + {-1, 2147483647, 1280, 2976, 3072, 2147483647}, {-1, 768, 2976, 4608, 3072, 2147483647}}}, + {160, + {{-1, 768, 4608, 5634, 3072, 2147483647}}} }; static std::map>> g_allreduce91093SixteenRankFP16CommdatasplitMap = { + {1, + {{-1, 36980, -1, 2147483647, -1, 768}, {36980, 74380, -1, 7170, -1, 768}, + {74380, 82060, -1, 3072, -1, 768}, {-1, 82060, -1, 1536, 768, 1536}, + {-1, 23040, 1536, 2147483647, 768, 1536}, {23040, 82060, 5634, 2147483647, 768, 1536}, + {82060, 2147483647, -1, 1536, -1, 1536}, {82060, 112280, 1536, 3072, -1, 1536}, + {129600, 2147483647, 1536, 3072, -1, 1536}, {176600, 222720, 3072, 2147483647, 768, 1536}, + {-1, 2147483647, -1, 2976, 1536, 10240}, {-1, 107968, -1, 2976, 10240, 13312}, + {107968, 2147483647, -1, 1792, 10240, 13312}, {-1, 2147483647, -1, 1536, 13312, 2147483647}, + {-1, 75840, 1536, 2976, 13312, 2147483647}, {-1, 11904, 2976, 2147483647, 1536, 3072}, + {-1, 3072, 2976, 2147483647, 3072, 2147483647}, {3072, 11904, 2976, 2147483647, 3072, 13312}, + {11904, 2147483647, 5634, 2147483647, 5376, 7424}}}, + {16, + {{36980, 74380, 7170, 2147483647, -1, 768}, {74380, 82060, 3072, 2147483647, -1, 768}, + {23040, 82060, 1536, 5634, 768, 1536}, {112280, 129600, 1536, 3072, -1, 1536}, + {82060, 2147483647, 3072, 2147483647, -1, 768}, {82060, 176600, 3072, 2147483647, 768, 1536}, + {222720, 2147483647, 3072, 2147483647, 768, 1536}, {107968, 2147483647, 1792, 2976, 10240, 13312}, + {75840, 2147483647, 1536, 2976, 13312, 2147483647}, {3072, 11904, 2976, 2147483647, 13312, 2147483647}, + {11904, 2147483647, 2976, 5634, 1536, 2147483647}, {11904, 2147483647, 5634, 2147483647, 1536, 5376}, + {11904, 2147483647, 5634, 2147483647, 7424, 2147483647}}} }; static std::map>> g_allreduce91093SixteenRankFP16M0Map = { + {128, + {{-1, 2147483647, -1, 2147483647, -1, 3072}, {-1, 2147483647, -1, 2976, 3072, 2147483647}}}, + {256, + {{-1, 2147483647, 2976, 2147483647, 3072, 2147483647}}} }; static std::map>> g_allreduce91093SixteenRankFP16UbmovenumMap = { + {60, + {{-1, 768, -1, 5634, -1, 768}, {3072, 2147483647, -1, 1536, -1, 1536}, + {3072, 36980, 1536, 3072, -1, 1536}, {-1, 15412, -1, 2976, 5376, 2147483647}, + {15412, 2147483647, -1, 2976, 1536, 13312}, {15412, 2147483647, -1, 1536, 13312, 2147483647}}}, + {20, + {{-1, 768, 5634, 2147483647, -1, 768}, {10320, 2147483647, 3072, 4608, -1, 1536}, + {3072, 2147483647, 4608, 2147483647, -1, 1536}, {-1, 15412, 3072, 2147483647, 1536, 5376}, + {-1, 15412, 2976, 2147483647, 5376, 2147483647}, {15412, 2147483647, 2976, 2147483647, 1536, 13312}, + {15412, 2147483647, 3072, 2147483647, 13312, 2147483647}}}, + {160, + {{768, 3072, -1, 384, -1, 768}}}, + {80, + {{768, 3072, 384, 2147483647, -1, 768}}}, + {120, + {{-1, 1536, -1, 4608, 768, 1536}, {1536, 3072, 640, 2147483647, 768, 1536}}}, + {40, + {{-1, 1536, 4608, 2147483647, 768, 1536}}}, + {140, + {{1536, 3072, -1, 640, 768, 1536}}}, + {30, + {{36980, 2147483647, 1536, 3072, -1, 1536}, {3072, 10320, 3072, 4608, -1, 1536}, + {-1, 15412, 1536, 3072, 1536, 5376}, {15412, 2147483647, 1536, 3072, 13312, 2147483647}}}, + {100, + {{-1, 15412, -1, 1536, 1536, 5376}}} }; static std::map>> g_allreduce91093SixteenRankFP16PvalueMap = { + {4, + {{-1, 3072, -1, 4608, -1, 768}, {5148, 31220, -1, 4608, -1, 768}, + {10010, 36980, 4608, 2147483647, 768, 1536}, {36980, 53340, 1536, 2147483647, -1, 1536}, + {53340, 68160, -1, 2147483647, -1, 768}, {68160, 74380, 3586, 2147483647, -1, 768}, + {1536, 3072, 2976, 2147483647, 3072, 2147483647}, {3072, 16340, 4608, 7170, 1536, 2147483647}, + {3072, 2147483647, 7170, 2147483647, 1536, 2147483647}}}, + {1, + {{-1, 5148, 4608, 2147483647, -1, 768}, {-1, 1536, 2976, 7170, 1536, 5376}, + {-1, 1536, 7170, 2147483647, 1536, 2147483647}}}, + {8, + {{3072, 5148, -1, 4608, -1, 768}, {-1, 19680, 768, 3072, 768, 1536}, + {-1, 1536, -1, 384, 10240, 2147483647}, {-1, 2560, 384, 1280, 7424, 2147483647}}}, + {12, + {{31220, 36980, -1, 4608, -1, 768}, {-1, 36980, -1, 768, 768, 1536}, + {68160, 74380, -1, 3586, -1, 768}, {68160, 74380, 1536, 2147483647, 768, 1536}, + {-1, 19680, -1, 384, 1536, 7424}, {-1, 6298, -1, 384, 7424, 10240}}}, + {2, + {{5148, 36980, 4608, 2147483647, -1, 768}, {-1, 10010, 3072, 2147483647, 768, 1536}, + {-1, 1536, 2976, 7170, 5376, 2147483647}, {1536, 3072, 2976, 2147483647, 1536, 3072}}}, + {10, + {{19680, 36980, 768, 3072, 768, 1536}, {142040, 2147483647, 1536, 2147483647, -1, 768}, + {74380, 189080, 3072, 2147483647, 768, 1536}, {19680, 2147483647, 2976, 4608, 1536, 5376}, + {3072, 2147483647, 2976, 4608, 5376, 2147483647}}}, + {6, + {{10010, 36980, 3072, 4608, 768, 1536}, {74380, 142040, 1536, 2147483647, -1, 768}, + {189080, 2147483647, 5634, 2147483647, 768, 1536}, {-1, 19680, 1536, 2976, 1536, 7424}, + {3072, 19680, 2976, 4608, 1536, 5376}, {16340, 2147483647, 4608, 7170, 1536, 2147483647}}}, + {14, + {{36980, 53340, -1, 1536, -1, 1536}, {53340, 68160, -1, 2147483647, 768, 1536}, + {68160, 74380, -1, 1536, 768, 1536}, {74380, 2147483647, -1, 1536, -1, 768}, + {74380, 189080, -1, 3072, 768, 1536}, {189080, 2147483647, -1, 5634, 768, 1536}, + {-1, 19680, 384, 1536, 1536, 7424}, {19680, 2147483647, -1, 2976, 1536, 7424}, + {6298, 2147483647, -1, 384, 7424, 10240}, {1536, 2147483647, -1, 384, 10240, 2147483647}, + {2560, 2147483647, 384, 1280, 7424, 2147483647}, {-1, 2147483647, 1280, 2976, 7424, 2147483647}}} }; void AllReduceNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData) -- Gitee From 705f00212c71ee10c1087790ef5f0496a3a17ff0 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 28 Aug 2025 14:53:55 +0800 Subject: [PATCH 342/414] draft --- comm/lcal/src/tiling/allreduce_tiling_91093.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp index a3532fc5..4c40fc6b 100644 --- a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp +++ b/comm/lcal/src/tiling/allreduce_tiling_91093.cpp @@ -20,7 +20,7 @@ namespace Lcal { constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT = 160; constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_M0_DEFAULT = 128; constexpr int32_t ALLREDUCE_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; - + static std::map>> g_allreduce91093EightRankFP16CommdatasplitMap = { {1, {{-1, 3072, -1, 2147483647, -1, 768}, {-1, 768, -1, 2147483647, 768, 1536}, @@ -167,7 +167,7 @@ namespace Lcal { {100, {{-1, 15412, -1, 1536, 1536, 5376}}} }; - + static std::map>> g_allreduce91093SixteenRankFP16PvalueMap = { {4, {{-1, 3072, -1, 4608, -1, 768}, {5148, 31220, -1, 4608, -1, 768}, -- Gitee From ccb66b1f705018ceb08c9b803c64a507d0a68938 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Wed, 27 Aug 2025 15:37:19 +0800 Subject: [PATCH 343/414] 7 --- comm/lcal/src/kernels/coc_dequant_runner.cpp | 215 +++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 comm/lcal/src/kernels/coc_dequant_runner.cpp diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cpp b/comm/lcal/src/kernels/coc_dequant_runner.cpp new file mode 100644 index 00000000..e645684a --- /dev/null +++ b/comm/lcal/src/kernels/coc_dequant_runner.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef __COC_DEQUANTER__ +#define __COC_DEQUANTER__ + +#ifdef __DAV_C220_VEC__ + +#include +#include "coc_internal.cce" + +template +class LoopDequanter { +}; + +template <> +class LoopDequanter { +public: + static constexpr int32_t max_len = 9792; + inline __aicore__ LoopDequanter() = default; + inline __aicore__ void SetForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + + inline __aicore__ void WaitForLoop() + { + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + } + + inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, int32_t offset, + int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto ub_out = is_ping ? ub_out0 : ub_out1; + auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; + + int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); + int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); + SetFlag(event_id); + WaitFlag(event_id); + Vadds(ub_adds, ub_in, offset, repeat, 1, 1, 8, 8); + SetFlag(event_id); + + PipeBarrier(); + Vconv(ub_adds_f32, ub_adds, repeat, 1, 1, 8, 8); + SetFlag(event_id); + + WaitFlag(event_id); + CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); + SetFlag(event_id); + } + +private: + static constexpr uint8_t repeat = 153; + __ubuf__ bfloat16_t *ub_out0 = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); + __ubuf__ bfloat16_t *ub_out1 = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)19584); + __ubuf__ float32_t *ub_adds_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)39936); + __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)79104); + __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)118272); + __ubuf__ int32_t *ub_adds = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); + __ubuf__ float32_t *ub_muls = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); + + bool is_ping = false; +}; + +template <> +class LoopDequanter { +public: + static constexpr int32_t max_len = 8192; + inline __aicore__ LoopDequanter() = default; + inline __aicore__ void SetForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID0); + } + inline __aicore__ void WaitForLoop() + { + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID0); + } + inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, + int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto ub_out = is_ping ? ub_out0 : ub_out1; + int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); + int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); + + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); + SetFlag(event_id); + WaitFlag(event_id); + Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 8); + SetFlag(event_id); + + WaitFlag(EVENT_ID2); + if (scale_rows == 0 || scale_source != scale) { + scale_rows = 1; + scale_source = scale; + CopyGmToUbufAlign(ub_scale, scale, 1, n_cols_this_loop, 0); + } + SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + for (; scale_rows < n_rows_this_loop; ++scale_rows) { + CopyUB2UB(ub_scale + scale_rows * n_blocks * Block32B::size, ub_scale, + 0, 1, n_blocks, 0, 0); + } + PipeBarrier(); + Vmul(ub_mul, ub_in_f32, ub_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID0); + Vconv(ub_out, ub_mul, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); + SetFlag(EVENT_ID0); + } + +private: + static constexpr uint8_t repeat = 128; + __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); + __ubuf__ float32_t *ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)32768); + __ubuf__ float32_t *ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); + __ubuf__ float32_t *ub_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98650); + __ubuf__ bfloat16_t *ub_out = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)131328); + __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)163840); + + __gm__ float32_t *scale_source = nullptr; + int32_t scale_rows = 0; + bool is_ping = false; +}; + +template +class LoopPerTokenDequanter { +public: + static constexpr int32_t max_len = 8 * 32 / 4 * 128; + + inline __aicore__ LoopPerTokenDequanter(int32_t n0) + { + n_round = (n0 + 127) / 128 * 128; + ub_in0 = reinterpret_cast<__ubuf__ T *>((uintptr_t)0); + ub_in1 = reinterpret_cast<__ubuf__ T *>(ub_in0 + max_len); + ub_out = reinterpret_cast<__ubuf__ T *>(ub_in1 + max_len); + ub_scales = reinterpret_cast<__ubuf__ float32_t *>(ub_out + max_len); + ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>(ub_scales + max_len); + ub_out_f32 = reinterpret_cast<__ubuf__ float32_t *>(ub_in_f32 + max_len); + } + + inline __aicore__ void SetForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + } + + inline __aicore__ void WaitForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + } + + inline __aicore__ void Loop(__gm__ T *buff, __gm__ float32_t *scale, + int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t stride) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; + int32_t ubufGap = Block32B::Count(n_round) - Block32B::Count(n_cols_this_loop); + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, buff, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); + SetFlag(event_id); + WaitFlag(event_id); + WaitFlag(event_id); + Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 4); + SetFlag(event_id); + + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + if (scale_source != scale) { + scale_source = scale; + CopyGmToUbufAlign(ub_scales, scale, 1, n_cols_this_loop, 0); + } + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + WaitFlag + } + +} -- Gitee From a8bab35f64bea49dd33660112e6670e18b9907de Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:30:45 +0800 Subject: [PATCH 344/414] 5 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 999 +++++++++++++++++++ comm/lcal/src/kernels/coc_dequant_runner.cpp | 215 ---- 2 files changed, 999 insertions(+), 215 deletions(-) delete mode 100644 comm/lcal/src/kernels/coc_dequant_runner.cpp diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index e69de29b..17c29357 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -0,0 +1,999 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef __COC_DEQUANTER__ +#define __COC_DEQUANTER__ + +#ifdef __DAV_C220_VEC__ + +#include +#include "coc_internal.cce" + +template +class LoopDequanter { +}; + +template <> +class LoopDequanter { +public: + static constexpr int32_t max_len = 9792; + inline __aicore__ LoopDequanter() = default; + inline __aicore__ void SetForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + + inline __aicore__ void WaitForLoop() + { + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + } + + inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, int32_t offset, + int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto ub_out = is_ping ? ub_out0 : ub_out1; + auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; + + int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); + int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); + SetFlag(event_id); + WaitFlag(event_id); + Vadds(ub_adds, ub_in, offset, repeat, 1, 1, 8, 8); + SetFlag(event_id); + + PipeBarrier(); + Vconv(ub_adds_f32, ub_adds, repeat, 1, 1, 8, 8); + SetFlag(event_id); + + WaitFlag(event_id); + CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); + SetFlag(event_id); + } + +private: + static constexpr uint8_t repeat = 153; + __ubuf__ bfloat16_t *ub_out0 = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); + __ubuf__ bfloat16_t *ub_out1 = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)19584); + __ubuf__ float32_t *ub_adds_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)39936); + __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)79104); + __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)118272); + __ubuf__ int32_t *ub_adds = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); + __ubuf__ float32_t *ub_muls = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); + + bool is_ping = false; +}; + +template <> +class LoopDequanter { +public: + static constexpr int32_t max_len = 8192; + inline __aicore__ LoopDequanter() = default; + inline __aicore__ void SetForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID0); + } + inline __aicore__ void WaitForLoop() + { + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID0); + } + inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, + int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto ub_out = is_ping ? ub_out0 : ub_out1; + int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); + int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); + + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); + SetFlag(event_id); + WaitFlag(event_id); + Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 8); + SetFlag(event_id); + + WaitFlag(EVENT_ID2); + if (scale_rows == 0 || scale_source != scale) { + scale_rows = 1; + scale_source = scale; + CopyGmToUbufAlign(ub_scale, scale, 1, n_cols_this_loop, 0); + } + SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + for (; scale_rows < n_rows_this_loop; ++scale_rows) { + CopyUB2UB(ub_scale + scale_rows * n_blocks * Block32B::size, ub_scale, + 0, 1, n_blocks, 0, 0); + } + PipeBarrier(); + Vmul(ub_mul, ub_in_f32, ub_scale, repeat, 1, 1, 1, 8, 8, 8); + SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID0); + Vconv(ub_out, ub_mul, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); + SetFlag(EVENT_ID0); + } + +private: + static constexpr uint8_t repeat = 128; + __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); + __ubuf__ float32_t *ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)32768); + __ubuf__ float32_t *ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); + __ubuf__ float32_t *ub_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98650); + __ubuf__ bfloat16_t *ub_out = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)131328); + __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)163840); + + __gm__ float32_t *scale_source = nullptr; + int32_t scale_rows = 0; + bool is_ping = false; +}; + +template +class LoopPerTokenDequanter { +public: + static constexpr int32_t max_len = 8 * 32 / 4 * 128; + + inline __aicore__ LoopPerTokenDequanter(int32_t n0) + { + n_round = (n0 + 127) / 128 * 128; + ub_in0 = reinterpret_cast<__ubuf__ T *>((uintptr_t)0); + ub_in1 = reinterpret_cast<__ubuf__ T *>(ub_in0 + max_len); + ub_out = reinterpret_cast<__ubuf__ T *>(ub_in1 + max_len); + ub_scales = reinterpret_cast<__ubuf__ float32_t *>(ub_out + max_len); + ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>(ub_scales + max_len); + ub_out_f32 = reinterpret_cast<__ubuf__ float32_t *>(ub_in_f32 + max_len); + } + + inline __aicore__ void SetForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + } + + inline __aicore__ void WaitForLoop() + { + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + } + + inline __aicore__ void Loop(__gm__ T *buff, __gm__ float32_t *scale, + int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t stride) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; + int32_t ubufGap = Block32B::Count(n_round) - Block32B::Count(n_cols_this_loop); + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, buff, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); + SetFlag(event_id); + WaitFlag(event_id); + WaitFlag(event_id); + Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 4); + SetFlag(event_id); + + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + if (scale_source != scale) { + scale_source = scale; + CopyGmToUbufAlign(ub_scales, scale, 1, n_cols_this_loop, 0); + } + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + PipeBarrier(); + for (int32_t row = 0; row < n_rows_this_loop; ++row) { + float32_t scale = ub_scales[row]; + Vmuls(ub_out_f32 + n_round * row, un_in_f32 + n_round * row, scale, (n_cols_this_loop + 127) / 128 * 2, 1, 1, 8, 8); 1); + } + PipeBarrier(); + Vconv(ub_out, ub_out_f32, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); + CopyUbufToGmAlign(buff, ub_out, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); + SetFlag(EVENT_ID2); + } +private: + static constexpr uint8_t repeat = 128; + __ubuf__ T *ub_in0 = nullptr; + __ubuf__ T *ub_in1 = nullptr; + __ubuf__ T *ub_out = nullptr; + __ubuf__ float32_t *ub_scales = nullptr; + __gm__ float32_t *scale_source = nullptr; + __ubuf__ float32_t *ub_in_f32 = nullptr; + __ubuf__ float32_t *ub_out_f32 = nullptr; + int32_t n_round = 0; + bool is_ping = false; +}; + +class LoopScaleFormater { +public: + static constexpr int32_t max_len = 8160; + inline __aicore__ LoopScaleFormater() = default; + inline __aicore__ void SetForLoop() + { + set_ctrl(sbitset1(get_ctrl(), 59)); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + } + + inline __aicore__ void WaitForLoop() + { + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + set_ctrl(sbitset1(get_ctrl(), 59)); + } + + inline __aicore__ void Loop(__gm__ float32_t *dst, __gm__ int64_t *src, int32_t len) + { + is_ping = !is_ping; + auto ub_in = is_ping ? ub_in0 : ub_in1; + auto ub_vconv = is_ping ? ub_vconv0 : ub_vconv1; + auto ub_out = is_ping ? ub_out0 : ub_out1; + auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; + + WaitFlag(event_id); + CopyGmToUbufAlign(ub_in, src, 1, len, 0); + SetFlag(event_id); + WaitFlag(event_id); + WaitFlag(event_id); + Vconv(ub_vconv, ub_in, repeat, 1, 1, 4, 8); + SetFlag(event_id); + SetFlag(event_id); + WaitFlag(event_id); + CopyUbufToGmAlign(dst, ub_out, 1, len, 0); + SetFlag(event_id); + } + +private: + static constexpr uint8_t repeat = 255; + __ubuf__ int64_t *ub_in0 = reinterpret_cast<__ubuf__ int64_t *>((uintptr_t)0); + __ubuf__ int64_t *ub_in1 = reinterpret_cast<__ubuf__ int64_t *>((uintptr_t)131072); + __ubuf__ int32_t *ub_vconv0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)65536); + __ubuf__ int32_t *ub_vconv1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)98304); + __ubuf__ float32_t *ub_out0 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); + __ubuf__ float32_t *ub_out1 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98304); + bool is_ping = false; +}; + +class BaseDequantRunner { +public: + class TileLoopIter { + public: + inline __aicore__ TileLoopIter(int32_t m_this_tile, int32_t n_this_tile) + { + m_this_subcore = m_this_tile >> 1; + n_this_subcore = n_this_tile; + if (get_subblockid() == 1) { + m_offset_this_subcore = m_this_subcore; + m_this_subcore += m_this_tile & 1; + } else { + m_offset_this_subcore = 0; + } + } + + inline __aicore__ void Init(int32_t max_len) + { + int32_t max_m_per_loop = max_len / Block32B::AlignUp(n_this_subcore); + m_complete = 0; + m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; + n_this_loop = n_this_subcore; + } + + inline __aicore__ bool End() + { + return m_complete >= m_this_subcore; + } + + inline __aicore__ void Next() + { + m_complete += m_this_loop; + if (End()) { + return; + } + if (m_complete + m_this_loop > m_this_subcore) { + m_this_loop = m_this_subcore - m_complete; + } + } + + inline __aicore__ int32_t m_offset_in_tile() const + { + return m_offset_this_subcore + m_complete; + } + int32_t m_this_subcore; + int32_t n_this_subcore; + int32_t m_this_loop; + int32_t n_this_loop; + int32_t m_offset_this_subcore; + int32_t m_complete; + }; + __aicore__ explicit BaseDequantRunner() = default; + + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo &workspace_info, + __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity quant_granularity, + int32_t batch_size, int32_t m, int32_t n) + { + this->gm_accum = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_accum); + this->gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_format_dequant_scale); + this->gm_out = gm_out; + this->gm_dequant_scale = gm_dequant_scale; + this->gm_dequant_offset = gm_dequant_offset; + this->quant_granularity = quant_granularity; + this->batch_size = batch_size; + this->m = m; + this->n = n; + if (dequant_granularity == QuantGranularity::PER_TENSOR) { + gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_format_dequant_scale); + } else if (dequant_granularity == QuantGranularity::PER_CHANNEL) { + FormatScale(); + } else { + gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale); + } + } + + inline __aicore__ void FormatScale() + { + int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); + int32_t align_core_num = get_blocknumm() * get_subblockdim(); + int32_t len = LoopScaleFormater::max_len; + int32_t loop_num = DivCeil(n, len); + LoopScaleFormater loop_scale_formater; + loop_scale_formater.SetForLoop(); + for (int32_t i = align_core_idx; i < loop_num; i += align_core_num) { + int32_t offset = i * len; + if (offset + len > n) { + len = n - offset; + } + } + loop_scale_formater.Loop(gm_format_dequant_scale + offset, gm_dequant_scale + offset, len); + } + loop_scale_formater.WaitForLoop(); + Barrier(); +} + +protected: + inline __aicore__ void Barrier() + { + FFTSCrossCoreSync(0, AIV_FINISH_DEQUANT_FLAG_ID); + WaitEvent(AIV_FINISH_DEQUANT_FLAG_ID); + } + + __gm__ int32_t *gm_accum; + __gm__ bfloat16_t *gm_out; + __gm__ int64_t *gm_dequant_scale; + __gm__ int32_t *gm_dequant_offset; + QuantGranularity dequant_granularity; + + __gm__ float32_t *gm_format_dequant_scale; + int32_t batch_size; + int32_t m; + int32_t k; + int32_t n; +}; + +class SerialDequantRunner : public BaseDequantRunner { +public: + class LoopIter { + public: + inline __aicore__ LoopIter(int32_t batch_size, int32_t n_rows, int32_t n_cols) : + batch_size(batch_size), n_rows(n_rows), n_cols(n_cols) + { + int32_t align_core_num = get_block_num() * get_subblockdim(); + int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); + int32_t n_rows_per_core_base = n_rows / align_core_num; + int32_t n_rows_remainder = n_rows % align_core_num; + int32_t row_offset_base = align_core_idx * n_rows_per_core_base; + if (align_core_idx < n_rows_remainder) { + n_rows_this_core = n_rows_per_core_base + 1; + row_offset_this_core = row_offset_base + align_core_idx; + } else { + n_rows_this_core = n_rows_per_core_base; + row_offset_this_core = row_offset_base + n_rows_remainder; + } + n_cols_this_core = n_cols; + col_offset_this_core = 0; + core_offset = row_offset_this_core * n_cols; + } + + inline __aicore__ void InitBatchLoop() + { + batch_idx = 0; + batch_offset = 0; + } + + inline __aicore__ void EndBatchLoop() + { + return batch_idx == batch_size; + } + + intline __aicore__ void NextBatchLoop() + { + ++batch_idx; + if (EndBatchLoop()) { + return; + } + batch_offset += static_casta(batch_idx) * n_rows * n_cols; + } + } + + inline __aicore__ void InitRowLoop(init32_t max_rows_per_loop) + { + n_rows_complete = 0; + n_rows_this_loop = (n_rows_this_core < max_rows_per_loop) ? n_rows_this_core : max_rows_per_loop; + row_offset = 0; + } + + inline __aicore__ bool EndRowLoop() const + { + return n_rows_complete == n_rows_this_core; + } + + inline __aicore__ void NextRowLoop() + { + n_rows_complete += n_rows_this_loop; + if (EndRowLoop()) { + return; + } + if (n_rows_complete + n_rows_this_loop > n_rows_this_core) { + n_rows_this_loop = n_rows_this_core - n_rows_complete; + } + row_offset = n_rows_complete; + } + + inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) + { + n_cols_complete = 0; + n_cols_this_loop = (n_cols_this_core < max_cols_per_loop) ? n_cols : max_cols_per_loop; + col_offset = 0; + } + + inline __aicore__ bool EndColLoop() const + { + return n_cols_complete == n_cols_this_core; + } + + inline __aicore__ void NextColLoop() + { + n_cols_complete += n_cols_this_loop; + if (EndColLoop()) { + return; + } + if (n_cols_complete + n_cols_this_loop > n_cols_this_core) { + n_cols_this_loop = n_cols_this_core - n_cols_complete; + } + col_offset = n_cols_complete; + } + + inine __aicore__ int64_t offset() const + { + return core_offset + row_offset * n_cols + col_offset; + } + + int32_t batch_size; + int32_t n_rows; + int32_t n_cols; + int32_t n_rows_this_core; + int32_t n_cols_this_core; + int64_t row_offset_this_core; + int64_t col_offset_this_core; + int32_t batch_idx; + int32_t n_rows_complete; + int32_t n_cols_complete; + int32_t n_rows_this_loop; + int32_t n_cols_this_loop; + int32_t core_offset; + int32_t batch_offset; + int32_t row_offset; + int32_t col_offset; + }; + + __aicore__ explicit SerialDequantRunner() = default; + + inline __aicore__ void Run() + { + switch (dequant_granularity) { + case QuantGranularity::PER_TENSOR: + DequantPerTensor(); + break; + case QuantGranularity::PER_CHANNEL: + DequantPerChannel(); + break; + case QuantGranularity::PER_TOKEN: + DequantPerChannel(); + break; + case QuantGranularity::FLOAT32_SCALE_PER_CHANNEL: + DequantPerChannel(); + break; + default: + break; + } + Barrier; + } + +private: + inline __aicore__ void DequantPerTensor() + { + float32_t scale = gm_format_dequant_scale[0]; + const auto max_len = LoopDequant::max_len; + int32_t n_round = Block32BAlignUp(n); + int32_t max_m_per_loop = (n_round <= max_len) ? (max / n_round) : 1; + int32_t max_n_per_loop = (n <= max_len) ? n : max_len; + + LoopIter it(batch_size, m, n); + LoopDequanter loop_dequanter; + loop_dequanter.SetForLoop(); + for (it.InitColLoop()l !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_n_per_loop); !it.EndColLoop(); it.NextColLoop()) { + for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto dst = gm_out + it.offset(); + auto src = gm_accum + it.offset(); + loop_dequanter.Loop(dst, src, scale, 0, it.n_rows_this_loop, it.n_cols_this_loop, n, n); + } + } + } + loop_dequanter.WaitForLoop(); + } + + inline __aicore__ void DequantPerChaneel() + { + const auto max_len = LoopDequanter::max_len; + int32_t n_round = Block32BAlignUp(n); + int32_t max_m_per_loop = (n_round <= max_len) ? (max / n_round) : 1; + int32_t max_n_per_loop = (n <= max_len) ? n : max_len; + + LoopIter it(batch_size, m, n); + LoopDequanter loop_dequanter; + loop_dequanter.SetForLoop(); + for (it.InitColLoop()l !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitColLoop(max_n_per_loop); !it.EndColLoop(); it.NextColLoop()) { + for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + auto dst = gm_out + it.offset(); + auto src = gm_accum + it.offset(); + loop_dequanter.Loop(dst, src, 0, 0, it.n_rows_this_loop, it.n_cols_this_loop, n, n); + } + } + } + loop_dequanter.WaitForLoop(); + } + +private: + __gm__ T *gm_out; + __gm__ float32_t *gm_dequant_scale_pertoken; + int32_t m; + int32_t n; + int32_t m0; + int32_t n0; +}; + +class FusedDequantRunner : public BaseDequantRunner { +public: + __aicore__ explicit FusedDequantRunner() = default; + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkSpaceInfo & worspace_info, + __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, + int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, int32_t p_value, int32_t rank_size) + { + BaseDequantRunner::SetArgs(gm_out, worspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, + batch_size, m, n, m0, n0); + core_num = get_block_num(); + core_idx = get_block_idx(); + this-> m0 = m0; + this-> n0 = n0; + this-> m_loop = m_loop; + this-> n_loop = n_loop; + this-> core_loop = core_loop; + this-> swizzl_direct = swizzl_direct; + this-> swizzl_count = swizzl_count; + this-> p_value = p_value; + this-> rank_size = rank_size; + } + + inline __aicore__ void RunDequantAllreduce(int32_t cal_idx) + { + switch (dequant_granularity) { + case QuantAGranularity:: PER_TENSOR : + DequantAllReducePerTensor(cal_idx); + return; + case QuantAGranularity:: PER_CHANNEL : + DequantAllReducePerChannel(cal_idx); + return; + case QuantAGranularity:: PER_TOKEN : + DequantAllReducePerChannel(cal_idx); + return; + case QuantAGranularity:: FLOAT32_SCALE_PER_CHANNEL : + DequantAllReducePerChannel(cal_idx); + return; + default: + return; + } + } + + inline __aicore__ void DequantAllReducePerChannel(int32_t cal_idx) + { + LoopDequanter loop_dequanter; + loop_dequanter.SetForLoop(); + int32_t pipe_depth = MAX_BLOCK_COUNT; + int32_t flag_idx = cal_idx % pipe_depth; + int32_t loop_idx = cal_idx * core_num + core_idx; + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + TileLoopIter tit(m_actual, n_actual); + int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; + for (tit.Init(LoopDequanter::max_len); !tit.End(); tit.Next()) { + int64_t src_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + auto accum = gm_accum + src_offset; + auto out = gm_out + dst_offset; + auto scale = gm_format_dequant_scale + n_idx * n0; + loop_dequanter.Loop(out, accum, scale, tit.m_this_loop, tit.n_this_loop, n0, n0); + } + } + loop_dequanter.WaitForLoop(); + } + + inline __aicore__ void DequantAllReducePerTensor(int32_t cal_idx) + { + LoopDequanter loop_dequanter; + float32_t scale = gm_format_dequant_scale[0]; + loop_dequanter.SetForLoop(); + int32_t pipe_depth = MAX_BLOCK_COUNT; + int32_t flag_idx = cal_idx % pipe_depth; + int32_t loop_idx = cal_idx * core_num + core_idx; + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + TileLoopIter tit(m_actual, n_actual); + int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; + for (tit.Init(LoopDequanter::max_len); !tit.End(); tit.Next()) { + int64_t src_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + auto accum = gm_accum + src_offset; + auto out = gm_out + dst_offset; + loop_dequanter.Loop(out, accum, scale, tit.m_this_loop, tit.n_this_loop, n0, n0); + } + } + loop_dequanter.WaitForLoop(); + } + + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorspaceInfo &workspace_info, + __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, + int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, + __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) + { + BaseDequantRunner::SetArgs(gm_out, work, gm_dequant_scale, gm_dequant_offset, dequant_granularity, + batch_size, m, n); + + core_num = get_block_num(); + core_idx = get_block_idx(); + loop_per_EP = p_value * core_num / (EP * TP); + out_loop_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_expert); + out_loop_per_ep = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_EP); + sum_num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_local_tokens_per_expert); + sum_num_global_tokens_per_local_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_global_tokens_per_local_expert); + this-> n_loop = n_loop; + this-> m_loop = m_loop; + this-> m0 = m0; + this-> n0 = n0; + this-> swizzl_direct = swizzlel_direct; + this-> swizzl_count = swizzle_count; + this-> p_value = p_value; + this-> rank_size = EP * TP; + this-> rank = rank; + this-> EP = EP; + this-> TP = TP; + this-> local_expert_nums = local_expert_nums; + this-> is_moe_averaged = is_moe_averaged; + this-> is_alltoallvc = is_alltoallvc; + this-> num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (num_local_tokens_per_expert); + this-> num_global_tokens_per_local_expert = + reinterpret_cast<__gm__ int32_t *> (num_global_tokens_per_local_expert); + } + +private: + int32_t core_num; + int32_t core_idx; + int32_t m0; + int32_t n0; + int32_t m_loop; + int32_t n_loop; + int32_t core_loop; + int32_t loop_num_per_comm; + int32_t swizzl_count; + int32_t swizzl_direct; + int32_t p_value; + int32_t rank_size; + int32_t loop_per_EP; + int32_t rank; + int32_t EP; + int32_t TP; + int32_t local_expert_nums; + int32_t is_moe_averaged; + int32_t is_alltoallvc; + + __gm__ int32_t *out_loop_per_expert; + __gm__ int32_t *out_loop_per_ep; + __gm__ int32_t *sum_num_local_tokens_per_expert; + __gm__ int32_t *sum_num_global_tokens_per_local_expert; + __gm__ int32_t *in_expert_comm_count_accum; + __gm__ int32_t *num_local_tokens_per_expert; + __gm__ int32_t *num_global_tokens_per_local_expert; + + int32_t sum_loop; +}; + +template +class FusedPerTokenDequantRunner : public BaseDequantRunner { +public: + __aicore__ explicit FusedPerTokenDequantRunner() = default; + inline __aicore__ void SetArgs(__gm__ T *gm_buff, + __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0, + int32_t m_loop, int32_t n_loop, int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, + int32_t p_value, int32_t rank_size) + { + this->gm_buff = gm_buff; + this->gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; + core_num = get_block_num(); + core_idx = get_block_idx(); + this -> m = m; + this -> n = n; + this -> m0 = m0; + this -> n0 = n0; + this -> m_loop = m_loop; + this -> n_loop = n_loop; + this -> core_loop = core_loop; + this -> swizzl_direct = swizzl_direct; + this -> swizzl_count = swizzl_count; + this -> loop_num_per_comm = p_value * core_num; + this -> p_value = p_value; + this -> rank_size = rank_size; + } + + inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo &workspace_info, + __gm__ float32_t *gm_dequant_scale_pertoken, + int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, + __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) + { + this -> gm_buff = gm_buff; + this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; + this -> m = m; + this -> n = n; + core_num = get_block_num(); + core_idx = get_block_idx(); + loop_per_EP = p_value * core_num / (EP * TP); + out_loop_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.out_loop_per_expert); + out_loop_per_ep = reinterpret_cast<__gm__ int32_t *> (workspace_info.out_loop_per_ep); + sum_num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.sum_num_local_tokens_per_expert); + sum_num_global_tokens_per_local_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.sum_num_global_tokens_per_local_expert); + in_expert_comm_count_accum = reinterpret_cast<__gm__ int32_t *> (workspace_info.in_expert_comm_count_accum); + this -> n_loop = n_loop; + this -> m_loop = m_loop; + this -> m0 = m0; + this -> n0 = n0; + this -> swizzl_direct = swizzlel_direct; + this -> swizzl_count = swizzle_count; + this -> p_value = p_value; + this -> rank_size = EP * TP; + + this -> rank = rank; + this -> EP = EP; + this -> TP = TP; + this -> local_expert_nums = local_expert_nums; + this -> is_moe_averaged = is_moe_averaged; + this -> is_alltoallvc = is_alltoallvc; + this-> num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (num_local_tokens_per_expert); + this-> num_global_tokens_per_local_expert = + reinterpret_cast<__gm__ int32_t *> (num_global_tokens_per_local_expert); + } + +inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & workspace_info, + __gm__ float32_t *gm_dequant_scale_pertoken, + int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, int32_t max_output_size, int32_t buffer_size, + __gm__ int32_t* global_tokens_per_expert_matrix) + { + this -> gm_buff = gm_buff; + this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; + this -> m = m; + this -> k = k; + this -> n = n; + core_num = get_block_num(); + core_idx = get_block_idx(); + this -> n_loop = n_loop; + this -> m_loop = m_loop; + this -> m0 = m0; + this -> k0 = k0; + this -> n0 = n0; + this -> swizzl_direct = swizzlel_direct; + this -> swizzl_count = swizzle_count; + this -> p_value = p_value; + this -> rank_size = EP * TP; + this -> rank = rank; + this -> buffer_size = buffer_size; + this -> EP = EP; + this -> TP = TP; + this -> local_expert_nums = local_expert_nums; + this -> is_moe_averaged = is_moe_averaged; + this -> is_alltoallvc = is_alltoallvc; + this -> comm_n = p_value * n0; + + this -> global_tokens_per_expert_matrix = reinterpret_cast<__gm__ int32_t *> (global_tokens_per_expert_matrix); + this -> expert_nums = EP * local_expert_nums; + this -> maxOutputSize = max_output_size; + if (is_moe_averaged) { + sum_m_loop = DivCeil((m / expert_nums) * EP, m0) * local_expert_nums; + max_m = m; + } else { + if (maxOutputSize == -1) { + max_m = 0; + for (int32_t ep_idx = 0; ep_idx < EP; ep_idx++) { + int32_t sum_m_ep = 0; + for (int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id++) { + int32_t expert_id = local_expert_id +ep_idx * local_expert_nums; + for (int32_t i = 0; i < EP; i++) { + sum_m_ep += global_tokens_per_expert_matrix[expert_nums + expert_id]; + } + } + max_m = max(max_m, sum_m_ep); + } + } else { + max_m = maxOutputSize; + } + + for (int32_t i = 0; i < local_expert_nms; i++) { + int32_t last_sum_m = (i == 0 ? 0 : sum_m[i - 1]); + for (int j = 0; j < EP; j++) { + sum_m[i] += global_tokens_per_expert_matrix[j * expert_nums + rank * local_expert_nums + i]; + } + if (maxOutputSize > 0 && sum_m[i] + last_sum_m > maxOutputSize) { + sum_m[i] = maxOutputSize - last_sum_m; + } + sum_m_loop += DivCeil(sum_m[i], m0); + sum_m[i] += (i == 0 ? 0 : sum_m[i - 1]); + } + } + sum_loop = 0; + } + + inline __aicore__ void RunDequantAllReduce(int32_t cal_idx) + { + LoopPerTokenDequanter loop_dequanter(n0); + loop_dequanter.SetForLoop(); + int32_t pipe_depth = MAX_BLOCK_COUNT; + int32_t flag_idx = cal_idx % pipe_depth; + int32_t loop_idx = cal_idx * core_num + core_idx; + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >- core_loop) + break; + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + TileLoopIter tit(m_actual, n_actual); + int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; + for (tit.Init(LoopPerTokenDequanter::max_len, n0); !tit.End(); tit.Next()) { + int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; + auto buff = gm_buff + offset; + auto scale = gm_dequant_scale_pertoken + m_idx * m0 + tit.m_offset_in_tile(); + loop_dequanter.Loop(buff, scale, tit.m_this_loop, tit.n_this_loop, n0); + } + } + loop_dequanter.WaitForLoop(); + } + +private: + int32_t core_num; + int32_t core_idx; + int32_t m0; + int32_t k0; + int32_t n0; + int32_t m_loop; + int32_t n_loop; + int32_t core_loop; + int32_t loop_num_per_comm; + int32_t swizzl_direct; + int32_t swizzl_count; + + int32_t p_value; + int32_t rank_size; + __gm__ T *gm_buff; + __gm__ float32_t *gm_dequant_scale_pertoken; + int32_t loop_per_EP; + int32_t rank; + int32_t EP; + int32_t TP; + int32_t local_expert_nums; + int32_t is_moe_averaged; + int32_t is_alltoallvc; + int32_t buffer_size; + __gm__ int32_t *out_loop_per_expert; + __gm__ int32_t *out_loop_per_ep; + __gm__ int32_t *sum_num_local_tokens_per_expert; + __gm__ int32_t *sum_num_global_tokens_per_local_expert; + __gm__ int32_t *in_expert_comm_count_accum; + __gm__ int32_t *num_local_tokens_per_expert; + __gm__ int32_t *num_global_tokens_per_local_expert; + __gm__ int32_t *num_local_tokens_per_expert; + __gm__ int32_t *num_global_tokens_per_local_expert; + int32_t sum_loop; + __gm__ int32_t* global_tokens_per_expert_matrix; + int32_t max_m; + int32_t sum_m[32] = {0}; + int32_t sum_m_loop = 0; + int32_t comm_n; + int32_t comm_k; + int64_t gm_a_pingpong_size; + int64_t gm_a_pingpong_num; + int32_t cal_count; + int32_t maxOutputSize; +}; +#endif +#endif + + + + + + + + + + +} \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cpp b/comm/lcal/src/kernels/coc_dequant_runner.cpp deleted file mode 100644 index e645684a..00000000 --- a/comm/lcal/src/kernels/coc_dequant_runner.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -#ifndef __COC_DEQUANTER__ -#define __COC_DEQUANTER__ - -#ifdef __DAV_C220_VEC__ - -#include -#include "coc_internal.cce" - -template -class LoopDequanter { -}; - -template <> -class LoopDequanter { -public: - static constexpr int32_t max_len = 9792; - inline __aicore__ LoopDequanter() = default; - inline __aicore__ void SetForLoop() - { - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - } - - inline __aicore__ void WaitForLoop() - { - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); - } - - inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, int32_t offset, - int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) - { - is_ping = !is_ping; - auto ub_in = is_ping ? ub_in0 : ub_in1; - auto ub_out = is_ping ? ub_out0 : ub_out1; - auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; - - int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); - int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); - WaitFlag(event_id); - CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); - SetFlag(event_id); - WaitFlag(event_id); - Vadds(ub_adds, ub_in, offset, repeat, 1, 1, 8, 8); - SetFlag(event_id); - - PipeBarrier(); - Vconv(ub_adds_f32, ub_adds, repeat, 1, 1, 8, 8); - SetFlag(event_id); - - WaitFlag(event_id); - CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); - SetFlag(event_id); - } - -private: - static constexpr uint8_t repeat = 153; - __ubuf__ bfloat16_t *ub_out0 = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); - __ubuf__ bfloat16_t *ub_out1 = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)19584); - __ubuf__ float32_t *ub_adds_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)39936); - __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)79104); - __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)118272); - __ubuf__ int32_t *ub_adds = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); - __ubuf__ float32_t *ub_muls = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); - - bool is_ping = false; -}; - -template <> -class LoopDequanter { -public: - static constexpr int32_t max_len = 8192; - inline __aicore__ LoopDequanter() = default; - inline __aicore__ void SetForLoop() - { - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID0); - } - inline __aicore__ void WaitForLoop() - { - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); - WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID0); - } - inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, - int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) - { - is_ping = !is_ping; - auto ub_in = is_ping ? ub_in0 : ub_in1; - auto ub_out = is_ping ? ub_out0 : ub_out1; - int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); - int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); - - WaitFlag(event_id); - CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); - SetFlag(event_id); - WaitFlag(event_id); - Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 8); - SetFlag(event_id); - - WaitFlag(EVENT_ID2); - if (scale_rows == 0 || scale_source != scale) { - scale_rows = 1; - scale_source = scale; - CopyGmToUbufAlign(ub_scale, scale, 1, n_cols_this_loop, 0); - } - SetFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); - for (; scale_rows < n_rows_this_loop; ++scale_rows) { - CopyUB2UB(ub_scale + scale_rows * n_blocks * Block32B::size, ub_scale, - 0, 1, n_blocks, 0, 0); - } - PipeBarrier(); - Vmul(ub_mul, ub_in_f32, ub_scale, repeat, 1, 1, 1, 8, 8, 8); - SetFlag(EVENT_ID2); - WaitFlag(EVENT_ID0); - Vconv(ub_out, ub_mul, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); - CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); - SetFlag(EVENT_ID0); - } - -private: - static constexpr uint8_t repeat = 128; - __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); - __ubuf__ float32_t *ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)32768); - __ubuf__ float32_t *ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); - __ubuf__ float32_t *ub_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98650); - __ubuf__ bfloat16_t *ub_out = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)131328); - __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)163840); - - __gm__ float32_t *scale_source = nullptr; - int32_t scale_rows = 0; - bool is_ping = false; -}; - -template -class LoopPerTokenDequanter { -public: - static constexpr int32_t max_len = 8 * 32 / 4 * 128; - - inline __aicore__ LoopPerTokenDequanter(int32_t n0) - { - n_round = (n0 + 127) / 128 * 128; - ub_in0 = reinterpret_cast<__ubuf__ T *>((uintptr_t)0); - ub_in1 = reinterpret_cast<__ubuf__ T *>(ub_in0 + max_len); - ub_out = reinterpret_cast<__ubuf__ T *>(ub_in1 + max_len); - ub_scales = reinterpret_cast<__ubuf__ float32_t *>(ub_out + max_len); - ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>(ub_scales + max_len); - ub_out_f32 = reinterpret_cast<__ubuf__ float32_t *>(ub_in_f32 + max_len); - } - - inline __aicore__ void SetForLoop() - { - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); - } - - inline __aicore__ void WaitForLoop() - { - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); - } - - inline __aicore__ void Loop(__gm__ T *buff, __gm__ float32_t *scale, - int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t stride) - { - is_ping = !is_ping; - auto ub_in = is_ping ? ub_in0 : ub_in1; - auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; - int32_t ubufGap = Block32B::Count(n_round) - Block32B::Count(n_cols_this_loop); - WaitFlag(event_id); - CopyGmToUbufAlign(ub_in, buff, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); - SetFlag(event_id); - WaitFlag(event_id); - WaitFlag(event_id); - Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 4); - SetFlag(event_id); - - WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); - if (scale_source != scale) { - scale_source = scale; - CopyGmToUbufAlign(ub_scales, scale, 1, n_cols_this_loop, 0); - } - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); - WaitFlag - } - -} -- Gitee From 19cc32b53a06cfd5d17d782f2abfa5a1f6f64513 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:35:05 +0800 Subject: [PATCH 345/414] 5 --- comm/lcal/src/kernels/coc_internal.cce | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index 706ddbf8..45444850 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -138,9 +138,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) __gm__ T_INPUT1 *gm_a, __gm__ T_INPUT2 *gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_gamma, \ __gm__ T_OUTPUT *gm_out, __gm__ T_OUTPUT *gm_allgather_out, GM_ADDR gm_workspace, \ GM_ADDR gm_dequant_scale, GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, \ - GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, \ - __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t *num_global_tokens_per_local_expert, \ - __gm__ int32_t *global_tokens_per_expert_matrix, GM_ADDR para_gm + GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, GM_ADDR_para_gm #define COC_ARGS_FUN_IO(T_INPUT, T_OUTPUT) COC_ARGS_FUN_IIO(T_INPUT, T_INPUT, T_OUTPUT) @@ -454,25 +452,5 @@ public: //int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE]; }; -FORCE_INLINE_AICORE void CommMatrixTrunc(__gm__ int32_t* global_tokens_per_expert_matrix, __gm__ int32_t* workspace, int32_t EP, int32_t local_expert_nums, int32_t maxOutputSize) -{ - int32_t expert_nums = local_expert_nums * EP; - for (int32_t i = 0; i < EP; i++) { - int32_t sum_tokens = 0; - for (int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id++) { - int32_t expert_id = i * local_expert_nums + local_expert_id; - for (int32_t j = 0; j < EP; j++) { - if (sum_tokens + global_tokens_per_expert_matrix[j * expert_nums + expert_id] - >= maxOutputSize) { - workspace[j * expert_nums + expert_id] = maxOutputSize - sum_tokens; - sum_tokens = maxOutputSize; - } else { - workspace[j * expert_nums + expert_id] = global_tokens_per_expert_matrix[j * expert_nums + expert_id]; - sum_tokens += global_tokens_per_expert_matrix[j * expert_nums + expert_id]; - } - } - } - } -} #endif // LCAL_COC_INTERNAL_H \ No newline at end of file -- Gitee From 3a23f878dabd6c636ec6d16d1a877e6863f0dd4d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:36:51 +0800 Subject: [PATCH 346/414] 8 --- comm/lcal/src/kernels/coc_internal.cce | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index 45444850..c5a5cab2 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -138,7 +138,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) __gm__ T_INPUT1 *gm_a, __gm__ T_INPUT2 *gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_gamma, \ __gm__ T_OUTPUT *gm_out, __gm__ T_OUTPUT *gm_allgather_out, GM_ADDR gm_workspace, \ GM_ADDR gm_dequant_scale, GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, \ - GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, GM_ADDR_para_gm + GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, GM_ADDR para_gm #define COC_ARGS_FUN_IO(T_INPUT, T_OUTPUT) COC_ARGS_FUN_IIO(T_INPUT, T_INPUT, T_OUTPUT) @@ -146,17 +146,13 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define COC_ARGS_CALL() \ gm_a, gm_b, gm_bias, gm_gamma, gm_out, gm_allgather_out, gm_workspace, gm_dequant_scale, gm_dequant_offset, \ - gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ - num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ - global_tokens_per_expert_matrix, para_gm + gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, para_gm #define COC_ARGS_CALL_INT8() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), reinterpret_cast(gm_bias), \ reinterpret_cast(gm_gamma), reinterpret_cast(gm_out), \ reinterpret_cast(gm_allgather_out), gm_workspace, gm_dequant_scale, gm_dequant_offset, \ - gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ - num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ - global_tokens_per_expert_matrix, para_gm + gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, para_gm #define PP_MATMUL_AIC_ARGS_FUN(T_INPUT, T_OUTPUT) \ GM_ADDR gm_a, GM_ADDR gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_c, \ -- Gitee From 13b01aefbe162703159bcd17c3e10c14cfce9a77 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:38:11 +0800 Subject: [PATCH 347/414] 0 --- comm/lcal/src/kernels/coc_internal.cce | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index c5a5cab2..ae4cc37c 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -162,9 +162,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) int32_t swizzl_count, int32_t swizzl_direct, int32_t rank, int32_t rank_size, int32_t p_value, \ int32_t withSerialMode, QuantGranularity quant_granularity, QuantGranularity dequant_granularity, \ int32_t ag_dim, int32_t rs_dim, bool inner_dim_is_Ag, bool weight_nz, bool is_91093, \ - __gm__ int32_t *num_local_tokens_per_expert, __gm__ int32_t * num_global_tokens_per_local_expert, \ - __gm__ int32_t *global_tokens_per_expert_matrix, int32_t local_expert_nums, int32_t EP, int32_t TP, \ - int32_t maxOutputSize, int32_t is_moe, bool is_deterministic, int32_t buffer_size \ + bool is_deterministic, int32_t buffer_size \ #define PP_MATMUL_AIC_ARGS_CALL() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), gm_bias, gm_c, gm_peer_mem, \ @@ -180,8 +178,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ QuantGranularity dequant_granularity, int32_t dequant_group_size, QuantGranularity quant_granularity, \ - int32_t quant_group_size, int32_t weight_nz, int32_t is_moe, int32_t is_moe_averaged, int32_t is_alltoallvc, \ - int32_t EP, int32_t TP, int32_t local_expert_nums, bool is_deterministic + int32_t quant_group_size, bool weight_nz, bool is_deterministic #define PP_MATMUL_AIV_PADDING_ARGS_CALL() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), \ -- Gitee From 9d76898bf5412cc1a100b2272becd622cce21c44 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:39:21 +0800 Subject: [PATCH 348/414] 4 --- comm/lcal/src/kernels/coc_internal.cce | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index ae4cc37c..b8da243c 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -170,15 +170,14 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) reinterpret_cast(gm_dequant_offset), batch_size, m, k, n, m0, k0, n0, m_loop, k_loop, \ n_loop, core_loop, swizzl_count, swizzl_direct, rank, rank_size, p_value, withSerialMode, quant_granularity, \ dequant_granularity, ag_dim, rs_dim, inner_dim_is_Ag, weight_nz, is_91093, \ - num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ - global_tokens_per_expert_matrix, local_expert_nums, EP, TP, maxOutputSize, is_moe, is_deterministic, buffer_size \ + is_deterministic, buffer_size \ #define PP_MATMUL_AIV_PADDING_ARGS_FUN() \ GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ QuantGranularity dequant_granularity, int32_t dequant_group_size, QuantGranularity quant_granularity, \ - int32_t quant_group_size, bool weight_nz, bool is_deterministic + int32_t quant_group_size, int32_t weight_nz, bool is_deterministic #define PP_MATMUL_AIV_PADDING_ARGS_CALL() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), \ -- Gitee From 0ab24ce65808ab8cc3ba41b9d5d16fc8f0cb0f99 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:40:00 +0800 Subject: [PATCH 349/414] 7 --- comm/lcal/src/kernels/coc_internal.cce | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index b8da243c..18761af7 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -184,8 +184,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ reinterpret_cast(gm_dequant_offset), reinterpret_cast(gm_quant_scale), \ reinterpret_cast(gm_quant_offset), batch_size, m, k, n, trans_a, trans_b, is_int8, \ - dequant_granularity, dequant_group_size, quant_granularity, quant_group_size, weight_nz, is_moe, \ - is_moe_averaged, is_alltoallvc, EP, TP, local_expert_nums, is_deterministic + dequant_granularity, dequant_group_size, quant_granularity, quant_group_size, weight_nz, is_deterministic #define PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN() \ GM_ADDR gm_bias, GM_ADDR gm_out, int32_t batch_size, int32_t m, int32_t n, int32_t rank_size -- Gitee From 69ec3fa08b26fbec9e2f8f3aea3cfa284382de80 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:42:27 +0800 Subject: [PATCH 350/414] 9 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 17c29357..49657a5b 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -60,7 +60,12 @@ public: PipeBarrier(); Vconv(ub_adds_f32, ub_adds, repeat, 1, 1, 8, 8); - SetFlag(event_id); + PipeBarrier(); + Vmuls(ub_muls, ub_adds_f32, scale, repeat, 1, 1, 8, 8); + PipeBarrier(); + Waitflag(event_id); + Vconv(ub_out, ub_muls, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); + SetFlag(event_id); WaitFlag(event_id); CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); -- Gitee From 9fe429bbb817837f06d654fc1653525f5a9c7032 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:49:19 +0800 Subject: [PATCH 351/414] 8 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 49657a5b..85253877 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -80,7 +80,7 @@ private: __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)79104); __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)118272); __ubuf__ int32_t *ub_adds = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); - __ubuf__ float32_t *ub_muls = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)157440); + __ubuf__ float32_t *ub_muls = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)157440); bool is_ping = false; }; @@ -104,7 +104,7 @@ public: WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID0); } - inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, float32_t scale, + inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, __gm__ float32_t *scale, int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) { is_ping = !is_ping; @@ -524,10 +524,10 @@ public: int32_t n_cols_complete; int32_t n_rows_this_loop; int32_t n_cols_this_loop; - int32_t core_offset; - int32_t batch_offset; - int32_t row_offset; - int32_t col_offset; + int64_t core_offset; + int64_t batch_offset; + int64_t row_offset; + int64_t col_offset; }; __aicore__ explicit SerialDequantRunner() = default; @@ -550,7 +550,7 @@ public: default: break; } - Barrier; + Barrier(); } private: @@ -924,7 +924,7 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work int32_t loop_idx = cal_idx * core_num + core_idx; for (int32_t p = 0; p < p_value; p++) { int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; - if (loop_idx >- core_loop) + if (loop_idx >= core_loop) break; int64_t m_idx, n_idx; GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); -- Gitee From 198f7596804772ecfc72b175c58cfcb386260ba8 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:52:36 +0800 Subject: [PATCH 352/414] 5 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 88 ++------------------ 1 file changed, 6 insertions(+), 82 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 85253877..ca65ed65 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -737,14 +737,6 @@ public: this-> p_value = p_value; this-> rank_size = EP * TP; this-> rank = rank; - this-> EP = EP; - this-> TP = TP; - this-> local_expert_nums = local_expert_nums; - this-> is_moe_averaged = is_moe_averaged; - this-> is_alltoallvc = is_alltoallvc; - this-> num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (num_local_tokens_per_expert); - this-> num_global_tokens_per_local_expert = - reinterpret_cast<__gm__ int32_t *> (num_global_tokens_per_local_expert); } private: @@ -756,25 +748,14 @@ private: int32_t n_loop; int32_t core_loop; int32_t loop_num_per_comm; - int32_t swizzl_count; + int32_t swizzl_direct; int32_t p_value; int32_t rank_size; - int32_t loop_per_EP; + int32_t rank; - int32_t EP; - int32_t TP; - int32_t local_expert_nums; - int32_t is_moe_averaged; - int32_t is_alltoallvc; - - __gm__ int32_t *out_loop_per_expert; - __gm__ int32_t *out_loop_per_ep; - __gm__ int32_t *sum_num_local_tokens_per_expert; - __gm__ int32_t *sum_num_global_tokens_per_local_expert; - __gm__ int32_t *in_expert_comm_count_accum; - __gm__ int32_t *num_local_tokens_per_expert; - __gm__ int32_t *num_global_tokens_per_local_expert; + + int32_t sum_loop; }; @@ -870,50 +851,9 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work this -> rank_size = EP * TP; this -> rank = rank; this -> buffer_size = buffer_size; - this -> EP = EP; - this -> TP = TP; - this -> local_expert_nums = local_expert_nums; - this -> is_moe_averaged = is_moe_averaged; - this -> is_alltoallvc = is_alltoallvc; - this -> comm_n = p_value * n0; - - this -> global_tokens_per_expert_matrix = reinterpret_cast<__gm__ int32_t *> (global_tokens_per_expert_matrix); - this -> expert_nums = EP * local_expert_nums; - this -> maxOutputSize = max_output_size; - if (is_moe_averaged) { - sum_m_loop = DivCeil((m / expert_nums) * EP, m0) * local_expert_nums; - max_m = m; - } else { - if (maxOutputSize == -1) { - max_m = 0; - for (int32_t ep_idx = 0; ep_idx < EP; ep_idx++) { - int32_t sum_m_ep = 0; - for (int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id++) { - int32_t expert_id = local_expert_id +ep_idx * local_expert_nums; - for (int32_t i = 0; i < EP; i++) { - sum_m_ep += global_tokens_per_expert_matrix[expert_nums + expert_id]; - } - } - max_m = max(max_m, sum_m_ep); - } - } else { - max_m = maxOutputSize; - } - for (int32_t i = 0; i < local_expert_nms; i++) { - int32_t last_sum_m = (i == 0 ? 0 : sum_m[i - 1]); - for (int j = 0; j < EP; j++) { - sum_m[i] += global_tokens_per_expert_matrix[j * expert_nums + rank * local_expert_nums + i]; - } - if (maxOutputSize > 0 && sum_m[i] + last_sum_m > maxOutputSize) { - sum_m[i] = maxOutputSize - last_sum_m; - } - sum_m_loop += DivCeil(sum_m[i], m0); - sum_m[i] += (i == 0 ? 0 : sum_m[i - 1]); - } - } - sum_loop = 0; - } + + inline __aicore__ void RunDequantAllReduce(int32_t cal_idx) { @@ -962,23 +902,8 @@ private: __gm__ float32_t *gm_dequant_scale_pertoken; int32_t loop_per_EP; int32_t rank; - int32_t EP; - int32_t TP; - int32_t local_expert_nums; - int32_t is_moe_averaged; - int32_t is_alltoallvc; int32_t buffer_size; - __gm__ int32_t *out_loop_per_expert; - __gm__ int32_t *out_loop_per_ep; - __gm__ int32_t *sum_num_local_tokens_per_expert; - __gm__ int32_t *sum_num_global_tokens_per_local_expert; - __gm__ int32_t *in_expert_comm_count_accum; - __gm__ int32_t *num_local_tokens_per_expert; - __gm__ int32_t *num_global_tokens_per_local_expert; - __gm__ int32_t *num_local_tokens_per_expert; - __gm__ int32_t *num_global_tokens_per_local_expert; int32_t sum_loop; - __gm__ int32_t* global_tokens_per_expert_matrix; int32_t max_m; int32_t sum_m[32] = {0}; int32_t sum_m_loop = 0; @@ -987,7 +912,6 @@ private: int64_t gm_a_pingpong_size; int64_t gm_a_pingpong_num; int32_t cal_count; - int32_t maxOutputSize; }; #endif #endif -- Gitee From df4681c11dd677fce771cf724fadf9f0c2f979a7 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:54:06 +0800 Subject: [PATCH 353/414] 9 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index ca65ed65..00315680 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -792,7 +792,6 @@ public: int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, - __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) { this -> gm_buff = gm_buff; this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; @@ -800,12 +799,6 @@ public: this -> n = n; core_num = get_block_num(); core_idx = get_block_idx(); - loop_per_EP = p_value * core_num / (EP * TP); - out_loop_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.out_loop_per_expert); - out_loop_per_ep = reinterpret_cast<__gm__ int32_t *> (workspace_info.out_loop_per_ep); - sum_num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.sum_num_local_tokens_per_expert); - sum_num_global_tokens_per_local_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.sum_num_global_tokens_per_local_expert); - in_expert_comm_count_accum = reinterpret_cast<__gm__ int32_t *> (workspace_info.in_expert_comm_count_accum); this -> n_loop = n_loop; this -> m_loop = m_loop; this -> m0 = m0; @@ -816,14 +809,6 @@ public: this -> rank_size = EP * TP; this -> rank = rank; - this -> EP = EP; - this -> TP = TP; - this -> local_expert_nums = local_expert_nums; - this -> is_moe_averaged = is_moe_averaged; - this -> is_alltoallvc = is_alltoallvc; - this-> num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (num_local_tokens_per_expert); - this-> num_global_tokens_per_local_expert = - reinterpret_cast<__gm__ int32_t *> (num_global_tokens_per_local_expert); } inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & workspace_info, -- Gitee From 183c3f029c9c54ab90590694bf2ea20c8c2871a4 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 15:58:42 +0800 Subject: [PATCH 354/414] 5 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 43 +++++++------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 00315680..acdc7981 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -616,8 +616,8 @@ public: int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, int32_t p_value, int32_t rank_size) { - BaseDequantRunner::SetArgs(gm_out, worspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, - batch_size, m, n, m0, n0); + BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, + batch_size, m, n); core_num = get_block_num(); core_idx = get_block_idx(); this-> m0 = m0; @@ -634,16 +634,16 @@ public: inline __aicore__ void RunDequantAllreduce(int32_t cal_idx) { switch (dequant_granularity) { - case QuantAGranularity:: PER_TENSOR : + case QuantGanularity:: PER_TENSOR : DequantAllReducePerTensor(cal_idx); return; - case QuantAGranularity:: PER_CHANNEL : + case QuantGanularity:: PER_CHANNEL : DequantAllReducePerChannel(cal_idx); return; - case QuantAGranularity:: PER_TOKEN : + case QuantGanularity:: PER_TOKEN : DequantAllReducePerChannel(cal_idx); return; - case QuantAGranularity:: FLOAT32_SCALE_PER_CHANNEL : + case QuantGanularity:: FLOAT32_SCALE_PER_CHANNEL : DequantAllReducePerChannel(cal_idx); return; default: @@ -806,17 +806,15 @@ public: this -> swizzl_direct = swizzlel_direct; this -> swizzl_count = swizzle_count; this -> p_value = p_value; - this -> rank_size = EP * TP; + // this -> rank_size = EP * TP; this -> rank = rank; } inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & workspace_info, __gm__ float32_t *gm_dequant_scale_pertoken, - int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, - int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, int32_t max_output_size, int32_t buffer_size, - __gm__ int32_t* global_tokens_per_expert_matrix) + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m0, int32_t k0,int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value) { this -> gm_buff = gm_buff; this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; @@ -831,9 +829,9 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work this -> k0 = k0; this -> n0 = n0; this -> swizzl_direct = swizzlel_direct; - this -> swizzl_count = swizzle_count; + this -> swizzl_count = swizzlel_count; this -> p_value = p_value; - this -> rank_size = EP * TP; + // this -> rank_size = EP * TP; this -> rank = rank; this -> buffer_size = buffer_size; @@ -859,9 +857,9 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + (loop_idx % loop_num_per_comm) * m0 * n0; for (tit.Init(LoopPerTokenDequanter::max_len, n0); !tit.End(); tit.Next()) { - int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; - auto buff = gm_buff + offset; - auto scale = gm_dequant_scale_pertoken + m_idx * m0 + tit.m_offset_in_tile(); + int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; // 子核当前需要处理的字节偏移 + auto buff = gm_buff + offset; // 通信缓冲内的地址 + auto scale = gm_dequant_scale_pertoken + m_idx * m0 + tit.m_offset_in_tile(); // 注意要加上m_offset_in_tile loop_dequanter.Loop(buff, scale, tit.m_this_loop, tit.n_this_loop, n0); } } @@ -899,15 +897,4 @@ private: int32_t cal_count; }; #endif -#endif - - - - - - - - - - -} \ No newline at end of file +#endif \ No newline at end of file -- Gitee From c3d0f88a7ff6eaf5de261d1407d4b9b2fe8f54a0 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:01:18 +0800 Subject: [PATCH 355/414] 9 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 24 ++++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index acdc7981..9f71c8eb 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -714,7 +714,7 @@ public: inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorspaceInfo &workspace_info, __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) { @@ -723,16 +723,11 @@ public: core_num = get_block_num(); core_idx = get_block_idx(); - loop_per_EP = p_value * core_num / (EP * TP); - out_loop_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_expert); - out_loop_per_ep = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_EP); - sum_num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_local_tokens_per_expert); - sum_num_global_tokens_per_local_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_global_tokens_per_local_expert); this-> n_loop = n_loop; this-> m_loop = m_loop; this-> m0 = m0; this-> n0 = n0; - this-> swizzl_direct = swizzlel_direct; + this-> swizzl_direct = swizzle_direct; this-> swizzl_count = swizzle_count; this-> p_value = p_value; this-> rank_size = EP * TP; @@ -790,7 +785,7 @@ public: inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo &workspace_info, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, { this -> gm_buff = gm_buff; @@ -803,7 +798,7 @@ public: this -> m_loop = m_loop; this -> m0 = m0; this -> n0 = n0; - this -> swizzl_direct = swizzlel_direct; + this -> swizzl_direct = swizzle_direct; this -> swizzl_count = swizzle_count; this -> p_value = p_value; // this -> rank_size = EP * TP; @@ -814,7 +809,7 @@ public: inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & workspace_info, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m0, int32_t k0,int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzlel_direct, int32_t swizzle_count, int32_t p_value) + int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value) { this -> gm_buff = gm_buff; this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; @@ -828,15 +823,14 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work this -> m0 = m0; this -> k0 = k0; this -> n0 = n0; - this -> swizzl_direct = swizzlel_direct; - this -> swizzl_count = swizzlel_count; + this -> swizzl_direct = swizzle_direct; + this -> swizzl_count = swizzle_count; this -> p_value = p_value; // this -> rank_size = EP * TP; this -> rank = rank; this -> buffer_size = buffer_size; - - + } inline __aicore__ void RunDequantAllReduce(int32_t cal_idx) { @@ -857,7 +851,7 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + (loop_idx % loop_num_per_comm) * m0 * n0; for (tit.Init(LoopPerTokenDequanter::max_len, n0); !tit.End(); tit.Next()) { - int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; // 子核当前需要处理的字节偏移 + int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; // 子核当前需处理的字节偏移 auto buff = gm_buff + offset; // 通信缓冲内的地址 auto scale = gm_dequant_scale_pertoken + m_idx * m0 + tit.m_offset_in_tile(); // 注意要加上m_offset_in_tile loop_dequanter.Loop(buff, scale, tit.m_this_loop, tit.n_this_loop, n0); -- Gitee From 99dd8b9d4ed97c7e81c64f4e0772bb3fb9caca1d Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:06:47 +0800 Subject: [PATCH 356/414] 0 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 50 +++++++++----------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 9f71c8eb..80809e2e 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -184,11 +184,11 @@ public: inline __aicore__ void WaitForLoop() { - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); } inline __aicore__ void Loop(__gm__ T *buff, __gm__ float32_t *scale, @@ -202,7 +202,6 @@ public: CopyGmToUbufAlign(ub_in, buff, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); SetFlag(event_id); WaitFlag(event_id); - WaitFlag(event_id); Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 4); SetFlag(event_id); @@ -210,7 +209,7 @@ public: WaitFlag(EVENT_ID2); if (scale_source != scale) { scale_source = scale; - CopyGmToUbufAlign(ub_scales, scale, 1, n_cols_this_loop, 0); + CopyGmToUbufAlign(ub_scales, scale, 1, n_rows_this_loop, 0); } SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); @@ -565,7 +564,7 @@ private: LoopIter it(batch_size, m, n); LoopDequanter loop_dequanter; loop_dequanter.SetForLoop(); - for (it.InitColLoop()l !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { for (it.InitColLoop(max_n_per_loop); !it.EndColLoop(); it.NextColLoop()) { for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto dst = gm_out + it.offset(); @@ -577,22 +576,22 @@ private: loop_dequanter.WaitForLoop(); } - inline __aicore__ void DequantPerChaneel() + inline __aicore__ void DequantPerChannel() { const auto max_len = LoopDequanter::max_len; - int32_t n_round = Block32BAlignUp(n); - int32_t max_m_per_loop = (n_round <= max_len) ? (max / n_round) : 1; - int32_t max_n_per_loop = (n <= max_len) ? n : max_len; + int32_t n_round = Block32B::AlignUp(n); + int32_t max_m_per_loop = (n_round <= max_len) ? (max_len / n_round) : 1; + int32_t max_n_per_loop = (n_round <= max_len) ? n : max_len; LoopIter it(batch_size, m, n); LoopDequanter loop_dequanter; loop_dequanter.SetForLoop(); - for (it.InitColLoop()l !it.EndBatchLoop(); it.NextBatchLoop()) { + for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { for (it.InitColLoop(max_n_per_loop); !it.EndColLoop(); it.NextColLoop()) { for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto dst = gm_out + it.offset(); auto src = gm_accum + it.offset(); - loop_dequanter.Loop(dst, src, 0, 0, it.n_rows_this_loop, it.n_cols_this_loop, n, n); + loop_dequanter.Loop(dst, src, scale, it.n_rows_this_loop, it.n_cols_this_loop, n, n); } } } @@ -611,7 +610,7 @@ private: class FusedDequantRunner : public BaseDequantRunner { public: __aicore__ explicit FusedDequantRunner() = default; - inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkSpaceInfo & worspace_info, + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo & workspace_info, __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, int32_t p_value, int32_t rank_size) @@ -634,16 +633,16 @@ public: inline __aicore__ void RunDequantAllreduce(int32_t cal_idx) { switch (dequant_granularity) { - case QuantGanularity:: PER_TENSOR : + case QuantGranularity:: PER_TENSOR : DequantAllReducePerTensor(cal_idx); return; - case QuantGanularity:: PER_CHANNEL : + case QuantGranularity:: PER_CHANNEL : DequantAllReducePerChannel(cal_idx); return; - case QuantGanularity:: PER_TOKEN : + case QuantGranularity:: PER_TOKEN : DequantAllReducePerChannel(cal_idx); return; - case QuantGanularity:: FLOAT32_SCALE_PER_CHANNEL : + case QuantGranularity:: FLOAT32_SCALE_PER_CHANNEL : DequantAllReducePerChannel(cal_idx); return; default: @@ -705,20 +704,18 @@ public: int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; auto accum = gm_accum + src_offset; auto out = gm_out + dst_offset; - loop_dequanter.Loop(out, accum, scale, tit.m_this_loop, tit.n_this_loop, n0, n0); + loop_dequanter.Loop(out, accum, scale, 0, tit.m_this_loop, tit.n_this_loop, n0, n0); } } loop_dequanter.WaitForLoop(); } - inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorspaceInfo &workspace_info, + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo &workspace_info, __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, - int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, - __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) + int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value) { - BaseDequantRunner::SetArgs(gm_out, work, gm_dequant_scale, gm_dequant_offset, dequant_granularity, + BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, batch_size, m, n); core_num = get_block_num(); @@ -785,8 +782,7 @@ public: inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo &workspace_info, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, - int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, + int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value) { this -> gm_buff = gm_buff; this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; -- Gitee From 2504e40402f2bdbb78fb85fe5537355fdff9222e Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:10:30 +0800 Subject: [PATCH 357/414] 0 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 80809e2e..14486627 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -46,7 +46,6 @@ public: { is_ping = !is_ping; auto ub_in = is_ping ? ub_in0 : ub_in1; - auto ub_out = is_ping ? ub_out0 : ub_out1; auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); @@ -148,7 +147,7 @@ private: __ubuf__ int32_t *ub_in0 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); __ubuf__ float32_t *ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)32768); __ubuf__ float32_t *ub_in_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); - __ubuf__ float32_t *ub_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98650); + __ubuf__ float32_t *ub_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98560); __ubuf__ bfloat16_t *ub_out = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)131328); __ubuf__ int32_t *ub_in1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)163840); @@ -219,7 +218,7 @@ public: PipeBarrier(); for (int32_t row = 0; row < n_rows_this_loop; ++row) { float32_t scale = ub_scales[row]; - Vmuls(ub_out_f32 + n_round * row, un_in_f32 + n_round * row, scale, (n_cols_this_loop + 127) / 128 * 2, 1, 1, 8, 8); 1); + Vmuls(ub_out_f32 + n_round * row, ub_in_f32 + n_round * row, scale, (n_cols_this_loop + 127) / 128 * 2, 1, 1, 8, 8); 1); } PipeBarrier(); Vconv(ub_out, ub_out_f32, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); @@ -443,21 +442,22 @@ public: batch_offset = 0; } - inline __aicore__ void EndBatchLoop() + inline __aicore__ bool EndBatchLoop() const { return batch_idx == batch_size; } - intline __aicore__ void NextBatchLoop() + inline __aicore__ void NextBatchLoop() { ++batch_idx; if (EndBatchLoop()) { return; } - batch_offset += static_casta(batch_idx) * n_rows * n_cols; + batch_offset = static_cast(batch_idx) * n_rows * n_cols; } } + inline __aicore__ void InitRowLoop(init32_t max_rows_per_loop) { n_rows_complete = 0; -- Gitee From 9e95b6fe220cb40e6e0b53724bc1aa4f080f19d5 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:15:35 +0800 Subject: [PATCH 358/414] 0 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 14486627..6400318f 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -213,8 +213,8 @@ public: SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); // 注意必须是MTE2_s, 不能是MTE2_V 否则会读到0, 造成乱码 + WaitFlag(EVENT_ID2); PipeBarrier(); for (int32_t row = 0; row < n_rows_this_loop; ++row) { float32_t scale = ub_scales[row]; @@ -222,9 +222,9 @@ public: } PipeBarrier(); Vconv(ub_out, ub_out_f32, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); - SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); - SetFlag(EVENT_ID2); + SetFlag(EVENT_ID2); WaitFlag(EVENT_ID2); CopyUbufToGmAlign(buff, ub_out, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); @@ -239,7 +239,7 @@ private: __gm__ float32_t *scale_source = nullptr; __ubuf__ float32_t *ub_in_f32 = nullptr; __ubuf__ float32_t *ub_out_f32 = nullptr; - int32_t n_round = 0; + int32_t n_round; bool is_ping = false; }; @@ -262,7 +262,7 @@ public: WaitFlag(EVENT_ID1); WaitFlag(EVENT_ID0); WaitFlag(EVENT_ID1); - set_ctrl(sbitset1(get_ctrl(), 59)); + set_ctrl(sbitset0(get_ctrl(), 59)); } inline __aicore__ void Loop(__gm__ float32_t *dst, __gm__ int64_t *src, int32_t len) @@ -375,7 +375,7 @@ public: inline __aicore__ void FormatScale() { int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); - int32_t align_core_num = get_blocknumm() * get_subblockdim(); + int32_t align_core_num = get_blocknum() * get_subblockdim(); int32_t len = LoopScaleFormater::max_len; int32_t loop_num = DivCeil(n, len); LoopScaleFormater loop_scale_formater; @@ -557,9 +557,9 @@ private: { float32_t scale = gm_format_dequant_scale[0]; const auto max_len = LoopDequant::max_len; - int32_t n_round = Block32BAlignUp(n); - int32_t max_m_per_loop = (n_round <= max_len) ? (max / n_round) : 1; - int32_t max_n_per_loop = (n <= max_len) ? n : max_len; + int32_t n_round = Block32B::AlignUp(n); + int32_t max_m_per_loop = (n_round <= max_len) ? (max_len / n_round) : 1; + int32_t max_n_per_loop = (n_round <= max_len) ? n : max_len; LoopIter it(batch_size, m, n); LoopDequanter loop_dequanter; -- Gitee From 5ee304595a443e84face894c6816f23e832661d7 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:20:17 +0800 Subject: [PATCH 359/414] 4 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 6400318f..75ac859d 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -46,6 +46,7 @@ public: { is_ping = !is_ping; auto ub_in = is_ping ? ub_in0 : ub_in1; + auto ub_out = is_ping ? ub_out0 : ub_out1; auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); @@ -108,7 +109,7 @@ public: { is_ping = !is_ping; auto ub_in = is_ping ? ub_in0 : ub_in1; - auto ub_out = is_ping ? ub_out0 : ub_out1; + auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); @@ -163,7 +164,7 @@ public: inline __aicore__ LoopPerTokenDequanter(int32_t n0) { - n_round = (n0 + 127) / 128 * 128; + n_round = (n0 + 127) / 128 * 128; // n_this_loop + 127 / 128是需要的repeat数, 每个repeat占用8个blocks ub_in0 = reinterpret_cast<__ubuf__ T *>((uintptr_t)0); ub_in1 = reinterpret_cast<__ubuf__ T *>(ub_in0 + max_len); ub_out = reinterpret_cast<__ubuf__ T *>(ub_in1 + max_len); @@ -213,12 +214,12 @@ public: SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); // 注意必须是MTE2_s, 不能是MTE2_V 否则会读到0, 造成乱码 + WaitFlag(EVENT_ID2); // 注意必须是MTE2_S, 不能是MTE2_V, 否则会读到0, 造成乱码 WaitFlag(EVENT_ID2); PipeBarrier(); for (int32_t row = 0; row < n_rows_this_loop; ++row) { float32_t scale = ub_scales[row]; - Vmuls(ub_out_f32 + n_round * row, ub_in_f32 + n_round * row, scale, (n_cols_this_loop + 127) / 128 * 2, 1, 1, 8, 8); 1); + Vmuls(ub_out_f32 + n_round * row, ub_in_f32 + n_round * row, scale, (n_cols_this_loop + 127) / 128 * 2, 1, 1, 8, 8); } PipeBarrier(); Vconv(ub_out, ub_out_f32, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); @@ -458,7 +459,7 @@ public: } - inline __aicore__ void InitRowLoop(init32_t max_rows_per_loop) + inline __aicore__ void InitRowLoop(int32_t max_rows_per_loop) { n_rows_complete = 0; n_rows_this_loop = (n_rows_this_core < max_rows_per_loop) ? n_rows_this_core : max_rows_per_loop; @@ -506,7 +507,7 @@ public: col_offset = n_cols_complete; } - inine __aicore__ int64_t offset() const + inline __aicore__ int64_t offset() const { return core_offset + row_offset * n_cols + col_offset; } @@ -727,7 +728,7 @@ public: this-> swizzl_direct = swizzle_direct; this-> swizzl_count = swizzle_count; this-> p_value = p_value; - this-> rank_size = EP * TP; + // this-> rank_size = EP * TP; this-> rank = rank; } -- Gitee From 87ba6b61a343e93e167a6ae67b192ad781826343 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:23:12 +0800 Subject: [PATCH 360/414] 9 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 75ac859d..035b6a98 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -214,7 +214,7 @@ public: SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); // 注意必须是MTE2_S, 不能是MTE2_V, 否则会读到0, 造成乱码 + WaitFlag(EVENT_ID2); // 注意必须是MTE2_S,不能是MTE2_V,否则会读到0, 造成乱码 WaitFlag(EVENT_ID2); PipeBarrier(); for (int32_t row = 0; row < n_rows_this_loop; ++row) { @@ -352,20 +352,20 @@ public: __aicore__ explicit BaseDequantRunner() = default; inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo &workspace_info, - __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity quant_granularity, + __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, int32_t batch_size, int32_t m, int32_t n) { this->gm_accum = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_accum); - this->gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_format_dequant_scale); + this->gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_formate_dequant_scale); this->gm_out = gm_out; this->gm_dequant_scale = gm_dequant_scale; this->gm_dequant_offset = gm_dequant_offset; - this->quant_granularity = quant_granularity; + this->dequant_granularity = dequant_granularity; this->batch_size = batch_size; this->m = m; this->n = n; if (dequant_granularity == QuantGranularity::PER_TENSOR) { - gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_format_dequant_scale); + gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale); } else if (dequant_granularity == QuantGranularity::PER_CHANNEL) { FormatScale(); } else { @@ -376,7 +376,7 @@ public: inline __aicore__ void FormatScale() { int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); - int32_t align_core_num = get_blocknum() * get_subblockdim(); + int32_t align_core_num = get_block_num() * get_subblockdim(); int32_t len = LoopScaleFormater::max_len; int32_t loop_num = DivCeil(n, len); LoopScaleFormater loop_scale_formater; @@ -557,7 +557,7 @@ private: inline __aicore__ void DequantPerTensor() { float32_t scale = gm_format_dequant_scale[0]; - const auto max_len = LoopDequant::max_len; + const auto max_len = LoopDequanter::max_len; int32_t n_round = Block32B::AlignUp(n); int32_t max_m_per_loop = (n_round <= max_len) ? (max_len / n_round) : 1; int32_t max_n_per_loop = (n_round <= max_len) ? n : max_len; -- Gitee From 333dc4efacdae6c833cd614d83189f72cc386c33 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 16:27:32 +0800 Subject: [PATCH 361/414] 0 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 035b6a98..5dd45a99 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -386,7 +386,7 @@ public: if (offset + len > n) { len = n - offset; } - } + loop_scale_formater.Loop(gm_format_dequant_scale + offset, gm_dequant_scale + offset, len); } loop_scale_formater.WaitForLoop(); @@ -456,7 +456,6 @@ public: } batch_offset = static_cast(batch_idx) * n_rows * n_cols; } - } inline __aicore__ void InitRowLoop(int32_t max_rows_per_loop) @@ -486,7 +485,7 @@ public: inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) { n_cols_complete = 0; - n_cols_this_loop = (n_cols_this_core < max_cols_per_loop) ? n_cols : max_cols_per_loop; + n_cols_this_loop = (n_cols < max_cols_per_loop) ? n_cols : max_cols_per_loop; col_offset = 0; } -- Gitee From 0ea8807eeae35142051bea424efb0a8ed9d5ac0e Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 19:00:02 +0800 Subject: [PATCH 362/414] 4 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 5dd45a99..cfd9a1be 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -322,6 +322,16 @@ public: n_this_loop = n_this_subcore; } + inline __aicore__ void Init(int32_t max_max_len, int32_t n0) + { + // Block32B::AlignUp: 扩展到32/sizeof(half)的倍数,也就是扩展到16的倍数 + // m_this_subcore最大值: max_len / n_this_subcore, 16384/256=64 + int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); + m_complete = 0; + m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; // 本次loop所处理的m, 最大值为max_m_per_loop + n_this_loop = max_n_per_loop; // 本次loop所处理的n + } + inline __aicore__ bool End() { return m_complete >= m_this_subcore; -- Gitee From 979aa7fab023875108a6a08f8b0b5714ce77fce1 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 19:02:50 +0800 Subject: [PATCH 363/414] 8 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index cfd9a1be..5378d2ac 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -322,14 +322,14 @@ public: n_this_loop = n_this_subcore; } - inline __aicore__ void Init(int32_t max_max_len, int32_t n0) + inline __aicore__ void Init(int32_t max_max_len, int32_t n0) // max_len = 8192或者9792 { - // Block32B::AlignUp: 扩展到32/sizeof(half)的倍数,也就是扩展到16的倍数 - // m_this_subcore最大值: max_len / n_this_subcore, 16384/256=64 + // Block32B::AlignUp: 扩展到32/sizeof(half)的倍数,也就是扩展到16的倍数 + // m_this_subcore最大值: max_len / n_this_subcore, 16384/256=64 int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); m_complete = 0; m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; // 本次loop所处理的m, 最大值为max_m_per_loop - n_this_loop = max_n_per_loop; // 本次loop所处理的n + n_this_loop = n_this_subcore; // 本次loop所处理的n } inline __aicore__ bool End() -- Gitee From c17a3ea9f5901e313cff6ee96a1c41564f0418d4 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 20:54:48 +0800 Subject: [PATCH 364/414] 9 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 63 +++++++++++++------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 5378d2ac..956b9ac0 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -322,13 +322,13 @@ public: n_this_loop = n_this_subcore; } - inline __aicore__ void Init(int32_t max_max_len, int32_t n0) // max_len = 8192或者9792 + inline __aicore__ void Init(int32_t max_len, int32_t n0) // max_len = 8192或者9792 { // Block32B::AlignUp: 扩展到32/sizeof(half)的倍数,也就是扩展到16的倍数 // m_this_subcore最大值: max_len / n_this_subcore, 16384/256=64 int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); m_complete = 0; - m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; // 本次loop所处理的m, 最大值为max_m_per_loop + m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; // 本次loop所处理的m, 最大为max_m_per_loop n_this_loop = n_this_subcore; // 本次loop所处理的n } @@ -385,6 +385,10 @@ public: inline __aicore__ void FormatScale() { + // if (dequant_granularity != QuantGranularity::PER_CHANNEL) { + // return; + // } + int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); int32_t align_core_num = get_block_num() * get_subblockdim(); int32_t len = LoopScaleFormater::max_len; @@ -601,12 +605,44 @@ private: for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto dst = gm_out + it.offset(); auto src = gm_accum + it.offset(); + //auto src = gm_accum; + auto scale = gm_dequant_scale_pertoken + it.offset(); loop_dequanter.Loop(dst, src, scale, it.n_rows_this_loop, it.n_cols_this_loop, n, n); } } } loop_dequanter.WaitForLoop(); } +}; + +template +class SerialPerTokenDequantRunner : public SerialDequantRunner { +public: + __aicore__ explicit SerialPerTokenDequantRunner() = default; + inline __aicore__ void SetArgs(__gm__ T *gm_out, + __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0) + { + this->gm_out = reinterpret_cast<__gm__ bfloat16_t *>(gm_out); + this->gm_dequant_scale_pertoken = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale_pertoken); + this->m = m; + this->n = n; + this->m0 = m0; + this->n0 = n0; + } + inline __aicore__ void Run() { + const auto max_len = LoopPerTokenDequanter::max_len; + int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); + LoopIter it(1, m, n); + LoopPerTokenDequanter loop_dequanter(n0); + for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + for (it.InitColLoop(n0); !it.EndColLoop(); it.NextColLoop()) { + __gm__ T * dst_add = gm_out + it.offset(); + __gm__ float32_t * scale = gm_out + it.offset(); + loop_dequanter.Loop(dst_add, scale, it.n_rows_this_loop, it.n_cols_this_loop, n); + } + } + loop_dequanter.WaitForLoop(); + } private: __gm__ T *gm_out; @@ -627,6 +663,7 @@ public: { BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, batch_size, m, n); + //cit.SetArgs(m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value); core_num = get_block_num(); core_idx = get_block_idx(); this-> m0 = m0; @@ -636,6 +673,7 @@ public: this-> core_loop = core_loop; this-> swizzl_direct = swizzl_direct; this-> swizzl_count = swizzl_count; + this-> loop_num_per_comm = p_value * core_num; this-> p_value = p_value; this-> rank_size = rank_size; } @@ -664,6 +702,7 @@ public: { LoopDequanter loop_dequanter; loop_dequanter.SetForLoop(); + //int32_t pipe_depth = is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT; int32_t pipe_depth = MAX_BLOCK_COUNT; int32_t flag_idx = cal_idx % pipe_depth; int32_t loop_idx = cal_idx * core_num + core_idx; @@ -695,6 +734,7 @@ public: LoopDequanter loop_dequanter; float32_t scale = gm_format_dequant_scale[0]; loop_dequanter.SetForLoop(); + //int32_t pipe_depth = is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT; int32_t pipe_depth = MAX_BLOCK_COUNT; int32_t flag_idx = cal_idx % pipe_depth; int32_t loop_idx = cal_idx * core_num + core_idx; @@ -720,26 +760,7 @@ public: loop_dequanter.WaitForLoop(); } - inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo &workspace_info, - __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, - int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value) - { - BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, - batch_size, m, n); - core_num = get_block_num(); - core_idx = get_block_idx(); - this-> n_loop = n_loop; - this-> m_loop = m_loop; - this-> m0 = m0; - this-> n0 = n0; - this-> swizzl_direct = swizzle_direct; - this-> swizzl_count = swizzle_count; - this-> p_value = p_value; - // this-> rank_size = EP * TP; - this-> rank = rank; - } private: int32_t core_num; -- Gitee From 3034b7f024a8e65d88de497f7cce58c71c6fbbec Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 20:57:21 +0800 Subject: [PATCH 365/414] 1 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 956b9ac0..6e8e8d06 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -606,7 +606,7 @@ private: auto dst = gm_out + it.offset(); auto src = gm_accum + it.offset(); //auto src = gm_accum; - auto scale = gm_dequant_scale_pertoken + it.offset(); + auto scale = gm_formatdequant_scale + it.col_offset(); loop_dequanter.Loop(dst, src, scale, it.n_rows_this_loop, it.n_cols_this_loop, n, n); } } @@ -622,7 +622,7 @@ public: inline __aicore__ void SetArgs(__gm__ T *gm_out, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0) { - this->gm_out = reinterpret_cast<__gm__ bfloat16_t *>(gm_out); + this->gm_out = reinterpret_cast<__gm__ t *>(gm_out); this->gm_dequant_scale_pertoken = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale_pertoken); this->m = m; this->n = n; @@ -637,7 +637,7 @@ public: for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { for (it.InitColLoop(n0); !it.EndColLoop(); it.NextColLoop()) { __gm__ T * dst_add = gm_out + it.offset(); - __gm__ float32_t * scale = gm_out + it.offset(); + __gm__ float32_t * scale = gm_dequant_scale_pertoken + it.row_offset + it.row_offset_this_core; loop_dequanter.Loop(dst_add, scale, it.n_rows_this_loop, it.n_cols_this_loop, n); } } @@ -663,7 +663,7 @@ public: { BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, batch_size, m, n); - //cit.SetArgs(m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value); + //cit.SetArgs(m, n, m0, n0, m_loop, n_loop, core_loop, swizzle_direct, swizzle_count, p_value); core_num = get_block_num(); core_idx = get_block_idx(); this-> m0 = m0; -- Gitee From 52a88e6a062950919085fd441c099b7209c4fd2c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Thu, 28 Aug 2025 20:59:28 +0800 Subject: [PATCH 366/414] 0 --- comm/lcal/src/kernels/coc_dequant_runner.cce | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 6e8e8d06..5cc13b2d 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -606,7 +606,7 @@ private: auto dst = gm_out + it.offset(); auto src = gm_accum + it.offset(); //auto src = gm_accum; - auto scale = gm_formatdequant_scale + it.col_offset(); + auto scale = gm_format_dequant_scale + it.col_offset; loop_dequanter.Loop(dst, src, scale, it.n_rows_this_loop, it.n_cols_this_loop, n, n); } } @@ -634,6 +634,7 @@ public: int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); LoopIter it(1, m, n); LoopPerTokenDequanter loop_dequanter(n0); + loop_dequanter.SetForLoop(); for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { for (it.InitColLoop(n0); !it.EndColLoop(); it.NextColLoop()) { __gm__ T * dst_add = gm_out + it.offset(); @@ -773,6 +774,7 @@ private: int32_t loop_num_per_comm; int32_t swizzl_direct; + int32_t swizzl_count; int32_t p_value; int32_t rank_size; -- Gitee From 6841f5625e0e294afa3327e9698fc3f3014d5c2b Mon Sep 17 00:00:00 2001 From: guanguan Date: Fri, 29 Aug 2025 14:21:43 +0800 Subject: [PATCH 367/414] fix --- {src => comm/lcal/src}/kernels/coc_preprocessor.cce | 0 {src => comm/lcal/src}/lcoc.cpp | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {src => comm/lcal/src}/kernels/coc_preprocessor.cce (100%) rename {src => comm/lcal/src}/lcoc.cpp (100%) diff --git a/src/kernels/coc_preprocessor.cce b/comm/lcal/src/kernels/coc_preprocessor.cce similarity index 100% rename from src/kernels/coc_preprocessor.cce rename to comm/lcal/src/kernels/coc_preprocessor.cce diff --git a/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp similarity index 100% rename from src/lcoc.cpp rename to comm/lcal/src/lcoc.cpp -- Gitee From 6852a5d6ac269b2404fb035abe3b4bcb59a4d1d5 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 29 Aug 2025 15:05:30 +0800 Subject: [PATCH 368/414] 6 --- comm/lcal/src/ascendc_kernels/lccl_op.h | 3 ++- comm/lcal/src/lcal_comm.cpp | 6 +++--- comm/lcal/src/lcal_internal.cpp | 2 +- comm/lcal/src/lccl.cpp | 7 +++++++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index cb53b646..82142253 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -198,7 +198,8 @@ extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNELS_ARGS_FU LcalAll2AllTranspose(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ else if ((extraFlag & ExtraFlag::TOPO_910_93) != 0) { \ - if (rankSize <= smallRankSize && len * sizeof(type) > SMALL_DATA_SIZE) { \ + if (rankSize <= smallRankSize && len * sizeof(type) > SMALL_DATA_SIZE && \ + (len * sizeof(type)) % (smallRankSize * smallRankSize * rankSize) == 0) { \ CLASS_OP_LAUNCH(All2AllHierarchySmall, type); \ } else { \ CLASS_OP_LAUNCH(All2AllHierarchy, type); \ diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index ae964964..741a9d40 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -97,7 +97,7 @@ ChipName GetChipName() MKI_LOG(WARN) << "There is no commitment to the supported chip types yet," << " and it is not certain whether the functions will work properly."; } - return curChipName; + return curChipName; } uint32_t GetCoreNum(ChipName chipName) @@ -176,11 +176,11 @@ int LcalComm::InitDumpAddr() } ret = aclrtMemcpy(dumpAddr, dumpWorkspaceSize, memory, dumpWorkspaceSize, ACL_MEMCPY_HOST_TO_DEVICE); + std::free(memory); if (ret != ACL_SUCCESS) { MKI_LOG(ERROR) << "aclrtMemcpy err " << __LINE__ << " " << ret; return LCAL_ERROR_INTERNAL; } - std::free(memory); commArgs_.dumpAddr = dumpAddr; return LCAL_SUCCESS; @@ -831,4 +831,4 @@ std::string LcalComm::PrintDFX() return ss.str(); } -} \ No newline at end of file +} // Lcal \ No newline at end of file diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index fd608c5f..695b116b 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -78,7 +78,7 @@ const int* FindNextOpStart(const int opStartMaigc, const int* cclBinEndPtr, cons MKI_LOG(ERROR) << "FindNextOpStart failed! cclBinPtr is nullptr"; return nullptr; } - while (*cclBinPtr != opStartMaigc and cclBinPtr < cclBinEndPtr) { + while (cclBinPtr < cclBinEndPtr and *cclBinPtr != opStartMaigc) { cclBinPtr++; } if (*cclBinPtr == opStartMaigc) { diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp index a92d54e6..79e7f7b1 100644 --- a/comm/lcal/src/lccl.cpp +++ b/comm/lcal/src/lccl.cpp @@ -225,6 +225,10 @@ uint32_t GetKernelBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize, int localRankSize, uint32_t extraFlag) const { + if (comm_ == nullptr) { + MKI_LOG(ERROR) << "comm is nullptr" << __LINE__; + return 0; + } uint32_t blockNum = GetKernelBlockNum(cclType, rankSize, dataSize, localRankSize, extraFlag); if (comm_->isEnableMix_) { constexpr uint32_t aivNumPerAic = 2; @@ -331,6 +335,9 @@ bool Lccl::CheckBuff(const void *sendBuff, const void *recvBuff) const } else if (recvBuff == nullptr) { MKI_LOG(ERROR) << "Lccl recvBuff is nullptr"; res = false; + } else if (comm_ == nullptr) { + MKI_LOG(ERROR) << "comm is nullptr" << __LINE__; + res = false; } return res; } -- Gitee From bbb344eb87772459d7666ca97662368fe243675c Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 29 Aug 2025 16:01:36 +0800 Subject: [PATCH 369/414] 3 --- .../ascendc_kernels/91093/all2all_hierarchy.h | 32 +++++++------- .../91093/all2all_hierarchy_small.h | 24 +++++------ .../91093/allgather_hierarchy_double_ring.h | 29 +++++++------ .../91093/allreduce_big_data_sio.h | 23 +++++----- .../91093/allreduce_hierarchy_double_ring.h | 34 +++++++-------- .../reduce_scatter_big_data_91093_4step.h | 28 ++++++------- .../reduce_scatter_hierarchy_double_ring.h | 12 +++--- comm/lcal/src/ascendc_kernels/allgather.h | 14 +++---- .../src/ascendc_kernels/allreduce_big_data.h | 30 ++++++------- .../src/ascendc_kernels/allreduce_one_shot.h | 8 ++-- .../src/ascendc_kernels/allreduce_quant.h | 10 ++--- .../src/ascendc_kernels/allreduce_two_shot.h | 28 ++++++------- comm/lcal/src/ascendc_kernels/collectives.h | 42 +++++++++---------- .../lcal/src/ascendc_kernels/datacopy_gm2gm.h | 20 ++++----- .../ascendc_kernels/datacopy_gm2gm_delay.h | 30 +++++++------ comm/lcal/src/ascendc_kernels/ipc_queue.h | 8 ++-- comm/lcal/src/ascendc_kernels/op_def.h | 4 +- .../lcal/src/ascendc_kernels/reduce_scatter.h | 4 +- .../src/ascendc_kernels/sync_collectives.h | 11 +++-- comm/lcal/src/lcal_comm.cpp | 1 - comm/lcal/src/lccl.cpp | 2 +- 21 files changed, 192 insertions(+), 202 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h index f01b2b20..8cf0aa9c 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h @@ -31,9 +31,9 @@ class All2AllHierarchy : protected Collectives { static const int64_t DIE_CHANGE = 1; public: - FORCE_INLINE_AICORE All2AllHierarchy(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE All2AllHierarchy(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} - FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { Collectives::Init(KERNELS_ARGS_CALL()); this->input = (__gm__ T *) input; @@ -54,7 +54,7 @@ public: } } private: - FORCE_INLINE_AICORE void InitShare() + FORCE_INLINE_AICORE void InitShare() { int64_t queNum = blockNum / STEP_TIMES; if (rankSize <= CORE_NUM_PER_STAGE) { @@ -63,7 +63,7 @@ private: if (len < perQueElemLen) { coreNumPerRank = 1; } - perQueElemLen = IPC_BUFF_MAX_SIZE / queNum / QUEUE_DEPTH / sizeof(T); + perQueElemLen = IPC_BUFF_MAX_SIZE / queNum / QUEUE_DEPTH / sizeof(T); queLen = perQueElemLen * QUEUE_DEPTH; queSize = queLen * sizeof(T); } @@ -72,9 +72,9 @@ private: { coreNumPerRank = 1; if (len < perQueElemLen) { - coreNumPerRank = 1; + coreNumPerRank = 1; } - coreNumPerStage = coreNumPerRank * rankSize < CORE_NUM_PER_STAGE ? + coreNumPerStage = coreNumPerRank * rankSize < CORE_NUM_PER_STAGE ? coreNumPerRank * rankSize : CORE_NUM_PER_STAGE; rankNumPerCore = CeilDiv(rankSize, coreNumPerStage); flagNumPerStage = rankSize; @@ -92,7 +92,7 @@ private: groupCoreIdx[i] = prefix + blockIdx - coreNumPerStage; } } - } + } FORCE_INLINE_AICORE void InitDataSlice() { @@ -101,10 +101,11 @@ private: for (auto i = 0; i < rankNumPerCore; ++i) { if (groupCoreIdx[i] % SIO == rank % SIO) { srcInnerQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + - (groupCoreIdx[i] % coreNumPerStage) * queSize, queLen, perQueElemLen); + (groupCoreIdx[i] % coreNumPerStage) * queSize, queLen, perQueElemLen); } else { SrcSioQue[i].Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + - ((groupCoreIdx[i] + (rank - sioRank)) % coreNumPerStage) * queSize, queLen, perQueElemLen); + ((groupCoreIdx[i] + (rank - sioRank)) % coreNumPerStage) * queSize, + queLen, perQueElemLen); } sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); } @@ -119,7 +120,7 @@ private: pullQue[i].Init(&sync, magic, shareAddrs[pullRank] + IPC_DATA_OFFSET + (rank % coreNumPerStage) * queSize + pullOffset * queSize, queLen, perQueElemLen); - sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); } } } @@ -133,9 +134,9 @@ private: } } - FORCE_INLINE_AICORE void Input2IpcSlice(int64_t idx, int64_t sliceIdx) + FORCE_INLINE_AICORE void Input2IpcSlice(int64_t idx, int64_t sliceIdx) { - inputGt.SetGlobalBuffer((__gm__ T*)input + groupCoreIdx[idx] * ipcDataNumPreBlock, ipcDataNumPreBlock); + inputGt.SetGlobalBuffer((__gm__ T*)input + groupCoreIdx[idx] * ipcDataNumPreBlock, ipcDataNumPreBlock); copyLen = ipcDataNumPreBlock - perQueElemLen * sliceIdx; if (copyLen > perQueElemLen) { copyLen = perQueElemLen; @@ -144,7 +145,8 @@ private: } if (groupCoreIdx[idx] % SIO == rank % SIO) { if (idx > 0) { - sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), groupCoreIdx[idx - 1] + flagNumPerStage, rank); + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), + groupCoreIdx[idx - 1] + flagNumPerStage, rank); } srcInnerQue[idx].DeQue(rank, groupCoreIdx[idx] + flagNumPerStage); writeGt = srcInnerQue[idx].EnQue(); @@ -154,7 +156,7 @@ private: } } else { if (idx > 0) { - sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), + sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), groupCoreIdx[idx - 1] + flagNumPerStage + (rank - sioRank), sioRank); } SrcSioQue[idx].DeQue(sioRank, groupCoreIdx[idx] + (rank - sioRank) + flagNumPerStage); @@ -186,7 +188,7 @@ private: FORCE_INLINE_AICORE void Ipc2Output(int64_t idx, int64_t sliceIdx) { - outputGt.SetGlobalBuffer((__gm__ T*)output + groupCoreIdx[idx] * ipcDataNumPreBlock, + outputGt.SetGlobalBuffer((__gm__ T*)output + groupCoreIdx[idx] * ipcDataNumPreBlock, ipcDataNumPreBlock); cpOffset = rank % SIO == 0 ? rank + groupCoreIdx[idx] % SIO : (rank - DIE_CHANGE) + groupCoreIdx[idx] % SIO; diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h index db6b9b43..46bbccfe 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h @@ -29,9 +29,9 @@ class All2AllHierarchySmall : protected Collectives { constexpr static int64_t SIO = 2; public: - FORCE_INLINE_AICORE All2AllHierarchySmall(int rank, int rankSize, uint32_t extraFlag) + FORCE_INLINE_AICORE All2AllHierarchySmall(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} - FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { Collectives::Init(KERNELS_ARGS_CALL()); this->input = (__gm__ T *) input; @@ -51,11 +51,11 @@ public: } } private: - FORCE_INLINE_AICORE void InitShare() + FORCE_INLINE_AICORE void InitShare() { coreNumPerStage = CORE_NUM_PER_STAGE; singleStage = coreNumPerStage / SIO; - perQueElemLen = IPC_BUFF_MAX_SIZE / SIO / singleStage / QUEUE_DEPTH / sizeof(T); + perQueElemLen = IPC_BUFF_MAX_SIZE / SIO / singleStage / QUEUE_DEPTH / sizeof(T); queLen = perQueElemLen * QUEUE_DEPTH; queSize = queLen * sizeof(T); queBlockSize = IPC_BUFF_MAX_SIZE / SIO; @@ -64,7 +64,7 @@ private: FORCE_INLINE_AICORE void InitCoreGroup() { if (len < perQueElemLen) { - coreNumPerRank = 1; + coreNumPerRank = 1; } loopCount = rankSize / SIO; flagNumPerStage = coreNumPerStage; @@ -75,7 +75,7 @@ private: coreGroup = CONSUMER_CORE; groupCoreIdx = blockIdx - coreNumPerStage; } - } + } FORCE_INLINE_AICORE void InitDataSlice() { @@ -90,10 +90,10 @@ private: srcSioQue1.Init(&sync, magic, shareAddrs[sioRank] + IPC_DATA_OFFSET + ifOffSet + (groupCoreIdx - singleStage) * queSize, queLen, perQueElemLen); } - sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); } } - sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); + sliceNum = CeilDiv(ipcDataNumPreBlock, perQueElemLen); } FORCE_INLINE_AICORE void Producer() @@ -109,7 +109,7 @@ private: } } - FORCE_INLINE_AICORE void Input2IpcSlice(int64_t idx, int64_t sliceIdx) + FORCE_INLINE_AICORE void Input2IpcSlice(int64_t idx, int64_t sliceIdx) { copyLen = ipcDataNumPreBlock - perQueElemLen * sliceIdx; if (copyLen > perQueElemLen) { @@ -139,7 +139,7 @@ private: sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), flagIdx + (waitRank / SIO) * coreNumPerStage + flagNumPerStage, sioRank); } - inputGt.SetGlobalBuffer((__gm__ T*)input + sioSrcRank * curRankDataNum + + inputGt.SetGlobalBuffer((__gm__ T*)input + sioSrcRank * curRankDataNum + (groupCoreIdx - singleStage) * ipcDataNumPreBlock, ipcDataNumPreBlock); srcSioQue.DeQue(sioRank, flagIdx + (sioSrcRank / SIO) * coreNumPerStage + flagNumPerStage); writeGt = srcSioQue.EnQue(); @@ -154,7 +154,7 @@ private: for (auto i = 0; i < loopCount; ++i) { destRank = (rank - i * SIO) >= 0 ? (rank - i * SIO) : rank + ((loopCount - i) * SIO); if (groupCoreIdx < singleStage) { - detHccsQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + + detHccsQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + groupCoreIdx * queSize, queLen, perQueElemLen); } else { detHccsSioQue.Init(&sync, magic, shareAddrs[destRank] + IPC_DATA_OFFSET + queBlockSize + @@ -178,7 +178,7 @@ private: } sync.WaitSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx, destRank); if (groupCoreIdx < singleStage) { - readGt = detHccsQue.ReadFront(); + readGt = detHccsQue.ReadFront(); } else { readGt = detHccsSioQue.ReadFront(); } diff --git a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h index 38c364e5..c8b363a8 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h @@ -8,8 +8,8 @@ * See LICENSE in the root of the software repository for the full text of the License. */ - #ifndef LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H - #define LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H +#ifndef LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H +#define LCCL_ALLGATHER_HIERARCHY_DOUBLE_RING_H #include "collectives.h" #include "ipc_queue.h" @@ -66,26 +66,26 @@ public: blockSize = queSize / QUE_DEPTH; queHccsLocal.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + queHccsOffset, queSize, blockSize); - queSioLocal.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + queHccsOffset + queSize, + queSioLocal.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + queHccsOffset + queSize, queSize, blockSize); rankRingForward = (rank + RING_NUM) % rankSize; queHccsForward.Init(&sync, magic, shareAddrs[rankRingForward] + IPC_DATA_OFFSET + queHccsOffset, queSize, blockSize); - rankSioAdjoint = rank ^ 1; - queSioAdjoint.Init(&sync, magic, + rankSioAdjoint = rank ^ 1; + queSioAdjoint.Init(&sync, magic, shareAddrs[rankSioAdjoint] + IPC_DATA_OFFSET + queHccsOffset + queSize, queSize, blockSize); - + for (int i = 0; i < STAGE_NUM; ++i) { stageEvents[i] = sync.CalEventIdByMulBlockNum(STAGE_EVENT, stageCoreIdx + coreNumPerStep * i); } - + DumpLcclLogInfo(LogId::INIT, Op::COPYONLY); } FORCE_INLINE_AICORE void Process() { - DumpLcclLogInfo(LogId::PROCESS, Op::COPYONLY); - int count = rankSize / RING_NUM * CeilDiv(dataSizePerCore, blockSize); + DumpLcclLogInfo(LogId::PROCESS, Op::COPYONLY); + int count = rankSize / RING_NUM * CeilDiv(dataSizePerCore, blockSize); if (stage == STAGE::HCCS_RING) { ProcessHccsRing(count); } else if (stage == STAGE::HCCS_TO_OUT) { @@ -104,7 +104,7 @@ private: int deQueWaitRanks[dependencyNum] = {(rank + rankSize - RING_NUM) % rankSize, rank, rank}; int deQueWaitEvents[dependencyNum] = { sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), - stageEvents[static_cast(STAGE::HCCS_TO_OUT)], + stageEvents[static_cast(STAGE::HCCS_TO_OUT)], stageEvents[static_cast(STAGE::HCCS_TO_SIO)]}; int64_t remainSize = dataSizePerCore; int64_t dataSize = 0; @@ -129,7 +129,7 @@ private: stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; continue; } - input = queHccsForward.ReadFront(); + input = queHccsForward.ReadFront(); } queHccsLocal.DeQue(deQueWaitRanks, deQueWaitEvents, dependencyNum); output = queHccsLocal.EnQue(); @@ -145,9 +145,8 @@ private: } } ++i; - } + } } - FORCE_INLINE_AICORE void ProcessHccsToOut(const int count) { @@ -179,7 +178,7 @@ private: } ++i; } - } + } FORCE_INLINE_AICORE void ProcessHccsToSio(const int count) { GlobalTensor input; @@ -228,7 +227,7 @@ private: dataSize = (remainSize >= blockSize) ? blockSize : remainSize; } input = queSioLocal.ReadFront(); - output = outputGm[countRankId / RING_NUM][dataSizePerCore - remainSize]; + output = outputGm[countRankId / RING_NUM][dataSizePerCore - remainSize]; CpGM2GMPingPong(dataSize, input, output, -1); constexpr int32_t halfQueDepth = 2; if (i % (QUE_DEPTH / halfQueDepth) == 0) { diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h index c5e5b48a..1c62f2e5 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h @@ -119,7 +119,7 @@ private: { int64_t atomLoopCount = CeilDiv(pullRankDataNum, curBlockNum); int64_t atomRemain = pullRankDataNum; - int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); int64_t remain = curRankDataNum; int count = 0; int64_t maxLoopCount = (loopCount < atomLoopCount) ? loopCount : atomLoopCount; @@ -127,7 +127,7 @@ private: if (peerRank != rank && rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO && count != atomLoopCount) { sync.WaitInnerFlag(magic, count, rank, rank); sync.WaitInnerFlag(magic, count, peerRank, rank); - + GlobalTensor inputGm = srcQue.ReadFront(); GlobalTensor outputGm = dstQue.EnQue(); @@ -150,7 +150,7 @@ private: sync.SetInnerFlag(magic, count); } remain = remain - curBlockNum; - count = count + 1; + count = count + 1; } } FORCE_INLINE_AICORE void Puller() @@ -176,12 +176,12 @@ private: FORCE_INLINE_AICORE void ProducerInit() { inputQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlockNum); + perQueNum, curBlockNum); if (blockIdx % RANK_SIZE_TWO == rank % RANK_SIZE_TWO) { sioAtomSrcQue.Init(&sync, magic, shareAddrs[adjRank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlockNum); + perQueNum, curBlockNum); sioAtomDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlockNum); + perQueNum, curBlockNum); } } FORCE_INLINE_AICORE void ConsumerInit() @@ -191,22 +191,21 @@ private: dstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * perQueSize, perQueNum, curBlockNum); if (peerRank != rank && rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { - pullSrcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + + pullSrcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + peerRank * perQueSize, perQueNum, curBlockNum); - pullDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + pullDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + peerRank * perQueSize, perQueNum, curBlockNum); } - } FORCE_INLINE_AICORE void PullerInit() { if (rank % RANK_SIZE_TWO == peerRank % RANK_SIZE_TWO) { - pullQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlockNum); + pullQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, + perQueNum, curBlockNum); } else { pullQue.Init(&sync, magic, shareAddrs[adjRank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlockNum); + perQueNum, curBlockNum); } } private: diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index 883e8f20..bdb8fd48 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -35,8 +35,6 @@ class AllReduceHierarchyDoubleRing : protected Collectives { constexpr static int32_t SIO_GATHER_FLAG = 7 * RING_CORE_NUM; constexpr static int32_t SIO_GATHER_OUTPUT_FLAG = 8 * RING_CORE_NUM; constexpr static int32_t OUTPUT_FLAG = 9 * RING_CORE_NUM; - - constexpr static int32_t INPUT_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / SIO_CORE_NUM; constexpr static int32_t OUTPUT_CORE_SCALE = RING_CORE_NUM / OUTPUT_CORE_NUM; @@ -95,7 +93,7 @@ public: } ++ipcQueIdx; } - + for (sioLayerLoop = 0; sioLayerLoop < ringRankSize; ++sioLayerLoop) { if (blockIdx < INPUT_CORE_NUM) { ; @@ -151,7 +149,7 @@ private: int64_t dmaLastLoop = 0; int64_t dmaLastRankLoop = 0; int32_t ipcQueIdx = 0; - int32_t gatherQueIdx = 0; + int32_t gatherQueIdx = 0; int32_t loopCount = 0; int32_t curLoopCnt = 0; int32_t sioLayerLoop = 0; @@ -166,13 +164,13 @@ private: if (blockIdx < INPUT_CORE_NUM) { for (int32_t blockLoop = 0; blockLoop < INPUT_CORE_SCALE; ++blockLoop) { localBlockIdx = blockIdx * INPUT_CORE_SCALE + blockLoop; - inputQueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + inputQueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); } } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { for (int32_t blockLoop = 0; blockLoop < SIO_CORE_SCALE; ++blockLoop) { localBlockIdx = (blockIdx - INPUT_CORE_NUM) * SIO_CORE_SCALE + blockLoop; - sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + + sioQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); sioGatherSrc1QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); @@ -180,21 +178,21 @@ private: IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); sioGatherDstQueList[blockLoop].Init(&sync, magic, shareAddrs[sioPeerRankId] + IPC_DATA_OFFSET + - (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH) * ipcBlockSize + dmaSizePerCore * localBlockIdx, + (IPC_QUE_DEPTH + RING_GATHER_QUE_DEPTH) * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * SIO_GATHER_QUE_DEPTH, ipcBlockNum); } } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM) { localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); - ringSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + + ringSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); ringDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); ringGatherSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + - IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, - ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); - ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); } else { for (int32_t blockLoop = 0; blockLoop < OUTPUT_CORE_SCALE; ++blockLoop) { localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM + RING_CORE_NUM)) * OUTPUT_CORE_SCALE + @@ -204,9 +202,9 @@ private: ipcBlockNum * SIO_GATHER_QUE_DEPTH, ipcBlockNum); outputSrc2QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, - ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); outputSrc3QueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + - dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); } } } @@ -233,7 +231,7 @@ private: } BuildCoreDataNum(curLoopCnt, targetRankOffset); - srcIpcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + + srcIpcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + localBlockIdx * coreDataNum]; dstIpcTensor = (*inputQue).EnQue(); CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); @@ -267,7 +265,7 @@ private: sync.WaitSyncFlag(magic, newQueIdx, INPUT_FLAG + localBlockIdx, rank); BuildCoreDataNum(newLoopCnt, targetRankOffset); - srcIpcTensor = inputTensor[targetRankOffset * totalBlockDataNum + newLoopCnt * dmaPerLoop + + srcIpcTensor = inputTensor[targetRankOffset * totalBlockDataNum + newLoopCnt * dmaPerLoop + localBlockIdx * coreDataNum]; dstIpcTensor = (*sioQue).EnQue(); CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, atomOp); @@ -301,7 +299,7 @@ private: { const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - sioLayerLoop)) % ringRankSize; const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + ringLayerId; - + sync.WaitSyncFlag(magic, gatherQueIdx, RING_GATHER_FLAG + localBlockIdx, rank); if (gatherQueIdx >= SIO_GATHER_QUE_DEPTH) { sync.WaitSyncFlag(magic, gatherQueIdx - SIO_GATHER_QUE_DEPTH, SIO_GATHER_OUTPUT_FLAG + localBlockIdx, rank); @@ -326,7 +324,7 @@ private: if (sioLayerLoop == 1) { sync.WaitSyncFlag(magic, consumedQueIdx, SIO_REDUCE_FLAG + localBlockIdx, ringPrevRankId); } else { - sync.WaitSyncFlag(magic, consumedQueIdx - 1, RING_REDUCE_FLAG + localBlockIdx, + sync.WaitSyncFlag(magic, consumedQueIdx - 1, RING_REDUCE_FLAG + localBlockIdx, ringPrevRankId); } const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 -sioLayerLoop)) % ringRankSize; @@ -413,7 +411,7 @@ private: BuildCoreDataNum(curLoopCnt, targetRingRankOffset); sync.WaitSyncFlag(magic, gatherQueIdx, SIO_GATHER_FLAG + localBlockIdx, rank); srcIpcTensor = sioLayerLoop == 0 ? (*outputSrc3Que).ReadFront() : (*outputSrc2Que).ReadFront(); - dstIpcTensor = outputTensor[targetRingRankOffset * totalBlockDataNum + + dstIpcTensor = outputTensor[targetRingRankOffset * totalBlockDataNum + curLoopCnt * dmaPerLoop + localBlockIdx * coreDataNum]; CpGM2GMPingPong(curCoreDataNum * sizeof(T), srcIpcTensor, dstIpcTensor, COPYONLY); sync.SetSyncFlag(magic, gatherQueIdx, OUTPUT_FLAG + localBlockIdx, rank); diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index a82ed1ac..32f35a5e 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -63,7 +63,7 @@ public: stepThreeOriginRankPerCore = CeilDiv(halfRankSize, PER_STEP_BLOCKNUM); stepOneInUseBlockNum = CeilDiv(rankSize, stepOneOriginRankPerCore); stepTwoInUseBlockNum = CeilDiv(halfRankSize, stepTwoOriginRankPerCore); - stepThreeInUseBlockNum = CeilDiv(halfRankSize, stepThreeOriginRankPerCore); + stepThreeInUseBlockNum = CeilDiv(halfRankSize, stepThreeOriginRankPerCore); if ((blockIdx / PER_STEP_BLOCKNUM) == 0) { if ((blockIdx % PER_STEP_BLOCKNUM) == (stepOneInUseBlockNum - 1)) { stepOneRankPerCore = rankSize - (blockIdx % PER_STEP_BLOCKNUM) * stepOneOriginRankPerCore; @@ -78,7 +78,7 @@ public: } } else if ((blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_TWO || (blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_THREE) { if (((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) == (stepThreeInUseBlockNum - 1)) { - stepThreeRankPerCore = halfRankSize - ((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / + stepThreeRankPerCore = halfRankSize - ((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) * stepThreeOriginRankPerCore; } else { stepThreeRankPerCore = stepThreeOriginRankPerCore; @@ -92,25 +92,25 @@ public: if ((blockIdx / PER_STEP_BLOCKNUM) == 0) { for (int i = 0; i < stepOneRankPerCore; i++) { ipcRank = blockIdx * stepOneOriginRankPerCore + i; - writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ipcRank * + writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ipcRank * ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); } } else if ((blockIdx / PER_STEP_BLOCKNUM) == 1) { for (int i = 0; i < stepTwoRankPerCore; i++) { ipcRank = ((blockIdx % PER_STEP_BLOCKNUM) * stepTwoOriginRankPerCore + i) * NUM_OF_TWO + (rank % NUM_OF_TWO); - readIpcQue[i].Init(&sync, magic, shareAddrs[adjPeerRank] + IPC_DATA_OFFSET + ipcRank * + readIpcQue[i].Init(&sync, magic, shareAddrs[adjPeerRank] + IPC_DATA_OFFSET + ipcRank * ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); - writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ipcRank * + writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + ipcRank * ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); } } else if ((blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_TWO || (blockIdx / PER_STEP_BLOCKNUM) == NUM_OF_THREE) { for (int i = 0; i < stepThreeRankPerCore; i++) { stepThreeRank = (((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO) * stepThreeOriginRankPerCore + i) * NUM_OF_TWO + (rank % NUM_OF_TWO); - writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * + writeIpcQue[i].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); - readIpcQue[i].Init(&sync, magic, shareAddrs[stepThreeRank] + IPC_DATA_OFFSET + rank * + readIpcQue[i].Init(&sync, magic, shareAddrs[stepThreeRank] + IPC_DATA_OFFSET + rank * ipcNumOfBlock * sizeof(T), ipcNumOfBlock, ipcBlockNum); } } else if (blockIdx == (NUM_OF_FOUR * PER_STEP_BLOCKNUM)) { @@ -134,7 +134,7 @@ public: if ((stepIndex == NUM_OF_TWO || stepIndex == NUM_OF_THREE) && ((blockIdx - PER_STEP_BLOCKNUM * NUM_OF_TWO) / NUM_OF_TWO * stepThreeOriginRankPerCore) >= (rankSize / NUM_OF_TWO)) { DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); - return; + return; } if (stepIndex == 0) { StepOneProcess(); @@ -154,15 +154,15 @@ public: if ((blockIdx * stepOneOriginRankPerCore + i) % NUM_OF_TWO == rank % NUM_OF_TWO) { if ((blockIdx * stepOneOriginRankPerCore + i) == rank) { waitWriteRankArr[i] = rank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; } else { waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / stepThreeOriginRankPerCore) * NUM_OF_TWO; } } else { waitWriteRankArr[i] = adjPeerRank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM + ((blockIdx * stepOneOriginRankPerCore + i) / + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM + ((blockIdx * stepOneOriginRankPerCore + i) / NUM_OF_TWO) / stepTwoOriginRankPerCore; } } @@ -200,7 +200,7 @@ public: waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_FOUR; } HccsAtomicToIpcProcess(waitReadRankArr, waitReadBlockArr, waitWriteRankArr, - waitWriteBlockArr, stepThreeRankPerCore); + waitWriteBlockArr, stepThreeRankPerCore); } __aicore__ inline void StepFourProcess() @@ -228,7 +228,7 @@ public: } } - __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, + __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, int *waitWriteBlock, int waitCount) { int processBlockNum = ipcBlockNum; @@ -248,7 +248,7 @@ public: } } - __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, + __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, int *waitWriteBlock, int waitCount) { int processBlockNum = ipcBlockNum; diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h index f5b19285..1c7ac156 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h @@ -31,8 +31,8 @@ class ReduceScatterHierarchyDoubleRing : protected Collectives { constexpr static int32_t INPUT_CORE_SCALE = RING_CORE_NUM / INPUT_CORE_NUM; constexpr static int32_t SIO_CORE_SCALE = RING_CORE_NUM / SIO_CORE_NUM; - constexpr static int64_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); - constexpr static int32_t BREAK_CYCLE = 10; + constexpr static int64_t BLOCK_NUM_ALIGN = BLOCK_SIZE / sizeof(T); + constexpr static int32_t BREAK_CYCLE = 10; public: FORCE_INLINE_AICORE ReduceScatterHierarchyDoubleRing(int rank, int rankSize, uint32_t extraFlag) @@ -62,7 +62,7 @@ public: if (blockIdx < INPUT_CORE_NUM) { for (int32_t blockLoop = 0; blockLoop < INPUT_CORE_SCALE; ++blockLoop) { localBlockIdx = blockIdx * INPUT_CORE_SCALE + blockLoop; - inputQueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + + inputQueList[blockLoop].Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); } } else if (blockIdx < INPUT_CORE_NUM + SIO_CORE_NUM) { @@ -73,10 +73,10 @@ public: } } else { localBlockIdx = (blockIdx - (INPUT_CORE_NUM + SIO_CORE_NUM)); - ringSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + + ringSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); ringDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + - dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); + dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); } inputTensor.SetGlobalBuffer((__gm__ T*) input); outputTensor.SetGlobalBuffer((__gm__ T*) output); @@ -173,7 +173,7 @@ private: FORCE_INLINE_AICORE void SioReduceByCore() { const int32_t targetSioLayerId = (sioLayerId + (ringRankSize - 1 - sioLayerLoop)) % ringRankSize; - const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; + const int32_t targetRankOffset = targetSioLayerId * RING_LAYER_NUM + (ringLayerId + 1) % RING_LAYER_NUM; curCoreDataNum = (localBlockIdx == RING_CORE_NUM - 1) ? lastCoreDataNum : coreDataNum; srcTensor = inputTensor[targetRankOffset * totalBlockDataNum + curLoopCnt * ipcBlockNum + diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/comm/lcal/src/ascendc_kernels/allgather.h index 72df6582..d015de72 100644 --- a/comm/lcal/src/ascendc_kernels/allgather.h +++ b/comm/lcal/src/ascendc_kernels/allgather.h @@ -25,8 +25,8 @@ class AllGather : public Collectives { public: FORCE_INLINE_AICORE AllGather(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} - - FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { Collectives::Init(KERNELS_ARGS_CALL()); globalRank = (reinterpret_cast<__gm__ CommArgs *>(commArgs))->rank; @@ -58,7 +58,7 @@ public: FORCE_INLINE_AICORE void Process() { if (extraFlag & ExtraFlag::RDMA) { - shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank % localRankSize] + baseOffsetSize) + + shareGm.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank % localRankSize] + baseOffsetSize) + len * globalRank + offsetToShare, countToShare); if (countToShare > 0) { CpGM2GMPingPong(countToShare * sizeof(T), inputGm, shareGm, COPYONLY); @@ -91,7 +91,7 @@ public: CpGM2GM(outputGm, shareGm, countToOutput, COPYONLY); } } - } + } private: @@ -110,7 +110,7 @@ private: if (blockDataOffset + blockDataCount > dataLen) { blockDataCount = dataLen - blockDataOffset; } - } + } private: GlobalTensor inputGm; GlobalTensor outputGm; @@ -125,8 +125,8 @@ private: int64_t blockRank; int64_t offsetFromShare;; int64_t offsetToOutput; - int64_t countToOutput; - int globalRank; + int64_t countToOutput; + int globalRank; int globalRankSize; int localRankSize; }; diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 0bb56493..f8ce0276 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -29,7 +29,7 @@ public: Collectives::Init(KERNELS_ARGS_CALL()); DumpLcclLogInfo(LogId::INIT, static_cast(op)); if constexpr(!std::is_same_v) { - BuildScaleOffset(scale, scaleCount, offset); + BuildScaleOffset(scale, scaleCount, offset); } if (blockIdx >= PING_PONG_SIZE * rankSize) { @@ -43,7 +43,7 @@ public: int globalRankSize = localArgs->rankSize <= 0 ? rankSize : localArgs->rankSize; int localRankSize = localArgs->localRankSize <= 0 ? rankSize : localArgs->localRankSize; int serverNum = globalRankSize / localRankSize; - int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / + int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; curBlockSize = ipcBuffMaxSizeAligned / localRankSize / QUEUE_DEPTH; curBlockNum = curBlockSize / sizeof(T); @@ -65,7 +65,7 @@ public: } pullRankDataNum = (rank == rankSize - 1) ? (len - rank * perRankDataNum) : perRankDataNum; - + inputBuffOffsetNum = blockIdx % rankSize * perRankDataNum; inputGt.SetGlobalBuffer((__gm__ U*)input + inputBuffOffsetNum, curRankDataNum); @@ -78,14 +78,14 @@ public: if (blockIdx / perStepBlockNum == 0) { inputQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + inputIpcGtOffsetNum, - perQueNum, curBlockNum); + perQueNum, curBlockNum); } else { srcQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + rank * perQueSize, - perQueNum, curBlockNum); + perQueNum, curBlockNum); dstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + rank * perQueSize, - perQueNum, curBlockNum); + perQueNum, curBlockNum); pullQue.Init(&sync, magic, shareAddrs[peerRank] + IPC_DATA_OFFSET + peerRank * perQueSize, - perQueNum, curBlockNum); + perQueNum, curBlockNum); } DumpLcclLogInfo(LogId::INIT, static_cast(op)); } @@ -106,7 +106,7 @@ public: while (count < loopCount) { int64_t copyNum = (remain < curBlockNum) ? remain : curBlockNum; Collectives::CpGM2GMPingPong(copyNum * sizeof(T), inputGt[count * curBlockNum], - outputGt[count * curBlockNum], COPYONLY); + outputGt[count * curBlockNum], COPYONLY); remain -= curBlockNum; ++count; } @@ -140,7 +140,7 @@ private: if (blockIdx != rank) { GlobalTensor outputGmTmp; outputGmTmp.SetGlobalBuffer((__gm__ U*)outputGm.GetPhyAddr()); - Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], outputGmTmp, + Collectives::CpGM2GMPingPong(copyNum * sizeof(U), inputGt[count * curBlockNum], outputGmTmp, COPYONLY); } else { CpGM2GMWithScale(copyNum, inputGt[count * curBlockNum], outputGm, COPYONLY); @@ -157,14 +157,14 @@ private: { int64_t atomLoopCount = CeilDiv(pullRankDataNum, curBlockNum); int64_t atomRemain = pullRankDataNum; - int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); + int64_t loopCount = CeilDiv(curRankDataNum, curBlockNum); int64_t remain = curRankDataNum; int count = 0; while (count < loopCount || count < atomLoopCount) { if (peerRank != rank && count != atomLoopCount) { sync.WaitInnerFlag(magic, count, rank, rank); sync.WaitInnerFlag(magic, count, peerRank, rank); - + GlobalTensor inputGm = srcQue.ReadFront(); GlobalTensor outputGm = dstQue.EnQue(); @@ -191,7 +191,7 @@ private: sync.SetInnerFlag(magic, count); remain = remain - curBlockNum; - count = count + 1; + count = count + 1; } } @@ -208,7 +208,7 @@ private: } } - FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, + FORCE_INLINE_AICORE void CpGM2GMWithScale(int64_t atomCopyNum, GlobalTensor inputGm, GlobalTensor outputGm, int64_t atomOp) { if (!isEnableScale) { @@ -216,10 +216,10 @@ private: } else if (!isVectorScale) { CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, firstScale, offset); } else { - CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scaleNum, + CpGM2GMPingPong(atomCopyNum * sizeof(T), inputGm, outputGm, atomOp, scaleGt, scaleNum, offset); } - } + } private: GlobalTensor inputGt; GlobalTensor outputGt; diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h index 259297f1..9532a04f 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h @@ -27,7 +27,7 @@ public: Collectives::Init(KERNELS_ARGS_CALL()); DumpLcclLogInfo(LogId::INIT, static_cast(op)); if constexpr(!std::is_same_v) { - BuildScaleOffset(scale, scaleCount, offset); + BuildScaleOffset(scale, scaleCount, offset); } atomOp = op; blockNum = blockNum / rankSize * rankSize; @@ -81,7 +81,7 @@ public: if constexpr (!std::is_same_v) { if (!isEnableScale) { Collectives::CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp); - } else if (!isVectorScale){ + } else if (!isVectorScale) { CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp, firstScale, offset); } else { CpGM2GM(dstOutputGlobal, srcIPCGlobal, blockReduceNum, atomOp, scaleGt, scaleNum, offset); @@ -99,7 +99,7 @@ public: if constexpr (!std::is_same_v) { if (!isEnableScale) { Collectives::CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY); - } else if (!isVectorScale){ + } else if (!isVectorScale) { CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY, firstScale, offset); } else { CpGM2GM(copyOutputGlobal, srcInputGlobal, blockDataNum, COPYONLY, scaleGt, scaleNum, offset); @@ -131,7 +131,7 @@ protected: bool isVectorScale = false; private: - FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) + FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) { if (scale != nullptr && offset != nullptr) { this->offset =* reinterpret_cast<__gm__ T*>(offset); diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index d4ef1986..57804383 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -44,11 +44,11 @@ public: constexpr int32_t ubBlockSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; constexpr int32_t ubAlignNum = ubBlockSize / (sizeof(T) + sizeof(U)) / ALIGN_SIZE * ALIGN_SIZE; constexpr int32_t inputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(U); - constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); + constexpr int32_t outputUbBlockSize = std::is_same_v ? ubBlockSize : ubAlignNum * sizeof(T); __gm__ U *input = const_cast<__gm__ U *>(inputGT.GetPhyAddr()); __gm__ T *output = const_cast<__gm__ T *>(outputGT.GetPhyAddr()); __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET), (__ubuf__ U*)(UB_MID_OFFSET)}; - __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(inputUB[0] + inputUbBlockSize / sizeof(U)), + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(inputUB[0] + inputUbBlockSize / sizeof(U)), (__ubuf__ T*)(inputUB[1] + inputUbBlockSize / sizeof(U))}; __ubuf__ T* targetOutputUB = nullptr; int inputOffsetNum = 0; @@ -115,9 +115,9 @@ protected: const int64_t batchDataNum = (scaleCount + ubAlignNum - 1) / ubAlignNum; __ubuf__ T* scaleUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET), (__ubuf__ T*)(UB_MID_OFFSET)}; - __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET + ubAlignNum * sizeof(T)), + __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET + ubAlignNum * sizeof(T)), (__ubuf__ U*)(UB_MID_OFFSET + ubAlignNum * sizeof(T))}; - __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), (__ubuf__ T*)(UB_MID_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U)))}; __ubuf__ T* targetOutputUB = nullptr; int64_t i = 0; @@ -173,7 +173,7 @@ protected: __ubuf__ T* scaleUB = (__ubuf__ T*)(UB_HEAD_OFFSET); __ubuf__ U* inputUB[2] = {(__ubuf__ U*)(UB_HEAD_OFFSET + ubAlignNum * sizeof(T)), (__ubuf__ U*)(ubMidOffset)}; - __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), + __ubuf__ T* outputUB[2] = {(__ubuf__ T*)(UB_HEAD_OFFSET + ubAlignNum * (sizeof(T) + sizeof(U))), (__ubuf__ T*)(ubMidOffset + ubAlignNum * sizeof(U))}; __ubuf__ T* targetOutputUB = nullptr; int64_t processedNum = 0; diff --git a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h index f4d36d25..3ef03960 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h @@ -27,7 +27,7 @@ public: Collectives::Init(KERNELS_ARGS_CALL()); DumpLcclLogInfo(LogId::INIT, static_cast(op)); if constexpr(!std::is_same_v) { - BuildScaleOffset(scale, scaleCount, offset); + BuildScaleOffset(scale, scaleCount, offset); } if (blockIdx >= rankSize) { @@ -38,11 +38,11 @@ public: blockNum = rankSize; __gm__ CommArgs *localArgs = reinterpret_cast<__gm__ CommArgs *>(commArgs); - + int localRankSize = localArgs->localRankSize <= 0 ? rankSize : localArgs->localRankSize; int globalRankSize = localArgs->rankSize <= 0 ? rankSize : localArgs->rankSize; int serverNum = globalRankSize / localRankSize; - int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / + int64_t ipcBuffMaxSizeAligned = IPC_BUFF_MAX_SIZE / (globalRankSize + serverNum - 1) / QUEUE_DEPTH / sizeof(T) /scaleNum * scaleNum * QUEUE_DEPTH * sizeof(T) * globalRankSize; ipcDataPerParagraphSize = ipcBuffMaxSizeAligned / localRankSize; int64_t ipcDataPerParagraphNum = ipcDataPerParagraphSize / sizeof(T); @@ -61,9 +61,9 @@ public: if (coreSegmentedIdx == corePerRank - 1) { dataNumPreBlock = pullRankDataNum - coreSegmentedIdx * pullBlockDataNum; } - buffOffsetNum = peerRank * perRankDataNum + coreSegmentedIdx * pullBlockDataNum + + buffOffsetNum = peerRank * perRankDataNum + coreSegmentedIdx * pullBlockDataNum + ipcDataPerParagraphNum * peerRank; - + curBlockDataNum = GetDataCount(curRankDataNum, corePerRank); ipcDataNumPreBlock = curBlockDataNum; ipcbuffOffsetNum = rank * perRankDataNum + coreSegmentedIdx * curBlockDataNum + ipcDataPerParagraphNum * rank; @@ -72,12 +72,12 @@ public: inputIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + IPC_DATA_OFFSET) + buffOffsetNum, dataNumPreBlock); srcIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[peerRank] + IPC_DATA_OFFSET) + ipcbuffOffsetNum, ipcDataNumPreBlock); - processIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + IPC_DATA_OFFSET) + ipcbuffOffsetNum, + processIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[rank] + IPC_DATA_OFFSET) + ipcbuffOffsetNum, ipcDataNumPreBlock); - - processedIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[peerRank] + IPC_DATA_OFFSET) + buffOffsetNum, - dataNumPreBlock); - outputGt.SetGlobalBuffer((__gm__ T*)output + buffOffsetNum - ipcDataPerParagraphNum * peerRank, + + processedIpcGt.SetGlobalBuffer((__gm__ T*)(shareAddrs[peerRank] + IPC_DATA_OFFSET) + buffOffsetNum, + dataNumPreBlock); + outputGt.SetGlobalBuffer((__gm__ T*)output + buffOffsetNum - ipcDataPerParagraphNum * peerRank, dataNumPreBlock); DumpLcclLogInfo(LogId::INIT, static_cast(op)); } @@ -95,7 +95,7 @@ public: if (peerRank == rank) { if (!isEnableScale) { Collectives::CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY); - } else if (!isVectorScale){ + } else if (!isVectorScale) { CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY, firstScale, offset); } else { CpGM2GM(inputIpcGt, inputGt, dataNumPreBlock, COPYONLY, scaleGt, scaleNum, offset); @@ -153,8 +153,8 @@ private: int64_t curBlockDataNum; int64_t peerRank; int64_t pullRankDataNum; - int64_t dataNumPreBlock; - int64_t buffOffsetNum; + int64_t dataNumPreBlock; + int64_t buffOffsetNum; int64_t ipcDataNumPreBlock; int64_t ipcbuffOffsetNum; @@ -164,7 +164,7 @@ private: T offset = 0; bool isEnableScale = false; bool isVectorScale = false; - FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) + FORCE_INLINE_AICORE void BuildScaleOffset(GM_ADDR scale, int64_t scaleCount, GM_ADDR offset) { if (scale != nullptr && offset != nullptr) { scaleGt.SetGlobalBuffer((__gm__ T*)scale); diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/comm/lcal/src/ascendc_kernels/collectives.h index 036c5e3b..9743a2dc 100644 --- a/comm/lcal/src/ascendc_kernels/collectives.h +++ b/comm/lcal/src/ascendc_kernels/collectives.h @@ -42,9 +42,9 @@ class Collectives { constexpr static int32_t UB_HEAD_OFFSET = 96; constexpr static int32_t UB_MID_OFFSET = UB_HEAD_OFFSET + UB_SINGLE_PING_PONG_ADD_SIZE_MAX + ALIGN_SIZE; public: - FORCE_INLINE_AICORE Collectives(int rank, int rankSize, uint32_t extraFlag) : rank(rank), rankSize(rankSize), + FORCE_INLINE_AICORE Collectives(int rank, int rankSize, uint32_t extraFlag) : rank(rank), rankSize(rankSize), extraFlag(extraFlag) {} - + FORCE_INLINE_AICORE ~Collectives() { const int64_t notRunning = 0xdead; @@ -58,7 +58,7 @@ public: peerMemsAddrGm.SetGlobalBuffer(&(reinterpret_cast<__gm__ CommArgs *>(commArgs))->peerMems[0], LCAL_MAX_RANK_SIZE); for (int i = 0; i < rankSize; ++i) { - shareAddrs[i] = peerMemsAddrGm.GetValue(i) + + shareAddrs[i] = peerMemsAddrGm.GetValue(i) + (magic % PING_PONG_SIZE) * (IPC_BUFF_MAX_SIZE + IPC_DATA_OFFSET); } dfx.SetGlobalBuffer((reinterpret_cast<__gm__ CommArgs *>(commArgs))->dfx, @@ -75,7 +75,7 @@ public: blockIdx = GetBlockIdx(); blockNum = GetBlockNum() * LCAL_BLOCK_NUM_MULTI; - + sync.Init(rank, rankSize, shareAddrs); dfx.SetValue(MAGIC, magic); dfx.SetValue(LEN, len); @@ -84,7 +84,7 @@ public: } template - FORCE_INLINE_AICORE void DataCopyWrapPingPong(const GlobalTensor& inputGT, const GlobalTensor& outputGT, + FORCE_INLINE_AICORE void DataCopyWrapPingPong(const GlobalTensor& inputGT, const GlobalTensor& outputGT, int64_t dataSizeRemain, int op, TBuf tbuf) { if (dataSizeRemain <= 0) { @@ -141,7 +141,7 @@ public: template FORCE_INLINE_AICORE void CpGM2GMDelay(GlobalTensor& outputGT, GlobalTensor (&inputGT)[8], - GlobalTensor (&inputScaleGT)[8], const uint32_t calCount, int rankCount, GlobalTensor& outScaleGT, + GlobalTensor (&inputScaleGT)[8], const uint32_t calCount, int rankCount, GlobalTensor& outScaleGT, TBuf tbuf) { DataCopyGM2GMDelay cpKernel; @@ -152,7 +152,7 @@ public: template FORCE_INLINE_AICORE T1 CeilDiv(T1 a, T2 b) { - if (b == 0) { + if (b == 0) { return 0; } return (a + b - 1) / b; @@ -164,14 +164,14 @@ public: if (curDealSize > MAX_VADD_SIZE) { vadd(ubuf0, ubuf1, ubuf0, VADD_MAX_REPEAT, 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); - vadd((__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), + vadd((__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), (__ubuf__ T*)((__ubuf__ int8_t*)ubuf1 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), (__ubuf__ T*)((__ubuf__ int8_t*)ubuf0 + VADD_MAX_REPEAT * VADD_UNIT_BYTE), CeilDiv((curDealSize - MAX_VADD_SIZE), VADD_UNIT_BYTE), 1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); } else { vadd(ubuf0, ubuf1, ubuf0, CeilDiv(curDealSize, VADD_UNIT_BYTE), 1, 1, 1, - VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); + VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO); } } @@ -188,7 +188,6 @@ public: } if (alreadyDealSize != 0) { AscendC::WaitFlag(EVENT_ID0); - } CpGM2UB(localUB[0], srcGmMem + alreadyDealNum, curDealSize); @@ -215,7 +214,7 @@ public: if ((i + 1 == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { continue; } - AscendC::SetFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID1); } AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -241,7 +240,7 @@ public: } template - FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputGT, + FORCE_INLINE_AICORE void CpGM2GMPingPong(int64_t dataSizeRemain, const GlobalTensor& inputGT, const GlobalTensor& outputGT, int op) { constexpr int32_t ubBlockSize = UB_SINGLE_PING_PONG_ADD_SIZE_MAX; @@ -275,12 +274,12 @@ public: if constexpr (!std::is_same_v) { SetWaitEvent(eventId); CastImpl((i & 1) ? outputUB[0] : outputUB[1], (i & 1) ? inputUB[0] : inputUB[1], RoundMode::CAST_NONE, - size / sizeof(T)); - SetWaitEvent(eventId); + size / sizeof(T)); + SetWaitEvent(eventId); } AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - CpUB2GM(output + outputOffsetNum, (i & 1) ? outputUB[0] : outputUB[1], size); + CpUB2GM(output + outputOffsetNum, (i & 1) ? outputUB[0] : outputUB[1], size); AscendC::SetFlag(eventId); dataSizeRemain -= size; inputOffsetNum += (size / sizeof(T)); @@ -301,7 +300,7 @@ public: if (curDealSize > MAX_VADD_SIZE) { Add(ubuf0, ubuf1, ubuf0, MASK_PLACEHOLDER, VADD_MAX_REPEAT, {1, 1, 1, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO, VADD_UNIT_TO_BLOCK_UNIT_RATIO}); - + Add(ubuf0[MAX_VADD_SIZE / sizeof(T)], ubuf1[MAX_VADD_SIZE / sizeof(T)], ubuf0[MAX_VADD_SIZE / sizeof(T)], MASK_PLACEHOLDER, CeilDiv((curDealSize - MAX_VADD_SIZE), VADD_UNIT_BYTE), @@ -325,7 +324,7 @@ public: localUB[1] = tbuf.GetWithOffset(95 * 1024, 95 * 1024); AscendC::PipeBarrier(); - LoopVaddProcess(localUB, remainNum * sizeof(T), targetRankArr, targetRankArrValidSize, + LoopVaddProcess(localUB, remainNum * sizeof(T), targetRankArr, targetRankArrValidSize, srcIpcOffsetNum, srcGt, dstGt, 0); AscendC::PipeBarrier(); } @@ -342,7 +341,6 @@ public: } if (alreadyDealSize != 0) { AscendC::WaitFlag(EVENT_ID0); - } DataCopyWrap(localUB[0], srcGt[alreadyDealNum], curDealSize); @@ -370,7 +368,7 @@ public: if ((i + 1 == targetRankArrValidSize - 1) && (targetRankArr[i + 1] == rank)) { continue; } - AscendC::SetFlag(EVENT_ID1); + AscendC::SetFlag(EVENT_ID1); } AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -415,7 +413,7 @@ protected: GlobalTensor dfx; SyncCollectives sync; GM_ADDR dumpAddr_ = nullptr; - + template FORCE_INLINE_AICORE void SetAscendCAtomic(int op) { @@ -436,7 +434,7 @@ protected: ; } } - + template FORCE_INLINE_AICORE void SetAtomic(int op) { @@ -446,7 +444,7 @@ protected: SetAtomicOpType(op); #endif } - PipeBarrier(); + PipeBarrier(); } FORCE_INLINE_AICORE void UnsetAtomic(int op) diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index c09cce2a..b6b91715 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -11,7 +11,7 @@ #ifndef LCCL_DATACOPY_GM2GM_H #define LCCL_DATACOPY_GM2GM_H #include -#include "comm_args.h" +#include "comm_args.h" using namespace AscendC; using namespace Lcal; @@ -27,7 +27,7 @@ FORCE_INLINE_AICORE void SetAtomicOpType(int op) case ADD: AscendC::SetAtomicAdd(); break; - + case MUL: break; case MAX: @@ -73,7 +73,7 @@ FORCE_INLINE_AICORE void CopyUB2UB(__ubuf__ T *dst, __ubuf__ T *src, const uint3 { LocalTensor srcTensor; LocalTensor dstTensor; - TBuffAddr srcAddr, dstAddr; + TBuffAddr srcAddr, dstAddr; srcAddr.bufferAddr = reinterpret_cast(src); dstAddr.bufferAddr = reinterpret_cast(dst); srcTensor.SetAddr(srcAddr); @@ -143,7 +143,7 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); } - CpUB2GM((__gm__ T*)outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, + CpUB2GM((__gm__ T*)outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, OUTPUT_BLOCK_SIZE); AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); @@ -162,7 +162,7 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); } - CpUB2GM((__gm__ T*)outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, + CpUB2GM((__gm__ T*)outputGm + i * OUTPUT_BLOCK_SIZE / sizeof(T), (__ubuf__ T*)outputUB, dataSizeRemain); PipeBarrier(); } @@ -223,7 +223,7 @@ private: if (op != -1) { #ifdef __DAV_C220_VEC__ SetAtomicOpType(op); -#endif +#endif } PipeBarrier(); } @@ -269,7 +269,7 @@ private: AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); i += 1; - dataSizeRemain -= outputBlockSize; + dataSizeRemain -= outputBlockSize; } UnsetAtomic(op); } @@ -314,7 +314,7 @@ private: AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); i += 1; - dataSizeRemain -= curDataNum * sizeof(T); + dataSizeRemain -= curDataNum * sizeof(T); processedNum += curDataNum; } UnsetAtomic(op); @@ -328,6 +328,4 @@ private: const __gm__ T* outputGm = nullptr; int op; }; -#endif // LCCL_DATACOPY_GM2GM_H - - +#endif // LCCL_DATACOPY_GM2GM_H \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h index a0419016..46a94bd3 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h @@ -45,17 +45,17 @@ public: inTensor[1] = tbuf.GetWithOffset(BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + IN_BLOCKSIZE * THREE_NUM); singleScaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); singleScaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + - IN_BLOCKSIZE * FOUR_NUM); + IN_BLOCKSIZE * FOUR_NUM); singleScaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE); singleScaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * HALF_NUM + - IN_BLOCKSIZE * FOUR_NUM); + IN_BLOCKSIZE * FOUR_NUM); scaleUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); scaleUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + IN_BLOCKSIZE * FOUR_NUM); scaleUUBTensor[0] = tbuf.GetWithOffset(SCALE_NUM, IN_BLOCKSIZE + SCALE_SIZE); scaleUUBTensor[1] = tbuf.GetWithOffset(SCALE_NUM, WORK_OFFSET + SCALE_SIZE * THREE_NUM + IN_BLOCKSIZE * FOUR_NUM); - workUBTensor[0] = tbuf.GetWithOffset(WORK_BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM); + workUBTensor[0] = tbuf.GetWithOffset(WORK_BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM); workUBTensor[1] = tbuf.GetWithOffset(WORK_BLOCK_NUM, WORK_OFFSET + SCALE_SIZE * FOUR_NUM + IN_BLOCKSIZE * FOUR_NUM); outputUBTensor[0] = tbuf.GetWithOffset(BLOCK_NUM, IN_BLOCKSIZE + SCALE_SIZE * HALF_NUM + WORK_OFFSET); @@ -67,7 +67,7 @@ public: this->rankId = rankId; } - FORCE_INLINE_AICORE void PreProcess() + FORCE_INLINE_AICORE void PreProcess() { for (int index = 0; index < rankCount; index++) { DataCopyWrap(scaleUUBTensor[0][index * SCALE_SIZE / sizeof(U)], inputScaleGt[index], SCALE_SIZE); @@ -85,14 +85,13 @@ public: } Div(scaleUBTensor[1], outputUBTensor[0], scaleUBTensor[1], rankCount); AscendC::PipeBarrier(); - ReduceMin(singleScaleUBTensor[0], scaleUBTensor[0], + ReduceMin(singleScaleUBTensor[0], scaleUBTensor[0], workUBTensor[1][WORK_BLOCK_NUM / HALF_NUM], rankCount, false); pipe_barrier(PIPE_ALL); DataCopyWrap(outScaleGt, singleScaleUUBTensor[0], sizeof(T)); AscendC::PipeBarrier(); } - FORCE_INLINE_AICORE void LoopUncastAndMul(int idx, int index, event_t eventId) { PipeBarrier(); @@ -104,7 +103,7 @@ public: PipeBarrier(); perRankNum = perRankNumRemain >= WORK_BLOCK_NUM ? WORK_BLOCK_NUM : perRankNumRemain; PipeBarrier(); - + perRankNumRemain -= perRankNum; PipeBarrier(); AscendC::SetFlag(eventId); @@ -114,7 +113,7 @@ public: WORK_BLOCK_NUM] : inTensor[1][j * WORK_BLOCK_NUM], RoundMode::CAST_NONE, perRankNum); PipeBarrier(); if (index == 0) { - Muls((idx & 1) ? outputUBTensor[0][j * WORK_BLOCK_NUM] : outputUBTensor[1][j * + Muls((idx & 1) ? outputUBTensor[0][j * WORK_BLOCK_NUM] : outputUBTensor[1][j * WORK_BLOCK_NUM], (idx & 1) ? workUBTensor[0] : workUBTensor[1], scalarValue, perRankNum); } else { Axpy((idx & 1) ? outputUBTensor[0][j * WORK_BLOCK_NUM] : outputUBTensor[1][j * @@ -123,8 +122,8 @@ public: PipeBarrier(); } } - - FORCE_INLINE_AICORE void Mte3Process(int idx, int index, int calCount, event_t eventId) + + FORCE_INLINE_AICORE void Mte3Process(int idx, int index, int calCount, event_t eventId) { if (index == (rankCount - 1)) { if constexpr (std::is_same_v) { @@ -132,7 +131,7 @@ public: AscendC::WaitFlag(eventId); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? + DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? outputUBTensor[0] : outputUBTensor[1], calCount * sizeof(V)); } if constexpr (std::is_same_v) { @@ -142,7 +141,7 @@ public: AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); PipeBarrier(); - Muls((idx & 1) ? outputUBTensor[0] : outputUBTensor[1], (idx & 1) ? + Muls((idx & 1) ? outputUBTensor[0] : outputUBTensor[1], (idx & 1) ? outputUBTensor[0] : outputUBTensor[1], scaleValue, calCount); PipeBarrier(); Cast((idx & 1) ? inTensor[0] : inTensor[1], (idx & 1) ? @@ -152,10 +151,10 @@ public: AscendC::WaitFlag(eventId); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); - DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? - inTensor[0] : inTensor[1], calCount * sizeof(V)); + DataCopyWrap(outputGt[idx * BLOCK_NUM], (idx & 1) ? + inTensor[0] : inTensor[1], calCount * sizeof(V)); } - } + } } FORCE_INLINE_AICORE int GetSize(int idx, int numOfPiece) @@ -208,7 +207,6 @@ public: } } - AscendC::WaitFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID0); diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/comm/lcal/src/ascendc_kernels/ipc_queue.h index 068232d1..89967c2f 100644 --- a/comm/lcal/src/ascendc_kernels/ipc_queue.h +++ b/comm/lcal/src/ascendc_kernels/ipc_queue.h @@ -16,9 +16,9 @@ template class IpcQueue { public: FORCE_INLINE_AICORE IpcQueue() {} - + FORCE_INLINE_AICORE void Init(SyncCollectives *sync, int64_t magic, GM_ADDR workSpace, uint64_t bufferNum, - uint64_t blockNum) + uint64_t blockNum) { this->sync = sync; this->magic = magic; @@ -38,7 +38,7 @@ public: } return false; } - + FORCE_INLINE_AICORE GlobalTensor EnQue() { uint64_t rearOld = rear; @@ -120,4 +120,4 @@ private: SyncCollectives *sync; int blockIdx; }; -#endif // LCCL_IPC_QUEUE_H +#endif // LCCL_IPC_QUEUE_H diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/comm/lcal/src/ascendc_kernels/op_def.h index 9770c5e0..c4f323b8 100644 --- a/comm/lcal/src/ascendc_kernels/op_def.h +++ b/comm/lcal/src/ascendc_kernels/op_def.h @@ -47,8 +47,8 @@ do { \ DataCopyExtParams dataCopyParams(1, sizeof(int32_t), 0, 0, 0); \ DataCopyPadExtParams padParams; \ DataCopyPad(localSet, magicGt[rankSize - 1], dataCopyParams, padParams); \ - AscendC::SetFlag(EVENT_ID0); \ - AscendC::WaitFlag(EVENT_ID0); \ + AscendC::SetFlag(EVENT_ID0); \ + AscendC::WaitFlag(EVENT_ID0); \ magic = static_cast(localSet.GetValue(0)); \ PipeBarrier(); \ constexpr int32_t aivNumPerAic = 2; \ diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/comm/lcal/src/ascendc_kernels/reduce_scatter.h index 309ffc44..82405c41 100644 --- a/comm/lcal/src/ascendc_kernels/reduce_scatter.h +++ b/comm/lcal/src/ascendc_kernels/reduce_scatter.h @@ -21,7 +21,7 @@ public: FORCE_INLINE_AICORE ReduceScatter(int rank, int rankSize, uint32_t extraFlag) : Collectives(rank, rankSize, extraFlag) {} - FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) + FORCE_INLINE_AICORE void Init(KERNELS_ARGS_FUN()) { Collectives::Init(KERNELS_ARGS_CALL()); DumpLcclLogInfo(LogId::INIT, static_cast(op)); @@ -70,7 +70,7 @@ public: if ((blockIdx >= rank * corePerRank) && (blockIdx < (rank * corePerRank + corePerRank))) { CpGM2GM(dstOutputGlobal, srcInputGlobal, blockDataNum, -1); } - } + } } protected: diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/comm/lcal/src/ascendc_kernels/sync_collectives.h index ac8a1558..89f5ac66 100644 --- a/comm/lcal/src/ascendc_kernels/sync_collectives.h +++ b/comm/lcal/src/ascendc_kernels/sync_collectives.h @@ -56,7 +56,7 @@ public: int64_t v = MergeMagicWithValue(magic, value); SetFlag((__gm__ int64_t*)(shareAddrs[rank]) + eventID * FLAG_UNIT_INT_NUM, v); } - + __aicore__ inline int32_t CalEventIdByMulBlockNum(int32_t blockMultiplier, int32_t targetCoreId) { return (blockMultiplier * blockNum) + targetCoreId; @@ -124,7 +124,7 @@ public: __gm__ int64_t *flagAddr; flagAddr = GetOuterFlagAddr(rank, 0); WaitOneRankPartFlag(flagAddr, blockNum, value); - } + } __aicore__ inline void WaitAllRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t startBlock, int64_t flagNum) { int64_t value = MergeMagicWithValue(magic, eventID); @@ -200,13 +200,12 @@ public: return res; } - __aicore__ inline void WaitOneRankPartOuterFlag(int32_t magic, int32_t eventID, int64_t waitRank, int64_t startBlock, int64_t flagNum) { int64_t value = MergeMagicWithValue(magic, eventID); __gm__ int64_t *flagAddr; - flagAddr = GetOuterFlagAddr(waitRank, startBlock); + flagAddr = GetOuterFlagAddr(waitRank, startBlock); WaitOneRankPartFlag(flagAddr, flagNum, value); } @@ -228,12 +227,12 @@ private: __aicore__ inline __gm__ int64_t* GetInnerFlagAddr(int64_t flagRank, int64_t flagBlock) { - return (__gm__ int64_t*)(shareAddrs[flagRank]) + flagBlock * FLAG_UNIT_INT_NUM; + return (__gm__ int64_t*)(shareAddrs[flagRank]) + flagBlock * FLAG_UNIT_INT_NUM; } __aicore__ inline __gm__ int64_t* GetOuterFlagAddr(int64_t flagRank, int64_t flagBlock) { - return (__gm__ int64_t*)(shareAddrs[flagRank]) + segmentCount + flagBlock * FLAG_UNIT_INT_NUM; + return (__gm__ int64_t*)(shareAddrs[flagRank]) + segmentCount + flagBlock * FLAG_UNIT_INT_NUM; } __aicore__ inline void WaitOneRankPartFlag(__gm__ int64_t* waitAddr, int64_t flagNum, int64_t checkValue, diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 741a9d40..14a8608e 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -284,7 +284,6 @@ void LcalComm::CloseIpcMem() MKI_LOG(WARN) << "Close ipc[" << i << "] memory failed! ret: " << ret; } peerMem_[i] = nullptr; - } } diff --git a/comm/lcal/src/lccl.cpp b/comm/lcal/src/lccl.cpp index 79e7f7b1..694bfc74 100644 --- a/comm/lcal/src/lccl.cpp +++ b/comm/lcal/src/lccl.cpp @@ -307,7 +307,7 @@ int Lccl::AllReduce(void *sendBuff, void *recvBuff, int64_t count, HcclDataType uint32_t blockDim = GetBlockNum(LcalType::ALL_REDUCE, rankSize_, Count2Size(count, dataType), comm_->localRankSize_, comm_->commArgs_.extraFlag); AscendCCLKernelArgs args = { sendBuff, recvBuff, comm_->commArgsPtr_, count, comm_->magic_, op, 0, 0, scale, - scaleCount }; + scaleCount}; comm_->magic_++; return LoadMTE(LcalType::ALL_REDUCE, args, blockDim, dataType, stream); } -- Gitee From ea13cf54acbe8a0e7038dcb86143581db6994bad Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 29 Aug 2025 16:03:18 +0800 Subject: [PATCH 370/414] 0 --- .../ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 32f35a5e..efc6cdcf 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -154,7 +154,7 @@ public: if ((blockIdx * stepOneOriginRankPerCore + i) % NUM_OF_TWO == rank % NUM_OF_TWO) { if ((blockIdx * stepOneOriginRankPerCore + i) == rank) { waitWriteRankArr[i] = rank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; } else { waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / -- Gitee From f8f3d5ff8cf159efbf9af07a9a7d1be740663e84 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 29 Aug 2025 16:17:05 +0800 Subject: [PATCH 371/414] 9 --- comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h | 6 +++--- .../src/ascendc_kernels/91093/all2all_hierarchy_small.h | 4 ++-- .../91093/allgather_hierarchy_double_ring.h | 4 ++-- .../91093/allreduce_hierarchy_double_ring.h | 2 +- .../91093/reduce_scatter_big_data_91093_4step.h | 4 ++-- comm/lcal/src/ascendc_kernels/allreduce_quant.h | 4 ++-- comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h | 4 ++-- comm/lcal/src/ascendc_kernels/lccl_op.h | 8 ++++---- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h index 8cf0aa9c..c1da7cc9 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h @@ -146,11 +146,11 @@ private: if (groupCoreIdx[idx] % SIO == rank % SIO) { if (idx > 0) { sync.WaitSyncFlag(magic, sliceIdx + sliceNum * (idx - 1), - groupCoreIdx[idx - 1] + flagNumPerStage, rank); + groupCoreIdx[idx - 1] + flagNumPerStage, rank); } srcInnerQue[idx].DeQue(rank, groupCoreIdx[idx] + flagNumPerStage); writeGt = srcInnerQue[idx].EnQue(); - if(copyLen > 0) { + if (copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx], rank); } @@ -161,7 +161,7 @@ private: } SrcSioQue[idx].DeQue(sioRank, groupCoreIdx[idx] + (rank - sioRank) + flagNumPerStage); writeGt = SrcSioQue[idx].EnQue(); - if(copyLen > 0) { + if (copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, groupCoreIdx[idx] + (rank - sioRank), sioRank); } diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h index 46bbccfe..3ea12ebf 100644 --- a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h +++ b/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h @@ -128,7 +128,7 @@ private: ipcDataNumPreBlock); srcLocalQue.DeQue(rank, flagIdx + (srcRank / SIO) * coreNumPerStage + flagNumPerStage); writeGt = srcLocalQue.EnQue(); - if(copyLen > 0) { + if (copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, flagIdx, rank); } @@ -143,7 +143,7 @@ private: (groupCoreIdx - singleStage) * ipcDataNumPreBlock, ipcDataNumPreBlock); srcSioQue.DeQue(sioRank, flagIdx + (sioSrcRank / SIO) * coreNumPerStage + flagNumPerStage); writeGt = srcSioQue.EnQue(); - if(copyLen > 0) { + if (copyLen > 0) { CpGM2GMPingPong(copyLen * sizeof(T), inputGt[sliceIdx * perQueElemLen], writeGt, Op::COPYONLY); sync.SetSyncFlag(magic, sliceIdx + sliceNum * idx, flagIdx, sioRank); } diff --git a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h index c8b363a8..babfaa52 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h @@ -120,7 +120,7 @@ private: remainSize -= blockSize; } else { if (i == 1) { - sync.WaitSyncFlag(magic, 0 , stageEvents[static_cast(STAGE::HCCS_RING)], rankRingForward); + sync.WaitSyncFlag(magic, 0, stageEvents[static_cast(STAGE::HCCS_RING)], rankRingForward); waitFlag = sync.GetInnerFlag(rankRingForward, stageEvents[static_cast(STAGE::HCCS_RING)]) & EVENT_ID_MASK; } @@ -139,7 +139,7 @@ private: if (countRankId != rank) { if ((rank + (i + 1) * RING_NUM) % rankSize == rank) { queHccsForward.ReadFront(); - sync.SetSyncFlag(magic, i , sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), rank); + sync.SetSyncFlag(magic, i, sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), rank); } else { sync.SetSyncFlag(magic, i - 1, sync.CalEventIdByMulBlockNum(RING_EVENT, blockIdx), rank); } diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h index bdb8fd48..8fc88149 100644 --- a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h +++ b/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h @@ -189,7 +189,7 @@ private: dmaSizePerCore * localBlockIdx, ipcBlockNum * IPC_QUE_DEPTH, ipcBlockNum); ringGatherSrcQue.Init(&sync, magic, shareAddrs[ringPrevRankId] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, - ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); + ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); ringGatherDstQue.Init(&sync, magic, shareAddrs[rank] + IPC_DATA_OFFSET + IPC_QUE_DEPTH * ipcBlockSize + dmaSizePerCore * localBlockIdx, ipcBlockNum * RING_GATHER_QUE_DEPTH, ipcBlockNum); diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index efc6cdcf..325f1ad3 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -228,7 +228,7 @@ public: } } - __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, + __aicore__ inline void SioAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock, int *waitWriteRank, int *waitWriteBlock, int waitCount) { int processBlockNum = ipcBlockNum; @@ -248,7 +248,7 @@ public: } } - __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock,int *waitWriteRank, + __aicore__ inline void HccsAtomicToIpcProcess(int *waitReadRank, int *waitReadBlock, int *waitWriteRank, int *waitWriteBlock, int waitCount) { int processBlockNum = ipcBlockNum; diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/comm/lcal/src/ascendc_kernels/allreduce_quant.h index 57804383..aa749abc 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_quant.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_quant.h @@ -23,7 +23,7 @@ public: FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GlobalTensor& inputGT, const uint32_t calCount, int op, T scale, T offset) { - DataCopyGM2GM cpKernel; + DataCopyGM2GM cpKernel; cpKernel.Init(outputGT, inputGT, calCount, op); cpKernel.Process(scale, offset); } @@ -32,7 +32,7 @@ public: FORCE_INLINE_AICORE void CpGM2GM(const GlobalTensor& outputGT, const GlobalTensor& inputGT, const uint32_t calCount, int op, const GlobalTensor& scaleGT, int64_t scaleCount, T offset) { - DataCopyGM2GM cpKernel; + DataCopyGM2GM cpKernel; cpKernel.Init(outputGT, inputGT, calCount, op); cpKernel.Process(scaleGT, scaleCount, offset); } diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h index b6b91715..318f679d 100644 --- a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h +++ b/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h @@ -84,7 +84,7 @@ template __aicore__ inline void DataCopyWrap(const GlobalTensor &dstGlobal, const LocalTensor &srcLocal, const uint32_t size) { - if(size % UB_ALIGN_SIZE == 0) { + if (size % UB_ALIGN_SIZE == 0) { DataCopy(dstGlobal, srcLocal, size / sizeof(T)); } else { DataCopyExtParams copyParams{1, size, 0, 0, 0}; @@ -96,7 +96,7 @@ template __aicore__ inline void DataCopyWrap(const LocalTensor &dstLocal, const GlobalTensor &srcGlobal, const uint32_t size) { - if(size % UB_ALIGN_SIZE == 0) { + if (size % UB_ALIGN_SIZE == 0) { DataCopy(dstLocal, srcGlobal, size / sizeof(T)); } else { DataCopyExtParams copyParams{1, size, 0, 0, 0}; diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index 82142253..1e7d4e35 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -78,7 +78,7 @@ extern "C" __global__ __aicore__ void LcalBroadcast##suffix(KERNELS_ARGS_FUN()) } #define LCCL_ALLGATHER_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_FUN()) {\ +extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_FUN()) { \ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ constexpr int32_t quickOneshotRankSize = 2; \ @@ -110,7 +110,7 @@ extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_ } else { \ LcalAllGatherBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ - }\ + } \ } \ } @@ -183,7 +183,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ } else { \ LcalAllReduceBigData(ALLREDUCE_ARGS_CALL_16P(type)); \ } \ - }\ + } \ } \ } @@ -241,6 +241,6 @@ extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_A LcalReduceScatterBigData(ALLREDUCE_ARGS_CALL(type)); \ } \ } \ - }\ + } \ } #endif -- Gitee From d60d34648fcb8057f2fd97b34ab9071a778e9b4f Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 29 Aug 2025 16:18:40 +0800 Subject: [PATCH 372/414] 9 --- .../ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h index 325f1ad3..d97e24a7 100644 --- a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h +++ b/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h @@ -154,7 +154,7 @@ public: if ((blockIdx * stepOneOriginRankPerCore + i) % NUM_OF_TWO == rank % NUM_OF_TWO) { if ((blockIdx * stepOneOriginRankPerCore + i) == rank) { waitWriteRankArr[i] = rank; - waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * 4; + waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_FOUR; } else { waitWriteRankArr[i] = blockIdx * stepOneOriginRankPerCore + i; waitWriteBlockArr[i] = PER_STEP_BLOCKNUM * NUM_OF_TWO + ((rank / NUM_OF_TWO) / -- Gitee From 5ccf9ab554b034c5d169e20bf72c196973b4fc92 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Fri, 29 Aug 2025 16:25:50 +0800 Subject: [PATCH 373/414] 9 --- comm/lcal/src/ascendc_kernels/lccl_op.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index 1e7d4e35..d2e31517 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -115,7 +115,7 @@ extern "C" __global__ __aicore__ void LcalAllGather_##type##suffix(KERNELS_ARGS_ } #define LCCL_ALL_REDUCE_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_FUN()) {\ +extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_FUN()) { \ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ constexpr int32_t quickOneshotRankSize = 2; \ @@ -188,7 +188,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ } #define LCCL_ALL2ALL_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNELS_ARGS_FUN()) {\ +extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNELS_ARGS_FUN()) { \ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ @@ -209,7 +209,7 @@ extern "C" __global__ __aicore__ void LcalAll2All_##type##suffix(KERNELS_ARGS_FU } #define LCCL_REDUCE_SCATTER_FUNC_AUTO_DEF(type, suffix) \ -extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_ARGS_FUN()) {\ +extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_ARGS_FUN()) { \ if ASCEND_IS_AIV { \ GET_COMM_ARGS; \ constexpr int32_t quickOneshotRankSize = 2; \ @@ -235,7 +235,7 @@ extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_A } else { \ if (rankSize == quickOneshotRankSize && len * sizeof(type) < SIZE_OF_8M) { \ LcalReduceScatterWrite(ALLREDUCE_ARGS_CALL(type)); \ - } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize){\ + } else if (rankSize > quickOneshotRankSize && len * sizeof(type) < cceSmallDataSize){ \ LcalReduceScatter(ALLREDUCE_ARGS_CALL(type)); \ } else { \ LcalReduceScatterBigData(ALLREDUCE_ARGS_CALL(type)); \ -- Gitee From 4bf05fc9066a009c88ff0e44fb44cc6c380facf5 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 09:58:42 +0800 Subject: [PATCH 374/414] fix --- comm/lcal/include/lcal_types.h | 32 +- comm/lcal/include/lcoc/lcoc_args.h | 12 +- comm/lcal/include/lcoc/lcoc_base.h | 2 +- comm/lcal/include/lcoc/lcoc_workspace.h | 20 +- comm/lcal/include/lcoc/tiling/tiling.h | 26 +- comm/lcal/include/lcoc/tiling/tiling_91093.h | 2 +- comm/lcal/include/lcoc/tiling/tiling_910B.h | 2 +- comm/lcal/include/lcoc/tiling/tiling_args.h | 4 +- comm/lcal/src/kernels/CMakeLists.txt | 2 +- comm/lcal/src/kernels/coc_add_bias_runner.cce | 60 ++- .../kernels/coc_allgather_reducescatter.cce | 463 ++++++++++++++++++ comm/lcal/src/kernels/coc_allreduce.cce | 176 ++----- comm/lcal/src/kernels/coc_comm_base.cce | 40 +- comm/lcal/src/kernels/coc_const_args.cce | 10 +- comm/lcal/src/kernels/coc_dequant_runner.cce | 6 +- comm/lcal/src/kernels/coc_postprocessor.cce | 2 +- comm/lcal/src/kernels/coc_ppmatmul.cce | 70 +-- comm/lcal/src/kernels/coc_ppmatmul_switch.cce | 10 +- comm/lcal/src/kernels/coc_preprocessor.cce | 11 +- comm/lcal/src/lcoc.cpp | 34 +- 20 files changed, 681 insertions(+), 303 deletions(-) create mode 100644 comm/lcal/src/kernels/coc_allgather_reducescatter.cce diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index 10ee60ff..d67aad78 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -68,26 +68,17 @@ enum class LcalType { ALL_GATHER = 3, BROADCAST = 4, ALL2ALL = 5, - ALL_REDUCE_910B2C = 6, - ALL_GATHER_910B2C = 7, + ALL2ALL_V_C = 6, + GATHER = 7, LOCAL_REDUCE = 8, SEND = 9, RECV = 10, - ALL2ALL_V_C = 11, - GATHER = 12, PURE_MATMUL = 101, MATMUL_ALL_REDUCE = 102, - MATMUL_REDUCE_SCATTER = 103, - ALL_GATHER_MATMUL = 104, - ALL_GATHER_MATMUL_V2 = 105, - ALL2ALL_MATMUL = 106, - MATMUL_ALL2ALL = 107, + MTE2_TEST = 108, ALL_GATHER_MATMUL_REDUCE_SCATTER = 111, BANDWIDTH = 201, - ALLTOALLV_ALLGATHER_MATMUL = 305, - ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN = 309, - MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN = 310, - LCAL_TYPE_MAX = 311, + LCAL_TYPE_MAX = 311 }; const std::map LCAL_TYPE2NAME = { @@ -97,20 +88,15 @@ const std::map LCAL_TYPE2NAME = { { LcalType::BROADCAST, "LcalBroadcast" }, { LcalType::PURE_MATMUL, "LcalPureMatmul" }, { LcalType::MATMUL_ALL_REDUCE, "LcalMatmulAllReduce" }, - { LcalType::MATMUL_REDUCE_SCATTER, "LcalMatmulReduceScatter" }, - { LcalType::ALL_GATHER_MATMUL, "LcalAllGatherMatmul" }, - { LcalType::ALL_GATHER_MATMUL_V2, "LcalAllGatherMatmulV2" }, - { LcalType::ALL2ALL_MATMUL, "LcalAll2AllMatmul" }, - { LcalType::MATMUL_ALL2ALL, "LcalMatmulAll2All" }, + { LcalType::MTE2_TEST, "LcalMTE2Test" }, { LcalType::ALL2ALL, "LcalAll2All" }, { LcalType::ALL2ALL_V_C, "LcalAll2AllVC" }, { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER, "LcalAllGatherMatmulReduceScatter" }, { LcalType::BANDWIDTH, "LcalBandwidthTest" }, - { LcalType::ALL_REDUCE_910B2C, "LcalAllReduce910B2C" }, - { LcalType::ALL_GATHER_910B2C, "LcalAllGather910B2C" }, - { LcalType::ALLTOALLV_ALLGATHER_MATMUL, "LcalAllToAllVAllGatherMatmul" }, - { LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN, "LcalAllToAllVAllGatherMatmulHidden" }, - { LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN, "LcalMatmulReduceScatterAllToAllVHidden" } + { LcalType::LOCAL_REDUCE, "LcalLcalReduce" }, + { LcalType::GATHER, "LcalGather" }, + { LcalType::SEND, "LcalSend" }, + { LcalType::RECV, "LcalRecv" } }; diff --git a/comm/lcal/include/lcoc/lcoc_args.h b/comm/lcal/include/lcoc/lcoc_args.h index f496a86d..6a7775d9 100644 --- a/comm/lcal/include/lcoc/lcoc_args.h +++ b/comm/lcal/include/lcoc/lcoc_args.h @@ -52,11 +52,11 @@ namespace Lcal { }; const std::map COC_TYPE2HCCL_TYPE = { - { FP16FP16_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16BF16_FP32_BF16, HCCL_DATA_TYPE_BP16 }, - { INT8INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { INT8INT8_INT32_BF16, HCCL_DATA_TYPE_BP16 }, - { FP16INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_INT32_BF16, HCCL_DATA_TYPE_BP16 }, - { FP16INT8_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_FP32_BF16, HCCL_DATA_TYPE_BP16 }, - { FP16INT4_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT4_FP32_BF16, HCCL_DATA_TYPE_BP16 } + { FP16FP16_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16BF16_FP32_BF16, HCCL_DATA_TYPE_BFP16 }, + { INT8INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { INT8INT8_INT32_BF16, HCCL_DATA_TYPE_BFP16 }, + { FP16INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_INT32_BF16, HCCL_DATA_TYPE_BFP16 }, + { FP16INT8_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_FP32_BF16, HCCL_DATA_TYPE_BFP16 }, + { FP16INT4_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT4_FP32_BF16, HCCL_DATA_TYPE_BFP16 } }; struct CoCParamDesc { @@ -82,7 +82,7 @@ namespace Lcal { struct CoCOutputPkg { void *output = nullptr; - void *minOutput = nullptr; + void *midOutput = nullptr; }; struct TaskParam { diff --git a/comm/lcal/include/lcoc/lcoc_base.h b/comm/lcal/include/lcoc/lcoc_base.h index 2dd25d84..7979501d 100644 --- a/comm/lcal/include/lcoc/lcoc_base.h +++ b/comm/lcal/include/lcoc/lcoc_base.h @@ -45,7 +45,7 @@ struct TwoDimTPInfo { struct QuantInfo { QuantGranularity dequantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; - int32_t dequantGroupSize = -1 + int32_t dequantGroupSize = -1; QuantGranularity quantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; int32_t quantGroupSize = -1; diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/comm/lcal/include/lcoc/lcoc_workspace.h index f254d951..bb0b8fa1 100644 --- a/comm/lcal/include/lcoc/lcoc_workspace.h +++ b/comm/lcal/include/lcoc/lcoc_workspace.h @@ -10,7 +10,7 @@ #ifndef LCAL_LCOC_WORKSPACE_H #define LCAL_LCOC_WORKSPACE_H -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV_C310__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV__C310__) #define __aicore__ #define GM_ADDR int64_t #endif @@ -18,7 +18,7 @@ struct LcalWorkspaceInfo { GM_ADDR gm_reducebuf{ 0 }; GM_ADDR gm_a_align{ 0 }; - GM_ADDR gm_b_algn{ 0 }; + GM_ADDR gm_b_align{ 0 }; GM_ADDR gm_accum{ 0 }; GM_ADDR gm_formate_dequant_scale{ 0 }; GM_ADDR gm_dequant_param{ 0 }; @@ -31,7 +31,7 @@ inline __aicore__ int32_t AlignUp(int32_t len, int32_t size) return (len + size -1) & ~(size - 1); } -#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV_C310__) +#if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV__C310__) inline uint64_t GetDequantWorkSpaceSize(Lcal::LcalType lcalType, int32_t withSerialMode, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t pValue, int32_t nLoop, int32_t rankSize, int32_t blockDim, int32_t maxOutputSize = -1) @@ -51,7 +51,7 @@ inline uint64_t GetDequantWorkSpaceSize(Lcal::LcalType lcalType, int32_t withSer } #endif -inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpcae, int32_t batchSize, int32_t m, +inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, int32_t batchSize, int32_t m, int32_t k, int32_t n, int32_t mAlign, int32_t kAlign, int32_t nAlign, bool transa, bool transb, int32_t mmadSize, bool hasAAlign, bool hasBAlign, int32_t accumRankSize, bool hasAccum = false, uint64_t dequantWorkSpaceSize = 0, bool hasDequantParam = false, bool hasFormatDequantScale = false, @@ -64,29 +64,29 @@ inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpcae, in } constexpr int32_t ALIGN8 = 8; LcalWorkspaceInfo lcalWorkspaceInfo; - lcalWorkspaceInfo.gm_reducebuf = gmWorkSpcae; - GM_ADDR workspaceOffset = gmWorkSpcae; + lcalWorkspaceInfo.gm_reducebuf = gmWorkSpace; + GM_ADDR workspaceOffset = gmWorkSpace; if (isDeterministic) { workspaceOffset += WORKSPACE_REDUCE_SIZE; } if (hasAAlign) { lcalWorkspaceInfo.gm_a_align = workspaceOffset; - workspaceOffset += static_cast(batchSize) * (transa ? k * mAlign : m : kAlign) * mmadSize; + workspaceOffset += static_cast(batchSize) * (transa ? k * mAlign : m * kAlign) * mmadSize; } if (hasBAlign) { - lcalWorkspaceInfo.gm_b_algn = workspaceOffset; + lcalWorkspaceInfo.gm_b_align = workspaceOffset; workspaceOffset += static_cast(batchSize) * (transb ? n * kAlign : k * nAlign) * mmadSize * (expertPerRank <= 0 ? 1 : expertPerRank); } - if (hasDequantParam) { + if (!isMoe && hasDequantParam) { lcalWorkspaceInfo.gm_dequant_param = workspaceOffset; workspaceOffset += sizeof(int32_t) * AlignUp(n, ALIGN8); } - if (hasFormatdequantscale) { + if (hasFormatDequantScale) { lcalWorkspaceInfo.gm_formate_dequant_scale = workspaceOffset; workspaceOffset += sizeof(float) * AlignUp(n, ALIGN8); } diff --git a/comm/lcal/include/lcoc/tiling/tiling.h b/comm/lcal/include/lcoc/tiling/tiling.h index a41a8d94..66ec9da5 100644 --- a/comm/lcal/include/lcoc/tiling/tiling.h +++ b/comm/lcal/include/lcoc/tiling/tiling.h @@ -33,32 +33,32 @@ protected: CoCTilingData cocTilingData = {}; }; -class CoCMatmullReduceTilingFunc : public CoCTilingFunc { +class CoCMatmulAllReduceTilingFunc : public CoCTilingFunc { public: - CoCMatmullReduceTilingFunc(const CoCMatmullReduceTilingFunc &) = delete; - CoCMatmullReduceTilingFunc &operator = (const CoCMatmullReduceTilingFunc &) = delete; - CoCMatmullReduceTilingFunc() {} + CoCMatmulALlReduceTilingFunc(const CoCMatmulAllReduceTilingFunc &) = delete; + CoCMatmulAllReduceTilingFunc &operator = (const CoCMatmulAllReduceTilingFunc &) = delete; + CoCMatmulAllReduceTilingFunc() {} bool CheckTiling(const TaskParam &taskParam) override; void GetDefaultTiling(const TaskParam &taskParam) override; } -class CoCMatmullReduceDeterTilingFunc : public CoCTilingFunc { +class CoCMatmulAllReduceDeterTilingFunc : public CoCMatmulAllReduceTilingFunc { public: - CoCMatmullReduceDeterTilingFunc(const CoCMatmullReduceDeterTilingFunc &) = delete; - CoCMatmullReduceDeterTilingFunc &operator = (const CoCMatmullReduceDeterTilingFunc &) = delete; - CoCMatmullReduceDeterTilingFunc() {} + CoCMatmulAllReduceDeterTilingFunc(const CoCMatmulAllReduceDeterTilingFunc &) = delete; + CoCMatmulAllReduceDeterTilingFunc &operator = (const CoCMatmulAllReduceDeterTilingFunc &) = delete; + CoCMatmulAllReduceDeterTilingFunc() {} bool CheckTiling(const TaskParam &taskParam) override; void GetDefaultTiling(const TaskParam &taskParam) override; } -class CoCAllgatherMatnulReduceScatterTilingFunc : public CoCTilingFunc { +class CoCAllgatherMatmulReduceScatterTilingFunc : public CoCTilingFunc { public: - CoCAllgatherMatnulReduceScatterTilingFunc(const CoCAllgatherMatnulReduceScatterTilingFunc &) = delete; - CoCAllgatherMatnulReduceScatterTilingFunc &operator = (const CoCAllgatherMatnulReduceScatterTilingFunc &) = delete; - CoCAllgatherMatnulReduceScatterTilingFunc() {} + CoCAllgatherMatmulReduceScatterTilingFunc(const CoCAllgatherMatmulReduceScatterTilingFunc &) = delete; + CoCAllgatherMatmulReduceScatterTilingFunc &operator = (const CoCAllgatherMatmulReduceScatterTilingFunc &) = delete; + CoCAllgatherMatmulReduceScatterTilingFunc() {} bool CheckTiling(const TaskParam &taskParam) override; void GetDefaultTiling(const TaskParam &taskParam) override; }; } -#endif \ No newline at end of file +#endif // LCAL_TILING_H \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_91093.h b/comm/lcal/include/lcoc/tiling/tiling_91093.h index 331ae5ec..e3848d06 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_91093.h +++ b/comm/lcal/include/lcoc/tiling/tiling_91093.h @@ -17,6 +17,6 @@ namespace Lcal { void AllReduceNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData); void CoCAllgatherMatmulReduceScatterAgEightRsTwoTiling(CoCTilingData &cocTilingData); - void CoCAllgatherMatmulReduceScatterDefaultTiling(CoCTilingData &cocTilingData); + void CoCAllgatherMatmulReduceScatterDefaultTiling(CoCTilingData &cocTilingData, int32_t rsDim); } #endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_910B.h b/comm/lcal/include/lcoc/tiling/tiling_910B.h index 8c75adbd..59e684f4 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_910B.h +++ b/comm/lcal/include/lcoc/tiling/tiling_910B.h @@ -21,7 +21,7 @@ namespace Lcal { void AllReduceTwoRankFP16Tiling(CoCTilingData &cocTilingData); void ReduceScatterEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); - void ReduceScatterFoutRankINT8Tiling(CoCTilingData &cocTilingData); + void ReduceScatterFourRankINT8Tiling(CoCTilingData &cocTilingData); } #endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/comm/lcal/include/lcoc/tiling/tiling_args.h index 87b21e81..e2222b7a 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_args.h +++ b/comm/lcal/include/lcoc/tiling/tiling_args.h @@ -121,7 +121,7 @@ namespace Lcal { int64_t n = -1; int64_t batchSize = -1; - int32_t blockDim = -1 + int32_t blockDim = -1; int32_t rank = -1; int32_t rankSize = -1; int32_t tag = -1; @@ -136,7 +136,7 @@ namespace Lcal { void SetDefaultValue(); }; - struct CoCkernelParm { + struct CoCKernelParam { CoCTilingData cocTilingData = {}; QuantInfo quantInfo = {}; TwoDimTPInfo twoDimTPInfo = {}; diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/comm/lcal/src/kernels/CMakeLists.txt index a45f05f2..a4669b02 100644 --- a/comm/lcal/src/kernels/CMakeLists.txt +++ b/comm/lcal/src/kernels/CMakeLists.txt @@ -8,7 +8,7 @@ # See LICENSE in the root of the software repository for the full text of the License. # include(../ascendc.cmake) -set(OP_NAMES pure_matmul matmul_allreduce matmul_reduce_scatter allgather_matmul allgather_matmul_reduce_scatter alltoallv_allgather_matmul matmul_reduce_scatter_alltoallv) +set(OP_NAMES matmul_allreduce allgather_matmul_reduce_scatter) file(GLOB KERNEL_FILES *.cpp) set_source_files_properties(${KERNEL_FILES} PROPERTIES LANGUAGE CCE) diff --git a/comm/lcal/src/kernels/coc_add_bias_runner.cce b/comm/lcal/src/kernels/coc_add_bias_runner.cce index ed887ed2..066f7097 100644 --- a/comm/lcal/src/kernels/coc_add_bias_runner.cce +++ b/comm/lcal/src/kernels/coc_add_bias_runner.cce @@ -20,7 +20,7 @@ enum class BiasMode { ADD = 0, MOVE, ATOMIC_ADD }; template class BaseSerialBiasAdder { public: - __aicore__ explict BaseSerialBiasAdder() = default; + __aicore__ explicit BaseSerialBiasAdder() = default; inline __aicore__ void SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) { @@ -32,12 +32,12 @@ public: this->n = n; int32_t align_core_num = get_block_num() * get_subblockdim(); - int32_t align_core_dix = get_block_dix() * get_subblockdim() + get_subblockid(); + int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); if constexpr (MODE == BiasMode::MOVE || MODE == BiasMode::ATOMIC_ADD) { max_len = Block32B::AlignDown(MAX_UB_BUFF / sizeof(OutputDtype)); } else if constexpr (MODE == BiasMode::ADD) { - max_len = Block32B::AlignDown(MAX_UB_BUFF / sizeof(OutputDtype) * 3); + max_len = Block32B::AlignDown(MAX_UB_BUFF / (sizeof(OutputDtype) * 3)); } int32_t n_round = Block32B::AlignUp(n); @@ -46,10 +46,10 @@ public: int32_t m_per_core_base = m / align_core_num; int32_t m_remainder = m % align_core_num; - int32_t m_offset_base = align_core_dix * m_per_core_base; - if (align_core_dix < m_remainder) { + int32_t m_offset_base = align_core_idx * m_per_core_base; + if (align_core_idx < m_remainder) { m_this_core = m_per_core_base + 1; - m_offset_this_core = m_offset_base + align_core_dix; + m_offset_this_core = m_offset_base + align_core_idx; } else { m_this_core = m_per_core_base; m_offset_this_core = m_offset_base + m_remainder; @@ -60,6 +60,8 @@ public: { if constexpr (MODE == BiasMode::ADD) { AddBias(); + } else if constexpr (MODE == BiasMode::MOVE) { + MoveBias(); } else if constexpr (MODE == BiasMode::ATOMIC_ADD) { SetAtomicAdd(); PipeBarrier(); @@ -78,20 +80,20 @@ public: private: inline __aicore__ void AddBias() { - if constexpr (MODE != BiasMode:ADD) { + if constexpr (MODE != BiasMode::ADD) { return; } auto ub_bias = reinterpret_cast<__ubuf__ OutputDtype *>((uintptr_t)0); - auto ub_out1 = reinterpret_cast<_ubuf__ OutputDtype *>((uintptr_t)(max_len * sizeof(OutputDtype))); - auto ub_out2 = reinterpret_cast<_ubuf__ OutputDtype *>((uintptr_t)(max_len * sizeof(OutputDtype) * 2)); + auto ub_out1 = reinterpret_cast<__ubuf__ OutputDtype *>((uintptr_t)(max_len * sizeof(OutputDtype))); + auto ub_out2 = reinterpret_cast<__ubuf__ OutputDtype *>((uintptr_t)(max_len * sizeof(OutputDtype) * 2)); bool ping = true; for (int32_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { for (int32_t n_complete = 0, n_this_loop = n_per_loop; n_complete < n; n_complete += n_this_loop) { n_this_loop = (n_complete + n_this_loop > n) ? (n - n_complete) : n_this_loop; - // MTE2 ub_bias <- gm_bias + // MTE2: ub_bias <- gm_bias CopyGmToUbufAlign(ub_bias, gm_bias + n_complete, 1, n_this_loop, 0); SetFlag(EVENT_ID0); @@ -102,7 +104,7 @@ private: PipeBarrier(); SetFlag(EVENT_ID1); - WaitFlag(EVENT_ID2); + SetFlag(EVENT_ID2); ProcessMLoop(n_complete, n_this_loop, ub_out1, ub_out2, ping, ub_bias); @@ -191,7 +193,7 @@ private: n_this_loop = n - n_complete; } - // MTEs: ub_base <- gm_bias + // MTE2: ub_base <- gm_bias CopyGmToUbufAlign(ub_base, gm_bias + n_complete, 1, n_this_loop, 0); SetFlag(EVENT_ID0); @@ -257,7 +259,7 @@ public: inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) { - base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()); + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); } inline void __aicore__ Run() @@ -277,7 +279,7 @@ public: inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) { - base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()); + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); } inline void __aicore__ Run() @@ -287,5 +289,31 @@ public: } private: - BaseSerialBiasAdder base_adder; -}; \ No newline at end of file + BaseSerialBiasAdder base_adder; +}; + +template +class AllGatherMatmulBiasAdder { + static constexpr auto MODE = std::is_same::value ? BiasMode::ADD : BiasMode::ATOMIC_ADD; + +public: + __aicore__ explicit AllGatherMatmulBiasAdder() = default; + + inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) + { + m = m * rank_size; + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL); + } + + inline void __aicore__ Run() + { + base_adder.Run(); + base_adder.Barrier(); + } + +private: + BaseSerialBiasAdder base_adder; +}; + +#endif +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce new file mode 100644 index 00000000..90dec967 --- /dev/null +++ b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; +template +class AllGatherReduceScatter : public CocCommBase { +public: + __aicore__ explicit AllGatherReduceScatter() {}; + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) { + CocCommBase::SetArgsForReduce(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + if constexpr (HAVE_BIAS) { + add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + + m_align = (m + CUBE_MATRIX_SIZE - 1) / CUBE_MATRIX_SIZE * CUBE_MATRIX_SIZE; + k_align = (k + CUBE_MATRIX_SIZE - 1) / CUBE_MATRIX_SIZE * CUBE_MATRIX_SIZE; + n_align = (n + CUBE_MATRIX_SIZE - 1) / CUBE_MATRIX_SIZE * CUBE_MATRIX_SIZE; + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + this->gm_a = aligned_a ? reinterpret_cast<__gm__ T *>(workspace_info.gm_a_align) : gm_a; + if (inner_dim_is_Ag) { + this->rank_ag_idx = rank % ag_dim; + this->rank_rs_idx = rank / ag_dim; + this->other_rank_ag_idx = other_rank % ag_dim; + this->other_rank_rs_idx = other_rank / ag_dim; + }else { + this->rank_ag_idx = rank % rs_dim; + this->rank_rs_idx = rank / rs_dim; + this->other_rank_ag_idx = other_rank % rs_dim; + this->other_rank_rs_idx = other_rank / rs_dim; + } + + twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; + gm_a_pingpong_size = m0 * k_align * p_value * twod_big_dim; + gm_c_pingpong_size = p_value * twod_big_dim *n_loop * m0 * n0; + m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); + m_per_bigdim = m *ag_dim / twod_big_dim; + comm_count = DivCeil(batch_size * m_loop_per_bigdim , p_value); + ag_part_dim = twod_big_dim / ag_dim; + rs_part_dim = twod_big_dim / rs_dim; + + ag_comm_npu_split = comm_npu_split; + ag_comm_data_split = comm_data_split; + ag_len_per_loop = len_per_loop; + ag_comm_direct = ag_comm_direct; + + rs_comm_npu_split = extra_comm_npu_split; + rs_comm_data_split = extra_comm_data_split; + rs_len_per_loop = extra_len_per_loop; + + ag_core_count = ag_comm_npu_split * ag_comm_data_split; + rs_core_count = rs_comm_npu_split * rs_comm_data_split; + + ag_max_ub_ping_pong_size = (max_ub_ping_pong_size / 2) / n0 * n0; + rs_max_ub_ping_pong_size = (extra_ub_move_num / 2) / n0 * n0; + } + + FORCE_INLINE_AICORE void CopyGmToGm(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { + auto ub0 = output_UB_T[0]; + auto ub1 = output_UB_T[1]; + int32_t interm_offset = 0; + for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx) { + uint32_t data_size = interm_offset + max_ub_ping_pong_size < copy_size ? ag_max_ub_ping_pong_size : copy_size - interm_offset; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub = (move_idx & 1) ? ub0 : ub1; + WaitFlag(event_id); + CopyGmToUbuf(ub, gm_src + interm_offset, 1, data_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + CopyUbufToGm(gm_dst + interm_offset, ub, 1, data_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + interm_offset += data_size; + } + } + + FORCE_INLINE_AICORE + void MoveToOtherRankWithSkip(__gm__ T *gm_src, int32_t rank_offset, int32_t len, + int32_t rank_st, int32_t skip_num, int32_t group_num) + { + int32_t ping_pong_move_count = (len + ag_max_ub_ping_pong_size - 1) / ag_max_ub_ping_pong_size; + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = ag_max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = len - move_idx * ag_max_ub_ping_pong_size; + } + int32_t block_len = actual_move_size * sizeof(T) / 32; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, gm_src, 1, block_len, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + int32_t dst_rank = rank_st % ag_dim; + for (int32_t cycle_idx = 0; cycle_idx < group_num; ++cycle_idx) { + int32_t real_rank; + if (inner_dim_is_Ag) { + real_rank = dst_rank + rank / ag_dim * ag_dim; + } esle { + real_rank = dst_rank * rs_dim + rank % rs_dim; + } + if (real_rank != rank && dst_rank < ag_dim) { + CopyUbufToGM(buff[real_rank] + rank_offset, ub_buff_st, 1, block_len, 0, 0) + } + dst_rank = (dst_rank + skip_num) % ag_dim; + } + gm_src += ag_max_ub_ping_pong_size; + rank_offset += ag_max_ub_ping_pong_size; + SetFlag(event_id); + } + } + + FORCE_INLINE_AICORE + void MoveWithSplit(__gm__ T *gm_src, int32_t rank_offset, int32_t len) + { + int32_t data_split = DivCeil(len, ag_len_per_loop); + int32_t data_block = ag_len_per_loop; + int32_t group_num = ag_dim / ag_comm_npu_split; + int32_t data_offset = -data_block; + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + for (int32_t data_block_idx = 0; data_block_idx < data_split; ++data_block_idx) { + data_offset += data_block; + data_block = data_block_idx == data_split - 1 ? len - data_offset : data_block; + int32_t num_per_core = DivCeil(data_block, ag_comm_data_split); + + int32_t data_src = data_offset + (core_idx / ag_comm_npu_split) * num_per_core; + int32_t data_len = data_block + data_offset - data_src; + data_len = data_len >= num_per_core ? num_per_core : data_len; + if (ag_comm_direct) { + MoveToOtherRankWithSkip(gm_src + data_src, rank_offset + data_src, data_len, + core_idx, ag_comm_npu_split, group_num); + continue; + } + int32_t dst_rank = core_idx % ag_dim; + for (int32_t rank_group_idx = 0; rank_group_idx < group_num; ++rank_group_idx) { + int32_t real_rank; + if (inner_dim_is_Ag) { + real_rank = dst_rank + rank / ag_dim * ag_dim; + } else { + real_rank = dst_rank * rs_dim + rank % rs_dim; + } + if (real_rank != rank && dst_rank < ag_dim) { + CopyGmToGm(gm_src + data_src, buff[real_rank] + rank_offset + data_src, data_len); + } + dst_rank = (dst_rank + ag_comm_npu_split) % ag_dim; + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + } + + FORCE_INLINE_AICORE int32_t GetRealCoreIdx(int32_t index, int32_t rank_per_core) + { + int32_t core_index = core_idx - ag_core_count; + int32_t core_rank_offset = (core_index / rs_comm_data_split) * rank_per_core; + int32_t rank_idx_rot = (index + core_index) % rank_per_core; + int32_t real_core_idx = core_rank_offset + rank_idx_rot; + + return real_core_idx; + } + + FORCE_INLINE_AICORE void GetLenPerCore(int32_t rank_total, int32_t loop_index, int32_t &m_in_core, int32_t &rank_buff_offset) + { + int32_t core_index = core_idx - ag_core_count; + int32_t before_core_offset = rs_len_per_loop * rs_comm_data_split * loop_index; + int32_t loop_total = rank_total - before_core_offset; + int32_t real_core_offset = core_index % rs_comm_data_split * rs_len_per_loop; + + buff_offset = before_core_offset + real_core_offset; + m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + rs_len_per_loop) > loop_total ? + loop_total - real_core_offset : rs_len_per_loop); + } + + FORCE_INLINE_AICORE void FirstStepInOutWithSplit(int32_t rank_total, int32_t rank_buff_offset, int32_t comm_idx, int32_t flag_idx, int64_t out_part_offset) + { + SetAtomicAdd(); + PipeBarrier(); + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + + int32_t rank_per_core = rs_dim / rs_comm_npu_split; + int32_t m_per_core = rank_total / rs_comm_data_split; + int32_t data_split_num = DivCeil(m_per_core, rs_len_per_loop); + for (int32_t loop_idx = 0; loop_idx < data_split_num; loop_idx++) { + int32_t m_in_core; + int32_t offset; + GetLenPerCore(rank_total, loop_idx, m_in_core, offset); + + for (int32_t rank_idx = 0; rank_idx < rank_per_core; rank_idx++) { + int32_t real_rank_idx_tmp = GetRealCoreIdx(rank_idx, rank_per_core); + int32_t real_core_idx; + if (inner_dim_is_Ag) { + real_rank_idx = real_rank_idx_tmp * ag_dim + rank % ag_dim; + } else { + real_rank_idx = real_rank_idx_tmp + rank / rs_dim * rs_dim; + } + + if (real_rank_idx == rank) + continue; + + FirstStepInOut(m_in_core, buff[real_rank_idx], rank_buff_offset, offset, comm_idx, flag_idx, out_part_offset); + } + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); + } + + FORCE_INLINE_AICORE void FirstStepInOut(int32_t mat_blocks_size, __gm__ T *input, int32_t gm_ofset, int32_t offset, int32_t comm_idx, int32_t flag_idx, int64_t out_part_offset) { + int32_t ping_pong_move_count = DivCeil(mat_blocks_size, rs_max_ub_ping_pong_size); + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = rs_max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = mat_blocks_size - move_idx * rs_max_ub_ping_pong_size; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, input + gm_offset + offset + move_idx * rs_max_ub_ping_pong_size, 1, actual_move_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + int32_t move_num_offset = offset + move_idx * rs_max_ub_ping_pong_size; + auto ub_buff = ub_buff_st; + int32_t left_m = actual_move_size / n0; + while (left_m > 0) { + int32_t loop_idx = (move_num_offset / (m0 * n0)); + int32_t n_idx = loop_idx % n_loop; + int64_t m_idx = comm_idx * p_value + loop_idx / n_loop; + int32_t actual_m = (m_idx == (m_loop_per_bigdim - 1)) ? (m_per_bigdim - m_idx * m0) : m0; + int32_t actual_n = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + int32_t m_offset = (move_num_offset % (m0 * n0)) / n0; + int32_t actual_move_m; + if (m_offset >= actual_m) { + actual_move_m = m0 < m_offset + left_m ? m0 - m_offset : left_m; + } else { + actual_move_m = actual_m < m_offset + left_m ? actual_m - m_offset : left_m; + int64_t out_buff_offset = (m_idx * m0 + m_offset) * n + n_idx * n0; + CopyUbufToGmUnknown(n % BLOCK_SIZE_16 == 0, gm_out + out_part_offset + out_buff_offset, + ub_buff, actual_move_m, actual_n * sizeof(T), (n0 - actual_n) * sizeof(T) / 32, (n - actual_n) * sizeof(T)); + } + left_m -= actual_move_m; + move_num_offset += actual_move_m * n0; + ub_buff += actual_move_m * n0; + } + SetFlag(event_id); + } + } + + FORCE_INLINE_AICORE void EndFlagsAndBias() + { + ResetIpcFlags(2); + + if (aiv_idx == 1 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + + if constexpr (HAVE_BIAS) { + add_bias_runner.Run(); + } + } + + FORCE_INLINE_AICORE void Run() { + preprocessor.Run(); + ResetIpcFlags(2); + PipeBarrier(); + int32_t twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; + int64_t gm_a_pingpong_size = m0 * k_align * p_value * twod_big_dim; + int64_t gm_c_pingpong_size = p_value * twod_big_dim *n_loop * m0 * n0; + int32_t m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); + int64_t m_per_bigdim = m *ag_dim / twod_big_dim; + int32_t comm_count = DivCeil(m_loop_per_bigdim, p_value); + int32_t ag_m = p_value * m0; + int32_t rs_p_value = p_value; + + for (int32_t comm_idx = 0; comm_idx < comm_count + MAX_BLOCK_COUNT; ++comm_idx) { + uint64_t flag_idx = comm_idx % MAX_BLOCK_COUNT; + int32_t commrs_idx = comm_idx - MAX_BLOCK_COUNT; + if (comm_idx == comm_count - 1){ + ag_m = m_per_bigdim - (comm_count - 1) * p_value * m0; + } + if (commrs_idx == comm_count - 1){ + rs_p_value = m_loop_per_bigdim - (comm_count - 1) * p_value; + } + if (commrs_idx >= 0) { + WaitEvent(flag_idx); + } + SetAndWaitAivSync(flag_idx); + CrossRankSyncV1(FLAG_ZERO_IDX, comm_idx + 1); + SetAndWaitAivSync(flag_idx); + if (comm_idx < comm_count && aiv_idx == 0 && core_idx < ag_comm_npu_split * ag_comm_data_split) { + for (int32_t ag_part_idx = 0; ag_part_idx < ag_part_dim; ag_part_idx++) { + int64_t src_offset = comm_idx * p_value * m0 * k_align + ag_part_idx * m_per_bigdim * k_align; + int32_t bigdim_idx = rank_ag_idx * ag_part_dim + ag_part_idx; + int32_t rank_offset = flag_idx * gm_a_pingpong_size + bigdim_idx * p_value * m0 * k_align; + MoveWithSplit(gm_a + src_offset, rank_offset, ag_m * k_align); + } + } + if (comm_idx >= MAX_BLOCK_COUNT && aiv_idx == 0 && core_idx >= ag_core_count && core_idx < ag_core_count + rs_core_count) { + for (int32_t rs_part_idx = 0; rs_part_idx < rs_part_dim; rs_part_idx++) { + int32_t bigdim_idx = rank_rs_idx * rs_part_dim + rs_part_idx; + int32_t rank_buff_offset = flag_idx * gm_c_pingpong_size + bigdim_idx * rs_p_value * m0 * n_loop * n0; + FirstStepInOutWithSplit(rs_p_value * m0 * n_loop * n0, LCAL_2DTP_C_OFFSET + rank_buff_offset, commrs_idx, flag_idx, m_per_bigdim * rs_part_idx * n); + } + } + + SetAndWaitAivSync(flag_idx); + CrossRankSyncV2(FLAG_ONE_IDX, comm_idx + 1); + + SetAndWaitAivSync(flag_idx); + + SetAicSync(flag_idx); + } + EndFlagsAndBias(); + } + +public: + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_loop; + using CocCommBase::core_idx; + using CocCommBase::core_num; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_count; + using CocCommBase::swizzl_direct; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_int8; + using CocCommBase::is_91093; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::withSerialMode; + using CocCommBase::tag; + using CocCommBase::loop_num_per_comm; + using CocCommBase::gm_c_pingpong_size; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::ag_dim; + using CocCommBase::rs_dim; + using CocCommBase::comm_npu_split; + using CocCommBase::comm_data_split; + using CocCommBase::comm_direct; + using CocCommBase::len_per_loop; + using CocCommBase::extra_comm_npu_split; + using CocCommBase::extra_comm_data_split; + using CocCommBase::extra_comm_direct; + using CocCommBase::extra_len_per_loop; + using CocCommBase::extra_ub_move_num; + using CocCommBase::weight_nz; + using CocCommBase::is_deterministic; + using CocCommBase::flag_offset; + int32_t m_align; + int64_t k_align; + int32_t n_align; + int32_t aligned_a; + int32_t aligned_b; + int32_t comm_count; + + int32_t ag_comm_npu_split; + int32_t ag_comm_data_split; + int32_t ag_len_per_loop; + int32_t ag_comm_direct; + + int32_t rs_comm_npu_split; + int32_t rs_comm_data_split; + int32_t rs_len_per_loop; + int32_t rs_comm_direct; + + int32_t ag_core_count; + int32_t rs_core_count; + + int32_t ag_max_ub_ping_pong_size; + int32_t rs_max_ub_ping_pong_size; + __gm__ T *gm_a; + + int32_t rank_ag_idx; + int32_t rank_rs_idx; + int32_t other_rank_ag_idx; + int32_t other_rank_rs_idx; + Preprocessor preprocessor; + AllGatherMatmulBiasAdder add_bias_runner; + + int32_t twod_big_dim; + int64_t gm_a_pingpong_size; + int64_t gm_c_pingpong_size; + int32_t m_loop_per_bigdim; + int32_t m_per_bigdim; + int32_t ag_part_dim; + int32_t rs_part_dim; + +}; + +template +inline __aicore__ void CocAllGatherMatmulReduceScatterAiv(COC_ARGS_FUN(T)) { + AllGatherReduceScatter allgatherreducescatter_write_without_bias; + AllGatherReduceScatter allgatherreducescatter_write_with_bias; + + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t tiling_key = cocTilingData->tilingKey; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + allgatherreducescatter_write_without_bias.SetArgs(COC_ARGS_CALL()); + allgatherreducescatter_write_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + allgatherreducescatter_write_with_bias.SetArgs(COC_ARGS_CALL()); + allgatherreducescatter_write_with_bias.Run(); + break; + default: + break; + } + PipeBarrier(); +} + +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/comm/lcal/src/kernels/coc_allreduce.cce index cc740acb..07ca30a2 100644 --- a/comm/lcal/src/kernels/coc_allreduce.cce +++ b/comm/lcal/src/kernels/coc_allreduce.cce @@ -43,10 +43,9 @@ public: } if (dequant_granularity == QuantGranularity::PER_TOKEN) { fused_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(buff[rank]), - reinterpret_cast<__gm__ int64_t *>(gm_quant_scale), - m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, - swizzl_count, p_value, rank_size); - serial_pertokrn_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(gm_out), reinterpret_cast<__gm__ int64_t *>(gm_quant_scale), m, n, m0, n0); + reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, + m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value, rank_size); + serial_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(gm_out), reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, m0, n0); } total_core_idx = aiv_idx * core_num + core_idx; cal_count = DivCeil(core_loop, loop_num_per_comm); @@ -115,7 +114,7 @@ public: SetFlag(event_id); WaitFlag(event_id); int32_t move_num_offset = gm_out_offset + move_idx * max_ub_ping_pong_size; - CopyUbufToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset); + CopyUbToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset); SetFlag(event_id); } EndFlagsAndBias(); @@ -147,7 +146,7 @@ public: actual_move_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); WaitFlag(event_id); - int32_t move_num_offset = gm_out_offset + move_idx * max_ub_ping_pong_size; + int64_t move_num_offset = other_rank_offset + move_idx * max_ub_ping_pong_size; CopyUbufToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset + cal_idx * gm_c_pingpong_size); SetFlag(event_id); } @@ -175,10 +174,10 @@ public: auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; WaitFlag(event_id); - CopyGmToUbuf(ub, input + offset, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); + CopyGmToUbuf(ub, input + offset, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); WaitFlag(event_id); - CopyGmToUbuf(output + offset, ub, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); + CopyUbufToGM(output + offset, ub, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); data_size_remain -= max_ub_ping_pong_size; offset += max_ub_ping_pong_size; @@ -191,7 +190,7 @@ public: SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); if (ALIGN) { - CopyGmToUbuf(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); + CopyUbufToGM(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); } else { CopyUbufToGmAlignB16(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T), 0, 0); } @@ -214,9 +213,9 @@ public: int32_t actual_loop_num = (cal_idx == cal_count -1) ? core_loop - cal_idx * loop_num_per_comm : loop_num_per_comm; int32_t m_total = actual_loop_num * m0; - m_per_rank = DivCeil(m_tatal, rank_size); + m_per_rank = DivCeil(m_total, rank_size); m_in_rank = (rank * m_per_rank >= m_total) ? 0 : - ((rank + 1) * m_per_rank > m_tatal ? m_total - rank * m_per_rank : m_per_rank); + ((rank + 1) * m_per_rank > m_total ? m_total - rank * m_per_rank : m_per_rank); WaitEvent(flag_idx); @@ -254,7 +253,7 @@ public: ((real_core_offset + len_per_loop) > loop_total ? loop_total - real_core_offset : len_per_loop); - FirstStepDivCore(m_in_core, runk_buff_offset + before_core_offset + real_core_offset); + FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); } } WaitFlag(EVENT_ID0); @@ -285,124 +284,6 @@ public: } ResetIpcFlags(3); - if (aiv_idx == 0 && core_idx < rank_size) { - CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); - } - } - - FORCE_INLINE_AICORE void DataCopySioRs(int32_t cal_idx_sio, int32_t len_per_rank) { - int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_4; - int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; - int32_t sio_core_idx = total_core_idx - core_count; - int32_t core_offset = sio_core_idx * len_per_core; - int32_t sio_peer_rank = rank ^ 1; - for (int32_t src_rank = rank % 2; src_rank < rank_size; src_rank +=2) { - int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank + core_offset; - FirstStepInPeerMem(len_per_core, buff[sio_peer_rank] + peer_offset, buff[rank] + peer_offset); - } - } - - FORCE_INLINE_AICORE void DataCopySioAg(int32_t cal_idx_sio, int32_t len_per_rank) { - int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_4; - int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; - int32_t sio_core_idx = total_core_idx - core_count; - int32_t core_offset = sio_core_idx * len_per_core; - int32_t sio_peer_rank = rank ^ 1; - for (int32_t src_rank = sio_peer_rank % 2; src_rank < rank_size; src_rank +=2) { - int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank; - int32_t dst_offset = cal_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank + core_offset; - SecondStepParallel(len_per_core, buff[sio_peer_rank] + peer_offset + core_offset, dst_offset); - } - int32_t local_offset = flag_idx_sio * gm_c_pingpong_size + rank * len_per_rank + core_offset; - int32_t dst_offset = cal_idx_sio * gm_c_pingpong_size + rank * len_per_rank + core_offset; - SecondStepParallel(len_per_core, buff[rank] + local_offset, dst_offset); - } - - FORCE_INLINE_AICORE void ParallelSio() { - ResetIpcFlags(3); - PipeBarrier(); - int32_t last_loop_num = core_loop - (cal_count -1) * loop_num_per_comm; - int32_t core_group = GetCoreGroup(); - for (int32_t cal_idx = 0; cal_idx < cal_count + 2; ++cal_idx) { - int32_t hccs_idx = cal_idx -1; - int32_t sio2_idx = cal_idx -2; - int32_t flag_idx_sio1 = cal_idx % BLOCK_COUNT_4; - int32_t flag_idx_hccs = hccs_idx % BLOCK_COUNT_4; - int32_t flag_idx_sio2 = sio2_idx % BLOCK_COUNT_4; - int32_t loop_num_hccs = hccs_idx == cal_count -1 ? last_loop_num : loop_num_per_comm; - - if (cal_idx < cal_count) { - WaitEvent(flag_idx_sio1); - } - - if (need_dequant) { - fused_dequant_runner.RunDequantAllReduce(cal_idx); - } - - SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); - - CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); - - StartBeforeFisrtStep(flag_idx_sio1); - - if (core_group == 0 && cal_idx >= 1 && cal_idx < cal_count + 1) { - int32_t size_per_rank = loop_num_hccs * m0 * n0 / rank_size; - int32_t rank_offset = rank * size_per_rank; - int32_t rank_buff_offset = flag_idx_hccs * gm_c_pingpong_size + rank_offset; - int32_t size_per_core = size_per_rank / (comm_data_split); - - int32_t data_split_num = DivCeil(size_per_core, len_per_loop); - - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); - for (int loop_index = 0; loop_index < data_split_num; loop_index++) { - int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; - int32_t loop_total = size_per_rank -before_core_offset; - int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; - - int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : - ((real_core_offset + len_per_loop) > loop_total ? - loop_total -real_core_offset : len_per_loop); - - FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); - } - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); - } - if (core_group == 1 && cal_idx < cal_count) { - int32_t loop_num_sio1 = cal_idx == cal_count -1 ? last_loop_num : loop_num_per_comm; - int32_t size_per_rank = loop_num_sio1 * m0 * n0 / rank_size; - DataCopySioRs(cal_idx, size_per_rank); - } - - EndFirstStep(flag_idx_sio1); - - CrossRankSyncV1(FLAG_ONE_IDX, cal_idx + 1); - SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); - if (core_group == 0 && cal_idx >= 1 && cal_idx < cal_count + 1) { - int32_t size_per_rank = loop_num_hccs * m0 * n0 / rank_size; - int32_t pipe_offset = flag_idx_hccs * gm_c_pingpong_size + other_rank * size_per_rank; - int32_t dst_offset = hccs_idx * gm_c_pingpong_size + other_rank * size_per_rank; - if ((other_rank % 2) == (rank % 2) && other_rank != rank) { - FirstStepInPeerMemTransLayout(size_per_rank, buff[other_rank] + pipe_offset, buff[rank] + pipe_offset, dst_offset); - } - } - if (core_group == 1 && cal_idx >= 2) { - int32_t loop_num_sio2 = sio2_idx == cal_count - 1 ? last_loop_num : loop_num_per_comm; - int32_t size_per_rank = loop_num_sio2 * m0 * n0 / rank_size; - DataCopySioAg(sio2_idx, size_per_rank); - } - SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); - CrossRankSyncV2(FLAG_TWO_IDX, cal_idx + 1); - - SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); - - if (cal_idx >= 2) { - SetAicSync(flag_idx_sio2); - } - } - ResetIpcFlags(3); - if (aiv_idx == 0 && core_idx < rank_size) { CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); } @@ -416,7 +297,7 @@ public: WaitEvent(AIV_FINISH_ALIGN_FLAG_ID); if (need_dequant) { - serial_dequant_runner.Run() + serial_dequant_runner.Run(); } if (aiv_idx == 1 && core_idx < rank_size) { int32_t data_size = batch_size * m * n; @@ -507,12 +388,12 @@ public: preprocessor.Run(); if constexpr (HAVE_BIAS) { - add_bias_runner.Run() + add_bias_runner.Run(); } if (withSerialMode) { if (is_deterministic) { - SerialWithSplit() + SerialWithSplit(); } else { Serial(); } @@ -525,7 +406,7 @@ public: PipeBarrier(); if (withSerialMode && dequant_granularity == QuantGranularity::PER_TOKEN) { - serial_pertokrn_dequant_runner.Run(); + serial_pertoken_dequant_runner.Run(); } } @@ -539,8 +420,7 @@ public: using CocCommBase::FirstStepInPeerMem; using CocCommBase::FirstStepInPeerMemSeq; using CocCommBase::FirstStepInPeerMemTree; - using CocCommBase::FirstStepInPeerMemTransLayout; - using CocCommBase::CopyUbufToGmTransLayout; + using CocCommBase::CopyUbToGmTransLayout; using CocCommBase::ResetIpcFlags; using CocCommBase::CrossRankSyncV1; using CocCommBase::CrossRankSyncV2; @@ -570,6 +450,7 @@ public: using CocCommBase::swizzl_direct; using CocCommBase::trans_a; using CocCommBase::trans_b; + using CocCommBase::is_int8; using CocCommBase::is_91093; using CocCommBase::p_value; using CocCommBase::aiv_idx; @@ -594,6 +475,7 @@ public: using CocCommBase::flag_offset; int32_t cal_count; int32_t m_per_rank; + int32_t m_in_rank; int32_t total_core_idx; Preprocessor preprocessor; Postprocessor postprocessor; @@ -601,7 +483,7 @@ public: SerialDequantRunner serial_dequant_runner; FusedDequantRunner fused_dequant_runner; FusedPerTokenDequantRunner fused_pertoken_dequant_runner; - SerialPerTokenDequantRunner serial_pertokrn_dequant_runner; + SerialPerTokenDequantRunner serial_pertoken_dequant_runner; bool need_dequant; }; @@ -637,22 +519,22 @@ FORCE_INLINE_AICORE void RunAllReduceAlign16(int32_t tiling_key, COC_ARGS_FUN(T) template FORCE_INLINE_AICORE void RunAllReduceUnAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { - AllReduce allreduce_align_16_without_bias; - AllReduce allreduce_align_16_with_bias; + AllReduce allreduce_unalign_16_without_bias; + AllReduce allreduce_unalign_16_with_bias; switch (tiling_key) { case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : - allreduce_align_16_without_bias.SetArgs(COC_ARGS_CALL()); - allreduce_align_16_without_bias.Run(); + allreduce_unalign_16_without_bias.SetArgs(COC_ARGS_CALL()); + allreduce_unalign_16_without_bias.Run(); break; case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : - allreduce_align_16_with_bias.SetArgs(COC_ARGS_CALL()); - allreduce_align_16_with_bias.Run(); + allreduce_unalign_16_with_bias.SetArgs(COC_ARGS_CALL()); + allreduce_unalign_16_with_bias.Run(); break; default : break; @@ -660,19 +542,19 @@ FORCE_INLINE_AICORE void RunAllReduceUnAlign16(int32_t tiling_key, COC_ARGS_FUN( } template -inline __aicore__ void CocMatmullReduceAiv(COC_ARGS_FUN(T)) +inline __aicore__ void CocMatmulAllReduceAiv(COC_ARGS_FUN(T)) { AllReduce allreduce_align_16_without_bias; AllReduce allreduce_align_16_with_bias; - AllReduce allreduce_align_16_without_bias; - AllReduce allreduce_align_16_with_bias; + AllReduce allreduce_unalign_16_without_bias; + AllReduce allreduce_unalign_16_with_bias; SetAtomicNone(); SetMaskNormImpl(); SetSyncBaseAddr((uint64_t)ffts_addr); SetVectorMask((uint64_t)-1, (uint64_t)-1); - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; int64_t batch_size = cocTilingData->batchSize; int32_t m = cocTilingData->m; diff --git a/comm/lcal/src/kernels/coc_comm_base.cce b/comm/lcal/src/kernels/coc_comm_base.cce index 2d4401d0..fea6333f 100644 --- a/comm/lcal/src/kernels/coc_comm_base.cce +++ b/comm/lcal/src/kernels/coc_comm_base.cce @@ -54,7 +54,7 @@ public: FORCE_INLINE_AICORE void SetFromParam(__gm__ uint8_t *para_gm) { - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; auto quantInfo = ¶->quantInfo; auto twoDimTPInfo = ¶->twoDimTPInfo; @@ -67,9 +67,9 @@ public: k0 = cocTilingData->k0; n0 = cocTilingData->n0; - m_loop = cocTilingData->m_loop; - k_loop = cocTilingData->k_loop; - n_loop = cocTilingData->n_loop; + m_loop = cocTilingData->mLoop; + k_loop = cocTilingData->kLoop; + n_loop = cocTilingData->nLoop; core_loop = cocTilingData->coreLoop; swizzl_count = cocTilingData->swizzlCount; @@ -95,7 +95,7 @@ public: core_count = comm_npu_split * comm_data_split; dequant_granularity = static_cast(quantInfo->dequantGranularity); dequant_group_size = quantInfo->dequantGroupSize; - quant_granularity = static_cast(quantInfo->quantGranularity) + quant_granularity = static_cast(quantInfo->quantGranularity); quant_group_size = quantInfo->quantGroupSize; swizzl_direct = (tiling_key & SWIZZL_MASK) ? true : false; trans_a = (tiling_key & TRANS_A_MASK) ? true : false; @@ -135,7 +135,7 @@ public: } workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, - hasFormatDequantScale, is_deterministic, false, false, 0, 0, 0); + hasFormatDequantScale, is_deterministic); } @@ -146,7 +146,7 @@ public: FORCE_INLINE_AICORE void SetAndWaitAivSync(uint64_t flag_idx, int32_t pipe_depth = 2) { - FFTSCrossCoreSync(2, flag_idx + pipe_depth); + FFTSCrossCoreSync(0, flag_idx + pipe_depth); WaitEvent(flag_idx + pipe_depth); } @@ -186,7 +186,7 @@ public: SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); while (true) { - CopyUbufToGmAlignB16(ctrl_flags_UB, buff, 1, sizeof(int32_t), 0, 0); + CopyGmToUbufAlignB16(ctrl_flags_UB, buff, 1, sizeof(int32_t), 0, 0); SetFlag(EVENT_ID3); WaitFlag(EVENT_ID3); if (*ctrl_flags_UB == flag) { @@ -221,7 +221,7 @@ public: if (aiv_idx == 0 && core_idx == rank) { SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); } else if (aiv_idx == 0 && core_idx < rank_size) { - CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx]] + flag_offset + flag_idx, + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, flag_data); } } @@ -230,9 +230,9 @@ public: { if (aiv_idx == 0 && core_idx < rank_size) { if (core_idx != rank) { - SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); } - CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, rank_size * flag_data); + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, rank_size * flag_data * rank_size); } } @@ -240,7 +240,7 @@ public: { for (int32_t idx = 0; idx < num_flags; ++idx) { if (core_idx == 0 && aiv_idx == 0) { - SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + idx, f0); + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + idx, 0); } } } @@ -271,13 +271,13 @@ public: } } - FORCE_INLINE_AICORE void CopyUbufToGmTransLayout(__ubuf__ T* ub_buff_st, int32_t actual_move_size, int64_t move_num_offset) { + FORCE_INLINE_AICORE void CopyUbToGmTransLayout(__ubuf__ T* ub_buff_st, int32_t actual_move_size, int64_t move_num_offset) { auto ub_buff = ub_buff_st; int32_t left_m = actual_move_size / n0; while (left_m > 0) { int32_t loop_idx = move_num_offset / (m0 * n0); int64_t m_idx, n_idx; - GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzlCount, m_idx, n_idx); + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); int32_t actual_m = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; int32_t actual_n = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; int32_t m_offset = (move_num_offset % (m0 * n0)) / n0; @@ -301,7 +301,7 @@ public: for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx) { uint32_t data_size = interm_offset + max_ub_ping_pong_size < copy_size ? max_ub_ping_pong_size : copy_size - interm_offset; auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; - auto ub = (m_idx & 1) ? ub0 : ub1; + auto ub = (move_idx & 1) ? ub0 : ub1; WaitFlag(event_id); CopyGmToUbuf(ub, gm_src + interm_offset, 1, data_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); @@ -369,7 +369,7 @@ public: } if (rank_size == 8) { CopyGmToGm(gm_reducebuf + 1 * len_per_loop, buff[rank] + core_buff_offset, data_size_remain); - CopyGmToGm(gm_reducebuf + 1 * len_per_loop, gm_reducebuf, data_size_remain); + CopyGmToGm(gm_reducebuf + 2 * len_per_loop, gm_reducebuf, data_size_remain); } if (rank_size >= 4) { CopyGmToGm(gm_reducebuf, buff[rank] + core_buff_offset, data_size_remain); @@ -392,7 +392,7 @@ public: WaitFlag(EVENT_ID1); if (atomic_add) { SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); SetAtomicNone(); PipeBarrier(); } @@ -415,7 +415,7 @@ public: int32_t m_loop; int32_t n_loop; int32_t k_loop; - int32_t coreLoop; + int32_t core_loop; int32_t core_idx; int32_t real_core_idx; @@ -425,8 +425,8 @@ public: int32_t flag_offset; int32_t tiling_key; - int32_t swizzlCount; - int32_t swizzl_direct; + int32_t swizzl_count; + bool swizzl_direct; bool trans_a; bool trans_b; bool is_int8; diff --git a/comm/lcal/src/kernels/coc_const_args.cce b/comm/lcal/src/kernels/coc_const_args.cce index ea09aec4..2473f80c 100644 --- a/comm/lcal/src/kernels/coc_const_args.cce +++ b/comm/lcal/src/kernels/coc_const_args.cce @@ -48,9 +48,9 @@ constexpr int32_t BLOCK_COUNT_3 = 3; constexpr int32_t BLOCK_COUNT_4 = 4; constexpr int32_t L0AB_PINGPONG_BUFFER_LEN = 16384; constexpr int32_t CUBE_MATRIX_SIZE = 256; -constexpr int32_t L1_PINGPONG_BUFFER_LEN = 131072; +constexpr int64_t L1_PINGPONG_BUFFER_LEN = 131072; constexpr int32_t MAX_CORE_NUM = 25; -constexpr int32_t MAX_UB_BUFF = 196608; +constexpr int64_t MAX_UB_BUFF = 196608; constexpr int32_t ADD_REPEAT_TIME = 4; constexpr int32_t FLAG_ZERO_IDX = 0; constexpr int32_t FLAG_ONE_IDX = 1; @@ -83,7 +83,7 @@ enum QuantGranularity : int { PER_TOKEN = 3, FLOAT32_SCALE_PER_CHANNEL = 4, QUANT_GRANULARITY_MAX = 5, -} +}; template struct BaseBlock { @@ -120,7 +120,7 @@ using Block256B = BaseBlock; template using Block512B = BaseBlock; -template +template struct CoCCommArgs { int rank; int localRank; @@ -131,4 +131,4 @@ struct CoCCommArgs { int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE]; }; -#endif +#endif // LCAL_COC_CONST_ARGS_H diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 5cc13b2d..053fc092 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -63,7 +63,7 @@ public: PipeBarrier(); Vmuls(ub_muls, ub_adds_f32, scale, repeat, 1, 1, 8, 8); PipeBarrier(); - Waitflag(event_id); + WaitFlag(event_id); Vconv(ub_out, ub_muls, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(event_id); @@ -622,7 +622,7 @@ public: inline __aicore__ void SetArgs(__gm__ T *gm_out, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0) { - this->gm_out = reinterpret_cast<__gm__ t *>(gm_out); + this->gm_out = reinterpret_cast<__gm__ T *>(gm_out); this->gm_dequant_scale_pertoken = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale_pertoken); this->m = m; this->n = n; @@ -679,7 +679,7 @@ public: this-> rank_size = rank_size; } - inline __aicore__ void RunDequantAllreduce(int32_t cal_idx) + inline __aicore__ void RunDequantAllReduce(int32_t cal_idx) { switch (dequant_granularity) { case QuantGranularity:: PER_TENSOR : diff --git a/comm/lcal/src/kernels/coc_postprocessor.cce b/comm/lcal/src/kernels/coc_postprocessor.cce index 154c91b1..526a1d3b 100644 --- a/comm/lcal/src/kernels/coc_postprocessor.cce +++ b/comm/lcal/src/kernels/coc_postprocessor.cce @@ -173,7 +173,7 @@ public: FORCE_INLINE_AICORE void SetArgs(PP_MATMUL_AIV_POST_ARGS_FUN()) { - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); auto cocTilingData = ¶-.cocTilingData; this->with_rms_norm = para->postInfo.withRmsNorm; if (this->with_rms_norm) { diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce index f344efee..4666e809 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -49,7 +49,7 @@ inline __aicore__ void CopyGmToCbuf(__cbuf__ T *dst, __gm__ T *src, uint8_t sid, { DataCopyParams intriParams(nBurst, lenBurst, srcStride, dstStride); GlobalTensor srcTensor; - srcTensor.SetGloalBuffer(src); + srcTensor.SetGlobalBuffer(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); uint8_t logicpos = static_cast(TPosition::C1); LocalTensor dstTensor; @@ -159,7 +159,7 @@ inline __aicore__ void LoadCbufToCb(__cb__ T *dst, __cbuf__ T *src, uint16_t bas template struct IntrinsicCopyGmToL1Nd2Nz { static inline __aicore__ void move( - __cbuf__ T *dst, __gm__ T *src + __cbuf__ T *dst, __gm__ T *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, uint16_t dstNzNStride, uint16_t dstNzMatrixStride){ @@ -173,7 +173,7 @@ struct IntrinsicCopyGmToL1Nd2Nz { LocalTensor dstTensor; dstTensor = CreateLocalTensor(dst_buffer_offset, dst_logicpos); GlobalTensor srcTensor; - srcTensor.SetGloalBuffer(src); + srcTensor.SetGlobalBuffer(src); DataCopy(dstTensor, srcTensor, nd2nzParams); } }; @@ -182,7 +182,7 @@ template struct CopyGmToL1Nd2zN { static inline __aicore__ void move( __cbuf__ T *dst, __gm__ T *src, - uint16_t nValue, uint16_t dValue, uint16_t srcDValue, uint16_t dstNzC0Stride) { + uint16_t nValue, uint16_t dValue, uint32_t srcDValue, uint16_t dstNzC0Stride) { constexpr int BLOCK_LEN = 32 / sizeof(T); if (srcDValue < ND2NZ_STRIDE_LIMIT) { IntrinsicCopyGmToL1Nd2Nz::move( @@ -268,7 +268,7 @@ public: workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, TA, TB, sizeof(MmadDtype), has_a_align, has_b_align, accum_rank_size, has_accum, 0, has_dequant_param, - hasFormatDequantScale, is_deterministic, false, false, 0, 0, 0); + hasFormatDequantScale, is_deterministic); gm_a_src = reinterpret_cast<__gm__ MmadDtype *>(has_a_align ? workspace_info.gm_a_align : gm_a); gm_b_src = reinterpret_cast<__gm__ MmadDtype *>(has_b_align ? workspace_info.gm_b_align : gm_b); @@ -280,7 +280,7 @@ public: L1_PINGPONG_BUFFER_LEN = ((m0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size + (n0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size * (IS_INT8 ? 2 : 1)); - L0AB_PINGPONG_BUFFER_LEN = L0AB_PINGPONG_BUFFER_LEN / sizeof(MmadDtype); + L0AB_PINGPONG_BUFFER_LEN = L0AB_PINGPONG_BUFFER_SIZE / sizeof(MmadDtype); int32_t a_l1_size = m0 * k0 * sizeof(MmadDtype); int32_t a_l1_size_round = DivCeil(a_l1_size, 512) * 512; @@ -330,9 +330,9 @@ public: m_round = DivCeil(m_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; } if (TB) { - n_round = DivCeil(n_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; - } else { n_round = DivCeil(n_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } else { + n_round = DivCeil(n_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; } } else { m_round = DivCeil(m_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; @@ -500,12 +500,12 @@ public: if (weight_nz) { offset_b_next = batch_idx * n * k + (k_idx + 1) * k0 * n_align16 + n_idx * n0 * block_size; } else { - offset_b_next = batch_idx * n * k + (k_idx + 1) * k0 + n_idx * n0; + offset_b_next = batch_idx * n * k + n_idx * n0 * k + (k_idx + 1) * k0; } } } else { if (aligned_b == 1) { - offset_b_next = batch_idx * k * n_align + n_idx * n0 + (k_idx + 1) * k0 * n_align; + offset_b_next = batch_idx * k * n_align + n_idx * n0 + (k_idx + 1) * k0 * n_align + n_idx * n0; } else { if (weight_nz) { offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * block_size + n_idx * n0 * k_align16; @@ -525,7 +525,7 @@ public: auto gm_src_a = gm_a_src_tmp + offset_a_next; auto gm_src_b = gm_b_src + offset_b_next; - WaitFlag(event_id_next); + WaitFlag(event_id_next); if (m == 1 || m_actual == 1 && !TA) { CopyGmToCbuf( l1_buf_a_next, @@ -632,7 +632,7 @@ public: l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_16 + i * k_round * BLOCK_SIZE_16, 0, - k0_round, + k0_round /BLOCK_SIZE_16, 1, 0, 0, @@ -642,7 +642,7 @@ public: } } } else { - for (int i = 0; i < m_round / BLOCK_SIZE_16; i++) { + for (int32_t i = 0; i < k0_round / BLOCK_SIZE_16; i++) { LoadCbufToCa( l0a_buf + i * cube_matrix_size, l1_buf_a + k_part_idx * k_part_len * m_round + @@ -684,8 +684,8 @@ public: l0b_buf + i * ((n_actual + 15) / 16 * 16) * BLOCK_SIZE_32, l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_32) * BLOCK_SIZE_32, 0, - n_round, - k_round, + n_round / BLOCK_SIZE_32, + k_round / BLOCK_SIZE_32, 1, 0, 0, @@ -697,8 +697,8 @@ public: l0b_buf + i * n_round * BLOCK_SIZE_16, l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_16) * BLOCK_SIZE_16, 0, - n_round, - k_round, + n_round / BLOCK_SIZE_16, + k_round / BLOCK_SIZE_16, 0, 0, true, @@ -818,7 +818,7 @@ public: } inline __aicore__ void MoveL0CToGM(__gm__ OutDtype *gm_dst, int64_t offset_c, int32_t m_actual, int32_t n_actual, int32_t src_stride, int32_t dst_stride) { - #if (__CCE__AICORE__ == 220) + #if (__CCE_AICORE__ == 220) FixpipeParamsV220 FixpipeParams( n_actual, m_actual, @@ -827,11 +827,11 @@ public: false ); #elif (defined(__DAV_C310__)) - FixpipeParamsV310 FixpipeParams( + FixpipeParamsC310 FixpipeParams( n_actual, m_actual, src_stride, - dst_stride, + dst_stride ); #endif uint64_t src_addr = reinterpret_cast(l0c_buf); @@ -846,7 +846,7 @@ public: FixpipeParams.quantPre = VDEQF16; Fixpipe(dstTensor, srcTensor, FixpipeParams); SetFlag(EVENT_ID0); - } else if (QuantGranularity::PER_TENSOR) { + } else if (dequant_granularity == QuantGranularity::PER_TENSOR) { FixpipeParams.quantPre = DEQF16; FixpipeParams.deqScalar = gm_dequant_scale[0]; Fixpipe(dstTensor, srcTensor, FixpipeParams); @@ -975,7 +975,7 @@ public: int64_t gm_c_pingpong_size = p_value * twod_big_dim * n_loop * m0 * n0; int32_t m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); int64_t m_per_bigdim = m * ag_dim / twod_big_dim; - int32_t comm_count = DivCeil(batch_size, * m_loop_per_bigdim, p_value); + int32_t comm_count = DivCeil(batch_size * m_loop_per_bigdim, p_value); int32_t loop_num_per_cal = p_value * n_loop * twod_big_dim; int32_t ag_part_dim = twod_big_dim / ag_dim; int32_t rs_part_dim = twod_big_dim / rs_dim; @@ -1004,18 +1004,18 @@ public: int64_t m_idx_in_c = comm_idx * p_value + m_idx_in_rank; int32_t m_actual = (m_idx_in_c == (m_loop_per_bigdim - 1)) ? (m_per_bigdim - m_idx_in_c * m0) : m0; int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; - int64_t bigdim_dix = m_idx / actual_p_value; + int64_t bigdim_idx = m_idx / actual_p_value; - int32_t ag_src_idx = bigdim_dix / ag_part_dim; - int32_t ag_part_idx = bigdim_dix % ag_part_dim; - int32_t rs_dst_idx = bigdim_dix / rs_part_dim; - int32_t rs_part_idx = bigdim_dix % rs_part_dim; + int32_t ag_src_idx = bigdim_idx / ag_part_dim; + int32_t ag_part_idx = bigdim_idx % ag_part_dim; + int32_t rs_dst_idx = bigdim_idx / rs_part_dim; + int32_t rs_part_idx = bigdim_idx % rs_part_dim; __gm__ MmadDtype *gm_mem_st; if (ag_src_idx != ag_rank_idx) { gm_mem_st = reinterpret_cast<__gm__ MmadDtype *>(gm_peer_mem) + (comm_idx % MAX_BLOCK_COUNT) * gm_a_pingpong_size - + bigdim_dix * p_value * m0 * k_align; + + bigdim_idx * p_value * m0 * k_align; } else { gm_mem_st = gm_a_src + (comm_idx * p_value) * m0 * k_align + ag_part_idx * m_per_bigdim * k_align; } @@ -1025,7 +1025,7 @@ public: WaitFlag(EVENT_ID0); int64_t offset_c; - int32_t n_stride; + int64_t dst_stride; __gm__ OutDtype *gm_dst = nullptr; if (rs_dst_idx != rs_rank_idx) { @@ -1068,7 +1068,7 @@ protected: __gm__ MmadDtype *gm_b_src{nullptr}; __gm__ OutDtype *gm_c{nullptr}; - __gm__ OutDtype *gm_peer_mem(nullptr); + __gm__ OutDtype *gm_peer_mem{nullptr}; __gm__ int64_t *gm_dequant_scale{nullptr}; __gm__ int32_t *gm_format_dequant_offset{nullptr}; __gm__ int32_t *gm_accum{nullptr}; @@ -1096,7 +1096,7 @@ protected: int32_t k; int32_t n; int32_t m_align; - int32_t k_align; + int64_t k_align; int32_t n_align; int32_t k_align16; int32_t n_align16; @@ -1140,4 +1140,12 @@ protected: QuantGranularity dequant_granularity; }; +#elif __DAV_C220_VEC__ + +#include "coc_preprocessor.cce" +#include "coc_add_bias_runner.cce" +#include "coc_dequant_runner.cce" +#include "tiling_args.h" + +#endif #endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce b/comm/lcal/src/kernels/coc_ppmatmul_switch.cce index 7bca03ca..1de57637 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul_switch.cce @@ -56,7 +56,7 @@ inline __aicore__ void CocPpmatmulSwitchAic(COC_ARGS_FUN(TData)) { set_nd_para(config); SetSyncBaseAddr((uint64_t)ffts_addr); - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParm *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; auto quantInfo = ¶->quantInfo; auto twoDimTPInfo = ¶->twoDimTPInfo; @@ -71,9 +71,9 @@ inline __aicore__ void CocPpmatmulSwitchAic(COC_ARGS_FUN(TData)) { int32_t k0 = cocTilingData->k0; int32_t n0 = cocTilingData->n0; - int32_t m_loop = cocTilingData->m_loop; - int32_t k_loop = cocTilingData->k_loop; - int32_t n_loop = cocTilingData->n_loop; + int32_t m_loop = cocTilingData->mLoop; + int32_t k_loop = cocTilingData->kLoop; + int32_t n_loop = cocTilingData->nLoop; int32_t core_loop = cocTilingData->coreLoop; int32_t swizzl_count = cocTilingData->swizzlCount; @@ -89,7 +89,7 @@ inline __aicore__ void CocPpmatmulSwitchAic(COC_ARGS_FUN(TData)) { bool is_int8 = (tiling_key & INT8_MASK) != 0; QuantGranularity dequant_granularity = static_cast(quantInfo->dequantGranularity); int32_t dequant_group_size = quantInfo->dequantGroupSize; - quantGranularity quant_granularity = static_cast(quantInfo->quantGranularity); + QuantGranularity quant_granularity = static_cast(quantInfo->quantGranularity); int32_t quant_group_size = quantInfo->quantGroupSize; __gm__ TData* gm_peer_mem = buff[rank]; __gm__ TData* gm_c = gm_out; diff --git a/comm/lcal/src/kernels/coc_preprocessor.cce b/comm/lcal/src/kernels/coc_preprocessor.cce index 98a8333f..cac13c20 100644 --- a/comm/lcal/src/kernels/coc_preprocessor.cce +++ b/comm/lcal/src/kernels/coc_preprocessor.cce @@ -1,3 +1,12 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ #ifndef __COC_PREPROCESSOR__ #define __COC_PREPROCESSOR__ @@ -2592,7 +2601,7 @@ public: } LcalWorkspaceInfo workspace_info = GetLcalWorkSpaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, - hasFormatDequantScale,is_deterministic, is_moe, is_alltoallvc, EP, local_expert_nums, m * EP * TP); + hasFormatDequantScale,is_deterministic); if (this->is_int8) { switch (this->dequant_granularity) { diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index 2dd022e5..c9005dff 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -1,3 +1,12 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ #include #include #include @@ -155,7 +164,7 @@ int Lcoc::SetParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDes SetLcocParam(lcalType, paramDesc); CoCTilingFunc *pTilingFunc = CreateCoCTilingFunc(lcalType); if (pTilingFunc == nullptr) { - PrintErrorLog(lcalType, "Create CoCTilingFunc Failed!"); + PrintErrorLog(lcalType, "Create CoCTilingFunc failed!"); return LCAL_ERROR_INTERNAL; } CoCTilingData tilingData = pTilingFunc->GenerateTiling(taskParam_, tiling); @@ -217,7 +226,8 @@ bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, const CoCOutputPkg &outputPkg return true; } -int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream) { LcalType lcalType = LcalType::MATMUL_ALL_REDUCE; if (!CheckBasic(inputPkg, outputPkg, lcalType)) { @@ -259,13 +269,12 @@ bool IsMatrixAligned(const int64_t &m, const int64_t &n, const bool &transpose, int64_t Lcoc::GetWorkspaceSize() { - LcalType lcaltype = taskParam_.lcalType; + LcalType lcalType = taskParam_.lcalType; auto cocParamDesc = taskParam_.cocParamDesc; bool isDeterministic = (GetComm()->GetCommArgs()->extraFlag & ExtraFlag::DETERMINISTIC) != 0; CoCDataTypeDesc dataType = cocParamDesc.dataTypeDesc; const MatMulInfo &mmInfo = cocParamDesc.mmInfo; const QuantInfo &quantInfo = cocParamDesc.quantInfo; - const MoeInfo& moeInfo = cocParamDesc.moeInfo; bool hasQuant = quantInfo.quantGranularity != QuantGranularity::QUANT_GRANULARITY_UNDEFINED; bool hasDequant = quantInfo.dequantGranularity != QuantGranularity::QUANT_GRANULARITY_UNDEFINED; int32_t eleSize = COC_TYPE2ELE_SIZE.at(dataType); @@ -273,7 +282,6 @@ int64_t Lcoc::GetWorkspaceSize() int32_t mAlign = AlignUp(mmInfo.m, nElemAlign); int32_t nAlign = AlignUp(mmInfo.n, nElemAlign); int32_t kAlign = AlignUp(mmInfo.k, nElemAlign); - int32_t maxOutputSize = moeInfo.maxOutputSize; bool hasAAlign = hasQuant || (!IsMatrixAligned(mmInfo.m, mmInfo.k, mmInfo.transA, nElemAlign) && mmInfo.m != 1); @@ -282,20 +290,16 @@ int64_t Lcoc::GetWorkspaceSize() int32_t accumRankSize = 0; - bool hasAccum = dataType == COCDataTypeDesc::INT8INT8_INT32_BF16; + bool hasAccum = dataType == CoCDataTypeDesc::INT8INT8_INT32_BF16; bool hasDequantParam = (quantInfo.dequantGranularity == QuantGranularity::PER_TOKEN || quantInfo.dequantGranularity == QuantGranularity::PER_TENSOR); bool hasFormatDequantScale = (quantInfo.dequantGranularity == QuantGranularity::PER_CHANNEL); - bool isMoe = false; - - bool isAlltoallVc = false; uint64_t dequantWorkSpaceSize = GetDequantWorkSpaceSize(lcalType, tiling_.withSerialMode, mmInfo.m, mmInfo.n, - tiling_.m0, tiling_.n0, tiling_.pValue, tiling_.nLoop, taskParam_.rankSize, taskParam_.blockDim, maxOutputSize); + tiling_.m0, tiling_.n0, tiling_.pValue, tiling_.nLoop, taskParam_.rankSize, taskParam_.blockDim); LcalWorkspaceInfo lcalWorkSpaceInfo = GetLcalWorkspaceInfo(0, mmInfo.batchSize, mmInfo.m, mmInfo.k, mmInfo.n, mAlign, kAlign, nAlign, mmInfo.transA, mmInfo.transB, eleSize, hasAAlign, hasBAlign, - accumRankSize, hasAccum, dequantWorkSpaceSize, hasDequantParam, hasFormatDequantScale, isDeterministic, - isMoe, isAlltoallVc, moeInfo.EP, moeInfo.local_expert_nums, maxOutputSize); + accumRankSize, hasAccum, dequantWorkSpaceSize, hasDequantParam, hasFormatDequantScale, isDeterministic); MKI_LOG(DEBUG) << "[Lcoc Workspace]: " << "m=" << mmInfo.m << ",k=" << mmInfo.k << ", n=" << mmInfo.n << ", mAlign=" << mAlign << ", kAlign=" << kAlign << ", nAlign=" << nAlign << ", transA=" << mmInfo.transA @@ -303,10 +307,8 @@ int64_t Lcoc::GetWorkspaceSize() << ", hasBAlign=" << hasBAlign << ", accumRankSize=" << accumRankSize << ", hasAccum=" << hasAccum << ", dequantWorkSpaceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam << ", hasFormatDequantScale=" << hasFormatDequantScale << ", isDeterministic=" << isDeterministic - << ", isMoe=" << isMoe << ", isAlltoallVc=" << isAlltoallVc << ", moeInfo.EP=" << static_cast(moeInfo.EP) - << ", moeInfo.local_expert_nums=" << moeInfo.local_expert_nums - << ", maxOutputSize=" << maxOutputSize << ", workspaceSize=" << lcalWorkspaceInfo.workspaceSize; - return lcalWorkSpaceInfo.workspaceSize; + + return lcalWorkspaceInfo.workspaceSize; } } -- Gitee From a1f0ca1838f4a6d0eaab82475d16d26bb40b2415 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 30 Aug 2025 10:53:40 +0800 Subject: [PATCH 375/414] cmakelist fix --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 328b61a6..02cf7dac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,8 +116,10 @@ if (BUILD_CUSTOMIZE_OPS) add_subdirectory(ops_customize) endif() -set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}") +set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}/lcal") add_subdirectory(comm/lcal) +set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_ +{cxx_abi}") message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) -- Gitee From acdaf1c4cc4eacd9a93dbdd7db31c0da477bdabc Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 10:54:42 +0800 Subject: [PATCH 376/414] test case --- tests/apitest/opstest/csv/linear_parallel.csv | 43 +------------------ 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv index ae51e1c4..443a1392 100644 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -41,15 +41,6 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 40|llama_65bCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 41|llama_65bCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 42|llama_65bCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -43|NoErrorCase0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -44|NoErrorCase1LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|28,2,1024;8,1024|1|float16|nd|14,2,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -45|IErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,59;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||I:ERROR_INVALID_TENSOR_DIM -46|SErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM -47|NoErrorCase0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -48|NoErrorCase1AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,4,1024;8,1024|1|float16|nd|4,4,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -49|IErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,33;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM -50|SErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM -51|NoErrorCase0AllGatherLinearV2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"keepIntermediate":true}|2|float16;float16|nd;nd|2,16;32,16|2|float16;float16|nd;nd|4,32;4,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 52|NoErrorCase0MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;1,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR 53|NoErrorCase1MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|8,2;4,2;1,4;1,4|1|bf16|nd|8,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 54|NoErrorCase2MatmulAllReduceDequantWithoutBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;0;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR @@ -57,28 +48,13 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 56|NoErrorCase4MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 57|NoErrorCase5MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|bf16;int8;bf16;bf16;bf16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|bf16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR 58|NoErrorCase6MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|float16;int8;float16;float16;float16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR -59|NoErrorCase0PureMatmul|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -60|NoErrorCase1PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR -61|NoErrorCase2PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR -62|DimCheckFailPureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1,4;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM -63|PureMatmulW8A8Fp16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -64|PureMatmulW8A8Bf16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -65|PureMatmulW8A8Fp16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -66|PureMatmulW8A8Bf16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -67|PureMatmulW8A8InvalidQuantType|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|S:ERROR_INVALID_PARAM -68|PureMatmulKeepIntermediateInValid|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","keepIntermediate":true,"type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|C:ERROR_INVALID_PARAM 69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,1,32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,1,32,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,1,32,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -75|PureMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -80|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE +780|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE 81|rsv|LinearParallelOperation|{"rank":0,"rankSize":2,"rsv":[1]}|0||||0||||||||||||C:ERROR_INVALID_PARAM 82|NoErrorCase0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":1}}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 83|NoErrorCase1AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|2048,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR @@ -86,20 +62,3 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 85|NoErrorCase3AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|512,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 86|IErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":1}}|2|float16;float16|nd;nd|32,16;32,20|1|float16|nd|16,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM 87|SErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|1024,32|customize;customize|-1,1;-1,1||||||Ascend910B|S:ERROR_INVALID_TENSOR_DIM -88|AllGatherMatmulInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -89|AllGatherMatmulInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -90|MatmulReducescatterInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -91|MatmulReducescatterInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -92|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -93|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -94|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -95|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -96|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -97|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -98|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -99|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -100|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,48;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -101|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|32768,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -102|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -103|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -104|PureMatmulW8A8Fp16_3_float|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;float|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|random;random;random;random|-5,5;-5,5;-10,10;1,2||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -- Gitee From ab2a3cb392504c6e2eccf97edbfcd558a9337a5b Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 30 Aug 2025 11:00:58 +0800 Subject: [PATCH 377/414] 4 --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 02cf7dac..dc8d623e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,8 +118,7 @@ endif() set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}/lcal") add_subdirectory(comm/lcal) -set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_ -{cxx_abi}") +set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}") message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) -- Gitee From f5883a0d06de11c33f03224ed79bf7b0664c1897 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 30 Aug 2025 11:02:30 +0800 Subject: [PATCH 378/414] 5 --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc8d623e..dfaffefb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,6 +124,8 @@ message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) install(DIRECTORY ${PROJECT_SOURCE_DIR}/ops_configs DESTINATION ./configs) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/mki/lib/libmki.so DESTINATION lib) +install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/liblcal.so DESTINATION lib) +install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/liblcal_static.a DESTINATION lib) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libasdops_aicpu_kernels.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libtbe_adapter.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libcann_ops_adapter.so DESTINATION lib OPTIONAL) -- Gitee From bba3b679186715af0962c014e71e672e497f96e8 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 11:49:33 +0800 Subject: [PATCH 379/414] fix --- comm/lcal/include/lcal_types.h | 2 +- comm/lcal/include/lcoc/lcoc.h | 2 +- comm/lcal/include/lcoc/tiling/tiling.h | 6 +-- comm/lcal/include/lcoc/tiling/tiling_91093.h | 2 +- comm/lcal/src/coc_kernel_args.cpp | 40 +++++++++---------- comm/lcal/src/coc_kernel_args.h | 8 ++-- comm/lcal/src/kernels/coc_add_bias_runner.cce | 2 +- .../kernels/coc_allgather_reducescatter.cce | 32 +++++++-------- comm/lcal/src/kernels/coc_allreduce.cce | 8 ++-- comm/lcal/src/kernels/coc_comm_base.cce | 20 +++++----- comm/lcal/src/kernels/coc_postprocessor.cce | 4 +- comm/lcal/src/kernels/coc_ppmatmul.cce | 16 ++++---- comm/lcal/src/kernels/coc_preprocessor.cce | 8 ++-- comm/lcal/src/lcoc.cpp | 1 + comm/lcal/src/lcoc_func.cpp | 4 +- 15 files changed, 76 insertions(+), 79 deletions(-) diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index d67aad78..69f93b0e 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -93,7 +93,7 @@ const std::map LCAL_TYPE2NAME = { { LcalType::ALL2ALL_V_C, "LcalAll2AllVC" }, { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER, "LcalAllGatherMatmulReduceScatter" }, { LcalType::BANDWIDTH, "LcalBandwidthTest" }, - { LcalType::LOCAL_REDUCE, "LcalLcalReduce" }, + { LcalType::LOCAL_REDUCE, "LcalLoalReduce" }, { LcalType::GATHER, "LcalGather" }, { LcalType::SEND, "LcalSend" }, { LcalType::RECV, "LcalRecv" } diff --git a/comm/lcal/include/lcoc/lcoc.h b/comm/lcal/include/lcoc/lcoc.h index e5120735..95adabc5 100644 --- a/comm/lcal/include/lcoc/lcoc.h +++ b/comm/lcal/include/lcoc/lcoc.h @@ -45,4 +45,4 @@ private: bool tilingSuccess_ = false; }; } -#endif \ No newline at end of file +#endif // LCAL_LCOC_H \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling.h b/comm/lcal/include/lcoc/tiling/tiling.h index 66ec9da5..65c8d8c2 100644 --- a/comm/lcal/include/lcoc/tiling/tiling.h +++ b/comm/lcal/include/lcoc/tiling/tiling.h @@ -35,12 +35,12 @@ protected: class CoCMatmulAllReduceTilingFunc : public CoCTilingFunc { public: - CoCMatmulALlReduceTilingFunc(const CoCMatmulAllReduceTilingFunc &) = delete; + CoCMatmulAllReduceTilingFunc(const CoCMatmulAllReduceTilingFunc &) = delete; CoCMatmulAllReduceTilingFunc &operator = (const CoCMatmulAllReduceTilingFunc &) = delete; CoCMatmulAllReduceTilingFunc() {} bool CheckTiling(const TaskParam &taskParam) override; void GetDefaultTiling(const TaskParam &taskParam) override; -} +}; class CoCMatmulAllReduceDeterTilingFunc : public CoCMatmulAllReduceTilingFunc { public: @@ -49,7 +49,7 @@ public: CoCMatmulAllReduceDeterTilingFunc() {} bool CheckTiling(const TaskParam &taskParam) override; void GetDefaultTiling(const TaskParam &taskParam) override; -} +}; class CoCAllgatherMatmulReduceScatterTilingFunc : public CoCTilingFunc { public: diff --git a/comm/lcal/include/lcoc/tiling/tiling_91093.h b/comm/lcal/include/lcoc/tiling/tiling_91093.h index e3848d06..9e3764c8 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_91093.h +++ b/comm/lcal/include/lcoc/tiling/tiling_91093.h @@ -19,4 +19,4 @@ namespace Lcal { void CoCAllgatherMatmulReduceScatterAgEightRsTwoTiling(CoCTilingData &cocTilingData); void CoCAllgatherMatmulReduceScatterDefaultTiling(CoCTilingData &cocTilingData, int32_t rsDim); } -#endif \ No newline at end of file +#endif // LCAL_TILING_91093_H \ No newline at end of file diff --git a/comm/lcal/src/coc_kernel_args.cpp b/comm/lcal/src/coc_kernel_args.cpp index fc4b60e0..ff307add 100644 --- a/comm/lcal/src/coc_kernel_args.cpp +++ b/comm/lcal/src/coc_kernel_args.cpp @@ -17,18 +17,18 @@ using namespace Mki; namespace Lcal { -int CoCkernelArgs::SetFFTSAddr() +int CoCKernelArgs::SetFFTSAddr() { uint32_t fftsLen; - int MkiRtGetC2cCtrlAddr(&fftsAddr, &fftsLen); + int error = MkiRtGetC2cCtrlAddr(&fftsAddr, &fftsLen); if (error != MKIRT_SUCCESS) { - MKT_LOG(ERROR) << "MkiRtGetC2cCtrlAddr err"; + MKI_LOG(ERROR) << "MkiRtGetC2cCtrlAddr err"; return LCAL_ERROR_MKIRT; } return LCAL_SUCCESS; } -void CoCkernelArgs::SetInputPkgArgs(CoCInputPkg &inputPkg) +void CoCKernelArgs::SetInputPkgArgs(CoCInputPkg &inputPkg) { matrixA = inputPkg.matrixA; matrixB = inputPkg.matrixB; @@ -40,43 +40,43 @@ void CoCkernelArgs::SetInputPkgArgs(CoCInputPkg &inputPkg) quantOffset = inputPkg.quantOffset; } -void CoCkernelArgs::SetOutputPkgArgs(CoCOutputPkg &outputPkg) +void CoCKernelArgs::SetOutputPkgArgs(CoCOutputPkg &outputPkg) { output = outputPkg.output; - minOutput = outputPkg.midOutput; + midOutput = outputPkg.midOutput; } -void CoCkernelArgs::SetWorkspacePtrArg(void *workspacePtr) +void CoCKernelArgs::SetWorkspacePtrArg(void *workspacePtr) { workspace = workspacePtr; } -void CoCkernelArgs::SetParamDescArgs(const CoCParamDesc ¶mDesc) +void CoCKernelArgs::SetParamDescArgs(const CoCParamDesc ¶mDesc) { - cockernelParm.quantInfo = paramDesc.quantInfo; - cockernelParm.twoDimTPInfo = paramDesc.twoDimTPInfo; - cockernelParm.postInfo = paramDesc.postInfo; - cockernelParm.weightNz = paramDesc.mmInfo.weightNz; + cocKernelParam.quantInfo = paramDesc.quantInfo; + cocKernelParam.twoDimTPInfo = paramDesc.twoDimTPInfo; + cocKernelParam.postInfo = paramDesc.postInfo; + cocKernelParam.weightNz = paramDesc.mmInfo.weightNz; } -void CoCkernelArgs::SetCommArgs(const LcalComm &comm) +void CoCKernelArgs::SetCommArgs(const LcalComm &comm) { commArgsPtr = comm.GetCommArgsPtr(); } -void CoCkernelArgs::SetCoCTilingDataArgs(const CoCTilingData &tilingData) +void CoCKernelArgs::SetCoCTilingDataArgs(const CoCTilingData &tilingData) { - pCocTiling = &(cockernelParm.cocTilingData); - cockernelParm.cocTilingData = tilingData; + pCocTiling = &(cocKernelParam.cocTilingData); + cocKernelParam.cocTilingData = tilingData; } -std::string CoCkernelArgs::ParamToString() +std::string CoCKernelArgs::ParamToString() { std::string quantInfoString = "[QuantInfo]: dequantGranularity=" + - std::to_string(cockernelParm.quantInfo.dequantGranularity) + "\n"; + std::to_string(cocKernelParam.quantInfo.dequantGranularity) + "\n"; std::string weightNzInfoString = "[weightNz]: weightNz=" + - std::to_string(cockernelParm.weightNz) + "\n"; - std::string tilingInfoString = cockernelParm.cocTilingData.ToString(); + std::to_string(cocKernelParam.weightNz) + "\n"; + std::string tilingInfoString = cocKernelParam.cocTilingData.ToString(); return quantInfoString + weightNzInfoString + tilingInfoString; } } \ No newline at end of file diff --git a/comm/lcal/src/coc_kernel_args.h b/comm/lcal/src/coc_kernel_args.h index 873833fc..dc2e0c35 100644 --- a/comm/lcal/src/coc_kernel_args.h +++ b/comm/lcal/src/coc_kernel_args.h @@ -17,7 +17,7 @@ #include "lcoc_args.h" namespace Lcal { - struct CoCkernelArgs { + struct CoCKernelArgs { void *matrixA = nullptr; void *matrixB = nullptr; void *bias = nullptr; @@ -33,7 +33,7 @@ namespace Lcal { uint64_t fftsAddr = 0; CoCTilingData *pCocTiling = nullptr; - CoCkernelParm cockernelParm = {}; + CoCKernelParam cocKernelParam = {}; int SetFFTSAddr(); void SetInputPkgArgs(CoCInputPkg &inputPkg); void SetOutputPkgArgs(CoCOutputPkg &outputPkg); @@ -42,7 +42,7 @@ namespace Lcal { void SetCommArgs(const LcalComm &comm); void SetCoCTilingDataArgs(const CoCTilingData &tilingData); std::string ParamToString(); - } + }; } -#endif \ No newline at end of file +#endif // LCAL_COC_KERNEL_ARGS_H \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_add_bias_runner.cce b/comm/lcal/src/kernels/coc_add_bias_runner.cce index 066f7097..41a73836 100644 --- a/comm/lcal/src/kernels/coc_add_bias_runner.cce +++ b/comm/lcal/src/kernels/coc_add_bias_runner.cce @@ -302,7 +302,7 @@ public: inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) { m = m * rank_size; - base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL); + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); } inline void __aicore__ Run() diff --git a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce index 90dec967..4533096a 100644 --- a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce +++ b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce @@ -34,10 +34,10 @@ public: this->other_rank_ag_idx = other_rank % ag_dim; this->other_rank_rs_idx = other_rank / ag_dim; }else { - this->rank_ag_idx = rank % rs_dim; - this->rank_rs_idx = rank / rs_dim; - this->other_rank_ag_idx = other_rank % rs_dim; - this->other_rank_rs_idx = other_rank / rs_dim; + this->rank_ag_idx = rank / rs_dim; + this->rank_rs_idx = rank % rs_dim; + this->other_rank_ag_idx = other_rank / rs_dim; + this->other_rank_rs_idx = other_rank % rs_dim; } twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; @@ -52,7 +52,7 @@ public: ag_comm_npu_split = comm_npu_split; ag_comm_data_split = comm_data_split; ag_len_per_loop = len_per_loop; - ag_comm_direct = ag_comm_direct; + ag_comm_direct = comm_direct; rs_comm_npu_split = extra_comm_npu_split; rs_comm_data_split = extra_comm_data_split; @@ -61,7 +61,7 @@ public: ag_core_count = ag_comm_npu_split * ag_comm_data_split; rs_core_count = rs_comm_npu_split * rs_comm_data_split; - ag_max_ub_ping_pong_size = (max_ub_ping_pong_size / 2) / n0 * n0; + ag_max_ub_ping_pong_size = (max_ub_single_dma_size / 2) / n0 * n0; rs_max_ub_ping_pong_size = (extra_ub_move_num / 2) / n0 * n0; } @@ -70,7 +70,7 @@ public: auto ub1 = output_UB_T[1]; int32_t interm_offset = 0; for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx) { - uint32_t data_size = interm_offset + max_ub_ping_pong_size < copy_size ? ag_max_ub_ping_pong_size : copy_size - interm_offset; + uint32_t data_size = interm_offset + ag_max_ub_ping_pong_size < copy_size ? ag_max_ub_ping_pong_size : copy_size - interm_offset; auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub = (move_idx & 1) ? ub0 : ub1; WaitFlag(event_id); @@ -105,11 +105,11 @@ public: int32_t real_rank; if (inner_dim_is_Ag) { real_rank = dst_rank + rank / ag_dim * ag_dim; - } esle { + } else { real_rank = dst_rank * rs_dim + rank % rs_dim; } if (real_rank != rank && dst_rank < ag_dim) { - CopyUbufToGM(buff[real_rank] + rank_offset, ub_buff_st, 1, block_len, 0, 0) + CopyUbufToGm(buff[real_rank] + rank_offset, ub_buff_st, 1, block_len, 0, 0) } dst_rank = (dst_rank + skip_num) % ag_dim; } @@ -169,7 +169,7 @@ public: return real_core_idx; } - FORCE_INLINE_AICORE void GetLenPerCore(int32_t rank_total, int32_t loop_index, int32_t &m_in_core, int32_t &rank_buff_offset) + FORCE_INLINE_AICORE void GetLenPerCore(int32_t rank_total, int32_t loop_index, int32_t &m_in_core, int32_t &buff_offset) { int32_t core_index = core_idx - ag_core_count; int32_t before_core_offset = rs_len_per_loop * rs_comm_data_split * loop_index; @@ -199,7 +199,7 @@ public: for (int32_t rank_idx = 0; rank_idx < rank_per_core; rank_idx++) { int32_t real_rank_idx_tmp = GetRealCoreIdx(rank_idx, rank_per_core); - int32_t real_core_idx; + int32_t real_rank_idx; if (inner_dim_is_Ag) { real_rank_idx = real_rank_idx_tmp * ag_dim + rank % ag_dim; } else { @@ -220,7 +220,7 @@ public: PipeBarrier(); } - FORCE_INLINE_AICORE void FirstStepInOut(int32_t mat_blocks_size, __gm__ T *input, int32_t gm_ofset, int32_t offset, int32_t comm_idx, int32_t flag_idx, int64_t out_part_offset) { + FORCE_INLINE_AICORE void FirstStepInOut(int32_t mat_blocks_size, __gm__ T *input, int32_t gm_offset, int32_t offset, int32_t comm_idx, int32_t flag_idx, int64_t out_part_offset) { int32_t ping_pong_move_count = DivCeil(mat_blocks_size, rs_max_ub_ping_pong_size); for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { int32_t actual_move_size = rs_max_ub_ping_pong_size; @@ -229,7 +229,7 @@ public: } auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; - WaitFlag(event_id); + WaitFlag(event_id); CopyGmToUbuf(ub_buff_st, input + gm_offset + offset + move_idx * rs_max_ub_ping_pong_size, 1, actual_move_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); WaitFlag(event_id); @@ -362,15 +362,10 @@ public: using CocCommBase::trans_a; using CocCommBase::trans_b; using CocCommBase::is_int8; - using CocCommBase::is_91093; using CocCommBase::p_value; using CocCommBase::aiv_idx; using CocCommBase::other_rank; using CocCommBase::max_ub_single_dma_size; - using CocCommBase::withSerialMode; - using CocCommBase::tag; - using CocCommBase::loop_num_per_comm; - using CocCommBase::gm_c_pingpong_size; using CocCommBase::dequant_granularity; using CocCommBase::dequant_group_size; using CocCommBase::quant_granularity; @@ -378,6 +373,7 @@ public: using CocCommBase::workspace_info; using CocCommBase::ag_dim; using CocCommBase::rs_dim; + using CocCommBase::inner_dim_is_Ag; using CocCommBase::comm_npu_split; using CocCommBase::comm_data_split; using CocCommBase::comm_direct; diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/comm/lcal/src/kernels/coc_allreduce.cce index 07ca30a2..c787a6dd 100644 --- a/comm/lcal/src/kernels/coc_allreduce.cce +++ b/comm/lcal/src/kernels/coc_allreduce.cce @@ -44,7 +44,7 @@ public: if (dequant_granularity == QuantGranularity::PER_TOKEN) { fused_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(buff[rank]), reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, - m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value, rank_size); + m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value, rank_size); serial_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(gm_out), reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, m0, n0); } total_core_idx = aiv_idx * core_num + core_idx; @@ -147,7 +147,7 @@ public: SetFlag(event_id); WaitFlag(event_id); int64_t move_num_offset = other_rank_offset + move_idx * max_ub_ping_pong_size; - CopyUbufToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset + cal_idx * gm_c_pingpong_size); + CopyUbToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset + cal_idx * gm_c_pingpong_size); SetFlag(event_id); } } @@ -177,7 +177,7 @@ public: CopyGmToUbuf(ub, input + offset, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); WaitFlag(event_id); - CopyUbufToGM(output + offset, ub, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); + CopyUbufToGm(output + offset, ub, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); data_size_remain -= max_ub_ping_pong_size; offset += max_ub_ping_pong_size; @@ -190,7 +190,7 @@ public: SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); if (ALIGN) { - CopyUbufToGM(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); + CopyUbufToGm(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); } else { CopyUbufToGmAlignB16(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T), 0, 0); } diff --git a/comm/lcal/src/kernels/coc_comm_base.cce b/comm/lcal/src/kernels/coc_comm_base.cce index fea6333f..789ea110 100644 --- a/comm/lcal/src/kernels/coc_comm_base.cce +++ b/comm/lcal/src/kernels/coc_comm_base.cce @@ -54,7 +54,7 @@ public: FORCE_INLINE_AICORE void SetFromParam(__gm__ uint8_t *para_gm) { - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; auto quantInfo = ¶->quantInfo; auto twoDimTPInfo = ¶->twoDimTPInfo; @@ -232,7 +232,7 @@ public: if (core_idx != rank) { SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); } - CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, rank_size * flag_data * rank_size); + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, flag_data * rank_size); } } @@ -294,7 +294,7 @@ public: } } - FORCE_INLINE_AICORE void CopyGmToGm(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { + FORCE_INLINE_AICORE void CopyGMToGM(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { auto ub0 = output_UB_T[0]; auto ub1 = output_UB_T[1]; int32_t interm_offset = 0; @@ -330,7 +330,7 @@ public: if (is_91093 && (m_rank_idx % 2) != (rank % 2)) { continue; } - CopyGmToGm(buff[m_rank_idx] + core_buff_offset, buff[rank] + core_buff_offset, data_size_remain); + CopyGMToGM(buff[m_rank_idx] + core_buff_offset, buff[rank] + core_buff_offset, data_size_remain); } } @@ -364,15 +364,15 @@ public: gm_interm = buff[rank] + core_buff_offset; } auto gm_peer = buff[rank_idx_rot] + core_buff_offset; - CopyGmToGm(gm_peer, gm_interm, data_size_remain); + CopyGMToGM(gm_peer, gm_interm, data_size_remain); rank_idx++; } if (rank_size == 8) { - CopyGmToGm(gm_reducebuf + 1 * len_per_loop, buff[rank] + core_buff_offset, data_size_remain); - CopyGmToGm(gm_reducebuf + 2 * len_per_loop, gm_reducebuf, data_size_remain); + CopyGMToGM(gm_reducebuf + 1 * len_per_loop, buff[rank] + core_buff_offset, data_size_remain); + CopyGMToGM(gm_reducebuf + 2 * len_per_loop, gm_reducebuf, data_size_remain); } if (rank_size >= 4) { - CopyGmToGm(gm_reducebuf, buff[rank] + core_buff_offset, data_size_remain); + CopyGMToGM(gm_reducebuf, buff[rank] + core_buff_offset, data_size_remain); } } @@ -387,12 +387,12 @@ public: int32_t offset = 0; SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); - CopyGmToGm(input, output, data_size_remain); + CopyGMToGM(input, output, data_size_remain); WaitFlag(EVENT_ID0); WaitFlag(EVENT_ID1); if (atomic_add) { SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); SetAtomicNone(); PipeBarrier(); } diff --git a/comm/lcal/src/kernels/coc_postprocessor.cce b/comm/lcal/src/kernels/coc_postprocessor.cce index 526a1d3b..faccedf4 100644 --- a/comm/lcal/src/kernels/coc_postprocessor.cce +++ b/comm/lcal/src/kernels/coc_postprocessor.cce @@ -173,8 +173,8 @@ public: FORCE_INLINE_AICORE void SetArgs(PP_MATMUL_AIV_POST_ARGS_FUN()) { - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); - auto cocTilingData = ¶-.cocTilingData; + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; this->with_rms_norm = para->postInfo.withRmsNorm; if (this->with_rms_norm) { uint32_t m = cocTilingData->m; diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce index 4666e809..2c4e2f44 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -78,7 +78,7 @@ inline __aicore__ void LoadCbufToCaTranspose(__ca__ T *dst, __cbuf__ T *src, uin addrmode ); uint32_t src_buffer_offset = reinterpret_cast(src); - uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint32_t dst_buffer_offset = reinterpret_cast(dst); uint8_t src_logicpos = static_cast(TPosition::C1); uint8_t dst_logicpos = static_cast(TPosition::A2); LocalTensor srcTensor; @@ -100,7 +100,7 @@ inline __aicore__ void LoadCbufToCbTranspose(__cb__ T *dst, __cbuf__ T *src, uin addrmode ); uint32_t src_buffer_offset = reinterpret_cast(src); - uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint32_t dst_buffer_offset = reinterpret_cast(dst); uint8_t src_logicpos = static_cast(TPosition::C1); uint8_t dst_logicpos = static_cast(TPosition::B2); LocalTensor srcTensor; @@ -123,7 +123,7 @@ inline __aicore__ void LoadCbufToCa(__ca__ T *dst, __cbuf__ T *src, uint16_t bas addr_cal_mode ); uint32_t src_buffer_offset = reinterpret_cast(src); - uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint32_t dst_buffer_offset = reinterpret_cast(dst); uint8_t src_logicpos = static_cast(TPosition::C1); uint8_t dst_logicpos = static_cast(TPosition::A2); LocalTensor srcTensor; @@ -146,7 +146,7 @@ inline __aicore__ void LoadCbufToCb(__cb__ T *dst, __cbuf__ T *src, uint16_t bas addr_cal_mode ); uint32_t src_buffer_offset = reinterpret_cast(src); - uint32_t dst_buffer_offset = reinterpret_cast(dst>); + uint32_t dst_buffer_offset = reinterpret_cast(dst); uint8_t src_logicpos = static_cast(TPosition::C1); uint8_t dst_logicpos = static_cast(TPosition::B2); LocalTensor srcTensor; @@ -279,7 +279,7 @@ public: block_size = 32 / sizeof(MmadDtype); L1_PINGPONG_BUFFER_LEN = ((m0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size + - (n0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size * (IS_INT8 ? 2 : 1)); + (n0 * k0 + cube_matrix_size - 1) / cube_matrix_size * cube_matrix_size) * (IS_INT8 ? 2 : 1); L0AB_PINGPONG_BUFFER_LEN = L0AB_PINGPONG_BUFFER_SIZE / sizeof(MmadDtype); int32_t a_l1_size = m0 * k0 * sizeof(MmadDtype); @@ -505,7 +505,7 @@ public: } } else { if (aligned_b == 1) { - offset_b_next = batch_idx * k * n_align + n_idx * n0 + (k_idx + 1) * k0 * n_align + n_idx * n0; + offset_b_next = batch_idx * k * n_align + (k_idx + 1) * k0 * n_align + n_idx * n0; } else { if (weight_nz) { offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * block_size + n_idx * n0 * k_align16; @@ -532,7 +532,7 @@ public: gm_src_a, 0, 1, - k_round_next, + k_round_next / block_size, 0, 0, PAD_NONE @@ -688,7 +688,7 @@ public: k_round / BLOCK_SIZE_32, 1, 0, - 0, + 0 ); } } else { diff --git a/comm/lcal/src/kernels/coc_preprocessor.cce b/comm/lcal/src/kernels/coc_preprocessor.cce index cac13c20..a7cd1b57 100644 --- a/comm/lcal/src/kernels/coc_preprocessor.cce +++ b/comm/lcal/src/kernels/coc_preprocessor.cce @@ -317,7 +317,7 @@ private: }; -template<> +template <> class Padder : public BasePadder { public: __aicore__ explicit Padder() = default; @@ -541,7 +541,7 @@ private: bool has_offset{ false }; }; -template<> +template <> class DequantPadder : public BasePadder { public: __aicore__ explicit DequantPadder() = default; @@ -700,7 +700,7 @@ private: }; -template<> +template <> class DequantPadder : public BasePadder { public: __aicore__ explicit DequantPadder() = default; @@ -2599,7 +2599,7 @@ public: aligned_b = 0; has_b_align = false; } - LcalWorkspaceInfo workspace_info = GetLcalWorkSpaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, + LcalWorkspaceInfo workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, hasFormatDequantScale,is_deterministic); diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index c9005dff..53a5f4da 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -307,6 +307,7 @@ int64_t Lcoc::GetWorkspaceSize() << ", hasBAlign=" << hasBAlign << ", accumRankSize=" << accumRankSize << ", hasAccum=" << hasAccum << ", dequantWorkSpaceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam << ", hasFormatDequantScale=" << hasFormatDequantScale << ", isDeterministic=" << isDeterministic + << ", workspaceSize=" << lcalWorkSpaceInfo.workspaceSize; return lcalWorkspaceInfo.workspaceSize; } diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index eda5326e..638c4090 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -17,8 +17,8 @@ bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max) { - if (value < min || (max != PARA_CHECK_MAX_VALUE && value > max)) { - if (max == PARA_CHECK_MAX_VALUE) { + if (value < min || (max != PARM_CHECK_MAX_VALUE && value > max)) { + if (max == PARM_CHECK_MAX_VALUE) { MKI_LOG(ERROR) << "The " << name << ":" << value << " must equal or greater than " << min << "!"; } else { MKI_LOG(ERROR) << "The " << name << ":" << value << " must be in [" << min << "," << max << "]!"; -- Gitee From c5aa3949ec0b8aff6dabce2cd17aae238881ae6f Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 14:26:31 +0800 Subject: [PATCH 380/414] fix --- comm/lcal/src/lcoc_func.cpp | 3 +- tests/apitest/opstest/csv/linear_parallel.csv | 43 ++++++++++++++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index 638c4090..ac954966 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -7,7 +7,6 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ - #include "lcoc_func.h" #include "lcoc_args.h" #include "mki/utils/log/log.h" @@ -73,7 +72,7 @@ } int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, - int nElemAlign) + int nElemAlign) { int64_t nRow = transpose ? n : m; int64_t nCol = transpose ? m : n; diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv index 443a1392..ae51e1c4 100644 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -41,6 +41,15 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 40|llama_65bCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 41|llama_65bCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 42|llama_65bCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +43|NoErrorCase0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +44|NoErrorCase1LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|28,2,1024;8,1024|1|float16|nd|14,2,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +45|IErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,59;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||I:ERROR_INVALID_TENSOR_DIM +46|SErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM +47|NoErrorCase0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +48|NoErrorCase1AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,4,1024;8,1024|1|float16|nd|4,4,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +49|IErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,33;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM +50|SErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM +51|NoErrorCase0AllGatherLinearV2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"keepIntermediate":true}|2|float16;float16|nd;nd|2,16;32,16|2|float16;float16|nd;nd|4,32;4,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 52|NoErrorCase0MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;1,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR 53|NoErrorCase1MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|8,2;4,2;1,4;1,4|1|bf16|nd|8,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 54|NoErrorCase2MatmulAllReduceDequantWithoutBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;0;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR @@ -48,13 +57,28 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 56|NoErrorCase4MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 57|NoErrorCase5MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|bf16;int8;bf16;bf16;bf16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|bf16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR 58|NoErrorCase6MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|float16;int8;float16;float16;float16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR +59|NoErrorCase0PureMatmul|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +60|NoErrorCase1PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR +61|NoErrorCase2PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR +62|DimCheckFailPureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1,4;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM +63|PureMatmulW8A8Fp16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +64|PureMatmulW8A8Bf16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +65|PureMatmulW8A8Fp16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +66|PureMatmulW8A8Bf16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +67|PureMatmulW8A8InvalidQuantType|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|S:ERROR_INVALID_PARAM +68|PureMatmulKeepIntermediateInValid|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","keepIntermediate":true,"type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|C:ERROR_INVALID_PARAM 69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,1,32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,1,32,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,1,32,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -780|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE +75|PureMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +80|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE 81|rsv|LinearParallelOperation|{"rank":0,"rankSize":2,"rsv":[1]}|0||||0||||||||||||C:ERROR_INVALID_PARAM 82|NoErrorCase0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":1}}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 83|NoErrorCase1AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|2048,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR @@ -62,3 +86,20 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 85|NoErrorCase3AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|512,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 86|IErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":1}}|2|float16;float16|nd;nd|32,16;32,20|1|float16|nd|16,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM 87|SErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|1024,32|customize;customize|-1,1;-1,1||||||Ascend910B|S:ERROR_INVALID_TENSOR_DIM +88|AllGatherMatmulInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +89|AllGatherMatmulInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +90|MatmulReducescatterInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +91|MatmulReducescatterInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +92|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +93|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +94|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +95|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +96|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +97|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +98|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +99|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +100|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,48;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +101|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|32768,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +102|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +103|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +104|PureMatmulW8A8Fp16_3_float|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;float|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|random;random;random;random|-5,5;-5,5;-10,10;1,2||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -- Gitee From 2aa332817a3ba1ed60e1accf5f272b220a514fb4 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 14:59:20 +0800 Subject: [PATCH 381/414] fix --- comm/lcal/include/lcoc/lcoc_workspace.h | 2 +- comm/lcal/include/lcoc/tiling/tiling_args.h | 2 +- comm/lcal/src/lcoc_func.cpp | 15 +++--- .../tiling/allgather_reducescatter_tiling.cpp | 27 +++++----- comm/lcal/src/tiling/tiling_func.cpp | 52 +++++++++---------- 5 files changed, 48 insertions(+), 50 deletions(-) diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/comm/lcal/include/lcoc/lcoc_workspace.h index bb0b8fa1..56f9f5fb 100644 --- a/comm/lcal/include/lcoc/lcoc_workspace.h +++ b/comm/lcal/include/lcoc/lcoc_workspace.h @@ -78,7 +78,7 @@ inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, in if (hasBAlign) { lcalWorkspaceInfo.gm_b_align = workspaceOffset; workspaceOffset += static_cast(batchSize) * (transb ? n * kAlign : k * nAlign) * mmadSize * - (expertPerRank <= 0 ? 1 : expertPerRank); + (expertPerRank <= 0 ? 1 : expertPerRank); } if (!isMoe && hasDequantParam) { diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/comm/lcal/include/lcoc/tiling/tiling_args.h index e2222b7a..66bfe661 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_args.h +++ b/comm/lcal/include/lcoc/tiling/tiling_args.h @@ -112,7 +112,7 @@ namespace Lcal { int32_t withSerialMode = -1; int32_t is91093 = -1; - int32_t bufferSize = -1; + int32_t bufferSize = -1; }; struct CoCTilingData : CoCTiling { diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index ac954966..85945012 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -7,13 +7,12 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ - #include "lcoc_func.h" - #include "lcoc_args.h" - #include "mki/utils/log/log.h" - - using namespace std; - namespace Lcal { +#include "lcoc_func.h" +#include "lcoc_args.h" +#include "mki/utils/log/log.h" +using namespace std; +namespace Lcal { bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max) { if (value < min || (max != PARM_CHECK_MAX_VALUE && value > max)) { @@ -72,11 +71,11 @@ } int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, - int nElemAlign) + int nElemAlign) { int64_t nRow = transpose ? n : m; int64_t nCol = transpose ? m : n; int64_t nColAlign = (nCol + nElemAlign - 1) / nElemAlign * nElemAlign; return batchSize * nRow * nColAlign; } - } \ No newline at end of file +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp index ae92b66a..2d18f29d 100644 --- a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp +++ b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp @@ -13,7 +13,7 @@ #include "tiling_func.h" #include "lcoc_func.h" -#define TILING_MAP std::>> +#define TILING_MAP std::map>> namespace Lcal { constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFAULT = 11; static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap = { @@ -223,7 +223,7 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMDATASPLI static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap = { {8, {{3072, 2147483647, 2176, 2147483647, -1, 5120}, {768, 2147483647, -1, 2147483647, 5120, 2147483647}}}}; -const int PVALE_ONE = 1; +const int PVALUE_ONE = 1; const int M0_DEFAULT = 128; const int K0_DEFAULT = 256; const int N0_DEFAULT = 256; @@ -231,7 +231,7 @@ const int SWIZZLEDIRECT_ONE = 1; void AG8RS2FalseFP16Tiling(CoCTilingData &cocTilingData) { - std::map tilingParamMap = { + std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFAULT, g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap}}, @@ -248,11 +248,11 @@ void AG8RS2FalseFP16Tiling(CoCTilingData &cocTilingData) {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMDATASPLIT_DEFAULT, g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap}}, {&cocTilingData.extraUbMoveNum, - {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT. + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT, g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap}} + g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap}}, {&cocTilingData.extraCommNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT}}, {&cocTilingData.extraCommDataSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT}}}; @@ -262,7 +262,7 @@ void AG8RS2FalseFP16Tiling(CoCTilingData &cocTilingData) void AG8RS2TrueFP16Tiling(CoCTilingData &cocTilingData) { - std::map tilingParamMap = { + std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_SWIZZLECOUNT_DEFAULT, g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap}}, @@ -283,7 +283,7 @@ void AG8RS2TrueFP16Tiling(CoCTilingData &cocTilingData) g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, - g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap}} + g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap}}, {&cocTilingData.extraCommNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT}}, {&cocTilingData.extraCommDataSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT}}}; @@ -293,7 +293,7 @@ void AG8RS2TrueFP16Tiling(CoCTilingData &cocTilingData) void AG2RS8TrueFP16Tiling(CoCTilingData &cocTilingData) { - std::map tilingParamMap = { + std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_SWIZZLECOUNT_DEFAULT, g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap}}, @@ -307,9 +307,8 @@ void AG2RS8TrueFP16Tiling(CoCTilingData &cocTilingData) {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMNPUSPLIT_DEFAULT}}, {&cocTilingData.commDataSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDATASPLIT_DEFAULT}}, - {&cocTilingData.commDirect, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDIRECT_DEFAULT}}, {&cocTilingData.extraUbMoveNum, - {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT. + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT, g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, @@ -325,7 +324,7 @@ void AG2RS8TrueFP16Tiling(CoCTilingData &cocTilingData) void AG2RS8FalseFP16Tiling(CoCTilingData &cocTilingData) { - std::map tilingParamMap = { + std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_SWIZZLECOUNT_DEFAULT, g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap}}, @@ -339,7 +338,7 @@ void AG2RS8FalseFP16Tiling(CoCTilingData &cocTilingData) {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDIRECT_DEFAULT, g_allgatherTwoReducescatterEightFalseFP16CommdirectMap}}, {&cocTilingData.extraUbMoveNum, - {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT. + {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT, g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT}}, {&cocTilingData.extraCommNpuSplit, @@ -356,7 +355,7 @@ void CoCAllgatherMatmulReduceScatterTilingFunc::GetDefaultTiling(const TaskParam { CoCTilingFunc::GetDefaultTiling(taskParam); - cocTilingData.swizzleDirect = SWIZZLEDIRECT_ONE; + cocTilingData.swizzlDirect = SWIZZLEDIRECT_ONE; cocTilingData.m0 = M0_DEFAULT; cocTilingData.k0 = K0_DEFAULT; @@ -364,7 +363,7 @@ void CoCAllgatherMatmulReduceScatterTilingFunc::GetDefaultTiling(const TaskParam cocTilingData.withSerialMode = 0; cocTilingData.is91093 = 0; - cocTilingData.pValue = PVALE_ONE; + cocTilingData.pValue = PVALUE_ONE; cocTilingData.commDirect = 0; auto rsDim = taskParam.cocParamDesc.twoDimTPInfo.rsDim; diff --git a/comm/lcal/src/tiling/tiling_func.cpp b/comm/lcal/src/tiling/tiling_func.cpp index 75623fcb..f939eeec 100644 --- a/comm/lcal/src/tiling/tiling_func.cpp +++ b/comm/lcal/src/tiling/tiling_func.cpp @@ -8,12 +8,12 @@ * See LICENSE in the root of the software repository for the full text of the License. */ - #include "lcoc_func.h" - #include "lcoc_args.h" - #include "tiling_args.h" - #include "tiling_func.h" +#include "lcoc_func.h" +#include "lcoc_args.h" +#include "tiling_args.h" +#include "tiling_func.h" - namespace Lcal { +namespace Lcal { int32_t CeilDev(int32_t num, int32_t div) { if (div == 0) { @@ -51,8 +51,8 @@ for (auto &condition : iter->second) { bool inRange = m > condition[CONDITION_M_ST] && m <= condition[CONDITION_M_END] && - k > condition[CONDITION_K_ST] && m <= condition[CONDITION_K_END] && - n > condition[CONDITION_N_ST] && m <= condition[CONDITION_N_END]; + k > condition[CONDITION_K_ST] && k <= condition[CONDITION_K_END] && + n > condition[CONDITION_N_ST] && n <= condition[CONDITION_N_END]; if (inRange) { return iter->first; } @@ -78,7 +78,7 @@ tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transB); tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.isInt8); tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.withBias); - tilingKey = (static_cast(tilingKey) << 1) + static_cast(tilingData.splitk); + tilingKey = (static_cast(tilingKey) << 1) + static_cast(tilingData.splitK); return tilingKey; } @@ -166,30 +166,30 @@ {"ubMoveNum", HALF_KBYTE}, {"lenPerLoop", HALF_KBYTE}, {"extraUbMoveNum", HALF_KBYTE}, - {"extraLenPerLoop", HALF_KBYTE}, + {"extraLenPerLoop", HALF_KBYTE} }; return alignParamMap; } - std::vector> GetCoCTingparamCheckList(const CoCTiling &tiling) + std::vector> GetCoCTilingParamCheckList(const CoCTiling &tiling) { std::vector> paramCheckList = { {"m0", tiling.m0, BLOCK_SIZE, CUBE_BLOCK_SIZE}, {"n0", tiling.n0, BLOCK_SIZE, CUBE_BLOCK_SIZE}, {"k0", tiling.k0, CUBE_BLOCK_SIZE, AXES_ALIGN_SIZE}, - {"swizzlCount", tiling.swizzlCount, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"swizzleDirect", tiling.swizzlDirect, SWIZZLE_DIRECT_ZERO, SWIZZLE_DIRECT_ONE}, + {"swizzlCount", tiling.swizzlCount, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"swizzlDirect", tiling.swizzlDirect, SWIZZLE_DIRECT_ZERO, SWIZZLE_DIRECT_ONE}, {"ubMoveNum", tiling.ubMoveNum, HALF_KBYTE, MAX_UB_NUM}, - {"commNpuSplit", tiling.commNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"commDataSplit", tiling.commDataSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"commNpuSplit", tiling.commNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"commDataSplit", tiling.commDataSplit, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, {"commDirect", tiling.commDirect, COMM_DATA_DIRECT, COMM_NPU_DIRECT}, - {"lenPerLoop", tiling.lenPerLoop, HALF_KBYTE, PARA_CHECK_MAX_VALUE}, + {"lenPerLoop", tiling.lenPerLoop, HALF_KBYTE, PARAM_CHECK_MAX_VALUE}, {"extraUbMoveNum", tiling.extraUbMoveNum, HALF_KBYTE, MAX_UB_NUM}, - {"extraCommNpuSplit", tiling.extraCommNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"extraCommDataSplit", tiling.extraCommDataSplit, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"extraCommNpuSplit", tiling.extraCommNpuSplit, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"extraCommDataSplit", tiling.extraCommDataSplit, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, {"extraCommDirect", tiling.extraCommDirect, COMM_DATA_DIRECT, COMM_NPU_DIRECT}, - {"extraLenPerLoop", tiling.extraLenPerLoop, HALF_KBYTE, PARA_CHECK_MAX_VALUE}, - {"splitK", tiling.splitK, PARAM_CHECK_MIN_VALUE_ZERO, PARA_CHECK_MAX_VALUE}, + {"extraLenPerLoop", tiling.extraLenPerLoop, HALF_KBYTE, PARAM_CHECK_MAX_VALUE}, + {"splitK", tiling.splitK, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MAX_VALUE}, {"write2OtherRank", tiling.write2OtherRank, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MIN_VALUE_ONE}, {"withSerialMode", tiling.withSerialMode, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MIN_VALUE_ONE}, {"is91093", tiling.is91093, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MIN_VALUE_ONE} @@ -201,7 +201,7 @@ { auto powerOfTwoParamMap = GetCoCTilingPowerOfTwoParamMap(); auto alignParamMap = GetCoCTilingAlignParamMap(); - auto paramCheckList = GetCoCTingparamCheckList(tiling); + auto paramCheckList = GetCoCTilingParamCheckList(tiling); for (auto ¶m : paramCheckList) { auto name = std::get<0>(param); auto value = std::get<1>(param); @@ -231,11 +231,11 @@ return false; } std::vector> paramCheckList = { - {"mLoop", tiling.mLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"kLoop", tiling.kLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"nLoop", tiling.nLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"coreLoop", tiling.coreLoop, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, - {"tilingKey", tiling.tilingKey, PARAM_CHECK_MIN_VALUE_ONE, PARA_CHECK_MAX_VALUE}, + {"mLoop", tilingData.mLoop, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"kLoop", tilingData.kLoop, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"nLoop", tilingData.nLoop, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"coreLoop", tilingData.coreLoop, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE}, + {"tilingKey", tilingData.tilingKey, PARAM_CHECK_MIN_VALUE_ZERO, PARAM_CHECK_MAX_VALUE}, }; return CheckParamScopeList(paramCheckList); } @@ -281,4 +281,4 @@ TransformCoCTiling(tiling, tilingData); CalTilingParam(taskParam.cocParamDesc.mmInfo, tilingData); } - } \ No newline at end of file +} \ No newline at end of file -- Gitee From fc27043720a55d846770dc24738ada9e3667f0c5 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 15:20:35 +0800 Subject: [PATCH 382/414] fix --- comm/lcal/include/lcoc/tiling/tiling_args.h | 4 ++-- comm/lcal/src/coc_kernel_args.cpp | 2 +- .../src/kernels/coc_allgather_reducescatter.cce | 9 ++++----- comm/lcal/src/kernels/coc_allreduce.cce | 2 +- comm/lcal/src/kernels/coc_ppmatmul.cce | 2 +- comm/lcal/src/lcoc.cpp | 16 ++++++++-------- comm/lcal/src/lcoc_func.cpp | 6 +++--- 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/comm/lcal/include/lcoc/tiling/tiling_args.h index 66bfe661..8d08c6d0 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_args.h +++ b/comm/lcal/include/lcoc/tiling/tiling_args.h @@ -50,7 +50,7 @@ namespace Lcal { constexpr int32_t AXES_ALIGN_SIZE = 512; constexpr int32_t BASE_BLOCK_STEP = 2; constexpr int32_t INPUT_DTYPE = 2; - constexpr int32_t MAX_BLOCK_COUNT =2; + constexpr int32_t MAX_BLOCK_COUNT = 2; constexpr int32_t BLOCK_COUNT_3 = 3; constexpr int32_t FP16_SIZE = 2; constexpr int32_t FP32_SIZE = 4; @@ -133,7 +133,7 @@ namespace Lcal { uint32_t tilingKey = -1; const char* ToString() const; - void SetDefaultValue(); + void SetDefaultValue(); }; struct CoCKernelParam { diff --git a/comm/lcal/src/coc_kernel_args.cpp b/comm/lcal/src/coc_kernel_args.cpp index ff307add..f0ae9e65 100644 --- a/comm/lcal/src/coc_kernel_args.cpp +++ b/comm/lcal/src/coc_kernel_args.cpp @@ -25,7 +25,7 @@ int CoCKernelArgs::SetFFTSAddr() MKI_LOG(ERROR) << "MkiRtGetC2cCtrlAddr err"; return LCAL_ERROR_MKIRT; } - return LCAL_SUCCESS; + return LCAL_SUCCESS; } void CoCKernelArgs::SetInputPkgArgs(CoCInputPkg &inputPkg) diff --git a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce index 4533096a..3655cc9a 100644 --- a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce +++ b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce @@ -65,7 +65,7 @@ public: rs_max_ub_ping_pong_size = (extra_ub_move_num / 2) / n0 * n0; } - FORCE_INLINE_AICORE void CopyGmToGm(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { + FORCE_INLINE_AICORE void CopyGMToGM(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { auto ub0 = output_UB_T[0]; auto ub1 = output_UB_T[1]; int32_t interm_offset = 0; @@ -109,7 +109,7 @@ public: real_rank = dst_rank * rs_dim + rank % rs_dim; } if (real_rank != rank && dst_rank < ag_dim) { - CopyUbufToGm(buff[real_rank] + rank_offset, ub_buff_st, 1, block_len, 0, 0) + CopyUbufToGm(buff[real_rank] + rank_offset, ub_buff_st, 1, block_len, 0, 0); } dst_rank = (dst_rank + skip_num) % ag_dim; } @@ -150,7 +150,7 @@ public: real_rank = dst_rank * rs_dim + rank % rs_dim; } if (real_rank != rank && dst_rank < ag_dim) { - CopyGmToGm(gm_src + data_src, buff[real_rank] + rank_offset + data_src, data_len); + CopyGMToGM(gm_src + data_src, buff[real_rank] + rank_offset + data_src, data_len); } dst_rank = (dst_rank + ag_comm_npu_split) % ag_dim; } @@ -353,7 +353,6 @@ public: using CocCommBase::k_loop; using CocCommBase::core_loop; using CocCommBase::core_idx; - using CocCommBase::core_num; using CocCommBase::rank; using CocCommBase::rank_size; using CocCommBase::tiling_key; @@ -436,7 +435,7 @@ inline __aicore__ void CocAllGatherMatmulReduceScatterAiv(COC_ARGS_FUN(T)) { SetMaskNormImpl(); SetSyncBaseAddr((uint64_t)ffts_addr); SetVectorMask((uint64_t)-1, (uint64_t)-1); - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; int32_t tiling_key = cocTilingData->tilingKey; switch (tiling_key) { diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/comm/lcal/src/kernels/coc_allreduce.cce index c787a6dd..c2243edb 100644 --- a/comm/lcal/src/kernels/coc_allreduce.cce +++ b/comm/lcal/src/kernels/coc_allreduce.cce @@ -554,7 +554,7 @@ inline __aicore__ void CocMatmulAllReduceAiv(COC_ARGS_FUN(T)) SetSyncBaseAddr((uint64_t)ffts_addr); SetVectorMask((uint64_t)-1, (uint64_t)-1); - auto para = reinterpret_cast<__gm__ Lcal::CoCkernelParam *>(para_gm); + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; int64_t batch_size = cocTilingData->batchSize; int32_t m = cocTilingData->m; diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce index 2c4e2f44..b104a545 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -642,7 +642,7 @@ public: } } } else { - for (int32_t i = 0; i < k0_round / BLOCK_SIZE_16; i++) { + for (int32_t i = 0; i < k0_round / block_size; i++) { LoadCbufToCa( l0a_buf + i * cube_matrix_size, l1_buf_a + k_part_idx * k_part_len * m_round + diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index 53a5f4da..911db3e5 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -48,7 +48,7 @@ bool CheckLcalType(LcalType lcalType) { if (lcalType < LcalType::PURE_MATMUL || lcalType >= LcalType::LCAL_TYPE_MAX) { MKI_LOG(ERROR) << "The lcalType:" << int(lcalType) - << "must be in [" << int(LcalType::PURE_MATMUL) << ", " << int(LcalType::LCAL_TYPE_MAX) << ")!"; + << " must be in [" << int(LcalType::PURE_MATMUL) << ", " << int(LcalType::LCAL_TYPE_MAX) << ")!"; return false; } return true; @@ -66,7 +66,7 @@ bool CheckCoCParamDesc(LcalType lcalType, const CoCParamDesc ¶mDesc) return false; } if (paramDesc.op != HCCL_REDUCE_SUM) { - MKI_LOG(ERROR) << "The ReduceOp:" << paramDesc.op << "is not support yet!"; + MKI_LOG(ERROR) << "The ReduceOp:" << paramDesc.op << " is not support yet!"; return false; } @@ -226,8 +226,7 @@ bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, const CoCOutputPkg &outputPkg return true; } -int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, - aclrtStream stream) +int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) { LcalType lcalType = LcalType::MATMUL_ALL_REDUCE; if (!CheckBasic(inputPkg, outputPkg, lcalType)) { @@ -237,7 +236,8 @@ int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *wo return LaunchOperator(inputPkg, outputPkg, workspace, stream); } -int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream) { LcalType lcalType = LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; if (!CheckBasic(inputPkg, outputPkg, lcalType)) { @@ -297,17 +297,17 @@ int64_t Lcoc::GetWorkspaceSize() uint64_t dequantWorkSpaceSize = GetDequantWorkSpaceSize(lcalType, tiling_.withSerialMode, mmInfo.m, mmInfo.n, tiling_.m0, tiling_.n0, tiling_.pValue, tiling_.nLoop, taskParam_.rankSize, taskParam_.blockDim); - LcalWorkspaceInfo lcalWorkSpaceInfo = GetLcalWorkspaceInfo(0, mmInfo.batchSize, mmInfo.m, mmInfo.k, + LcalWorkspaceInfo lcalWorkspaceInfo = GetLcalWorkspaceInfo(0, mmInfo.batchSize, mmInfo.m, mmInfo.k, mmInfo.n, mAlign, kAlign, nAlign, mmInfo.transA, mmInfo.transB, eleSize, hasAAlign, hasBAlign, accumRankSize, hasAccum, dequantWorkSpaceSize, hasDequantParam, hasFormatDequantScale, isDeterministic); - MKI_LOG(DEBUG) << "[Lcoc Workspace]: " << "m=" << mmInfo.m << ",k=" << mmInfo.k << ", n=" << mmInfo.n + MKI_LOG(DEBUG) << "[Lcoc Workspace]: " << "m=" << mmInfo.m << ", k=" << mmInfo.k << ", n=" << mmInfo.n << ", mAlign=" << mAlign << ", kAlign=" << kAlign << ", nAlign=" << nAlign << ", transA=" << mmInfo.transA << ", transB=" << mmInfo.transB << ", eleSize=" << eleSize << ", hasAAlign=" << hasAAlign << ", hasBAlign=" << hasBAlign << ", accumRankSize=" << accumRankSize << ", hasAccum=" << hasAccum << ", dequantWorkSpaceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam << ", hasFormatDequantScale=" << hasFormatDequantScale << ", isDeterministic=" << isDeterministic - << ", workspaceSize=" << lcalWorkSpaceInfo.workspaceSize; + << ", workspaceSize=" << lcalWorkspaceInfo.workspaceSize; return lcalWorkspaceInfo.workspaceSize; } diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index 85945012..9915cfa8 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -15,11 +15,11 @@ using namespace std; namespace Lcal { bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max) { - if (value < min || (max != PARM_CHECK_MAX_VALUE && value > max)) { - if (max == PARM_CHECK_MAX_VALUE) { + if (value < min || (max != PARAM_CHECK_MAX_VALUE && value > max)) { + if (max == PARAM_CHECK_MAX_VALUE) { MKI_LOG(ERROR) << "The " << name << ":" << value << " must equal or greater than " << min << "!"; } else { - MKI_LOG(ERROR) << "The " << name << ":" << value << " must be in [" << min << "," << max << "]!"; + MKI_LOG(ERROR) << "The " << name << ":" << value << " must be in [" << min << ", " << max << "]!"; } return false; } -- Gitee From 5a0178b383b03d30c8d50198b0ca22d0510c2504 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 15:25:27 +0800 Subject: [PATCH 383/414] fix --- comm/lcal/src/kernels/coc_ppmatmul.cce | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce index b104a545..51ce3a31 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -421,7 +421,7 @@ public: gm_src_a, 0, 1, - k_round, + k_round / block_size, 0, 0, PAD_NONE -- Gitee From 782df623fd2fbde4cd919e8e57b53eddb2b3feb4 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 15:34:45 +0800 Subject: [PATCH 384/414] fix --- comm/lcal/src/kernels/coc_ppmatmul.cce | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce index 51ce3a31..a3963de5 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -618,11 +618,11 @@ public: l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_32 + i * k_round * BLOCK_SIZE_32, 0, - k0_round, + k0_round / BLOCK_SIZE_32, 1, 0, 0, - k0_round + k0_round / BLOCK_SIZE_32 - 1 ); } } else { -- Gitee From 431cbbebda86ea458fa2c77a13c95af24c7393c2 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 15:56:42 +0800 Subject: [PATCH 385/414] fix --- comm/lcal/include/lcoc/lcoc.h | 2 +- comm/lcal/include/lcoc/lcoc_workspace.h | 2 +- comm/lcal/src/coc_kernel_args.cpp | 4 +- comm/lcal/src/lcoc.cpp | 2 +- comm/lcal/src/lcoc_func.cpp | 2 +- .../tiling/allgather_reducescatter_tiling.cpp | 130 +++++++++--------- comm/lcal/src/tiling/tiling_func.cpp | 2 +- 7 files changed, 72 insertions(+), 72 deletions(-) diff --git a/comm/lcal/include/lcoc/lcoc.h b/comm/lcal/include/lcoc/lcoc.h index 95adabc5..2bb01072 100644 --- a/comm/lcal/include/lcoc/lcoc.h +++ b/comm/lcal/include/lcoc/lcoc.h @@ -16,7 +16,7 @@ #include "tiling_args.h" namespace Lcal { - class Lcoc { +class Lcoc { public: Lcoc() = delete; explicit Lcoc(LcalComm &comm); diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/comm/lcal/include/lcoc/lcoc_workspace.h index 56f9f5fb..8385f413 100644 --- a/comm/lcal/include/lcoc/lcoc_workspace.h +++ b/comm/lcal/include/lcoc/lcoc_workspace.h @@ -78,7 +78,7 @@ inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, in if (hasBAlign) { lcalWorkspaceInfo.gm_b_align = workspaceOffset; workspaceOffset += static_cast(batchSize) * (transb ? n * kAlign : k * nAlign) * mmadSize * - (expertPerRank <= 0 ? 1 : expertPerRank); + (expertPerRank <= 0 ? 1 : expertPerRank); } if (!isMoe && hasDequantParam) { diff --git a/comm/lcal/src/coc_kernel_args.cpp b/comm/lcal/src/coc_kernel_args.cpp index f0ae9e65..a0e3850d 100644 --- a/comm/lcal/src/coc_kernel_args.cpp +++ b/comm/lcal/src/coc_kernel_args.cpp @@ -72,9 +72,9 @@ void CoCKernelArgs::SetCoCTilingDataArgs(const CoCTilingData &tilingData) std::string CoCKernelArgs::ParamToString() { - std::string quantInfoString = "[QuantInfo]: dequantGranularity=" + + std::string quantInfoString = "[QuantInfo]: dequantGranularity=" + std::to_string(cocKernelParam.quantInfo.dequantGranularity) + "\n"; - std::string weightNzInfoString = "[weightNz]: weightNz=" + + std::string weightNzInfoString = "[weightNz]: weightNz=" + std::to_string(cocKernelParam.weightNz) + "\n"; std::string tilingInfoString = cocKernelParam.cocTilingData.ToString(); return quantInfoString + weightNzInfoString + tilingInfoString; diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index 911db3e5..7a6dcfc1 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -236,7 +236,7 @@ int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *wo return LaunchOperator(inputPkg, outputPkg, workspace, stream); } -int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, +int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) { LcalType lcalType = LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index 9915cfa8..8d5e6604 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -71,7 +71,7 @@ namespace Lcal { } int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, - int nElemAlign) + int nElemAlign) { int64_t nRow = transpose ? n : m; int64_t nCol = transpose ? m : n; diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp index 2d18f29d..2375fe47 100644 --- a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp +++ b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp @@ -19,16 +19,16 @@ constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFA static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap = { {9, {{768, 1536, -1, 2147483647, -1, 7168}, - {1536, 3072, -1, 5120, -1, 14848}, - {1536, 2147483647, 5120, 2147483647, -1, 10752}}}, + {1536, 3072, -1, 5120, -1, 14848}, + {1536, 2147483647, 5120, 2147483647, -1, 10752}}}, {14, {{768, 1536, -1, 5120, 10752, 2147483647}, {1536, 2147483647, -1, 5120, 14848, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_UBMOVENUM_DEFAULT = 40; static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap = { {24, {{768, 1536, -1, 2147483647, 3072, 10752}, - {1536, 3072, -1, 7168, 3072, 2147483647}, - {3072, 2147483647, -1, 7168, 3072, 2147483647}}}, + {1536, 3072, -1, 7168, 3072, 2147483647}, + {3072, 2147483647, -1, 7168, 3072, 2147483647}}}, {30, {{3072, 2147483647, 7168, 2147483647, 3072, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_LENPERLOOPMULT_DEFAULT = 400; @@ -40,23 +40,23 @@ constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMNPUSPLIT_DEFA static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap = { {1, {{768, 1536, 5120, 2147483647, -1, 3072}, - {1536, 3072, 14848, 2147483647, -1, 7168}, - {3072, 2147483647, 14848, 2147483647, -1, 2147483647}}}}; + {1536, 3072, 14848, 2147483647, -1, 7168}, + {3072, 2147483647, 14848, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMDATASPLIT_DEFAULT = 1; static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap = { {8, {{768, 1536, 5120, 2147483647, -1, 3072}, - {1536, 3072, 14848, 2147483647, -1, 7168}, - {3072, 2147483647, 14848, 2147483647, -1, 2147483647}}}}; + {1536, 3072, 14848, 2147483647, -1, 7168}, + {3072, 2147483647, 14848, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT = 12; static TILING_MAP g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap = { {10, {{-1, 768, -1, 2147483647, 5120, 10752}, - {768, 1536, -1, 2147483647, 5120, 2147483647}, - {1536, 2147483647, -1, 10752, 5120, 2147483647}, - {1536, 2147483647, 10752, 14848, -1, 10752}}}, + {768, 1536, -1, 2147483647, 5120, 2147483647}, + {1536, 2147483647, -1, 10752, 5120, 2147483647}, + {1536, 2147483647, 10752, 14848, -1, 10752}}}, {20, {{1536, 2147483647, 14848, 2147483647, 10752, 2147483647}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 4; @@ -76,8 +76,8 @@ static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap = { {17, {{768, 2147483647, 7168, 2147483647, -1, 7936}}}, {13, {{-1, 192, 5120, 2147483647, -1, 12800}, - {-1, 192, -1, 2147483647, 15360, 2147483647}, - {768, 2147483647, -1, 7168, -1, 9088}}}}; + {-1, 192, -1, 2147483647, 15360, 2147483647}, + {768, 2147483647, -1, 7168, -1, 9088}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_UBMOVENUM_DEFAULT = 60; static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap = { @@ -100,8 +100,8 @@ static TILING_MAP g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap = { {60, {{-1, 192, -1, 5120, -1, 6912}, {-1, 192, -1, 2147483647, 10368, 2147483647}}}, {40, {{-1, 192, 5120, 2147483647, -1, 6912}, - {-1, 192, -1, 2147483647, 6912, 10368}, - {192, 384, -1, 2147483647, 1600, 4608}}}, + {-1, 192, -1, 2147483647, 6912, 10368}, + {192, 384, -1, 2147483647, 1600, 4608}}}, {30, {{192, 384, -1, 2147483647, 4608, 2147483647}, {768, 2147483647, -1, 5120, -1, 3968}}}}; constexpr int32_t ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; @@ -117,12 +117,12 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_SWIZZLECOUNT_DEFAU static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap = { {9, {{3072, 6144, -1, 2147483647, -1, 10752}, - {12288, 2147483647, -1, 7168, -1, 10752}, - {12288, 2147483647, 10752, 2147483647, -1, 5120}}}, + {12288, 2147483647, -1, 7168, -1, 10752}, + {12288, 2147483647, 10752, 2147483647, -1, 5120}}}, {14, {{-1, 3072, -1, 7168, -1, 14848}, - {-1, 3072, -1, 10752, 14848, 2147483647}, - {12288, 2147483647, 7168, 10752, -1, 5120}}}}; + {-1, 3072, -1, 10752, 14848, 2147483647}, + {12288, 2147483647, 7168, 10752, -1, 5120}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_UBMOVENUM_DEFAULT = 10; static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap = { @@ -135,8 +135,8 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_LENPERLOOPMULT_DEF static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap = { {4, {{3072, 6144, -1, 2147483647, 3072, 10752}, - {12288, 2147483647, -1, 10752, -1, 3072}, - {6144, 2147483647, -1, 2147483647, 3072, 2147483647}}}}; + {12288, 2147483647, -1, 10752, -1, 3072}, + {6144, 2147483647, -1, 2147483647, 3072, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMNPUSPLIT_DEFAULT = 1; @@ -153,8 +153,8 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRALENPERLOOPMUL static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap = { {4, {{-1, 3072, -1, 10752, 14848, 2147483647}, - {12288, 2147483647, 3072, 5120, -1, 2147483647}, - {6144, 2147483647, 5120, 7168, -1, 2147483647}}}}; + {12288, 2147483647, 3072, 5120, -1, 2147483647}, + {6144, 2147483647, 5120, 7168, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT = 8; static TILING_MAP g_allgatherTwoReducescatterEightTrueFP16ExtracommnpusplitMap = { @@ -169,22 +169,22 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_SWIZZLECOUNT_DEFA static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap = { {13, {{-1, 768, 1280, 2147483647, -1, 7168}, - {1536, 3072, -1, 2147483647, -1, 7168}, - {3072, 2147483647, 5184, 2147483647, -1, 2147483647}}}, + {1536, 3072, -1, 2147483647, -1, 7168}, + {3072, 2147483647, 5184, 2147483647, -1, 2147483647}}}, {17, {{-1, 768, -1, 2147483647, 7168, 2147483647}, - {3072, 2147483647, -1, 4544, 7168, 2147483647}, - {3072, 2147483647, 4544, 5184, -1, 2147483647}}}, + {3072, 2147483647, -1, 4544, 7168, 2147483647}, + {3072, 2147483647, 4544, 5184, -1, 2147483647}}}, {5, {{768, 1536, -1, 2147483647, 5120, 2147483647}, - {3072, 2147483647, -1, 4544, -1, 7168}, - {3072, 2147483647, 7680, 2147483647, -1, 2147483647}}}}; + {3072, 2147483647, -1, 4544, -1, 7168}, + {3072, 2147483647, 7680, 2147483647, -1, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_UBMOVENUM_DEFAULT = 40; static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap = { {30, {{-1, 768, 2176, 3840, -1, 5120}, {-1, 768, 2560, 2147483647, 5120, 7168}, - {-1, 768, -1, 7680, 7168, 2147483647}, {1536, 3072, -1, 6400, -1, 2147483647}}}, + {-1, 768, -1, 7680, 7168, 2147483647}, {1536, 3072, -1, 6400, -1, 2147483647}}}, {60, {{-1, 768, 7680, 2147483647, 7168, 2147483647}, {768, 1536, -1, 1280, -1, 7168}}}, {20, {{768, 1536, -1, 4352, 7168, 2147483647}, {3072, 2147483647, -1, 6400, -1, 2147483647}}}}; @@ -198,19 +198,19 @@ constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDIRECT_DEFAUL static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16CommdirectMap = { {1, {{-1, 768, 3456, 2147483647, -1, 5120}, - {-1, 768, 2560, 2147483647, 5120, 7168}, - {-1, 768, 4352, 7680, 7168, 2147483647}, - {768, 1536, -1, 2147483647, -1, 7168}, - {1536, 3072, 1280, 2147483647, -1, 2147483647}, - {3072, 2147483647, -1, 7680, 5120, 2147483647}}}}; + {-1, 768, 2560, 2147483647, 5120, 7168}, + {-1, 768, 4352, 7680, 7168, 2147483647}, + {768, 1536, -1, 2147483647, -1, 7168}, + {1536, 3072, 1280, 2147483647, -1, 2147483647}, + {3072, 2147483647, -1, 7680, 5120, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT = 60; static TILING_MAP g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap = { {40, {{768, 2147483647, -1, 2176, -1, 5120}}}, {30, {{768, 1536, 2176, 2147483647, -1, 5120}, - {768, 1536, -1, 2147483647, 5120, 2147483647}, - {1536, 2147483647, -1, 1792, 5120, 2147483647}}}, + {768, 1536, -1, 2147483647, 5120, 2147483647}, + {1536, 2147483647, -1, 1792, 5120, 2147483647}}}, {20, {{1536, 2147483647, 2176, 2147483647, -1, 5120}, {1536, 2147483647, 1792, 2147483647, 5120, 2147483647}}}}; constexpr int32_t ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT = 2; @@ -234,25 +234,25 @@ void AG8RS2FalseFP16Tiling(CoCTilingData &cocTilingData) std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_SWIZZLECOUNT_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap}}, + g_allgatherEightReducescatterTwoFalseFP16SwizzlecountMap}}, {&cocTilingData.ubMoveNum, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_UBMOVENUM_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap}}, + g_allgatherEightReducescatterTwoFalseFP16UbmovenumMap}}, {&cocTilingData.lenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_LENPERLOOPMULT_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16LenperloopmultMap}}, + g_allgatherEightReducescatterTwoFalseFP16LenperloopmultMap}}, {&cocTilingData.commNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMNPUSPLIT_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap}}, + g_allgatherEightReducescatterTwoFalseFP16CommnpusplitMap}}, {&cocTilingData.commDataSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_COMMDATASPLIT_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap}}, + g_allgatherEightReducescatterTwoFalseFP16CommdatasplitMap}}, {&cocTilingData.extraUbMoveNum, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap}}, + g_allgatherEightReducescatterTwoFalseFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT, - g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap}}, + g_allgatherEightReducescatterTwoFalseFP16ExtralenperloopmultMap}}, {&cocTilingData.extraCommNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT}}, {&cocTilingData.extraCommDataSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT}}}; @@ -265,25 +265,25 @@ void AG8RS2TrueFP16Tiling(CoCTilingData &cocTilingData) std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_SWIZZLECOUNT_DEFAULT, - g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap}}, + g_allgatherEightReducescatterTwoTrueFP16SwizzlecountMap}}, {&cocTilingData.ubMoveNum, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_UBMOVENUM_DEFAULT, - g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap}}, + g_allgatherEightReducescatterTwoTrueFP16UbmovenumMap}}, {&cocTilingData.lenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_LENPERLOOPMULT_DEFAULT}}, {&cocTilingData.commNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMNPUSPLIT_DEFAULT, - g_allgatherEightReducescatterTwoTrueFP16CommnpusplitMap}}, + g_allgatherEightReducescatterTwoTrueFP16CommnpusplitMap}}, {&cocTilingData.commDataSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDATASPLIT_DEFAULT, - g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap}}, + g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap}}, {&cocTilingData.commDirect, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDIRECT_DEFAULT}}, {&cocTilingData.extraUbMoveNum, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT. - g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap}}, + g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, - g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap}}, + g_allgatherEightReducescatterTwoTrueFP16ExtralenperloopmultMap}}, {&cocTilingData.extraCommNpuSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMNPUSPLIT_DEFAULT}}, {&cocTilingData.extraCommDataSplit, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT}}}; @@ -296,28 +296,28 @@ void AG2RS8TrueFP16Tiling(CoCTilingData &cocTilingData) std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_SWIZZLECOUNT_DEFAULT, - g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap}}, + g_allgatherTwoReducescatterEightTrueFP16SwizzlecountMap}}, {&cocTilingData.ubMoveNum, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_UBMOVENUM_DEFAULT, - g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap}}, + g_allgatherTwoReducescatterEightTrueFP16UbmovenumMap}}, {&cocTilingData.lenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_LENPERLOOPMULT_DEFAULT, - g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap}}, + g_allgatherTwoReducescatterEightTrueFP16LenperloopmultMap}}, {&cocTilingData.commNpuSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMNPUSPLIT_DEFAULT}}, {&cocTilingData.commDataSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_COMMDATASPLIT_DEFAULT}}, {&cocTilingData.extraUbMoveNum, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT, - g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap}}, + g_allgatherTwoReducescatterEightTrueFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, - g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap}}, - {&cocTilingData.extraCommNpuSplit, + g_allgatherTwoReducescatterEightTrueFP16ExtralenperloopmultMap}}, + {&cocTilingData.extraCommNpuSplit, {DIM_EIGHT, g_allgatherTwoReducescatterEightTrueFP16ExtracommnpusplitMap}}, {&cocTilingData.extraCommDataSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_TRUE_FP16_EXTRACOMMDATASPLIT_DEFAULT, - g_allgatherTwoReducescatterEightTrueFP16ExtracommdatasplitMap}}}; + g_allgatherTwoReducescatterEightTrueFP16ExtracommdatasplitMap}}}; SetTilingParam2D(cocTilingData, tilingParamMap); return; } @@ -327,26 +327,26 @@ void AG2RS8FalseFP16Tiling(CoCTilingData &cocTilingData) std::map tilingParamMap = { {&cocTilingData.swizzlCount, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_SWIZZLECOUNT_DEFAULT, - g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap}}, + g_allgatherTwoReducescatterEightFalseFP16SwizzlecountMap}}, {&cocTilingData.ubMoveNum, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_UBMOVENUM_DEFAULT, - g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap}}, + g_allgatherTwoReducescatterEightFalseFP16UbmovenumMap}}, {&cocTilingData.lenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_LENPERLOOPMULT_DEFAULT}}, {&cocTilingData.commNpuSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMNPUSPLIT_DEFAULT}}, {&cocTilingData.commDataSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDATASPLIT_DEFAULT}}, - {&cocTilingData.commDirect, + {&cocTilingData.commDirect, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_COMMDIRECT_DEFAULT, - g_allgatherTwoReducescatterEightFalseFP16CommdirectMap}}, + g_allgatherTwoReducescatterEightFalseFP16CommdirectMap}}, {&cocTilingData.extraUbMoveNum, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRAUBMOVENUM_DEFAULT, - g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap}}, + g_allgatherTwoReducescatterEightFalseFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRALENPERLOOPMULT_DEFAULT}}, - {&cocTilingData.extraCommNpuSplit, + {&cocTilingData.extraCommNpuSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMNPUSPLIT_DEFAULT, - g_allgatherTwoReducescatterEightFalseFP16ExtracommnpusplitMap}}, + g_allgatherTwoReducescatterEightFalseFP16ExtracommnpusplitMap}}, {&cocTilingData.extraCommDataSplit, {ALLGATHER_TWO_REDUCESCATTER_EIGHT_FALSE_FP16_EXTRACOMMDATASPLIT_DEFAULT, - g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap}}}; + g_allgatherTwoReducescatterEightFalseFP16ExtracommdatasplitMap}}}; SetTilingParam2D(cocTilingData, tilingParamMap); return; } diff --git a/comm/lcal/src/tiling/tiling_func.cpp b/comm/lcal/src/tiling/tiling_func.cpp index f939eeec..593736a0 100644 --- a/comm/lcal/src/tiling/tiling_func.cpp +++ b/comm/lcal/src/tiling/tiling_func.cpp @@ -222,7 +222,7 @@ namespace Lcal { return false; } } - return true; + return true; } bool CheckCoCTilingData(const CoCTilingData &tilingData) -- Gitee From 2191398238fd601a15f8ebd7d37f488527b279bd Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 16:15:31 +0800 Subject: [PATCH 386/414] fix --- comm/lcal/include/lcal_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index 69f93b0e..77a20f24 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -93,7 +93,7 @@ const std::map LCAL_TYPE2NAME = { { LcalType::ALL2ALL_V_C, "LcalAll2AllVC" }, { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER, "LcalAllGatherMatmulReduceScatter" }, { LcalType::BANDWIDTH, "LcalBandwidthTest" }, - { LcalType::LOCAL_REDUCE, "LcalLoalReduce" }, + { LcalType::LOCAL_REDUCE, "LcalLocalReduce" }, { LcalType::GATHER, "LcalGather" }, { LcalType::SEND, "LcalSend" }, { LcalType::RECV, "LcalRecv" } -- Gitee From 08e7715188960aa57e27d3ad8f30b2a1c695e6e4 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 30 Aug 2025 16:31:14 +0800 Subject: [PATCH 387/414] 5 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dfaffefb..9cac9a5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,7 +83,7 @@ include_directories( ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src/include ${PROJECT_SOURCE_DIR}/src/kernels/include - ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcal + ${PROJECT_SOURCE_DIR}/comm/lcal/include ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc/tiling ${PROJECT_SOURCE_DIR}/3rdparty/mki/include -- Gitee From 772f4bcfc4afa1368c1ba377b4776d2ba93c346d Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 16:43:41 +0800 Subject: [PATCH 388/414] fix --- comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp index 2375fe47..c6bc6b33 100644 --- a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp +++ b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp @@ -279,7 +279,7 @@ void AG8RS2TrueFP16Tiling(CoCTilingData &cocTilingData) g_allgatherEightReducescatterTwoTrueFP16CommdatasplitMap}}, {&cocTilingData.commDirect, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_COMMDIRECT_DEFAULT}}, {&cocTilingData.extraUbMoveNum, - {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT. + {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRAUBMOVENUM_DEFAULT, g_allgatherEightReducescatterTwoTrueFP16ExtraubmovenumMap}}, {&cocTilingData.extraLenPerLoop, {ALLGATHER_EIGHT_REDUCESCATTER_TWO_TRUE_FP16_EXTRALENPERLOOPMULT_DEFAULT, -- Gitee From 105fa2647a84e8e35f8c730200fed03dd3d3ddd1 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 16:53:28 +0800 Subject: [PATCH 389/414] fix --- comm/lcal/include/lcal_types.h | 35 ++++++++---- comm/lcal/include/lcoc/lcoc.h | 15 ++++- comm/lcal/include/lcoc/lcoc_args.h | 52 ++++++++++-------- comm/lcal/include/lcoc/lcoc_base.h | 21 +++++-- comm/lcal/include/lcoc/lcoc_func.h | 3 +- comm/lcal/include/lcoc/lcoc_workspace.h | 54 +++++++++++++++--- comm/lcal/include/lcoc/tiling/tiling.h | 58 +++++++++++++++++++- comm/lcal/include/lcoc/tiling/tiling_91093.h | 17 +++++- comm/lcal/include/lcoc/tiling/tiling_910B.h | 10 +++- comm/lcal/include/lcoc/tiling/tiling_args.h | 48 ++++++++++------ comm/lcal/include/lcoc/tiling/tiling_func.h | 3 +- 11 files changed, 242 insertions(+), 74 deletions(-) diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index 77a20f24..f9125c3c 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -27,7 +27,6 @@ constexpr int64_t LCAL_INVALID_VALUE = -1; // shared buffer size,这里要和collectives.cce文件中的常量联动修改!!! constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; -constexpr int LCAL_FLAG_BUFF_BYTES = 4 * 1024 * 1024; constexpr int LCAL_COMM_BUFFER_SIZE = 200; // 单位MB enum class ChipName { @@ -68,17 +67,26 @@ enum class LcalType { ALL_GATHER = 3, BROADCAST = 4, ALL2ALL = 5, - ALL2ALL_V_C = 6, - GATHER = 7, + ALL_REDUCE_910B2C = 6, + ALL_GATHER_910B2C = 7, LOCAL_REDUCE = 8, SEND = 9, RECV = 10, + ALL2ALL_V_C = 11, + GATHER = 12, PURE_MATMUL = 101, MATMUL_ALL_REDUCE = 102, - MTE2_TEST = 108, + MATMUL_REDUCE_SCATTER = 103, + ALL_GATHER_MATMUL = 104, + ALL_GATHER_MATMUL_V2 = 105, + ALL2ALL_MATMUL = 106, + MATMUL_ALL2ALL = 107, ALL_GATHER_MATMUL_REDUCE_SCATTER = 111, BANDWIDTH = 201, - LCAL_TYPE_MAX = 311 + ALLTOALLV_ALLGATHER_MATMUL = 305, + ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN = 309, + MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN = 310, + LCAL_TYPE_MAX = 311, }; const std::map LCAL_TYPE2NAME = { @@ -88,17 +96,22 @@ const std::map LCAL_TYPE2NAME = { { LcalType::BROADCAST, "LcalBroadcast" }, { LcalType::PURE_MATMUL, "LcalPureMatmul" }, { LcalType::MATMUL_ALL_REDUCE, "LcalMatmulAllReduce" }, - { LcalType::MTE2_TEST, "LcalMTE2Test" }, + { LcalType::MATMUL_REDUCE_SCATTER, "LcalMatmulReduceScatter" }, + { LcalType::ALL_GATHER_MATMUL, "LcalAllGatherMatmul" }, + { LcalType::ALL_GATHER_MATMUL_V2, "LcalAllGatherMatmulV2" }, + { LcalType::ALL2ALL_MATMUL, "LcalAll2AllMatmul" }, + { LcalType::MATMUL_ALL2ALL, "LcalMatmulAll2All" }, { LcalType::ALL2ALL, "LcalAll2All" }, { LcalType::ALL2ALL_V_C, "LcalAll2AllVC" }, { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER, "LcalAllGatherMatmulReduceScatter" }, { LcalType::BANDWIDTH, "LcalBandwidthTest" }, - { LcalType::LOCAL_REDUCE, "LcalLocalReduce" }, - { LcalType::GATHER, "LcalGather" }, - { LcalType::SEND, "LcalSend" }, - { LcalType::RECV, "LcalRecv" } + { LcalType::ALL_REDUCE_910B2C, "LcalAllReduce910B2C" }, + { LcalType::ALL_GATHER_910B2C, "LcalAllGather910B2C" }, + { LcalType::ALLTOALLV_ALLGATHER_MATMUL, "LcalAllToAllVAllGatherMatmul" }, + { LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN, "LcalAllToAllVAllGatherMatmulHidden" }, + { LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN, "LcalMatmulReduceScatterAllToAllVHidden" } }; } // namespace Lcal -#endif // LCAL_TYPES_H \ No newline at end of file +#endif // LCAL_TYPES_H diff --git a/comm/lcal/include/lcoc/lcoc.h b/comm/lcal/include/lcoc/lcoc.h index 2bb01072..ea8b413e 100644 --- a/comm/lcal/include/lcoc/lcoc.h +++ b/comm/lcal/include/lcoc/lcoc.h @@ -23,9 +23,20 @@ public: explicit Lcoc(LcalComm *comm); ~Lcoc(); int SetParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc); + int AllGatherMatmul(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream = nullptr); + int AllGatherMatmulV2(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream = nullptr); + int MatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream = nullptr); int MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream = nullptr); + int PureMatmul(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream = nullptr); int AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, - void *workspace, aclrtStream stream = nullptr); + void *workspace, aclrtStream stream = nullptr); + int AllToAllVAllGatherMatmul(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream = nullptr); + int AllToAllVAllGatherMatmulHidden(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream = nullptr); + int MatmulReduceScatterAllToAllVHidden(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream = nullptr); int64_t GetWorkspaceSize(); LcalComm *GetComm(); MatMulInfo &GetMatMulInfo(); @@ -45,4 +56,4 @@ private: bool tilingSuccess_ = false; }; } -#endif // LCAL_LCOC_H \ No newline at end of file +#endif // LCAL_LCOC_H diff --git a/comm/lcal/include/lcoc/lcoc_args.h b/comm/lcal/include/lcoc/lcoc_args.h index 6a7775d9..c62b169e 100644 --- a/comm/lcal/include/lcoc/lcoc_args.h +++ b/comm/lcal/include/lcoc/lcoc_args.h @@ -30,33 +30,33 @@ namespace Lcal { enum CoCDataTypeDesc : int { COC_DATA_TYPE_UNDEFINED = -1, - FP16FP16_FP32_FP16 = 0, - BF16BF16_FP32_BF16 = 1, - INT8INT8_INT32_FP16 = 2, - INT8INT8_INT32_BF16 = 3, - FP16INT8_INT32_FP16 = 4, - BF16INT8_INT32_BF16 = 5, - FP16INT8_FP32_FP16 = 6, - BF16INT8_FP32_BF16 = 7, - FP16INT4_FP32_FP16 = 8, - BF16INT4_FP32_BF16 = 9, + FP16FP16_FP32_FP16 = 0, // 无量化,无反量化 + BF16BF16_FP32_BF16 = 1, // 无量化,无反量化 + INT8INT8_INT32_FP16 = 2, // W8A8,未融合量化,随路反量化 + INT8INT8_INT32_BF16 = 3, // W8A8,未融合量化,aiv反量化 + FP16INT8_INT32_FP16 = 4, // W8A8,融合量化,随路反量化 + BF16INT8_INT32_BF16 = 5, // W8A8,融合量化,aiv反量化 + FP16INT8_FP32_FP16 = 6, // W8A16,融合伪量化,无反量化 + BF16INT8_FP32_BF16 = 7, // W8A16,融合伪量化,无反量化 + FP16INT4_FP32_FP16 = 8, // W4A16,融合伪量化,无反量化 + BF16INT4_FP32_BF16 = 9, // W4A16,融合伪量化,无反量化 COC_DATA_TYPE_DESC_MAX = 10, }; const std::map COC_TYPE2ELE_SIZE = { { FP16FP16_FP32_FP16, FP_BF_16_ELE_SIZE }, { BF16BF16_FP32_BF16, FP_BF_16_ELE_SIZE }, - { INT8INT8_INT32_FP16, INT8_ELE_SIZE }, { INT8INT8_INT32_BF16, INT8_ELE_SIZE }, - { FP16INT8_INT32_FP16, INT8_ELE_SIZE }, { BF16INT8_INT32_BF16, INT8_ELE_SIZE }, + { INT8INT8_INT32_FP16, INT8_ELE_SIZE }, { INT8INT8_INT32_BF16, INT8_ELE_SIZE }, + { FP16INT8_INT32_FP16, INT8_ELE_SIZE }, { BF16INT8_INT32_BF16, INT8_ELE_SIZE }, { FP16INT8_FP32_FP16, FP_BF_16_ELE_SIZE }, { BF16INT8_FP32_BF16, FP_BF_16_ELE_SIZE }, { FP16INT4_FP32_FP16, FP_BF_16_ELE_SIZE }, { BF16INT4_FP32_BF16, FP_BF_16_ELE_SIZE } }; const std::map COC_TYPE2HCCL_TYPE = { - { FP16FP16_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16BF16_FP32_BF16, HCCL_DATA_TYPE_BFP16 }, + { FP16FP16_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16BF16_FP32_BF16, HCCL_DATA_TYPE_BFP16 }, { INT8INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { INT8INT8_INT32_BF16, HCCL_DATA_TYPE_BFP16 }, { FP16INT8_INT32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_INT32_BF16, HCCL_DATA_TYPE_BFP16 }, - { FP16INT8_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_FP32_BF16, HCCL_DATA_TYPE_BFP16 }, - { FP16INT4_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT4_FP32_BF16, HCCL_DATA_TYPE_BFP16 } + { FP16INT8_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT8_FP32_BF16, HCCL_DATA_TYPE_BFP16 }, + { FP16INT4_FP32_FP16, HCCL_DATA_TYPE_FP16 }, { BF16INT4_FP32_BF16, HCCL_DATA_TYPE_BFP16 } }; struct CoCParamDesc { @@ -64,8 +64,9 @@ namespace Lcal { MatMulInfo mmInfo = {}; QuantInfo quantInfo = {}; PostInfo postInfo = {}; - HcclReduceOp op = HCCL_REDUCE_SUM; + HcclReduceOp op = HCCL_REDUCE_SUM; // 当前不支持其他值 TwoDimTPInfo twoDimTPInfo = {}; + MoeInfo moeInfo = {}; }; struct CoCInputPkg { @@ -73,26 +74,33 @@ namespace Lcal { void *matrixB = nullptr; void *bias = nullptr; void *gamma = nullptr; - void *dequantScale = nullptr; - void *dequantOffset = nullptr; + void *dequantScale = nullptr; // 反量化参数,当融合了Matmul前置伪量化或后置反量化操作时需要传入 + void *dequantOffset = nullptr; // 可选,若无offset(如对称量化场景),传入空指针即可 - void *quantScale = nullptr; - void *quantOffset = nullptr; + void *quantScale = nullptr; // 量化参数,当融合了量化操作时需要传入 + void *quantOffset = nullptr; // 可选,若无offset(如对称量化场景),传入空指针即可 + void *num_local_tokens_per_expert = nullptr; + void *num_global_tokens_per_local_expert = nullptr; + void *global_tokens_per_expert_matrix = nullptr; }; struct CoCOutputPkg { void *output = nullptr; - void *midOutput = nullptr; + void *midOutput = nullptr; // 先通信后计算情况下,通信的中间结果 }; struct TaskParam { + // hardware info int32_t rank = -1; int32_t rankSize = -1; int32_t blockDim = -1; int32_t bufferSize = -1; ChipName chipName = ChipName::CHIP_910B3; + // param info CoCParamDesc cocParamDesc = {}; + + // type LcalType lcalType = LcalType::ALL_REDUCE; }; } -#endif \ No newline at end of file +#endif // LCAL_LCOC_ARGS_H diff --git a/comm/lcal/include/lcoc/lcoc_base.h b/comm/lcal/include/lcoc/lcoc_base.h index 7979501d..7bf4321c 100644 --- a/comm/lcal/include/lcoc/lcoc_base.h +++ b/comm/lcal/include/lcoc/lcoc_base.h @@ -37,22 +37,31 @@ struct MatMulInfo { bool weightNz = false; }; -struct TwoDimTPInfo { - int32_t agDim = -1; - int32_t rsDim = -1; - bool innerDimIsAg = true; +struct TwoDimTPInfo { // 2D-TP,含x轴的通信和y轴通信 + int32_t agDim = -1; // 表示ag轴卡数,规定x轴方向是非连续卡号 + int32_t rsDim = -1; // 表示rs轴卡数,规定y轴方向是连续卡号 + bool innerDimIsAg = true; // 是否沿着内轴进行allgather通信 }; struct QuantInfo { + // 反量化(包括Matmul前置伪量化和后置反量化)粒度 QuantGranularity dequantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; int32_t dequantGroupSize = -1; - QuantGranularity quantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; + QuantGranularity quantGranularity = QuantGranularity::QUANT_GRANULARITY_UNDEFINED; // 量化粒度 int32_t quantGroupSize = -1; }; struct PostInfo { int32_t withRmsNorm = 0; }; + +struct MoeInfo { + int16_t local_expert_nums = 0; + int8_t EP = 0; + int8_t TP = 0; + int32_t maxOutputSize = -1; + int8_t isMoe = 0; +}; } -#endif \ No newline at end of file +#endif // LCAL_LCOC_BASE_H diff --git a/comm/lcal/include/lcoc/lcoc_func.h b/comm/lcal/include/lcoc/lcoc_func.h index b0c52dd1..35a95c97 100644 --- a/comm/lcal/include/lcoc/lcoc_func.h +++ b/comm/lcal/include/lcoc/lcoc_func.h @@ -19,6 +19,7 @@ #pragma once namespace Lcal { + // 校验参数取值范围在[min, max]内,当max=-1时,表示参数取值范围在[min, +∞) bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max); bool CheckParamScopeList(std::vector> paramCheckList); bool CheckParamAlign(const std::string &name, const int &value, const int &align); @@ -26,4 +27,4 @@ namespace Lcal { bool CheckParamPowerOfTwo(const std::string &name, int value); } -#endif \ No newline at end of file +#endif // LCAL_LCOC_FUNC_H \ No newline at end of file diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/comm/lcal/include/lcoc/lcoc_workspace.h index 8385f413..90f69e54 100644 --- a/comm/lcal/include/lcoc/lcoc_workspace.h +++ b/comm/lcal/include/lcoc/lcoc_workspace.h @@ -15,7 +15,7 @@ #define GM_ADDR int64_t #endif -struct LcalWorkspaceInfo { +struct LcalWorkspaceInfo { // host侧起点为0,device起点为gm_workspace 记录Offset GM_ADDR gm_reducebuf{ 0 }; GM_ADDR gm_a_align{ 0 }; GM_ADDR gm_b_align{ 0 }; @@ -23,12 +23,26 @@ struct LcalWorkspaceInfo { GM_ADDR gm_formate_dequant_scale{ 0 }; GM_ADDR gm_dequant_param{ 0 }; - GM_ADDR workspaceSize {0}; + // moe + GM_ADDR gm_out_loop_per_expert{ 0 }; + GM_ADDR gm_in_loop_per_expert{ 0 }; + GM_ADDR gm_out_loop_per_EP{ 0 }; + GM_ADDR gm_in_loop_per_EP{ 0 }; + GM_ADDR gm_sum_num_local_tokens_per_expert{ 0 }; + GM_ADDR gm_sum_num_global_tokens_per_local_expert{ 0 }; + GM_ADDR gm_in_expert_comm_count_accum{ 0 }; + GM_ADDR gm_out_expert_comm_count_accum{ 0 }; + + GM_ADDR gm_num_local_tokens_per_expert{ 0 }; + GM_ADDR gm_num_global_tokens_per_local_expert{ 0 }; + GM_ADDR comm_matrix_trunc{ 0 }; + + GM_ADDR workspaceSize {0}; // total size }; inline __aicore__ int32_t AlignUp(int32_t len, int32_t size) { - return (len + size -1) & ~(size - 1); + return (len + size - 1) & ~(size - 1); } #if !defined(__DAV_C220_VEC__) && !defined(__DAV_M200_VEC__) && !defined(__DAV_C220_CUBE__) && !defined(__DAV__C310__) @@ -40,24 +54,44 @@ inline uint64_t GetDequantWorkSpaceSize(Lcal::LcalType lcalType, int32_t withSer uint64_t dequantWorkSpaceSize = 0; if (withSerialMode > 0) { dequantWorkSpaceSize = (maxOutputSize == -1 ? m : maxOutputSize) * n * sizeof(int32_t); + if (lcalType == Lcal::LcalType::ALL_GATHER_MATMUL) { + dequantWorkSpaceSize *= rankSize; + } } else { - if (lcalType == Lcal::LcalType::MATMUL_ALL_REDUCE) { + if (lcalType == Lcal::LcalType::MATMUL_ALL_REDUCE || lcalType == Lcal::LcalType::MATMUL_REDUCE_SCATTER) { dequantWorkSpaceSize = pValue * blockDim * m0 * n0 * TWO * sizeof(int32_t); } else { dequantWorkSpaceSize = (maxOutputSize == -1 ? m : maxOutputSize) * n * sizeof(int32_t); + if (lcalType == Lcal::LcalType::ALL_GATHER_MATMUL) { + dequantWorkSpaceSize *= rankSize; + } } } return dequantWorkSpaceSize; } #endif +inline __aicore__ void GetLcalMoeWorkspaceInfo(LcalWorkspaceInfo& lcalWorkspaceInfo, GM_ADDR& workspaceOffset, + int32_t m, bool hasDequantParam = false, int32_t is_alltoallvc = false, + int32_t EP = 1, int32_t expertPerRank = 1, int32_t outputSize = -1) +{ + constexpr int32_t ALIGN8 = 8; + if (hasDequantParam) { + lcalWorkspaceInfo.gm_dequant_param = workspaceOffset; + workspaceOffset += sizeof(float) * AlignUp(m * EP, ALIGN8); + } + lcalWorkspaceInfo.comm_matrix_trunc = workspaceOffset; + workspaceOffset += sizeof(int32_t) * EP * EP * expertPerRank; +} + inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, int32_t batchSize, int32_t m, int32_t k, int32_t n, int32_t mAlign, int32_t kAlign, int32_t nAlign, bool transa, bool transb, int32_t mmadSize, bool hasAAlign, bool hasBAlign, int32_t accumRankSize, bool hasAccum = false, uint64_t dequantWorkSpaceSize = 0, bool hasDequantParam = false, bool hasFormatDequantScale = false, bool isDeterministic = false, int32_t isMoe = false, int32_t is_alltoallvc = false, - int32_t EP = 1, int32_t expertPerRank = 1, int32_t outputSize = -1) + int32_t EP = 1, int32_t expertPerRank = 1, int32_t outputSize = -1 +) { if (outputSize == -1) { outputSize = m; @@ -78,14 +112,19 @@ inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, in if (hasBAlign) { lcalWorkspaceInfo.gm_b_align = workspaceOffset; workspaceOffset += static_cast(batchSize) * (transb ? n * kAlign : k * nAlign) * mmadSize * - (expertPerRank <= 0 ? 1 : expertPerRank); + (expertPerRank <= 0 ? 1 : expertPerRank); + } + + if (isMoe) { + GetLcalMoeWorkspaceInfo(lcalWorkspaceInfo, workspaceOffset, m, hasDequantParam, is_alltoallvc, EP, + expertPerRank, outputSize); } if (!isMoe && hasDequantParam) { lcalWorkspaceInfo.gm_dequant_param = workspaceOffset; workspaceOffset += sizeof(int32_t) * AlignUp(n, ALIGN8); } - + if (hasFormatDequantScale) { lcalWorkspaceInfo.gm_formate_dequant_scale = workspaceOffset; workspaceOffset += sizeof(float) * AlignUp(n, ALIGN8); @@ -99,4 +138,5 @@ inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, in return lcalWorkspaceInfo; } + #endif \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling.h b/comm/lcal/include/lcoc/tiling/tiling.h index 65c8d8c2..d6aa177b 100644 --- a/comm/lcal/include/lcoc/tiling/tiling.h +++ b/comm/lcal/include/lcoc/tiling/tiling.h @@ -51,6 +51,33 @@ public: void GetDefaultTiling(const TaskParam &taskParam) override; }; +class CoCMatmulReduceScatterTilingFunc : public CoCMatmulAllReduceTilingFunc { +public: + CoCMatmulReduceScatterTilingFunc(const CoCMatmulReduceScatterTilingFunc &) = delete; + CoCMatmulReduceScatterTilingFunc &operator = (const CoCMatmulReduceScatterTilingFunc &) = delete; + CoCMatmulReduceScatterTilingFunc() {} + bool CheckTiling(const TaskParam &taskParam) override; + void GetDefaultTiling(const TaskParam &taskParam) override; +}; + +class CoCAllGatherMatmulTilingFunc : public CoCTilingFunc { +public: + CoCAllGatherMatmulTilingFunc(const CoCAllGatherMatmulTilingFunc &) = delete; + CoCAllGatherMatmulTilingFunc &operator = (const CoCAllGatherMatmulTilingFunc &) = delete; + CoCAllGatherMatmulTilingFunc() {} + bool CheckTiling(const TaskParam &taskParam) override; + void GetDefaultTiling(const TaskParam &taskParam) override; +}; + +class CoCAllGatherMatmulV2TilingFunc : public CoCTilingFunc { +public: + CoCAllGatherMatmulV2TilingFunc(const CoCAllGatherMatmulV2TilingFunc &) = delete; + CoCAllGatherMatmulV2TilingFunc &operator = (const CoCAllGatherMatmulV2TilingFunc &) = delete; + CoCAllGatherMatmulV2TilingFunc() {} + bool CheckTiling(const TaskParam &taskParam) override; + void GetDefaultTiling(const TaskParam &taskParam) override; +}; + class CoCAllgatherMatmulReduceScatterTilingFunc : public CoCTilingFunc { public: CoCAllgatherMatmulReduceScatterTilingFunc(const CoCAllgatherMatmulReduceScatterTilingFunc &) = delete; @@ -59,6 +86,33 @@ public: bool CheckTiling(const TaskParam &taskParam) override; void GetDefaultTiling(const TaskParam &taskParam) override; }; -} +class CoCAllToAllAllGatherMatmulTilingFunc : public CoCAllGatherMatmulTilingFunc { +public: + CoCAllToAllAllGatherMatmulTilingFunc(const CoCAllToAllAllGatherMatmulTilingFunc &) = delete; + CoCAllToAllAllGatherMatmulTilingFunc &operator = (const CoCAllToAllAllGatherMatmulTilingFunc &) = delete; + CoCAllToAllAllGatherMatmulTilingFunc() {} + bool CheckTiling(const TaskParam &tilingInfo) override; + void GetDefaultTiling(const TaskParam &tilingInfo) override; +}; +class CoCAllToAllAllGatherMatmulHiddenTilingFunc : public CoCAllGatherMatmulTilingFunc { +public: + CoCAllToAllAllGatherMatmulHiddenTilingFunc(const CoCAllToAllAllGatherMatmulHiddenTilingFunc &) = delete; + CoCAllToAllAllGatherMatmulHiddenTilingFunc &operator = ( + const CoCAllToAllAllGatherMatmulHiddenTilingFunc &) = delete; + CoCAllToAllAllGatherMatmulHiddenTilingFunc() {} + bool CheckTiling(const TaskParam &tilingInfo) override; + void GetDefaultTiling(const TaskParam &tilingInfo) override; +}; + +class CoCMatmulReduceScatterAllToAllHiddenTilingFunc : public CoCMatmulReduceScatterTilingFunc { +public: + CoCMatmulReduceScatterAllToAllHiddenTilingFunc(const CoCMatmulReduceScatterAllToAllHiddenTilingFunc &) = delete; + CoCMatmulReduceScatterAllToAllHiddenTilingFunc &operator = ( + const CoCMatmulReduceScatterAllToAllHiddenTilingFunc &) = delete; + CoCMatmulReduceScatterAllToAllHiddenTilingFunc() {} + bool CheckTiling(const TaskParam &tilingInfo) override; + void GetDefaultTiling(const TaskParam &tilingInfo) override; +}; -#endif // LCAL_TILING_H \ No newline at end of file +} +#endif // LCAL_TILING_H diff --git a/comm/lcal/include/lcoc/tiling/tiling_91093.h b/comm/lcal/include/lcoc/tiling/tiling_91093.h index 9e3764c8..d977d253 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_91093.h +++ b/comm/lcal/include/lcoc/tiling/tiling_91093.h @@ -13,10 +13,25 @@ #include "tiling_args.h" namespace Lcal { + void AllGatherNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData); + void AllGatherNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData); + void AllGatherNPU91093TwoRankFP16Tiling(CoCTilingData &cocTilingData); + void AllGatherNPU91093TwoRankINT8Tiling(CoCTilingData &cocTilingData); + + void AllGatherV2NPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData); + void AllGatherV2NPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData); + void AllGatherV2NPU91093TwoRankFP16Tiling(CoCTilingData &cocTilingData); + void AllReduceNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData); void AllReduceNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData); + void ReduceScatterNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData); + void ReduceScatterNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData); + void ReduceScatterNPU91093TwoRankFP16Tiling(CoCTilingData &cocTilingData); + void ReduceScatterNPU91093TwoRankINT8Tiling(CoCTilingData &cocTilingData); + void ReduceScatterNPU91093FourRankFP16Tiling(CoCTilingData &cocTilingData); + void CoCAllgatherMatmulReduceScatterAgEightRsTwoTiling(CoCTilingData &cocTilingData); void CoCAllgatherMatmulReduceScatterDefaultTiling(CoCTilingData &cocTilingData, int32_t rsDim); } -#endif // LCAL_TILING_91093_H \ No newline at end of file +#endif // LCAL_TILING_91093_H diff --git a/comm/lcal/include/lcoc/tiling/tiling_910B.h b/comm/lcal/include/lcoc/tiling/tiling_910B.h index 59e684f4..ca6efab2 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_910B.h +++ b/comm/lcal/include/lcoc/tiling/tiling_910B.h @@ -13,6 +13,13 @@ #include "tiling_args.h" namespace Lcal { + void AllGatherGetDefaultTiling(CoCTilingData &cocTilingData); + void AllGatherEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); + void AllGatherFourRankINT8Tiling(CoCTilingData &cocTilingData); + + void AllGatherV2EightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); + void AllGatherV2EightRankFP16Core16GetDefaultTiling(CoCTilingData &cocTilingData); + void AllReduceGetDefaultTiling(CoCTilingData &cocTilingData); void AllReduceFourRankInt8GetDefaultTiling(CoCTilingData &cocTilingData); void AllReduceFourRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); @@ -22,6 +29,5 @@ namespace Lcal { void ReduceScatterEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData); void ReduceScatterFourRankINT8Tiling(CoCTilingData &cocTilingData); - } -#endif \ No newline at end of file +#endif // LCAL_TILING_910B_H \ No newline at end of file diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/comm/lcal/include/lcoc/tiling/tiling_args.h index 8d08c6d0..f46e0477 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_args.h +++ b/comm/lcal/include/lcoc/tiling/tiling_args.h @@ -20,9 +20,9 @@ namespace Lcal { constexpr int32_t HBM_BM = 1; constexpr int32_t L2_BW = 5; constexpr int32_t BYTE_512 = 512; - constexpr int32_t MAX_UB_NUM = 97280; + constexpr int32_t MAX_UB_NUM = 97280; // 190 * 1024 / 2 constexpr int32_t MIN_UB_NUM = 256; - constexpr int32_t A3_DIE_NUM = 2; + constexpr int32_t A3_DIE_NUM = 2; // 一张卡有两个die constexpr int32_t DEFAULT_P_VALUE = 1; constexpr int32_t MIN_P_VALUE = 1; constexpr int32_t MAX_P_VALUE = 15; @@ -43,8 +43,8 @@ namespace Lcal { constexpr int32_t COMMDATASPLIT_FOUR = 4; constexpr int32_t COMMDATASPLIT_EIGHT = 8; constexpr int32_t COMMDATASPLIT_SIXTEEN = 16; - constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; - constexpr int32_t AXES_ALIGN_SIZE_INT8 = 128; + constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; // 2.5MB + constexpr int32_t AXES_ALIGN_SIZE_INT8 = 128; constexpr int32_t DEFAULT_ROW = 128; constexpr int32_t DEFAULT_COL = 256; constexpr int32_t AXES_ALIGN_SIZE = 512; @@ -62,7 +62,7 @@ namespace Lcal { constexpr int32_t CUBE_BLOCK_SIZE = 256; constexpr int32_t MIN_UB_MOVE_NUM = 5120; constexpr int32_t VALID_UB_MOVE_NUM = 20480; - constexpr int32_t L1AB_PINGPONG_BUFFER_LEN_FP16 = 131072; + constexpr int32_t L1AB_PINGPONG_BUFFER_LEN_FP16 = 131072; // 128 KB constexpr int32_t HALF_KBYTE = 512; constexpr int32_t SECOND_TO_MS = 1e3; constexpr int64_t MATMUL_BASE_100US = static_cast(1024) * 8192 * 1024; @@ -83,14 +83,22 @@ namespace Lcal { constexpr int32_t RANKSIZE_SIXTEEN = 16; constexpr int32_t DIV_TWO = 2; constexpr int32_t LENPERLOOP_DEFAULT = 5120; - constexpr int32_t ALLREDUCE_LENPERLOOP_DEFAULT = 5120; + constexpr int32_t ALLGATHERV2_CORENUM_SIXTEEN = 16; + constexpr int32_t ALLREDUCE_LENPERLOOP_DEFAULT = 5120; // 使用的core数为16时的取值 constexpr int32_t TREE_LEN_PER_LOOP = 20480; constexpr int32_t DIM_EIGHT = 8; constexpr int32_t DIM_TWO = 2; constexpr int32_t DEFAULT_SPLIT_K = 0; constexpr int32_t NUM_TWO = 2; + // Todo: tmp hard code, need tiling func for moe + constexpr int32_t AllTOAll_HIDDEN_UBMOVENUM = 28672; + + + // 默认值均为-1 struct CoCTiling { + // Tiling参数,用来控制融合算子执行策略 + // 可外部传入,也可内部计算得到 int32_t m0 = -1; int32_t k0 = -1; int32_t n0 = -1; @@ -103,46 +111,50 @@ namespace Lcal { int32_t commDirect = -1; int32_t lenPerLoop = -1; int32_t extraUbMoveNum = -1; - int32_t extraCommNpuSplit = -1; - int32_t extraCommDataSplit = -1; - int32_t extraCommDirect = -1; - int32_t extraLenPerLoop = -1; + int32_t extraCommNpuSplit = -1; // 2dtp使用 + int32_t extraCommDataSplit = -1; // 2dtp使用 + int32_t extraCommDirect = -1; // 2dtp使用 + int32_t extraLenPerLoop = -1; // 2dtp使用 int32_t splitK = -1; int32_t write2OtherRank = -1; int32_t withSerialMode = -1; - + // 控制融合算子实现的参数 int32_t is91093 = -1; int32_t bufferSize = -1; }; struct CoCTilingData : CoCTiling { + // 外部传入的参数 int64_t m = -1; int64_t k = -1; int64_t n = -1; int64_t batchSize = -1; + // NPU相关的参数 int32_t blockDim = -1; int32_t rank = -1; int32_t rankSize = -1; - int32_t tag = -1; + int32_t tag = -1; // 默认值为0 + // 内部计算得到的参数 int32_t mLoop = -1; int32_t kLoop = -1; int32_t nLoop = -1; int32_t coreLoop = -1; uint32_t tilingKey = -1; + // Tiling Func const char* ToString() const; - void SetDefaultValue(); + void SetDefaultValue(); // 设置默认值 }; struct CoCKernelParam { CoCTilingData cocTilingData = {}; - QuantInfo quantInfo = {}; - TwoDimTPInfo twoDimTPInfo = {}; - PostInfo postInfo = {}; + QuantInfo quantInfo = {}; // device侧对应23-26 + TwoDimTPInfo twoDimTPInfo = {}; // device侧对应27-29 + PostInfo postInfo = {}; // device侧对应30 + MoeInfo moeInfo = {}; // device侧对应31 bool weightNz = false; }; - } -#endif \ No newline at end of file +#endif // LCAL_TILING_ARGS_H diff --git a/comm/lcal/include/lcoc/tiling/tiling_func.h b/comm/lcal/include/lcoc/tiling/tiling_func.h index ee10892b..111e2b02 100644 --- a/comm/lcal/include/lcoc/tiling/tiling_func.h +++ b/comm/lcal/include/lcoc/tiling/tiling_func.h @@ -45,7 +45,6 @@ namespace Lcal { void CalTilingParam(const MatMulInfo &mmInfo, CoCTilingData &tilingData); void SetTilingInputParam(const TaskParam &taskParam, CoCTilingData &tilingData); void SetTilingData(const TaskParam &taskParam, const CoCTiling &tiling, CoCTilingData &tilingData); - } -#endif \ No newline at end of file +#endif // LCAL_TILING_FUNC_H -- Gitee From 0a960f9582e2f6bc6bee0fc1b5fe1490c98e4624 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 30 Aug 2025 16:55:06 +0800 Subject: [PATCH 390/414] 9 --- src/include/atb/runner/lcal_runner.h | 2 +- src/include/atb/runner/lccl_runner.h | 2 +- src/include/atb/runner/lcoc_runner.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/atb/runner/lcal_runner.h b/src/include/atb/runner/lcal_runner.h index f27f09f7..f684ebd5 100644 --- a/src/include/atb/runner/lcal_runner.h +++ b/src/include/atb/runner/lcal_runner.h @@ -10,7 +10,7 @@ #ifndef ATB_LCAL_RUNNER_H #define ATB_LCAL_RUNNER_H -#include +#include #include #include #include "atb/runner/runner.h" diff --git a/src/include/atb/runner/lccl_runner.h b/src/include/atb/runner/lccl_runner.h index c96188cd..2f2adaaa 100644 --- a/src/include/atb/runner/lccl_runner.h +++ b/src/include/atb/runner/lccl_runner.h @@ -10,7 +10,7 @@ #ifndef ATB_LCCL_RUNNER_H #define ATB_LCCL_RUNNER_H -#include +#include #include #include "atb/runner/runner.h" #include "atb/runner/lcal_runner.h" diff --git a/src/include/atb/runner/lcoc_runner.h b/src/include/atb/runner/lcoc_runner.h index 0dba0e6e..b181bcfd 100644 --- a/src/include/atb/runner/lcoc_runner.h +++ b/src/include/atb/runner/lcoc_runner.h @@ -10,7 +10,7 @@ #ifndef ATB_LCOC_RUNNER_H #define ATB_LCOC_RUNNER_H -#include +#include #include #include "atb/runner/runner.h" #include "atb/runner/lcal_runner.h" -- Gitee From 4023a67fa748c43a082498368190040ef0043c43 Mon Sep 17 00:00:00 2001 From: LiuHaoyu Date: Sat, 30 Aug 2025 17:10:30 +0800 Subject: [PATCH 391/414] 1 --- comm/lcal/include/lcal_types.h | 1 + 1 file changed, 1 insertion(+) diff --git a/comm/lcal/include/lcal_types.h b/comm/lcal/include/lcal_types.h index f9125c3c..104b3622 100644 --- a/comm/lcal/include/lcal_types.h +++ b/comm/lcal/include/lcal_types.h @@ -27,6 +27,7 @@ constexpr int64_t LCAL_INVALID_VALUE = -1; // shared buffer size,这里要和collectives.cce文件中的常量联动修改!!! constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; +constexpr int LCAL_FLAG_BUFF_BYTES = 4 * 1024 * 1024; constexpr int LCAL_COMM_BUFFER_SIZE = 200; // 单位MB enum class ChipName { -- Gitee From 594fd22a07148d63e3a970289ebea340ca0d7068 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Sat, 30 Aug 2025 18:05:20 +0800 Subject: [PATCH 392/414] fix --- CMakeLists.txt | 3 +-- comm/lcal/include/comm_args.h | 2 +- comm/lcal/src/CMakeLists.txt | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cac9a5d..a3c55e35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,9 +116,8 @@ if (BUILD_CUSTOMIZE_OPS) add_subdirectory(ops_customize) endif() -set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}/lcal") +set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}") add_subdirectory(comm/lcal) -set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}") message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index c8be9c47..1dbeae00 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -7,7 +7,7 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ - +#pragma GCC diagnostic ignored "-Wunused-parameter" #ifndef LCCL_COMM_ARGS_H #define LCCL_COMM_ARGS_H #include diff --git a/comm/lcal/src/CMakeLists.txt b/comm/lcal/src/CMakeLists.txt index 400edecd..5f326236 100644 --- a/comm/lcal/src/CMakeLists.txt +++ b/comm/lcal/src/CMakeLists.txt @@ -51,8 +51,8 @@ set_source_files_properties( PROPERTIES OBJECT_DEPENDS ${LCAL_CCE_PATH} ) -install(TARGETS lcal LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -install(TARGETS lcal_static DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +install(TARGETS lcal LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/lcal) +install(TARGETS lcal_static DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/lcal) -- Gitee From 5a6127e02adb474684edf71fe6121d1f1924205d Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 16:03:34 +0800 Subject: [PATCH 393/414] fix --- tests/apitest/opstest/csv/linear_parallel.csv | 43 +------------------ 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv index ae51e1c4..443a1392 100644 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -41,15 +41,6 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 40|llama_65bCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 41|llama_65bCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 42|llama_65bCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -43|NoErrorCase0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -44|NoErrorCase1LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|28,2,1024;8,1024|1|float16|nd|14,2,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -45|IErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,59;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||I:ERROR_INVALID_TENSOR_DIM -46|SErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM -47|NoErrorCase0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -48|NoErrorCase1AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,4,1024;8,1024|1|float16|nd|4,4,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -49|IErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,33;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM -50|SErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM -51|NoErrorCase0AllGatherLinearV2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"keepIntermediate":true}|2|float16;float16|nd;nd|2,16;32,16|2|float16;float16|nd;nd|4,32;4,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 52|NoErrorCase0MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;1,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR 53|NoErrorCase1MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|8,2;4,2;1,4;1,4|1|bf16|nd|8,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 54|NoErrorCase2MatmulAllReduceDequantWithoutBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;0;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR @@ -57,28 +48,13 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 56|NoErrorCase4MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 57|NoErrorCase5MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|bf16;int8;bf16;bf16;bf16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|bf16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR 58|NoErrorCase6MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|float16;int8;float16;float16;float16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR -59|NoErrorCase0PureMatmul|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -60|NoErrorCase1PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR -61|NoErrorCase2PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR -62|DimCheckFailPureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1,4;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM -63|PureMatmulW8A8Fp16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -64|PureMatmulW8A8Bf16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -65|PureMatmulW8A8Fp16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -66|PureMatmulW8A8Bf16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -67|PureMatmulW8A8InvalidQuantType|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|S:ERROR_INVALID_PARAM -68|PureMatmulKeepIntermediateInValid|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","keepIntermediate":true,"type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|C:ERROR_INVALID_PARAM 69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,1,32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,1,32,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,1,32,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -75|PureMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -80|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE +780|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE 81|rsv|LinearParallelOperation|{"rank":0,"rankSize":2,"rsv":[1]}|0||||0||||||||||||C:ERROR_INVALID_PARAM 82|NoErrorCase0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":1}}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 83|NoErrorCase1AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|2048,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR @@ -86,20 +62,3 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 85|NoErrorCase3AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|512,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 86|IErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":1}}|2|float16;float16|nd;nd|32,16;32,20|1|float16|nd|16,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM 87|SErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|1024,32|customize;customize|-1,1;-1,1||||||Ascend910B|S:ERROR_INVALID_TENSOR_DIM -88|AllGatherMatmulInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -89|AllGatherMatmulInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -90|MatmulReducescatterInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -91|MatmulReducescatterInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR -92|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -93|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -94|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -95|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -96|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -97|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -98|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -99|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -100|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,48;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -101|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|32768,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -102|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM -103|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM -104|PureMatmulW8A8Fp16_3_float|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;float|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|random;random;random;random|-5,5;-5,5;-10,10;1,2||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -- Gitee From e2b8f2d5eb5952f8e2f8f227404dc8bea9c21aaf Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 17:34:52 +0800 Subject: [PATCH 394/414] fix --- comm/lcal/include/comm_args.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index c8be9c47..1dbeae00 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -7,7 +7,7 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ - +#pragma GCC diagnostic ignored "-Wunused-parameter" #ifndef LCCL_COMM_ARGS_H #define LCCL_COMM_ARGS_H #include -- Gitee From 21075cd4d081dd2c23debe85eae11af626c94fd6 Mon Sep 17 00:00:00 2001 From: guanguan Date: Sat, 30 Aug 2025 18:15:53 +0800 Subject: [PATCH 395/414] fix --- CMakeLists.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cac9a5d..f71a4d3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,16 +116,13 @@ if (BUILD_CUSTOMIZE_OPS) add_subdirectory(ops_customize) endif() -set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}/lcal") +set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}") add_subdirectory(comm/lcal) -set(CMAKE_INSTALL_PREFIX "{CMAKE_SOURCE_DIR}/output/atb/cxx_abi_{cxx_abi}") message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) install(DIRECTORY ${PROJECT_SOURCE_DIR}/ops_configs DESTINATION ./configs) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/mki/lib/libmki.so DESTINATION lib) -install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/liblcal.so DESTINATION lib) -install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/liblcal_static.a DESTINATION lib) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libasdops_aicpu_kernels.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libtbe_adapter.so DESTINATION lib OPTIONAL) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/asdops/lib/libcann_ops_adapter.so DESTINATION lib OPTIONAL) -- Gitee From bff40aaeac1de8247336ee1371127d3dfc98d2b5 Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 10:06:38 +0800 Subject: [PATCH 396/414] cleancode --- comm/lcal/src/lcoc.cpp | 3 +++ comm/lcal/src/lcoc_func.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index 7a6dcfc1..a4b3188d 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -264,6 +264,9 @@ void Lcoc::GetTiling(CoCTiling &tiling) bool IsMatrixAligned(const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) { + if (nElemAlign == 0) { + return false; + } return (transpose ? m : n) % nElemAlign == 0; } diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index 8d5e6604..ad7927b3 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -45,6 +45,9 @@ namespace Lcal { bool CheckParamAlign(const std::string &name, const int &value, const int &align) { + if (align == 0) { + return false; + } if (value % align != 0) { MKI_LOG(ERROR) << "The " << name << ":" << value << " must be aligned by " << align << "!"; return false; @@ -73,6 +76,9 @@ namespace Lcal { int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) { + if (nElemAlign == 0) { + return false; + } int64_t nRow = transpose ? n : m; int64_t nCol = transpose ? m : n; int64_t nColAlign = (nCol + nElemAlign - 1) / nElemAlign * nElemAlign; -- Gitee From 9296730b8aca5c48c6c0df101d18223bc9bfd5d6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 1 Sep 2025 10:35:10 +0800 Subject: [PATCH 397/414] revert --- comm/lcal/src/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/CMakeLists.txt b/comm/lcal/src/CMakeLists.txt index 5f326236..400edecd 100644 --- a/comm/lcal/src/CMakeLists.txt +++ b/comm/lcal/src/CMakeLists.txt @@ -51,8 +51,8 @@ set_source_files_properties( PROPERTIES OBJECT_DEPENDS ${LCAL_CCE_PATH} ) -install(TARGETS lcal LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/lcal) -install(TARGETS lcal_static DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/lcal) +install(TARGETS lcal LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +install(TARGETS lcal_static DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -- Gitee From 7a2c2573c61323cfd99e5e87917886b562c5cde5 Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 10:58:16 +0800 Subject: [PATCH 398/414] fix --- .../linear_parallel_lcoc_runner.cpp | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp index 3d3ebdc8..38d5c51d 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp @@ -29,35 +29,35 @@ LinearParallelLcocRunner::LinearParallelLcocRunner(const infer::LinearParallelPa isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; break; - case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: - lcalType_ = Lcal::LcalType::MATMUL_REDUCE_SCATTER; - isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && - param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - break; - case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: - lcalType_ = - param_.keepIntermediate ? Lcal::LcalType::ALL_GATHER_MATMUL_V2 : Lcal::LcalType::ALL_GATHER_MATMUL; - isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && - param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - break; + // case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: + // lcalType_ = Lcal::LcalType::MATMUL_REDUCE_SCATTER; + // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && + // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + // break; + // case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: + // lcalType_ = + // param_.keepIntermediate ? Lcal::LcalType::ALL_GATHER_MATMUL_V2 : Lcal::LcalType::ALL_GATHER_MATMUL; + // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && + // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + // break; case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR_REDUCE_SCATTER: lcalType_ = Lcal::LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; break; - case infer::LinearParallelParam::ParallelType::PURE_LINEAR: - lcalType_ = Lcal::LcalType::PURE_MATMUL; - isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && - param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - break; - case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: - lcalType_ = Lcal::LcalType::ALLTOALLV_ALLGATHER_MATMUL; - isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && - param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - break; - case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: - lcalType_ = Lcal::LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN; - isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && - param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - break; + // case infer::LinearParallelParam::ParallelType::PURE_LINEAR: + // lcalType_ = Lcal::LcalType::PURE_MATMUL; + // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && + // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + // break; + // case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: + // lcalType_ = Lcal::LcalType::ALLTOALLV_ALLGATHER_MATMUL; + // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && + // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + // break; + // case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: + // lcalType_ = Lcal::LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN; + // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && + // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + // break; default: ATB_LOG(ERROR) << GetLogPrefix() << "UnSupported type: " << param_.type; } -- Gitee From 8f3bf406efd74c4986af8c6e339860c62d64f847 Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 11:21:03 +0800 Subject: [PATCH 399/414] fix --- .../linear_parallel_lcoc_runner.cpp | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp index 38d5c51d..0a7e2c46 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp @@ -137,21 +137,21 @@ Status LinearParallelLcocRunner::SetupImpl(RunnerVariantPack &runnerVariantPack) } } } - if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || - param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { - Lcal::MoeInfo moeInfo{.local_expert_nums = param_.moeInfo.localExpertNums, - .EP = param_.moeInfo.epSize, - .TP = param_.moeInfo.tpSize, - .maxOutputSize = -1, - .isMoe = 1}; - coCParamDesc.moeInfo = moeInfo; - if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM) { - coCParamDesc.moeInfo.maxOutputSize = - runnerVariantPack.inTensors.at(runnerVariantPack.inTensors.size() - 1).desc.shape.dims[0]; - } else if (param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { - coCParamDesc.moeInfo.maxOutputSize = runnerVariantPack.inTensors.at(0).desc.shape.dims[0]; - } - } + // if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || + // param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { + // Lcal::MoeInfo moeInfo{.local_expert_nums = param_.moeInfo.localExpertNums, + // .EP = param_.moeInfo.epSize, + // .TP = param_.moeInfo.tpSize, + // .maxOutputSize = -1, + // .isMoe = 1}; + // coCParamDesc.moeInfo = moeInfo; + // if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM) { + // coCParamDesc.moeInfo.maxOutputSize = + // runnerVariantPack.inTensors.at(runnerVariantPack.inTensors.size() - 1).desc.shape.dims[0]; + // } else if (param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { + // coCParamDesc.moeInfo.maxOutputSize = runnerVariantPack.inTensors.at(0).desc.shape.dims[0]; + // } + // } int ret = lcoc_->SetParam(lcalType_, {}, coCParamDesc); if (ret != 0) { ATB_LOG(ERROR) << GetLogPrefix() << "SetParam failed, ret : " << ret; @@ -181,35 +181,35 @@ Status LinearParallelLcocRunner::LaunchKernel(Lcal::CoCInputPkg inputPkg, Lcal:: ret = lcoc_->MatmulAllReduce(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, GetExecuteStream(runnerVariantPack.context)); break; - case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: - ret = lcoc_->MatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - GetExecuteStream(runnerVariantPack.context)); - break; - case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: - if (param_.keepIntermediate) { - ret = lcoc_->AllGatherMatmulV2(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - GetExecuteStream(runnerVariantPack.context)); - break; - } - ret = lcoc_->AllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - GetExecuteStream(runnerVariantPack.context)); - break; + // case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: + // ret = lcoc_->MatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + // GetExecuteStream(runnerVariantPack.context)); + // break; + // case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: + // if (param_.keepIntermediate) { + // ret = lcoc_->AllGatherMatmulV2(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + // GetExecuteStream(runnerVariantPack.context)); + // break; + // } + // ret = lcoc_->AllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + // GetExecuteStream(runnerVariantPack.context)); + // break; case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR_REDUCE_SCATTER: ret = lcoc_->AllGatherMatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, GetExecuteStream(runnerVariantPack.context)); break; - case infer::LinearParallelParam::ParallelType::PURE_LINEAR: - ret = lcoc_->PureMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - GetExecuteStream(runnerVariantPack.context)); - break; - case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: - ret = lcoc_->AllToAllVAllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - runnerVariantPack.context->GetExecuteStream()); - break; - case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: - ret = lcoc_->MatmulReduceScatterAllToAllVHidden(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - runnerVariantPack.context->GetExecuteStream()); - break; + // case infer::LinearParallelParam::ParallelType::PURE_LINEAR: + // ret = lcoc_->PureMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + // GetExecuteStream(runnerVariantPack.context)); + // break; + // case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: + // ret = lcoc_->AllToAllVAllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + // runnerVariantPack.context->GetExecuteStream()); + // break; + // case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: + // ret = lcoc_->MatmulReduceScatterAllToAllVHidden(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + // runnerVariantPack.context->GetExecuteStream()); + // break; default: ATB_LOG(ERROR) << GetLogPrefix() << "UnSupported type: " << param_.type; return ERROR_INVALID_PARAM; @@ -228,10 +228,10 @@ Status LinearParallelLcocRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPac return ERROR_COMM_EMPTY; } bool isMoe = false; - if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || - param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { - isMoe = true; - } + // if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || + // param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { + // isMoe = true; + // } size_t inTensorId = 0; const SVector &inTensors = runnerVariantPack.inTensors; Lcal::CoCInputPkg inputPkg; -- Gitee From 0ee04314528a146ff4076b8836b572f6ac4a75a4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 1 Sep 2025 11:34:37 +0800 Subject: [PATCH 400/414] clean code --- comm/lcal/include/comm_args.h | 2 +- comm/lcal/src/ascendc_kernels/lccl_op.h | 4 ++++ comm/lcal/src/ascendc_kernels/op_def.h | 6 +++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index 1dbeae00..5500106a 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -7,9 +7,9 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#pragma GCC diagnostic ignored "-Wunused-parameter" #ifndef LCCL_COMM_ARGS_H #define LCCL_COMM_ARGS_H +#pragma GCC diagnostic ignored "-Wunused-parameter" #include #if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_C220_CUBE__) diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index d2e31517..bf54ce2b 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -7,6 +7,9 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ +#ifndef LCCL_OP_H +#define LCCL_OP_H + #if defined(__DAV_C220_VEC__) || defined(__DAV_C220_CUBE__) #include "op_def.h" @@ -244,3 +247,4 @@ extern "C" __global__ __aicore__ void LcalReduceScatter_##type##suffix(KERNELS_A } \ } #endif +#endif \ No newline at end of file diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/comm/lcal/src/ascendc_kernels/op_def.h index c4f323b8..45086dae 100644 --- a/comm/lcal/src/ascendc_kernels/op_def.h +++ b/comm/lcal/src/ascendc_kernels/op_def.h @@ -7,6 +7,8 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ +#ifndef LCCL_OP_DEF_H +#define LCCL_OP_DEF_H #define GET_COMM_ARGS \ GlobalTensor commArgsGm; \ commArgsGm.SetGlobalBuffer(reinterpret_cast<__gm__ int *>(commArgs), 5); \ @@ -112,4 +114,6 @@ do { \ fun(float);fun(float16_t) #define LCCL_QUANT_LOW_TYPE_FUNC(fun) \ - fun(int8_t) \ No newline at end of file + fun(int8_t) + +#endif \ No newline at end of file -- Gitee From b352944bc681ce4b4470f3c4c242dcb670d84fc7 Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 11:44:50 +0800 Subject: [PATCH 401/414] fix --- comm/lcal/src/lcal_comm.cpp | 2 +- comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 14a8608e..9cc7b8d0 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -132,7 +132,7 @@ bool SkipUnusedChannel910B2C(int curRank, int peerRank, ChipName chipName) if (chipName == ChipName::CHIP_910B2C) { constexpr int rankSizePerNode = 8; if ((curRank / rankSizePerNode != peerRank / rankSizePerNode) - && (std::abs(curRank - peerRank) != rankSizePerNode)) { + && (std::abs(curRank - peerRank) != rankSizePerNode)) { return true; } } diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp index c6bc6b33..f25363b3 100644 --- a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp +++ b/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp @@ -403,7 +403,7 @@ bool CoCAllgatherMatmulReduceScatterTilingFunc::CheckTiling(const TaskParam &tas {"k", cocTilingData.k, PARAM_CHECK_MIN_VALUE_ONE, maxKValue}, {"n", cocTilingData.n, PARAM_CHECK_MIN_VALUE_ONE, maxNValue}, {"commNpuSplit * commDataSplit + extraCommNpuSplit * extraCommDataSplit", - useCoreCount, PARAM_CHECK_MIN_VALUE_ONE, coreNum}, + useCoreCount, PARAM_CHECK_MIN_VALUE_ONE, coreNum}, }; return CheckParamScopeList(paramCheckList); } -- Gitee From a371e2d43c0e25ee1269b83ceeab2f9363d190a5 Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 11:47:31 +0800 Subject: [PATCH 402/414] fix --- .../linear_parallel_lcoc_runner.cpp | 115 +++++++----------- 1 file changed, 45 insertions(+), 70 deletions(-) diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp index 0a7e2c46..0e8c4965 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp @@ -29,35 +29,35 @@ LinearParallelLcocRunner::LinearParallelLcocRunner(const infer::LinearParallelPa isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; break; - // case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: - // lcalType_ = Lcal::LcalType::MATMUL_REDUCE_SCATTER; - // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && - // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - // break; - // case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: - // lcalType_ = - // param_.keepIntermediate ? Lcal::LcalType::ALL_GATHER_MATMUL_V2 : Lcal::LcalType::ALL_GATHER_MATMUL; - // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && - // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - // break; + case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: + lcalType_ = Lcal::LcalType::MATMUL_REDUCE_SCATTER; + isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && + param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + break; + case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: + lcalType_ = + param_.keepIntermediate ? Lcal::LcalType::ALL_GATHER_MATMUL_V2 : Lcal::LcalType::ALL_GATHER_MATMUL; + isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && + param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + break; case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR_REDUCE_SCATTER: lcalType_ = Lcal::LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; break; - // case infer::LinearParallelParam::ParallelType::PURE_LINEAR: - // lcalType_ = Lcal::LcalType::PURE_MATMUL; - // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && - // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - // break; - // case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: - // lcalType_ = Lcal::LcalType::ALLTOALLV_ALLGATHER_MATMUL; - // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && - // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - // break; - // case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: - // lcalType_ = Lcal::LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN; - // isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && - // param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; - // break; + case infer::LinearParallelParam::ParallelType::PURE_LINEAR: + lcalType_ = Lcal::LcalType::PURE_MATMUL; + isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && + param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + break; + case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: + lcalType_ = Lcal::LcalType::ALLTOALLV_ALLGATHER_MATMUL; + isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && + param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + break; + case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: + lcalType_ = Lcal::LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN; + isQuant_ = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED && + param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; + break; default: ATB_LOG(ERROR) << GetLogPrefix() << "UnSupported type: " << param_.type; } @@ -137,21 +137,21 @@ Status LinearParallelLcocRunner::SetupImpl(RunnerVariantPack &runnerVariantPack) } } } - // if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || - // param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { - // Lcal::MoeInfo moeInfo{.local_expert_nums = param_.moeInfo.localExpertNums, - // .EP = param_.moeInfo.epSize, - // .TP = param_.moeInfo.tpSize, - // .maxOutputSize = -1, - // .isMoe = 1}; - // coCParamDesc.moeInfo = moeInfo; - // if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM) { - // coCParamDesc.moeInfo.maxOutputSize = - // runnerVariantPack.inTensors.at(runnerVariantPack.inTensors.size() - 1).desc.shape.dims[0]; - // } else if (param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { - // coCParamDesc.moeInfo.maxOutputSize = runnerVariantPack.inTensors.at(0).desc.shape.dims[0]; - // } - // } + if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || + param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { + Lcal::MoeInfo moeInfo{.local_expert_nums = param_.moeInfo.localExpertNums, + .EP = param_.moeInfo.epSize, + .TP = param_.moeInfo.tpSize, + .maxOutputSize = -1, + .isMoe = 1}; + coCParamDesc.moeInfo = moeInfo; + if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM) { + coCParamDesc.moeInfo.maxOutputSize = + runnerVariantPack.inTensors.at(runnerVariantPack.inTensors.size() - 1).desc.shape.dims[0]; + } else if (param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { + coCParamDesc.moeInfo.maxOutputSize = runnerVariantPack.inTensors.at(0).desc.shape.dims[0]; + } + } int ret = lcoc_->SetParam(lcalType_, {}, coCParamDesc); if (ret != 0) { ATB_LOG(ERROR) << GetLogPrefix() << "SetParam failed, ret : " << ret; @@ -181,35 +181,10 @@ Status LinearParallelLcocRunner::LaunchKernel(Lcal::CoCInputPkg inputPkg, Lcal:: ret = lcoc_->MatmulAllReduce(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, GetExecuteStream(runnerVariantPack.context)); break; - // case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: - // ret = lcoc_->MatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - // GetExecuteStream(runnerVariantPack.context)); - // break; - // case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: - // if (param_.keepIntermediate) { - // ret = lcoc_->AllGatherMatmulV2(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - // GetExecuteStream(runnerVariantPack.context)); - // break; - // } - // ret = lcoc_->AllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - // GetExecuteStream(runnerVariantPack.context)); - // break; case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR_REDUCE_SCATTER: ret = lcoc_->AllGatherMatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, GetExecuteStream(runnerVariantPack.context)); break; - // case infer::LinearParallelParam::ParallelType::PURE_LINEAR: - // ret = lcoc_->PureMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - // GetExecuteStream(runnerVariantPack.context)); - // break; - // case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: - // ret = lcoc_->AllToAllVAllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - // runnerVariantPack.context->GetExecuteStream()); - // break; - // case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: - // ret = lcoc_->MatmulReduceScatterAllToAllVHidden(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, - // runnerVariantPack.context->GetExecuteStream()); - // break; default: ATB_LOG(ERROR) << GetLogPrefix() << "UnSupported type: " << param_.type; return ERROR_INVALID_PARAM; @@ -228,10 +203,10 @@ Status LinearParallelLcocRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPac return ERROR_COMM_EMPTY; } bool isMoe = false; - // if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || - // param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { - // isMoe = true; - // } + if (param_.type == infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM || + param_.type == infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC) { + isMoe = true; + } size_t inTensorId = 0; const SVector &inTensors = runnerVariantPack.inTensors; Lcal::CoCInputPkg inputPkg; -- Gitee From 11508748735ef2faa4c824c570d36b5322219fea Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 16:24:24 +0800 Subject: [PATCH 403/414] add --- comm/lcal/src/kernels/coc_allgather.cce | 435 ++++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 comm/lcal/src/kernels/coc_allgather.cce diff --git a/comm/lcal/src/kernels/coc_allgather.cce b/comm/lcal/src/kernels/coc_allgather.cce new file mode 100644 index 00000000..e63f160d --- /dev/null +++ b/comm/lcal/src/kernels/coc_allgather.cce @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template // T: allgather type; MatType: matmul type +class AllGather : public CocCommBase { +public: + __aicore__ explicit AllGather() {}; + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) + { + CocCommBase::SetArgs(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + if constexpr (HAVE_BIAS) { + add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + need_dequant = workspace_info.gm_accum; + if (need_dequant) { + serial_dequant_runner.SetArgs(reinterpret_cast<__gm__ bfloat16_t *>(gm_out), workspace_info, + reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale), + reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset), dequant_granularity, 1, m * rank_size, n); + } + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + this->gm_out = aligned_a ? reinterpret_cast<__gm__ T *>(workspace_info.gm_a_align) : gm_a; + gm_a_pingpong_size = m0 * k_align * p_value * rank_size; + cal_count = DivCeil(m_loop, p_value); + } + + + + FORCE_INLINE_AICORE void EndFlagsAndBias() + { + ResetIpcFlags(2); + + if (aiv_idx == 1 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + + if constexpr (HAVE_BIAS) { + add_bias_runner.Run(); + } + } + + FORCE_INLINE_AICORE void MoveResultFromSrcToDst(__gm__ T *gm_src, __gm__ T *gm_dst, + int32_t len) + { + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultToDst(gm_src, gm_dst, len); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + + FORCE_INLINE_AICORE void MoveResultToDst(__gm__ T *gm_src, __gm__ T *gm_dst, + int32_t len) + { + int32_t ping_pong_move_count = (len + max_ub_ping_pong_size - 1) / max_ub_ping_pong_size; + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = len - move_idx * max_ub_ping_pong_size; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, gm_src, 1, actual_move_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + CopyUbufToGm(gm_dst, ub_buff_st, 1, actual_move_size * sizeof(T) / 32, 0, 0); + gm_src += max_ub_ping_pong_size; + gm_dst += max_ub_ping_pong_size; + SetFlag(event_id); + } + } + + + FORCE_INLINE_AICORE + void MoveToOtherRankWithSkip(__gm__ T *gm_src, int32_t rank_offset, int32_t len, + int32_t rank_st, int32_t skip_num, int32_t group_num, int32_t rank_scope) + { + int32_t ping_pong_move_count = (len + max_ub_ping_pong_size - 1) / max_ub_ping_pong_size; + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = len - move_idx * max_ub_ping_pong_size; + } + int32_t block_len = actual_move_size * sizeof(T) / 32; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, gm_src, 1, block_len, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + int32_t dst_rank = rank_st % rank_scope; + for (int32_t cycle_idx = 0; cycle_idx < group_num; ++cycle_idx) { + if (dst_rank != rank && dst_rank < rank_size) { + CopyUbufToGm(buff[dst_rank] + rank_offset, ub_buff_st, 1, block_len, 0, 0); + } + dst_rank = (dst_rank + skip_num) % rank_scope; + } + gm_src += max_ub_ping_pong_size; + rank_offset += max_ub_ping_pong_size; + SetFlag(event_id); + } + } + + FORCE_INLINE_AICORE + void MoveWithSplit(__gm__ T *gm_src, int32_t rank_offset, int32_t len) + { + int32_t data_split = DivCeil(len, len_per_loop); + int32_t data_block = len_per_loop; // 每份数据量 + int32_t rank_st = core_idx; + int32_t skip_num = comm_npu_split; + int32_t group_num = DivCeil(rank_size, comm_npu_split); + int32_t scope = comm_npu_split * group_num; + int32_t data_offset = -data_block; // 当前份数据的起始位置 + + if (is_91093) { // 卡间通信:91093只copy奇偶相同的卡 + rank_st = rank_st * A3_DIE_NUM + (rank % A3_DIE_NUM); + group_num = DivCeil(group_num, A3_DIE_NUM); + skip_num = skip_num * A3_DIE_NUM; + } + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + for (int32_t data_block_idx = 0; data_block_idx < data_split; ++data_block_idx) { + data_offset += data_block; // 当前份数据的起始位置 + data_block = data_block_idx == data_split - 1 ? len - data_offset : data_block; // 当前份数据量 + int32_t num_per_core = DivCeil(data_block, comm_data_split); + + int32_t data_src = data_offset + (core_idx / comm_npu_split) * num_per_core; + int32_t data_len = data_block + data_offset - data_src; + data_len = data_len >= num_per_core ? num_per_core : data_len; + // npu 方向:一份数据先发送到所有目标卡,再发送下一份数据,以此类推 + if (comm_direct) { + MoveToOtherRankWithSkip(gm_src + data_src, rank_offset + data_src, data_len, + rank_st, comm_npu_split, group_num, scope); + continue; + } + // data len 方向:所有的数据先发送到目标卡0,再发送到目标卡1,以此类推 + int32_t dst_rank = rank_st % scope; + for (int32_t rank_group_idx = 0; rank_group_idx < group_num; ++rank_group_idx) { + if (dst_rank != rank && dst_rank < rank_size) { + MoveResultToDst(gm_src + data_src, buff[dst_rank] + rank_offset + data_src, data_len); + } + dst_rank = (dst_rank + comm_npu_split) % scope; + } + } + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + + FORCE_INLINE_AICORE void RunWithSplit() + { + // Padding + preprocessor.Run(); + + ResetIpcFlags(2); + + int64_t data_len = static_cast(m) * k_align; // 数据量 + int32_t num_per_rank_move = m0 * k_align * p_value; // 每轮搬运到其他卡的数据量 + int64_t src_offset = 0; // 当前份数据的起始位置 + int64_t rank_offset = rank * num_per_rank_move; + for (int32_t cal_idx = 0; cal_idx < cal_count + MAX_BLOCK_COUNT; ++cal_idx) { + uint64_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + + if (cal_idx == cal_count - 1) { + num_per_rank_move = data_len - src_offset; + } + + // wait aic + if (cal_idx >= MAX_BLOCK_COUNT) { + WaitEvent(flag_idx); + } + // Step 1: AIV sync + SetAndWaitAivSync(flag_idx); + + if (cal_idx < cal_count) { + // Step 2: Rank sync + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + // Step 3: AIV sync + SetAndWaitAivSync(flag_idx); + // Step 4: Move + if (aiv_idx == 0 && core_idx < core_count) { + int64_t gm_rank_offset = flag_idx * gm_a_pingpong_size + rank_offset; + MoveWithSplit(gm_out + src_offset, gm_rank_offset, num_per_rank_move); + src_offset += num_per_rank_move; + } + // Step 5: AIV Sync + SetAndWaitAivSync(flag_idx); + // Step 6: Rank Sync + CrossRankSyncV2(FLAG_ONE_IDX, cal_idx + 1); + } + // aiv之间同步 + SetAndWaitAivSync(flag_idx); // 通信后aiv同步 + // 发送aic同步 + SetAicSync(flag_idx); + } + if (need_dequant) { + serial_dequant_runner.Run(); + } + EndFlagsAndBias(); + } + + FORCE_INLINE_AICORE void DataCopySio(int32_t cal_idx_sio, int32_t copy_len_sio) + { + if (cal_idx_sio < 0 || cal_idx_sio >= cal_count) { + return; + } + int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_3; + int32_t len_per_core = copy_len_sio / SIO_TOTAL_CORE_NUM; + int32_t sio_core_idx = aiv_idx * core_num + core_idx - core_count; + int32_t core_offset = sio_core_idx * len_per_core; + int64_t src_offset_sio = cal_idx_sio * p_value * m0 * k_align; + + if (sio_core_idx >= 0 && sio_core_idx < SIO_TOTAL_CORE_NUM) { + for (int32_t src_rank = rank % 2; src_rank < rank_size; src_rank += 2) { + int32_t sio_rank_offset = flag_idx_sio * gm_a_pingpong_size + src_rank * p_value * m0 * k_align; + __gm__ T *src_addr = buff[rank] + sio_rank_offset + core_offset; + if (src_rank == rank) { + src_addr = gm_out + src_offset_sio + core_offset; + } + MoveResultFromSrcToDst(src_addr, buff[rank ^ 1] + sio_rank_offset + core_offset, len_per_core); + } + } + } + + FORCE_INLINE_AICORE void RunWithSio() + { + // Padding + preprocessor.Run(); + + ResetIpcFlags(2); + int32_t copy_len_hccs = p_value * m0 * k_align; + int32_t copy_len_sio = p_value * m0 * k_align; + + for (int32_t cal_idx = 0; cal_idx < cal_count + BLOCK_COUNT_3; ++cal_idx) { + int32_t cal_idx_sio = cal_idx - 1; + uint64_t flag_idx = cal_idx % BLOCK_COUNT_3; + uint64_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_3; + int64_t src_offset = cal_idx * p_value * m0 * k_align; + int32_t rank_offset = flag_idx * gm_a_pingpong_size + rank * p_value * m0 * k_align; + + // 一次copy p_value * m0 行 + if (cal_idx == cal_count - 1) { + copy_len_hccs = (m - cal_idx * p_value * m0) * k_align; + } + + if (cal_idx_sio == cal_count - 1) { + copy_len_sio = (m - cal_idx_sio * p_value * m0) * k_align; + } + + // wait aic + if (cal_idx >= BLOCK_COUNT_3) { + WaitEvent(flag_idx); + } + // Step 1: AIV sync + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + + if (cal_idx < cal_count + 1) { + // Step 2: Rank sync + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + } + + // HCCS part + if (cal_idx < cal_count && core_idx < core_count) { + // Step 4: Move Hccs + MoveWithSplit(gm_out + src_offset, rank_offset, copy_len_hccs); + } + // SIO part + DataCopySio(cal_idx_sio, copy_len_sio); + + if (cal_idx < cal_count + 1) { + // Step 5: AIV Sync + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + // Step 6: Rank Sync + CrossRankSyncV2(FLAG_ONE_IDX, cal_idx + 1); + } + // aiv之间同步 + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + + // 发送aic同步 + if (cal_idx >= 1) { + SetAicSync(flag_idx_sio); + } + } + EndFlagsAndBias(); + } + + FORCE_INLINE_AICORE void Run() + { + if (is_91093) { + RunWithSio(); + } else { + RunWithSplit(); + } + } + +public: + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_idx; + using CocCommBase::core_num; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_direct; + using CocCommBase::swizzl_count; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_int8; + using CocCommBase::is_91093; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::max_ub_ping_pong_size; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::comm_npu_split; + using CocCommBase::comm_data_split; + using CocCommBase::comm_direct; + using CocCommBase::len_per_loop; + using CocCommBase::core_count; + using CocCommBase::weight_nz; + using CocCommBase::local_expert_nums; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; + using CocCommBase::is_deterministic; + using CocCommBase::EP; + using CocCommBase::TP; + using CocCommBase::flag_offset; + int32_t m_align; + int32_t k_align; + int32_t n_align; + int32_t aligned_a; + int32_t aligned_b; + int32_t cal_count; + int32_t gm_a_pingpong_size; + bool need_dequant; + Preprocessor preprocessor; + AllGatherMatmulBiasAdder add_bias_runner; + SerialDequantRunner serial_dequant_runner; +}; + +constexpr int32_t NO_BIAS_MASK4 = 0b000000 | 0b100000 | 0b010000 | 0b110000 | 0b001000 | 0b101000 | 0b011000 | 0b111000; +constexpr int32_t BIAS_MASK4 = 0b000010 | 0b100010 | 0b010010 | 0b110010 | 0b001010 | 0b101010 | 0b011010 | 0b111010; + +template +inline __aicore__ void CocAllGatherMatmulAiv(COC_ARGS_FUN(T)) +{ + // write + + + AllGather allgather_write_without_bias; + AllGather allgather_write_with_bias; + AllGather allgather_int8_write_without_bias; + AllGather allgather_int8_write_with_bias; + SetAtomicNone(); + SetMaskNorm(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t write_to_other_rank = cocTilingData->write2OtherRank; + // swizzl = 0 transa = 0 transb = 0 splitk = 0 bias = 0 int8 = 0 + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + allgather_write_without_bias.SetArgs(COC_ARGS_CALL()); + allgather_write_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + allgather_write_with_bias.SetArgs(COC_ARGS_CALL()); + allgather_write_with_bias.Run(); + break; + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + allgather_int8_write_without_bias.SetArgs(COC_ARGS_CALL_INT8()); + allgather_int8_write_without_bias.Run(); + break; + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + allgather_int8_write_with_bias.SetArgs(COC_ARGS_CALL_INT8()); + allgather_int8_write_with_bias.Run(); + break; + default : + break; + } + PipeBarrier(); +} + +#endif \ No newline at end of file -- Gitee From cb819607d26b9e5430d5dc9fdfae062eccc5a215 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 1 Sep 2025 19:46:46 +0800 Subject: [PATCH 404/414] fix build warning as error --- comm/lcal/include/comm_args.h | 1 - comm/lcal/include/lcoc/lcoc_workspace.h | 6 +++++- comm/lcal/src/lcoc.cpp | 1 + comm/lcal/src/tiling/tiling.cpp | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/comm/lcal/include/comm_args.h b/comm/lcal/include/comm_args.h index 5500106a..ff4f3297 100644 --- a/comm/lcal/include/comm_args.h +++ b/comm/lcal/include/comm_args.h @@ -9,7 +9,6 @@ */ #ifndef LCCL_COMM_ARGS_H #define LCCL_COMM_ARGS_H -#pragma GCC diagnostic ignored "-Wunused-parameter" #include #if !defined(__DAV_C220_VEC__) && !defined(__DAV_C310__) && !defined(__DAV_C220_CUBE__) diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/comm/lcal/include/lcoc/lcoc_workspace.h index 90f69e54..0b9e40fe 100644 --- a/comm/lcal/include/lcoc/lcoc_workspace.h +++ b/comm/lcal/include/lcoc/lcoc_workspace.h @@ -49,7 +49,8 @@ inline __aicore__ int32_t AlignUp(int32_t len, int32_t size) inline uint64_t GetDequantWorkSpaceSize(Lcal::LcalType lcalType, int32_t withSerialMode, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t pValue, int32_t nLoop, int32_t rankSize, int32_t blockDim, int32_t maxOutputSize = -1) - { +{ + (void) nLoop; constexpr int32_t TWO = 2; uint64_t dequantWorkSpaceSize = 0; if (withSerialMode > 0) { @@ -75,6 +76,8 @@ inline __aicore__ void GetLcalMoeWorkspaceInfo(LcalWorkspaceInfo& lcalWorkspaceI int32_t m, bool hasDequantParam = false, int32_t is_alltoallvc = false, int32_t EP = 1, int32_t expertPerRank = 1, int32_t outputSize = -1) { + (void) is_alltoallvc; + (void) outputSize; constexpr int32_t ALIGN8 = 8; if (hasDequantParam) { lcalWorkspaceInfo.gm_dequant_param = workspaceOffset; @@ -93,6 +96,7 @@ inline __aicore__ LcalWorkspaceInfo GetLcalWorkspaceInfo(GM_ADDR gmWorkSpace, in int32_t EP = 1, int32_t expertPerRank = 1, int32_t outputSize = -1 ) { + (void) accumRankSize; if (outputSize == -1) { outputSize = m; } diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index a4b3188d..de47fa58 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -202,6 +202,7 @@ int Lcoc::LaunchOperator(CoCInputPkg &inputPkg, CoCOutputPkg &outputPkg, void *w bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, const CoCOutputPkg &outputPkg, LcalType lcalType) const { + (void) outputPkg; if (!tilingSuccess_) { std::string str = "Tiling error. Please check whether the 'Lcoc::SetParam' method has been called, " "or verify if the tiling parameter is valid."; diff --git a/comm/lcal/src/tiling/tiling.cpp b/comm/lcal/src/tiling/tiling.cpp index 9e439491..a05b7eb8 100644 --- a/comm/lcal/src/tiling/tiling.cpp +++ b/comm/lcal/src/tiling/tiling.cpp @@ -27,11 +27,13 @@ CoCTilingData CoCTilingFunc::GenerateTiling(const TaskParam &taskParam, const Co bool CoCTilingFunc::CheckTiling(const TaskParam &taskParam) { + (void) taskParam; return CheckCoCTilingData(cocTilingData); } void CoCTilingFunc::GetDefaultTiling(const TaskParam &taskParam) { + (void) taskParam; cocTilingData.ubMoveNum = VALID_UB_MOVE_NUM; cocTilingData.commNpuSplit = cocTilingData.rankSize; cocTilingData.commDataSplit = COMMDATASPLIT_ONE; -- Gitee From ad60bac5501685a16cb9cefb1bc9a7c2cfa467cb Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 1 Sep 2025 19:52:32 +0800 Subject: [PATCH 405/414] add --- comm/lcal/src/coc_kernel_args.cpp | 21 +- comm/lcal/src/coc_kernel_args.h | 56 +- comm/lcal/src/kernels/coc_add_bias_runner.cce | 32 +- .../lcal/src/kernels/coc_allgather_matmul.cce | 54 ++ comm/lcal/src/kernels/coc_allgather_v2.cce | 367 +++++++++ comm/lcal/src/kernels/coc_comm_base.cce | 169 ++-- comm/lcal/src/kernels/coc_const_args.cce | 33 +- comm/lcal/src/kernels/coc_dequant_runner.cce | 756 ++++++++++++++---- comm/lcal/src/kernels/coc_internal.cce | 197 +++-- .../src/kernels/coc_matmul_reduce_scatter.cce | 5 +- comm/lcal/src/kernels/coc_postprocessor.cce | 24 +- comm/lcal/src/kernels/coc_ppmatmul.cce | 736 ++++++++++++----- comm/lcal/src/kernels/coc_ppmatmul_switch.cce | 90 ++- comm/lcal/src/kernels/coc_preprocessor.cce | 123 ++- comm/lcal/src/kernels/coc_pure_matmul.cce | 38 + comm/lcal/src/kernels/coc_reduce_scatter.cce | 526 ++++++++++++ comm/lcal/src/lcoc.cpp | 151 +++- comm/lcal/src/lcoc_func.cpp | 9 +- comm/lcal/src/tiling/allgather_tiling.cpp | 129 +++ .../src/tiling/allgather_tiling_91093.cpp | 474 +++++++++++ .../lcal/src/tiling/allgather_tiling_910B.cpp | 368 +++++++++ .../src/tiling/allgatherv2_tiling_91093.cpp | 357 +++++++++ .../src/tiling/allgatherv2_tiling_910B.cpp | 227 ++++++ .../alltoall_allgather_hidden_tiling.cpp | 100 +++ .../src/tiling/alltoall_allgather_tiling.cpp | 74 ++ .../reducescatter_alltoall_hidden_tiling.cpp | 98 +++ comm/lcal/src/tiling/reducescatter_tiling.cpp | 63 ++ .../src/tiling/reducescatter_tiling_91093.cpp | 516 ++++++++++++ .../src/tiling/reducescatter_tiling_910B.cpp | 198 +++++ comm/lcal/src/tiling/tiling.cpp | 3 +- comm/lcal/src/tiling/tiling_args.cpp | 2 +- comm/lcal/src/tiling/tiling_func.cpp | 20 +- 32 files changed, 5360 insertions(+), 656 deletions(-) create mode 100644 comm/lcal/src/kernels/coc_allgather_matmul.cce create mode 100644 comm/lcal/src/kernels/coc_allgather_v2.cce create mode 100644 comm/lcal/src/kernels/coc_pure_matmul.cce create mode 100644 comm/lcal/src/kernels/coc_reduce_scatter.cce create mode 100644 comm/lcal/src/tiling/allgather_tiling.cpp create mode 100644 comm/lcal/src/tiling/allgather_tiling_91093.cpp create mode 100644 comm/lcal/src/tiling/allgather_tiling_910B.cpp create mode 100644 comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp create mode 100644 comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp create mode 100644 comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp create mode 100644 comm/lcal/src/tiling/alltoall_allgather_tiling.cpp create mode 100644 comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp create mode 100644 comm/lcal/src/tiling/reducescatter_tiling.cpp create mode 100644 comm/lcal/src/tiling/reducescatter_tiling_91093.cpp create mode 100644 comm/lcal/src/tiling/reducescatter_tiling_910B.cpp diff --git a/comm/lcal/src/coc_kernel_args.cpp b/comm/lcal/src/coc_kernel_args.cpp index a0e3850d..89a30187 100644 --- a/comm/lcal/src/coc_kernel_args.cpp +++ b/comm/lcal/src/coc_kernel_args.cpp @@ -33,11 +33,14 @@ void CoCKernelArgs::SetInputPkgArgs(CoCInputPkg &inputPkg) matrixA = inputPkg.matrixA; matrixB = inputPkg.matrixB; bias = inputPkg.bias; - gamma= inputPkg.gamma; + gamma = inputPkg.gamma; dequantScale = inputPkg.dequantScale; dequantOffset = inputPkg.dequantOffset; quantScale = inputPkg.quantScale; quantOffset = inputPkg.quantOffset; + numLocalTokensPerExpertPtr = inputPkg.num_local_tokens_per_expert; + numGlobalTokensPerLocalExpertPtr = inputPkg.num_global_tokens_per_local_expert; + globalTokensPerLocalExpertMatrixPtr = inputPkg.global_tokens_per_expert_matrix; } void CoCKernelArgs::SetOutputPkgArgs(CoCOutputPkg &outputPkg) @@ -57,6 +60,7 @@ void CoCKernelArgs::SetParamDescArgs(const CoCParamDesc ¶mDesc) cocKernelParam.twoDimTPInfo = paramDesc.twoDimTPInfo; cocKernelParam.postInfo = paramDesc.postInfo; cocKernelParam.weightNz = paramDesc.mmInfo.weightNz; + cocKernelParam.moeInfo = paramDesc.moeInfo; } void CoCKernelArgs::SetCommArgs(const LcalComm &comm) @@ -73,10 +77,17 @@ void CoCKernelArgs::SetCoCTilingDataArgs(const CoCTilingData &tilingData) std::string CoCKernelArgs::ParamToString() { std::string quantInfoString = "[QuantInfo]: dequantGranularity=" + - std::to_string(cocKernelParam.quantInfo.dequantGranularity) + "\n"; + std::to_string(cocKernelParam.quantInfo.dequantGranularity) + "\n"; + auto moeInfo = cocKernelParam.moeInfo; + std::string moeInfoString = + std::string("[MoeInfo]: local_expert_nums=") + std::to_string(moeInfo.local_expert_nums) + + ", EP=" + std::to_string(static_cast(moeInfo.EP)) + + ", TP=" + std::to_string(static_cast(moeInfo.TP)) + + ", maxOutputSize=" + std::to_string(moeInfo.maxOutputSize) + + ", isMoe=" + std::to_string(static_cast(moeInfo.isMoe)) + "\n"; std::string weightNzInfoString = "[weightNz]: weightNz=" + - std::to_string(cocKernelParam.weightNz) + "\n"; + std::to_string(cocKernelParam.weightNz) + "\n"; std::string tilingInfoString = cocKernelParam.cocTilingData.ToString(); - return quantInfoString + weightNzInfoString + tilingInfoString; + return quantInfoString + moeInfoString + weightNzInfoString + tilingInfoString; +} } -} \ No newline at end of file diff --git a/comm/lcal/src/coc_kernel_args.h b/comm/lcal/src/coc_kernel_args.h index dc2e0c35..91ce2cd4 100644 --- a/comm/lcal/src/coc_kernel_args.h +++ b/comm/lcal/src/coc_kernel_args.h @@ -17,32 +17,36 @@ #include "lcoc_args.h" namespace Lcal { - struct CoCKernelArgs { - void *matrixA = nullptr; - void *matrixB = nullptr; - void *bias = nullptr; - void *gamma = nullptr; - void *output = nullptr; - void *midOutput = nullptr; - void *workspace = nullptr; - void *dequantScale = nullptr; - void *dequantOffset = nullptr; - void *quantScale = nullptr; - void *quantOffset = nullptr; - void *commArgsPtr = nullptr; - uint64_t fftsAddr = 0; +struct CoCKernelArgs { + void *matrixA = nullptr; + void *matrixB = nullptr; + void *bias = nullptr; + void *gamma = nullptr; + void *output = nullptr; + void *midOutput = nullptr; + void *workspace = nullptr; + void *dequantScale = nullptr; + void *dequantOffset = nullptr; + void *quantScale = nullptr; + void *quantOffset = nullptr; + void *commArgsPtr = nullptr; + uint64_t fftsAddr = 0; + + void *numLocalTokensPerExpertPtr = nullptr; + void *numGlobalTokensPerLocalExpertPtr = nullptr; + void *globalTokensPerLocalExpertMatrixPtr = nullptr; + CoCTilingData *pCocTiling = nullptr; + CoCKernelParam cocKernelParam = {}; + int SetFFTSAddr(); + void SetInputPkgArgs(CoCInputPkg &inputPkg); + void SetOutputPkgArgs(CoCOutputPkg &outputPkg); + void SetWorkspacePtrArg(void *workspacePtr); + void SetParamDescArgs(const CoCParamDesc ¶mDesc); + void SetCommArgs(const LcalComm &comm); + void SetCoCTilingDataArgs(const CoCTilingData &tilingData); + std::string ParamToString(); +}; - CoCTilingData *pCocTiling = nullptr; - CoCKernelParam cocKernelParam = {}; - int SetFFTSAddr(); - void SetInputPkgArgs(CoCInputPkg &inputPkg); - void SetOutputPkgArgs(CoCOutputPkg &outputPkg); - void SetWorkspacePtrArg(void *workspacePtr); - void SetParamDescArgs(const CoCParamDesc ¶mDesc); - void SetCommArgs(const LcalComm &comm); - void SetCoCTilingDataArgs(const CoCTilingData &tilingData); - std::string ParamToString(); - }; } -#endif // LCAL_COC_KERNEL_ARGS_H \ No newline at end of file +#endif // LCAL_COC_KERNEL_ARGS_H diff --git a/comm/lcal/src/kernels/coc_add_bias_runner.cce b/comm/lcal/src/kernels/coc_add_bias_runner.cce index 41a73836..f79db13d 100644 --- a/comm/lcal/src/kernels/coc_add_bias_runner.cce +++ b/comm/lcal/src/kernels/coc_add_bias_runner.cce @@ -147,7 +147,7 @@ private: // V: ub_out <- ub_out + ub_bias AddBiasToOutput(ub_out, ub_bias, m_this_loop, n_this_loop); - + SetFlag(event_id); WaitFlag(event_id); @@ -159,7 +159,7 @@ private: } inline __aicore__ void AddBiasToOutput(__ubuf__ OutputDtype *ub_out, __ubuf__ OutputDtype *ub_bias, - int32_t m_this_loop, int32_t n_this_loop) + int32_t m_this_loop, int32_t n_this_loop) { int32_t n_blocks = m_this_loop * Block32B::Count(n_this_loop); int32_t repeat_times = DivCeil(n_blocks, VEC_BLOCK_PER_REPEAT); @@ -212,7 +212,7 @@ private: WaitFlag(EVENT_ID0); for (int32_t row_idx = 1; row_idx < m_per_loop; ++row_idx) { - CopyUB2UB(ub_base + row_idx *Block32B::AlignUp(n), ub_base, 0, 1, + CopyUB2UB(ub_base + row_idx * Block32B::AlignUp(n), ub_base, 0, 1, Block32B::Count(n_this_loop), 0, 0); } } @@ -247,7 +247,7 @@ private: int32_t n_per_loop; int32_t max_len; - int32_t repeat_per_loop; + int32_t repeat_per_loop; }; template @@ -292,6 +292,29 @@ private: BaseSerialBiasAdder base_adder; }; +template +class MatmulReduceScatterBiasAdder { + static constexpr auto MODE = std::is_same::value ? BiasMode::ADD : BiasMode::ATOMIC_ADD; + +public: + __aicore__ explicit MatmulReduceScatterBiasAdder() = default; + + inline void __aicore__ SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN()) + { + m = m / rank_size; + base_adder.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + + inline void __aicore__ Run() + { + base_adder.Run(); + base_adder.Barrier(); + } + +private: + BaseSerialBiasAdder base_adder; +}; + template class AllGatherMatmulBiasAdder { static constexpr auto MODE = std::is_same::value ? BiasMode::ADD : BiasMode::ATOMIC_ADD; @@ -316,4 +339,5 @@ private: }; #endif + #endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_allgather_matmul.cce b/comm/lcal/src/kernels/coc_allgather_matmul.cce new file mode 100644 index 00000000..8e781281 --- /dev/null +++ b/comm/lcal/src/kernels/coc_allgather_matmul.cce @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __CCE_KT_TEST__ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + +#include "coc_ppmatmul_switch.cce" +#include "coc_allgather.cce" +#include "coc_allgather_v2.cce" + +#ifdef __DAV_C220_CUBE__ +// Matmul in LcalAllGatherMatmul +#define COC_ALL_GATHER_MATMUL_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllGatherMatmul_##type##_mix_aic(COC_ARGS_FUN(type)) { \ + CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + +// Matmul in LcalAllGatherMatmulV2 +#define COC_ALL_GATHER_MATMUL_V2_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllGatherMatmulV2_##type##_mix_aic(COC_ARGS_FUN(type)) { \ + CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + +#elif __DAV_C220_VEC__ +// AllGather in LcalAllGatherMatmul +#define COC_ALL_GATHER_MATMUL_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllGatherMatmul_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ + CocAllGatherMatmulAiv(COC_ARGS_CALL()); \ +} + +// AllGather in LcalAllGatherMatmul +#define COC_ALL_GATHER_MATMUL_V2_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllGatherMatmulV2_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ + CocAllGatherMatmulV2Aiv(COC_ARGS_CALL()); \ +} + +#endif + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // 910B support bf16 +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_ALL_GATHER_MATMUL_FUNC_AUTO_DEF); +COC_TYPE_FUNC(COC_ALL_GATHER_MATMUL_V2_FUNC_AUTO_DEF); + +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_allgather_v2.cce b/comm/lcal/src/kernels/coc_allgather_v2.cce new file mode 100644 index 00000000..76e8f424 --- /dev/null +++ b/comm/lcal/src/kernels/coc_allgather_v2.cce @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class AllGatherV2 : public AllGather { +public: + __aicore__ explicit AllGatherV2(){}; + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) + { + AllGather::SetArgs(COC_ARGS_CALL()); + max_move_m = max_ub_ping_pong_size > max_move_k ? max_ub_ping_pong_size / max_move_k : 1; + gm_allgather = gm_allgather_out; + } + + FORCE_INLINE_AICORE void MoveResultFromPeerMemToOut(__gm__ T *gm_src, __gm__ T *gm_dst, int32_t actual_m) + { + int32_t ping_pong_move_count = (actual_m + max_move_m - 1) / max_move_m; + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_m = max_move_m; + if (move_idx == ping_pong_move_count - 1) { + actual_move_m = actual_m - move_idx * max_move_m; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + int32_t k_move_count = (k_align + max_move_k - 1) / max_move_k; + for (int32_t k_move_idx = 0; k_move_idx < k_move_count; ++k_move_idx) { + int32_t actual_k_move_num_in_peer_mem = max_move_k; + int32_t actual_k_move_num_in_out = max_move_k; + if (k_move_idx == k_move_count - 1) { + actual_k_move_num_in_peer_mem = k_align - k_move_idx * max_move_k; + actual_k_move_num_in_out = k - k_move_idx * max_move_k; + } + WaitFlag(event_id); + CopyGmToUbuf(ub_buff_st, gm_src + move_idx * max_move_m * k_align + k_move_idx * max_move_k, + actual_move_m, actual_k_move_num_in_peer_mem * sizeof(T) / 32, + (k_align - actual_k_move_num_in_peer_mem) * sizeof(T) / 32, 0); + SetFlag(event_id); + WaitFlag(event_id); + if (ALIGN) { + CopyUbufToGm(gm_dst + move_idx * max_move_m * k + k_move_idx * max_move_k, ub_buff_st, + actual_move_m, actual_k_move_num_in_out * sizeof(T) / 32, + (actual_k_move_num_in_peer_mem - actual_k_move_num_in_out) * sizeof(T) / 32, + (k - actual_k_move_num_in_out) * sizeof(T) / 32); + } else { + CopyUbufToGmAlignB16(gm_dst + move_idx * max_move_m * k + k_move_idx * max_move_k, ub_buff_st, + actual_move_m, actual_k_move_num_in_out * sizeof(T), + (actual_k_move_num_in_peer_mem - actual_k_move_num_in_out) * sizeof(T) / 32, + (k - actual_k_move_num_in_out) * sizeof(T)); + } + SetFlag(event_id); + } + } + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + + FORCE_INLINE_AICORE void Run() + { + // Padding + preprocessor.Run(); + + ResetIpcFlags(2); + PipeBarrier(); + + for (int32_t cal_idx = 0; cal_idx < cal_count + MAX_BLOCK_COUNT; ++cal_idx) { + uint64_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + int32_t actual_m = p_value * m0; + if (cal_idx == cal_count - 1) { + actual_m = m - cal_idx * p_value * m0; + } + // wait aic + int32_t cal_done_idx = cal_idx - MAX_BLOCK_COUNT; + if (cal_done_idx >= 0) { + WaitEvent(flag_idx); + } + + // aiv之间同步 + SetAndWaitAivSync(flag_idx); + if (cal_idx < cal_count && aiv_idx == 0 && core_idx < rank_size) { + int64_t src_offset = cal_idx * p_value * m0 * k_align; + int32_t rank_offset = flag_idx * gm_a_pingpong_size + rank * p_value * m0 * k_align; + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + + if (other_rank != rank) { + MoveResultFromSrcToDst(gm_out + src_offset, buff[other_rank] + rank_offset, actual_m * k_align); + } + CrossRankSyncV2(FLAG_ONE_IDX, cal_idx + 1); + } else if (cal_idx > 0 && cal_idx < cal_count + 1 && aiv_idx == 1 && core_idx >= rank_size && + core_idx < rank_size * 2) { // peermem to out + uint64_t s2_flag_idx = (cal_idx - 1) % MAX_BLOCK_COUNT; + int32_t s2_other_rank = core_idx - rank_size; + int64_t src_offset = (cal_idx - 1) * p_value * m0 * k_align; + int32_t other_rank_offset = s2_flag_idx * gm_a_pingpong_size + s2_other_rank * p_value * m0 * k_align; + int64_t dst_offset = s2_other_rank * static_cast(m) * k + (cal_idx - 1) * p_value * m0 * k; + int32_t s2_actual_m = p_value * m0; + if (cal_idx == cal_count) { + s2_actual_m = m - (cal_idx - 1) * p_value * m0; + } + if (s2_other_rank != rank) { + MoveResultFromPeerMemToOut(buff[rank] + other_rank_offset, gm_allgather + dst_offset, s2_actual_m); + } else { + MoveResultFromPeerMemToOut(gm_out + src_offset, gm_allgather + dst_offset, s2_actual_m); + } + } + + // aiv之间同步 + SetAndWaitAivSync(flag_idx); + + // 发送aic同步 + SetAicSync(flag_idx); + } + + EndFlagsAndBias(); + } + + + FORCE_INLINE_AICORE void RunWithSplit() + { + // Padding + preprocessor.Run(); + + ResetIpcFlags(2); + PipeBarrier(); + + int64_t data_len = static_cast(m) * k_align; // 数据量 + int32_t num_per_rank_move = m0 * k0 * p_value * k_loop; // 每轮搬运到其他卡的数据量 + int32_t core_count = comm_npu_split * comm_data_split; // 每张卡上使用的核数 + int64_t src_offset = 0; // 当前份数据的起始位置 + int64_t rank_offset = rank * num_per_rank_move; + for (int32_t cal_idx = 0; cal_idx < cal_count + MAX_BLOCK_COUNT; ++cal_idx) { + uint64_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + if (cal_idx == cal_count - 1) { + num_per_rank_move = data_len - src_offset; + } + + // wait aic + if (cal_idx >= MAX_BLOCK_COUNT) { + WaitEvent(flag_idx); + } + // aiv之间同步 + SetAndWaitAivSync(flag_idx); + if (cal_idx < cal_count) { + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + } + SetAndWaitAivSync(flag_idx); + if (cal_idx < cal_count && aiv_idx == 0 && core_idx < core_count) { + int64_t gm_rank_offset = flag_idx * gm_a_pingpong_size + rank_offset; + MoveWithSplit(gm_out + src_offset, gm_rank_offset, num_per_rank_move); + src_offset += num_per_rank_move; + } else if (cal_idx > 0 && cal_idx < cal_count + 1 && aiv_idx == 1 && + core_idx >= core_count && core_idx < rank_size + core_count) { // peermem to out + // 如果剩余的core数不够,则循环搬运 + int32_t other_core_num = get_block_num() - core_count; // 剩余的core数 + int32_t cycle_num = (other_core_num + rank_size - 1) / other_core_num; // 循环次数 + uint64_t s2_flag_idx = (cal_idx - 1) % MAX_BLOCK_COUNT; + int64_t src_offset = (cal_idx - 1) * p_value * m0 * k_align; + int32_t s2_actual_m = cal_idx == cal_count ? m - (cal_idx - 1) * p_value * m0 : p_value * m0; + for (int32_t cycle_idx = 0; cycle_idx < cycle_num; ++cycle_idx) { + int32_t s2_other_rank = core_idx - core_count + cycle_idx * other_core_num; + int32_t other_rank_offset = s2_flag_idx * gm_a_pingpong_size + s2_other_rank * p_value * m0 * k_align; + int64_t dst_offset = s2_other_rank * static_cast(m) * k + (cal_idx - 1) * p_value * m0 * k; + if (s2_other_rank >= rank_size) { + break; + } + if (s2_other_rank != rank) { + MoveResultFromPeerMemToOut(buff[rank] + other_rank_offset, gm_allgather + dst_offset, s2_actual_m); + } else { + MoveResultFromPeerMemToOut(gm_out + src_offset, gm_allgather + dst_offset, s2_actual_m); + } + } + } + SetAndWaitAivSync(flag_idx); + if (cal_idx < cal_count) { + CrossRankSyncV2(FLAG_ONE_IDX, cal_idx + 1); + } + + // aiv之间同步 + SetAndWaitAivSync(flag_idx); + + // 发送aic同步 + SetAicSync(flag_idx); + } + + EndFlagsAndBias(); + } + +public: + using AllGather::SetAicSync; + using AllGather::SetAndWaitAivSync; + using AllGather::SetBuffFlag; + using AllGather::SetBuffFlagByAdd; + using AllGather::CheckBuffFlag; + using AllGather::ResetIpcFlags; + using AllGather::EndFlagsAndBias; + using AllGather::CrossRankSyncV1; + using AllGather::CrossRankSyncV2; + using AllGather::buff; + using AllGather::gm_out; + using AllGather::ctrl_flags_UB; + using AllGather::output_UB_T; + using AllGather::batch_size; + using AllGather::m; + using AllGather::k; + using AllGather::n; + using AllGather::m0; + using AllGather::k0; + using AllGather::n0; + using AllGather::m_loop; + using AllGather::n_loop; + using AllGather::k_loop; + using AllGather::core_loop; + using AllGather::core_idx; + using AllGather::rank; + using AllGather::rank_size; + using AllGather::tiling_key; + using AllGather::swizzl_count; + using AllGather::p_value; + using AllGather::aiv_idx; + using AllGather::other_rank; + using AllGather::max_ub_single_dma_size; + using AllGather::max_ub_ping_pong_size; + using AllGather::m_align; + using AllGather::k_align; + using AllGather::n_align; + using AllGather::aligned_a; + using AllGather::aligned_b; + using AllGather::cal_count; + using AllGather::gm_a_pingpong_size; + using AllGather::preprocessor; + using AllGather::add_bias_runner; + using AllGather::MoveResultFromSrcToDst; + using AllGather::comm_npu_split; + using AllGather::comm_data_split; + using AllGather::comm_direct; + using AllGather::len_per_loop; + using AllGather::MoveWithSplit; + using AllGather::local_expert_nums; + using AllGather::is_moe; + using AllGather::is_moe_averaged; + using AllGather::is_alltoallvc; + using AllGather::EP; + using AllGather::TP; + int32_t max_move_m; + int32_t max_move_k = 20480; + int32_t copy_core_num; + int32_t m_k_num; + int32_t num_per_rank_move; + int32_t core_count; + int32_t first_step_core_num; + int32_t num_per_move; + __gm__ T *gm_allgather; +}; + +constexpr int32_t NO_BIAS_MASK5 = 0b000000 | 0b100000 | 0b010000 | 0b110000 | 0b001000 | 0b101000 | 0b011000 | 0b111000; +constexpr int32_t BIAS_MASK5 = 0b000010 | 0b100010 | 0b010010 | 0b110010 | 0b001010 | 0b101010 | 0b011010 | 0b111010; + +template +FORCE_INLINE_AICORE void RunAllGatherV2Align16(int32_t tiling_key, COC_ARGS_FUN(T)) +{ + // 16 align + AllGatherV2 allgather_write_align_16_without_bias; + AllGatherV2 allgather_write_align_16_with_bias; + switch (tiling_key) { + case 0b000000: + case 0b100000: + case 0b010000: + case 0b110000: + case 0b001000: + case 0b101000: + case 0b011000: + case 0b111000: + allgather_write_align_16_without_bias.SetArgs(COC_ARGS_CALL()); + allgather_write_align_16_without_bias.RunWithSplit(); + break; + case 0b000010: + case 0b100010: + case 0b010010: + case 0b110010: + case 0b001010: + case 0b101010: + case 0b011010: + case 0b111010: + allgather_write_align_16_with_bias.SetArgs(COC_ARGS_CALL()); + allgather_write_align_16_with_bias.RunWithSplit(); + break; + default: + break; + } +} + +template +FORCE_INLINE_AICORE void RunAllGatherV2UnAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) +{ + // 16 unalign + AllGatherV2 allgather_write_unalign_16_without_bias; + AllGatherV2 allgather_write_unalign_16_with_bias; + switch (tiling_key) { + case 0b000000: + case 0b100000: + case 0b010000: + case 0b110000: + case 0b001000: + case 0b101000: + case 0b011000: + case 0b111000: + allgather_write_unalign_16_without_bias.SetArgs(COC_ARGS_CALL()); + allgather_write_unalign_16_without_bias.RunWithSplit(); + break; + case 0b000010: + case 0b100010: + case 0b010010: + case 0b110010: + case 0b001010: + case 0b101010: + case 0b011010: + case 0b111010: + allgather_write_unalign_16_with_bias.SetArgs(COC_ARGS_CALL()); + allgather_write_unalign_16_with_bias.RunWithSplit(); + break; + default : + break; + } +} + +template +inline __aicore__ void CocAllGatherMatmulV2Aiv(COC_ARGS_FUN(T)) +{ + // write + AllGatherV2 allgather_write_align_16_without_bias; + AllGatherV2 allgather_write_align_16_with_bias; + AllGatherV2 allgather_write_unalign_16_without_bias; + AllGatherV2 allgather_write_unalign_16_with_bias; + + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t k = cocTilingData->k; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t write_to_other_rank = cocTilingData->write2OtherRank; + // swizzl = 0 transa = 0 transb = 0 splitk = 0 bias = 0 int8 = 0 + if (k % BLOCK_SIZE_16 == 0) { + RunAllGatherV2Align16(tiling_key, COC_ARGS_CALL()); + } else { + RunAllGatherV2UnAlign16(tiling_key, COC_ARGS_CALL()); + } + PipeBarrier(); +} + +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_comm_base.cce b/comm/lcal/src/kernels/coc_comm_base.cce index 789ea110..5b3b1209 100644 --- a/comm/lcal/src/kernels/coc_comm_base.cce +++ b/comm/lcal/src/kernels/coc_comm_base.cce @@ -26,7 +26,7 @@ public: FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) { CoCBuffAddrAndArgs coc_buff_and_args(COC_ARGS_CALL()); - for (int i=0; icocTilingData; auto quantInfo = ¶->quantInfo; auto twoDimTPInfo = ¶->twoDimTPInfo; + auto moeInfo = ¶->moeInfo; batch_size = cocTilingData->batchSize; m = cocTilingData->m; k = cocTilingData->k; n = cocTilingData->n; - + m0 = cocTilingData->m0; k0 = cocTilingData->k0; n0 = cocTilingData->n0; @@ -77,7 +90,7 @@ public: rank = cocTilingData->rank; rank_size = cocTilingData->rankSize; buffer_size = cocTilingData->bufferSize; - flag_offset = buffer_size * 1024 * 1024 / sizeof(int32_t); + flag_offset = buffer_size * 1024 * 1024 / sizeof(int32_t);; p_value = cocTilingData->pValue; max_ub_single_dma_size = cocTilingData->ubMoveNum; withSerialMode = cocTilingData->withSerialMode; @@ -106,6 +119,12 @@ public: rs_dim = twoDimTPInfo->rsDim; inner_dim_is_Ag = twoDimTPInfo->innerDimIsAg; weight_nz = para->weightNz; + + local_expert_nums = moeInfo->local_expert_nums; + TP = moeInfo->TP; + EP = moeInfo->EP; + maxOutputSize = moeInfo->maxOutputSize; + is_moe = moeInfo->isMoe; } FORCE_INLINE_AICORE void SetWorkspace(__gm__ uint8_t *gm_workspace) @@ -135,8 +154,8 @@ public: } workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, - hasFormatDequantScale, is_deterministic); - + hasFormatDequantScale, is_deterministic, is_moe, is_alltoallvc, EP, local_expert_nums, maxOutputSize); + } FORCE_INLINE_AICORE void SetAicSync(uint64_t flag_idx) @@ -150,8 +169,10 @@ public: WaitEvent(flag_idx + pipe_depth); } + + FORCE_INLINE_AICORE void SetBuffFlag(__ubuf__ int32_t *ctrl_flags_UB, \ - __gm__ int32_t *buff, int32_t flag) + __gm__ int32_t *buff, int32_t flag) { *ctrl_flags_UB = flag; SetFlag(EVENT_ID2); @@ -160,7 +181,7 @@ public: } FORCE_INLINE_AICORE void SetBuffFlagByAdd(__ubuf__ int32_t *ctrl_flags_UB, \ - __gm__ int32_t *buff, int32_t flag) + __gm__ int32_t *buff, int32_t flag) { PipeBarrier(); *ctrl_flags_UB = flag; @@ -181,14 +202,14 @@ public: } FORCE_INLINE_AICORE void CheckBuffFlag(__ubuf__ int32_t *ctrl_flags_UB, \ - __gm__ int32_t *buff, int32_t flag) + __gm__ int32_t *buff, int32_t flag) { SetFlag(EVENT_ID1); WaitFlag(EVENT_ID1); while (true) { CopyGmToUbufAlignB16(ctrl_flags_UB, buff, 1, sizeof(int32_t), 0, 0); SetFlag(EVENT_ID3); - WaitFlag(EVENT_ID3); + WaitFlag(EVENT_ID3); // Scalar等MTE2 if (*ctrl_flags_UB == flag) { break; } @@ -198,21 +219,21 @@ public: FORCE_INLINE_AICORE void CrossRankSyncV1(int32_t flag_idx, int32_t flag_data) { if (aiv_idx == 0 && core_idx == rank) { - SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, FLAG_VALUE); + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, FLAG_VALUE); } else if (aiv_idx == 0 && core_idx < rank_size) { CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, - FLAG_VALUE * flag_data); + FLAG_VALUE * flag_data); } } FORCE_INLINE_AICORE void CrossRankSyncV2(int32_t flag_idx, int32_t flag_data) - { + { if (aiv_idx == 0 && core_idx < rank_size) { SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[core_idx] + flag_offset + flag_idx, FLAG_VALUE); } if (aiv_idx == 0 && core_idx == rank) { CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, - FLAG_VALUE * rank_size * flag_data); + FLAG_VALUE * rank_size * flag_data); } } @@ -228,7 +249,7 @@ public: FORCE_INLINE_AICORE void CrossRankSyncV4(int32_t flag_idx, int32_t flag_data) { - if (aiv_idx == 0 && core_idx < rank_size) { + if (aiv_idx == 0 && core_idx < rank_size){ if (core_idx != rank) { SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + flag_idx, flag_data); } @@ -236,24 +257,25 @@ public: } } + FORCE_INLINE_AICORE void ResetIpcFlags(int32_t num_flags) { for (int32_t idx = 0; idx < num_flags; ++idx) { - if (core_idx == 0 && aiv_idx == 0) { - SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + idx, 0); + if (core_idx == 0 && aiv_idx == 0){ + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + idx, 0); } } } FORCE_INLINE_AICORE void FillZero(int32_t data_size_remain, __gm__ T *output, \ - int32_t total_aiv, int32_t aiv_idx_in_clean) - { + int32_t total_aiv, int32_t aiv_idx_in_clean){ int32_t repeat_time = 128; int32_t num_per_call = repeat_time * 128; - + // 检查T是否为float16_t if constexpr (std::is_same::value) { VectorDup(output_UB_T[0], static_cast(0), repeat_time, 1, 8); } + // 检查T是否为bfloat16_t else if constexpr (std::is_same::value) { VectorDup(output_UB_T[0], static_cast(0), repeat_time, 1, 8); } @@ -263,7 +285,7 @@ public: data_size_remain = DivCeil(data_size_remain, total_aiv); data_size_remain = (data_size_remain + 15) / 16 * 16; int32_t offset = aiv_idx_in_clean * data_size_remain; - while (data_size_remain > 0) { + while (data_size_remain > 0){ int32_t data_size = data_size_remain < num_per_call ? data_size_remain : num_per_call; CopyUbufToGm(output + offset, output_UB_T[0], 1, data_size * sizeof(T) / 32, 0, 0); data_size_remain -= data_size; @@ -274,7 +296,7 @@ public: FORCE_INLINE_AICORE void CopyUbToGmTransLayout(__ubuf__ T* ub_buff_st, int32_t actual_move_size, int64_t move_num_offset) { auto ub_buff = ub_buff_st; int32_t left_m = actual_move_size / n0; - while (left_m > 0) { + while (left_m > 0){ int32_t loop_idx = move_num_offset / (m0 * n0); int64_t m_idx, n_idx; GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); @@ -294,11 +316,12 @@ public: } } + FORCE_INLINE_AICORE void CopyGMToGM(__gm__ T* gm_src, __gm__ T* gm_dst, int32_t copy_size) { auto ub0 = output_UB_T[0]; auto ub1 = output_UB_T[1]; int32_t interm_offset = 0; - for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx) { + for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx){ uint32_t data_size = interm_offset + max_ub_ping_pong_size < copy_size ? max_ub_ping_pong_size : copy_size - interm_offset; auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub = (move_idx & 1) ? ub0 : ub1; @@ -312,6 +335,7 @@ public: } } + // 分核线性策略:支持任意划分方案; FORCE_INLINE_AICORE void FirstStepInPeerMemSeq(int32_t data_size_remain, int32_t core_buff_offset) { if (data_size_remain <= 0) { return; @@ -319,7 +343,7 @@ public: auto ub0 = output_UB_T[0]; auto ub1 = output_UB_T[1]; int32_t rank_per_core = (rank_size) / comm_npu_split; - int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; + int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; // 每个core搬运不同的卡 for (int32_t rank_idx = 0; rank_idx < rank_per_core; ++rank_idx){ int32_t rank_idx_rot = (rank_idx + core_idx) % rank_per_core; @@ -327,55 +351,62 @@ public: if (m_rank_idx == rank) { continue; } - if (is_91093 && (m_rank_idx % 2) != (rank % 2)) { + if (is_91093 && (m_rank_idx % 2) != (rank % 2)) { // 91093只搬奇偶相同的卡 continue; } CopyGMToGM(buff[m_rank_idx] + core_buff_offset, buff[rank] + core_buff_offset, data_size_remain); } } + // 分核树形策略,仅支持comm_npu_split=1,当前仅支持4或8卡 FORCE_INLINE_AICORE void FirstStepInPeerMemTree(int32_t data_size_remain, int32_t core_buff_offset) { if (data_size_remain <= 0) { return; } int32_t rank_per_core = (rank_size) / comm_npu_split; - int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; - - __gm__ T* gm_reducebuf = reinterpret_cast<__gm__ T *>(workspace_info.gm_reducebuf) + core_idx * len_per_loop * rank_size / 2; + int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; // 每个core搬运不同的卡 + // 额外buffer: core_num * len_per_loop[20480] * (ranksize/2) * sizeof(fp16) = 3932160 (4MB) + __gm__ T* gm_reducebuf = reinterpret_cast<__gm__ T *>(workspace_info.gm_reducebuf) + core_idx * len_per_loop * rank_size / 2; // 每个core使用:rank_size / 2 * len_per_loop长度 + // 共搬运7次,前3次搬运先用普通move,不atomic加 SetAtomicNone(); - int32_t rank_idx = 0; - int32_t turn_atomic_step = rank_size / 2 - 1; - for (int32_t visited = 0; visited < rank_size - 1; visited++) { - if (visited == turn_atomic_step) { - SetAtomicAdd(); + int32_t rank_idx = 0; // 初始化的NPU ID + int32_t turn_atomic_step = rank_size / 2 - 1; // rank_size==8, step=3; rank_size==4, step=1; + for (int32_t visited = 0; visited < rank_size - 1; visited++){ // rank 8 + if (visited == turn_atomic_step) { // 前(rank/2-1)次搬完后,开始atomic加 + SetAtomicAdd(); } - int32_t rank_idx_rot = (rank_idx + core_idx) % rank_per_core; + int32_t rank_idx_rot = (rank_idx + core_idx) % rank_per_core; // 实际NPU ID if (rank_idx_rot == rank) { rank_idx++; rank_idx_rot = (rank_idx + core_idx) % rank_per_core; } - if (is_91093 && (rank_idx_rot % 2) != (rank % 2)) { + if (is_91093 && (rank_idx_rot % 2) != (rank % 2)) { // 91093只搬奇偶相同的卡 continue; } - + // 搬运地址:0 1 2,3 4 5分别搬到同样的地方,后3次atomicadd auto gm_interm = gm_reducebuf + (visited % turn_atomic_step) * len_per_loop; - if (visited == rank_size - 2) { + if (visited == rank_size - 2) { // last, atomic add to self peermem gm_interm = buff[rank] + core_buff_offset; } auto gm_peer = buff[rank_idx_rot] + core_buff_offset; CopyGMToGM(gm_peer, gm_interm, data_size_remain); rank_idx++; } - if (rank_size == 8) { + if (rank_size == 8) { // rank8树形累加 + // interm[1] -> self peermem CopyGMToGM(gm_reducebuf + 1 * len_per_loop, buff[rank] + core_buff_offset, data_size_remain); + // interm[2] -> interm[0] CopyGMToGM(gm_reducebuf + 2 * len_per_loop, gm_reducebuf, data_size_remain); } if (rank_size >= 4) { + // interm[0] -> self peermem CopyGMToGM(gm_reducebuf, buff[rank] + core_buff_offset, data_size_remain); } + } + // 原始策略:每个core负责一个NPU的搬运 FORCE_INLINE_AICORE void FirstStepInPeerMem(int32_t data_size_remain, __gm__ T *input, __gm__ T *output, bool atomic_add = false) { if (data_size_remain <= 0) { return; @@ -385,16 +416,52 @@ public: PipeBarrier(); } int32_t offset = 0; - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 CopyGMToGM(input, output, data_size_remain); - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 if (atomic_add) { - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); + SetFlag(EVENT_ID0); // Scalar等MTE3 + WaitFlag(EVENT_ID0); SetAtomicNone(); - PipeBarrier(); + PipeBarrier(); + } + } + + // Firststepinpeermem+转格式输出 + FORCE_INLINE_AICORE void FirstStepInPeerMemTransLayout(int32_t data_size_remain, __gm__ T *input, __gm__ T *output, int32_t out_offset = -1, bool atomic_add = false) { + if (data_size_remain <= 0) { + return; + } + if (atomic_add) { + SetAtomicAdd(); + PipeBarrier(); + } + int32_t offset = 0; + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + for (int32_t move_idx = 0; data_size_remain > 0; ++move_idx){ + uint32_t data_size = data_size_remain > max_ub_ping_pong_size ? max_ub_ping_pong_size : data_size_remain; + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbuf(ub, input + offset, 1, data_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); // MTE3等MTE2 + WaitFlag(event_id); + CopyUbufToGm(output + offset, ub, 1, data_size * sizeof(T) / 32, 0, 0); + CopyUbToGmTransLayout(ub, data_size, out_offset + offset); + SetFlag(event_id); // MTE2等MTE3 + data_size_remain -= data_size; + offset += data_size; + } + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + if (atomic_add) { + SetFlag(EVENT_ID0); // Scalar等MTE3 + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); } } @@ -404,6 +471,12 @@ public: __ubuf__ int32_t *ctrl_flags_UB = (__ubuf__ int32_t *)(0); __ubuf__ T *output_UB_T[2] = {(__ubuf__ T *)(32), (__ubuf__ T *)(97440)}; + __gm__ int32_t *num_local_tokens_per_expert; + __gm__ int32_t *num_global_tokens_per_local_expert; + __gm__ int32_t *global_tokens_per_expert_matrix; + int32_t expert_nums,local_expert_nums, TP, EP, maxOutputSize; + int32_t is_moe, is_moe_averaged, is_alltoallvc; + int32_t batch_size; int32_t m; int32_t k; @@ -449,10 +522,10 @@ public: int32_t core_count; int32_t extra_ub_move_num; - int32_t extra_comm_npu_split; - int32_t extra_comm_data_split; - int32_t extra_comm_direct; - int32_t extra_len_per_loop; + int32_t extra_comm_npu_split; // 2dtp allReduce使用 + int32_t extra_comm_data_split; // 2dtp allreduce使用 + int32_t extra_comm_direct; // 2dtp allreduce使用 + int32_t extra_len_per_loop; // 2dtp allreduce使用 bool is_deterministic; QuantGranularity dequant_granularity; diff --git a/comm/lcal/src/kernels/coc_const_args.cce b/comm/lcal/src/kernels/coc_const_args.cce index 2473f80c..9832d294 100644 --- a/comm/lcal/src/kernels/coc_const_args.cce +++ b/comm/lcal/src/kernels/coc_const_args.cce @@ -33,7 +33,7 @@ constexpr int32_t AIV_FINISH_DEQUANT_FLAG_ID = 11; constexpr int32_t AIC_WAIT_AIV_FINISH_ALIGN_FLAG_ID = 12; constexpr int32_t AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID = 13; -constexpr int32_t A3_DIE_NUM = 2; +constexpr int32_t A3_DIE_NUM = 2; // 一张卡有两个die constexpr int32_t BLOCK_SIZE_16 = 16; constexpr int32_t BLOCK_SIZE_32 = 32; constexpr int32_t SWIZZL_MASK = 0b100000; @@ -46,11 +46,11 @@ constexpr int32_t QUANT_SHIFT = 16; constexpr int32_t MAX_BLOCK_COUNT = 2; constexpr int32_t BLOCK_COUNT_3 = 3; constexpr int32_t BLOCK_COUNT_4 = 4; -constexpr int32_t L0AB_PINGPONG_BUFFER_LEN = 16384; -constexpr int32_t CUBE_MATRIX_SIZE = 256; -constexpr int64_t L1_PINGPONG_BUFFER_LEN = 131072; +constexpr int32_t L0AB_PINGPONG_BUFFER_LEN = 16384; // 32 KB +constexpr int32_t CUBE_MATRIX_SIZE = 256; // 16 * 16 +constexpr int64_t L1_PINGPONG_BUFFER_LEN = 131072; // 256 KB constexpr int32_t MAX_CORE_NUM = 25; -constexpr int64_t MAX_UB_BUFF = 196608; +constexpr int64_t MAX_UB_BUFF = 196608; // 192 * 1024 个 Byte; constexpr int32_t ADD_REPEAT_TIME = 4; constexpr int32_t FLAG_ZERO_IDX = 0; constexpr int32_t FLAG_ONE_IDX = 1; @@ -63,17 +63,25 @@ constexpr int32_t VEC_BLOCK_PER_REPEAT = 8; constexpr uint8_t REPEAT_PER_LOOP = 255; constexpr uint32_t PPMATMUL_RUN_PURE_MATMUL = 1; constexpr uint32_t PPMATMUL_RUN_MATMUL_ALLREDUCE = 2; +constexpr uint32_t PPMATMUL_RUN_MATMUL_REDUCE_SCATTER = 3; +constexpr uint32_t PPMATMUL_RUN_ALL_GATHER_MATMUL = 4; +constexpr uint32_t PPMATMUL_RUN_ALL_GATHER_MATMUL_V2 = 5; constexpr int32_t LCAL_2DTP_C_OFFSET = 100 * 1024 * 1024 / sizeof(half); constexpr uint32_t PPMATMUL_RUN_ALL_GATHER_MATMUL_REDUCE_SCATTER = 6; +constexpr uint32_t PPMATMUL_RUN_ALL_GATHER_MATMUL_SIO = 7; constexpr int32_t HCCS_TOTAL_CORE_NUM = 8; constexpr int32_t SIO_TOTAL_CORE_NUM = 8; constexpr uint64_t WORKSPACE_REDUCE_SIZE = 4000000; constexpr int32_t TWOD_DATA_SPLIT_DEFAULT = 2; constexpr int32_t TWOD_LEN_PER_LOOP_DEFAULT = 5120; + +constexpr uint32_t PPMATMUL_RUN_ALL_TO_ALL_ALL_GATHER_MATMUL = 13; +constexpr uint32_t PPMATMUL_RUN_ALL_TO_ALL_ALL_GATHER_MATMUL_HIDDEN = 15; +constexpr uint32_t PPMATMUL_RUN_MATMUL_REDUCE_SCATTER_ALL_TO_ALL_HIDDEN = 16; constexpr int LCAL_BUFF_BYTES = 204 * 1024 * 1024; -constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; -constexpr int32_t FLAG_OFFSET = (LCAL_BUFF_BYTES - FLAG_BUFF_BYTES) / sizeof(int32_t); +constexpr int32_t FLAG_BUFF_BYTES = 5 * 512 * 1024; // 2.5MB +constexpr int32_t FLAG_OFFSET = (LCAL_BUFF_BYTES - FLAG_BUFF_BYTES) / sizeof(int32_t); // 201.5 * 1024 * 1024 enum QuantGranularity : int { QUANT_GRANULARITY_UNDEFINED = -1, @@ -85,6 +93,7 @@ enum QuantGranularity : int { QUANT_GRANULARITY_MAX = 5, }; + template struct BaseBlock { static_assert((SIZE & (SIZE - 1)) == 0, "Invalid block size"); @@ -122,13 +131,17 @@ using Block512B = BaseBlock; template struct CoCCommArgs { - int rank; + int rank; // attr rank_id, global rank int localRank; - int rankSize; + int rankSize; // global rank size int localRankSize; uint32_t extraFlag; __gm__ T *peerMems[LCAL_MAX_RANK_SIZE]; int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE]; }; -#endif // LCAL_COC_CONST_ARGS_H + + + + +#endif // LCAL_COC_CONST_ARGS_H \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/comm/lcal/src/kernels/coc_dequant_runner.cce index 053fc092..b2f4a400 100644 --- a/comm/lcal/src/kernels/coc_dequant_runner.cce +++ b/comm/lcal/src/kernels/coc_dequant_runner.cce @@ -24,7 +24,9 @@ template <> class LoopDequanter { public: static constexpr int32_t max_len = 9792; + inline __aicore__ LoopDequanter() = default; + inline __aicore__ void SetForLoop() { SetFlag(EVENT_ID0); @@ -51,18 +53,25 @@ public: int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); + WaitFlag(event_id); CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); SetFlag(event_id); + WaitFlag(event_id); Vadds(ub_adds, ub_in, offset, repeat, 1, 1, 8, 8); SetFlag(event_id); PipeBarrier(); + Vconv(ub_adds_f32, ub_adds, repeat, 1, 1, 8, 8); + PipeBarrier(); + Vmuls(ub_muls, ub_adds_f32, scale, repeat, 1, 1, 8, 8); + PipeBarrier(); + WaitFlag(event_id); Vconv(ub_out, ub_muls, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(event_id); @@ -89,7 +98,9 @@ template <> class LoopDequanter { public: static constexpr int32_t max_len = 8192; + inline __aicore__ LoopDequanter() = default; + inline __aicore__ void SetForLoop() { SetFlag(EVENT_ID0); @@ -97,6 +108,7 @@ public: SetFlag(EVENT_ID2); SetFlag(EVENT_ID0); } + inline __aicore__ void WaitForLoop() { WaitFlag(EVENT_ID0); @@ -104,18 +116,21 @@ public: WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID0); } + inline __aicore__ void Loop(__gm__ bfloat16_t *dst, __gm__ int32_t *src, __gm__ float32_t *scale, int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t src_stride, int32_t dst_stride) { is_ping = !is_ping; auto ub_in = is_ping ? ub_in0 : ub_in1; auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; + int32_t n_blocks = Block32B::Count(n_cols_this_loop) * (sizeof(int32_t) / sizeof(bfloat16_t)); int32_t ubuf_gap = n_blocks - Block32B::Count(n_cols_this_loop); WaitFlag(event_id); CopyGmToUbufAlign(ub_in, src, n_rows_this_loop, n_cols_this_loop, src_stride - n_cols_this_loop, ubuf_gap); SetFlag(event_id); + WaitFlag(event_id); Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 8); SetFlag(event_id); @@ -124,20 +139,25 @@ public: if (scale_rows == 0 || scale_source != scale) { scale_rows = 1; scale_source = scale; + CopyGmToUbufAlign(ub_scale, scale, 1, n_cols_this_loop, 0); } SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); for (; scale_rows < n_rows_this_loop; ++scale_rows) { CopyUB2UB(ub_scale + scale_rows * n_blocks * Block32B::size, ub_scale, 0, 1, n_blocks, 0, 0); } PipeBarrier(); + Vmul(ub_mul, ub_in_f32, ub_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID2); + WaitFlag(EVENT_ID0); Vconv(ub_out, ub_mul, repeat, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); CopyUbufToGmAlign(dst, ub_out, n_rows_this_loop, n_cols_this_loop, dst_stride - n_cols_this_loop); SetFlag(EVENT_ID0); @@ -164,7 +184,7 @@ public: inline __aicore__ LoopPerTokenDequanter(int32_t n0) { - n_round = (n0 + 127) / 128 * 128; // n_this_loop + 127 / 128是需要的repeat数, 每个repeat占用8个blocks + n_round = (n0 + 127) / 128 * 128; // n_this_loop + 127 / 128是需要的repeat数,每个repeat占用8个blocks ub_in0 = reinterpret_cast<__ubuf__ T *>((uintptr_t)0); ub_in1 = reinterpret_cast<__ubuf__ T *>(ub_in0 + max_len); ub_out = reinterpret_cast<__ubuf__ T *>(ub_in1 + max_len); @@ -178,6 +198,8 @@ public: SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); + + SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); } @@ -187,6 +209,8 @@ public: WaitFlag(EVENT_ID0); WaitFlag(EVENT_ID1); WaitFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID2); } @@ -194,6 +218,7 @@ public: inline __aicore__ void Loop(__gm__ T *buff, __gm__ float32_t *scale, int32_t n_rows_this_loop, int32_t n_cols_this_loop, int32_t stride) { + is_ping = !is_ping; auto ub_in = is_ping ? ub_in0 : ub_in1; auto event_id = is_ping ? EVENT_ID0 : EVENT_ID1; @@ -201,10 +226,12 @@ public: WaitFlag(event_id); CopyGmToUbufAlign(ub_in, buff, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); SetFlag(event_id); + WaitFlag(event_id); Vconv(ub_in_f32, ub_in, repeat, 1, 1, 8, 4); SetFlag(event_id); + WaitFlag(EVENT_ID2); WaitFlag(EVENT_ID2); if (scale_source != scale) { @@ -213,9 +240,11 @@ public: } SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); - WaitFlag(EVENT_ID2); // 注意必须是MTE2_S,不能是MTE2_V,否则会读到0, 造成乱码 - WaitFlag(EVENT_ID2); + WaitFlag(EVENT_ID2); // 注意必须是MTE2_S,不能是MTE2_V,否则会读到0,造成乱码 + WaitFlag(EVENT_ID2); PipeBarrier(); for (int32_t row = 0; row < n_rows_this_loop; ++row) { float32_t scale = ub_scales[row]; @@ -227,10 +256,13 @@ public: SetFlag(EVENT_ID2); SetFlag(EVENT_ID2); + + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(buff, ub_out, n_rows_this_loop, n_cols_this_loop, stride - n_cols_this_loop, ubufGap); SetFlag(EVENT_ID2); } + private: static constexpr uint8_t repeat = 128; __ubuf__ T *ub_in0 = nullptr; @@ -247,7 +279,9 @@ private: class LoopScaleFormater { public: static constexpr int32_t max_len = 8160; + inline __aicore__ LoopScaleFormater() = default; + inline __aicore__ void SetForLoop() { set_ctrl(sbitset1(get_ctrl(), 59)); @@ -277,11 +311,13 @@ public: WaitFlag(event_id); CopyGmToUbufAlign(ub_in, src, 1, len, 0); SetFlag(event_id); + WaitFlag(event_id); WaitFlag(event_id); Vconv(ub_vconv, ub_in, repeat, 1, 1, 4, 8); SetFlag(event_id); SetFlag(event_id); + WaitFlag(event_id); CopyUbufToGmAlign(dst, ub_out, 1, len, 0); SetFlag(event_id); @@ -295,11 +331,13 @@ private: __ubuf__ int32_t *ub_vconv1 = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)98304); __ubuf__ float32_t *ub_out0 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)65536); __ubuf__ float32_t *ub_out1 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)98304); + bool is_ping = false; }; class BaseDequantRunner { public: + class TileLoopIter { public: inline __aicore__ TileLoopIter(int32_t m_this_tile, int32_t n_this_tile) @@ -321,14 +359,13 @@ public: m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; n_this_loop = n_this_subcore; } - inline __aicore__ void Init(int32_t max_len, int32_t n0) // max_len = 8192或者9792 { - // Block32B::AlignUp: 扩展到32/sizeof(half)的倍数,也就是扩展到16的倍数 - // m_this_subcore最大值: max_len / n_this_subcore, 16384/256=64 - int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); + // Block32B::AlignUp:扩展到32/sizeof(half)的倍数,也就是扩展到16的倍数 + // m_this_subcore最大值:max_len / n_this_subcore, 16384/256=64 + int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); m_complete = 0; - m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; // 本次loop所处理的m, 最大为max_m_per_loop + m_this_loop = max_m_per_loop > m_this_subcore ? m_this_subcore : max_m_per_loop; // 本次loop所处理的m,最大为max_m_per_loop n_this_loop = n_this_subcore; // 本次loop所处理的n } @@ -352,10 +389,13 @@ public: { return m_offset_this_subcore + m_complete; } + int32_t m_this_subcore; int32_t n_this_subcore; + int32_t m_this_loop; int32_t n_this_loop; + int32_t m_offset_this_subcore; int32_t m_complete; }; @@ -368,15 +408,18 @@ public: this->gm_accum = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_accum); this->gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_formate_dequant_scale); this->gm_out = gm_out; + this->gm_dequant_scale = gm_dequant_scale; this->gm_dequant_offset = gm_dequant_offset; this->dequant_granularity = dequant_granularity; + this->batch_size = batch_size; this->m = m; this->n = n; + if (dequant_granularity == QuantGranularity::PER_TENSOR) { gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale); - } else if (dequant_granularity == QuantGranularity::PER_CHANNEL) { + } else if (dequant_granularity == QuantGranularity::PER_CHANNEL){ FormatScale(); } else { gm_format_dequant_scale = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale); @@ -391,6 +434,7 @@ public: int32_t align_core_idx = get_block_idx() * get_subblockdim() + get_subblockid(); int32_t align_core_num = get_block_num() * get_subblockdim(); + int32_t len = LoopScaleFormater::max_len; int32_t loop_num = DivCeil(n, len); LoopScaleFormater loop_scale_formater; @@ -400,12 +444,12 @@ public: if (offset + len > n) { len = n - offset; } + loop_scale_formater.Loop(gm_format_dequant_scale + offset, gm_dequant_scale + offset, len); + } + loop_scale_formater.WaitForLoop(); - loop_scale_formater.Loop(gm_format_dequant_scale + offset, gm_dequant_scale + offset, len); + Barrier(); } - loop_scale_formater.WaitForLoop(); - Barrier(); -} protected: inline __aicore__ void Barrier() @@ -416,11 +460,13 @@ protected: __gm__ int32_t *gm_accum; __gm__ bfloat16_t *gm_out; + __gm__ int64_t *gm_dequant_scale; __gm__ int32_t *gm_dequant_offset; QuantGranularity dequant_granularity; __gm__ float32_t *gm_format_dequant_scale; + int32_t batch_size; int32_t m; int32_t k; @@ -448,6 +494,7 @@ public: } n_cols_this_core = n_cols; col_offset_this_core = 0; + core_offset = row_offset_this_core * n_cols; } @@ -471,72 +518,75 @@ public: batch_offset = static_cast(batch_idx) * n_rows * n_cols; } - - inline __aicore__ void InitRowLoop(int32_t max_rows_per_loop) - { - n_rows_complete = 0; - n_rows_this_loop = (n_rows_this_core < max_rows_per_loop) ? n_rows_this_core : max_rows_per_loop; - row_offset = 0; - } - - inline __aicore__ bool EndRowLoop() const - { - return n_rows_complete == n_rows_this_core; - } - - inline __aicore__ void NextRowLoop() - { - n_rows_complete += n_rows_this_loop; - if (EndRowLoop()) { - return; + inline __aicore__ void InitRowLoop(int32_t max_rows_per_loop) + { + n_rows_complete = 0; + n_rows_this_loop = (n_rows_this_core < max_rows_per_loop) ? n_rows_this_core : max_rows_per_loop; + row_offset = 0; } - if (n_rows_complete + n_rows_this_loop > n_rows_this_core) { - n_rows_this_loop = n_rows_this_core - n_rows_complete; + + inline __aicore__ bool EndRowLoop() const + { + return n_rows_complete == n_rows_this_core; } - row_offset = n_rows_complete; - } - inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) - { - n_cols_complete = 0; - n_cols_this_loop = (n_cols < max_cols_per_loop) ? n_cols : max_cols_per_loop; - col_offset = 0; - } + inline __aicore__ void NextRowLoop() + { + n_rows_complete += n_rows_this_loop; + if (EndRowLoop()) { + return; + } + if (n_rows_complete + n_rows_this_loop > n_rows_this_core) { + n_rows_this_loop = n_rows_this_core - n_rows_complete; + } + row_offset = n_rows_complete; + } - inline __aicore__ bool EndColLoop() const - { - return n_cols_complete == n_cols_this_core; - } + inline __aicore__ void InitColLoop(int32_t max_cols_per_loop) + { + n_cols_complete = 0; + n_cols_this_loop = (n_cols < max_cols_per_loop) ? n_cols : max_cols_per_loop; + col_offset = 0; + } - inline __aicore__ void NextColLoop() - { - n_cols_complete += n_cols_this_loop; - if (EndColLoop()) { - return; + inline __aicore__ bool EndColLoop() const + { + return n_cols_complete == n_cols_this_core; } - if (n_cols_complete + n_cols_this_loop > n_cols_this_core) { - n_cols_this_loop = n_cols_this_core - n_cols_complete; + + inline __aicore__ void NextColLoop() + { + n_cols_complete += n_cols_this_loop; + if (EndColLoop()) { + return; + } + if (n_cols_complete + n_cols_this_loop > n_cols_this_core) { + n_cols_this_loop = n_cols_this_core - n_cols_complete; + } + col_offset = n_cols_complete; } - col_offset = n_cols_complete; - } - inline __aicore__ int64_t offset() const - { - return core_offset + row_offset * n_cols + col_offset; - } + inline __aicore__ int64_t offset() const + { + return core_offset + row_offset * n_cols + col_offset; + } int32_t batch_size; int32_t n_rows; int32_t n_cols; + int32_t n_rows_this_core; int32_t n_cols_this_core; int64_t row_offset_this_core; int64_t col_offset_this_core; + int32_t batch_idx; int32_t n_rows_complete; int32_t n_cols_complete; + int32_t n_rows_this_loop; int32_t n_cols_this_loop; + int64_t core_offset; int64_t batch_offset; int64_t row_offset; @@ -563,6 +613,7 @@ public: default: break; } + Barrier(); } @@ -570,6 +621,7 @@ private: inline __aicore__ void DequantPerTensor() { float32_t scale = gm_format_dequant_scale[0]; + const auto max_len = LoopDequanter::max_len; int32_t n_round = Block32B::AlignUp(n); int32_t max_m_per_loop = (n_round <= max_len) ? (max_len / n_round) : 1; @@ -613,37 +665,42 @@ private: } loop_dequanter.WaitForLoop(); } + }; + + template -class SerialPerTokenDequantRunner : public SerialDequantRunner { +class SerialPerTokenDequantRunner : public SerialDequantRunner{ public: __aicore__ explicit SerialPerTokenDequantRunner() = default; inline __aicore__ void SetArgs(__gm__ T *gm_out, - __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0) - { - this->gm_out = reinterpret_cast<__gm__ T *>(gm_out); - this->gm_dequant_scale_pertoken = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale_pertoken); - this->m = m; - this->n = n; - this->m0 = m0; - this->n0 = n0; - } - inline __aicore__ void Run() { - const auto max_len = LoopPerTokenDequanter::max_len; - int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); - LoopIter it(1, m, n); - LoopPerTokenDequanter loop_dequanter(n0); - loop_dequanter.SetForLoop(); - for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { - for (it.InitColLoop(n0); !it.EndColLoop(); it.NextColLoop()) { - __gm__ T * dst_add = gm_out + it.offset(); - __gm__ float32_t * scale = gm_dequant_scale_pertoken + it.row_offset + it.row_offset_this_core; - loop_dequanter.Loop(dst_add, scale, it.n_rows_this_loop, it.n_cols_this_loop, n); - } + __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0) + { + this->gm_out = reinterpret_cast<__gm__ T *>(gm_out); + this->gm_dequant_scale_pertoken = reinterpret_cast<__gm__ float32_t *>(gm_dequant_scale_pertoken); + this->m = m; + this->n = n; + this->m0 = m0; + this->n0 = n0; + } + + inline __aicore__ void Run() { + const auto max_len = LoopPerTokenDequanter::max_len; + int32_t max_m_per_loop = max_len / ((n0 + 127) / 128 * 128); + LoopIter it(1, m, n); + LoopPerTokenDequanter loop_dequanter(n0); + loop_dequanter.SetForLoop(); + for (it.InitRowLoop(max_m_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { + for (it.InitColLoop(n0); !it.EndColLoop(); it.NextColLoop()) { + __gm__ T * dst_add = gm_out + it.offset(); + __gm__ float32_t * scale = gm_dequant_scale_pertoken + it.row_offset + it.row_offset_this_core; + loop_dequanter.Loop(dst_add, scale, it.n_rows_this_loop, it.n_cols_this_loop, n); } - loop_dequanter.WaitForLoop(); } + loop_dequanter.WaitForLoop(); + } + private: __gm__ T *gm_out; @@ -654,44 +711,49 @@ private: int32_t n0; }; + + class FusedDequantRunner : public BaseDequantRunner { public: __aicore__ explicit FusedDequantRunner() = default; - inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo & workspace_info, - __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, - int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, int32_t p_value, int32_t rank_size) - { + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo &workspace_info, + __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, + int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, int32_t p_value, int32_t rank_size) + { BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, - batch_size, m, n); + batch_size, m, n); + //cit.SetArgs(m, n, m0, n0, m_loop, n_loop, core_loop, swizzle_direct, swizzle_count, p_value); core_num = get_block_num(); core_idx = get_block_idx(); - this-> m0 = m0; - this-> n0 = n0; - this-> m_loop = m_loop; - this-> n_loop = n_loop; - this-> core_loop = core_loop; - this-> swizzl_direct = swizzl_direct; - this-> swizzl_count = swizzl_count; - this-> loop_num_per_comm = p_value * core_num; - this-> p_value = p_value; - this-> rank_size = rank_size; + this -> m0 = m0; + this -> n0 = n0; + this -> m_loop = m_loop; + this -> n_loop = n_loop; + this -> core_loop = core_loop; + this->swizzl_direct = swizzl_direct; + this->swizzl_count = swizzl_count; + + this->loop_num_per_comm = p_value * core_num; + this -> p_value = p_value; + this -> rank_size = rank_size; + } inline __aicore__ void RunDequantAllReduce(int32_t cal_idx) { switch (dequant_granularity) { - case QuantGranularity:: PER_TENSOR : + case QuantGranularity::PER_TENSOR: DequantAllReducePerTensor(cal_idx); return; - case QuantGranularity:: PER_CHANNEL : + case QuantGranularity::PER_CHANNEL: DequantAllReducePerChannel(cal_idx); return; - case QuantGranularity:: PER_TOKEN : + case QuantGranularity::PER_TOKEN: DequantAllReducePerChannel(cal_idx); return; - case QuantGranularity:: FLOAT32_SCALE_PER_CHANNEL : + case QuantGranularity::FLOAT32_SCALE_PER_CHANNEL: DequantAllReducePerChannel(cal_idx); return; default: @@ -699,6 +761,9 @@ public: } } + + + inline __aicore__ void DequantAllReducePerChannel(int32_t cal_idx) { LoopDequanter loop_dequanter; @@ -716,8 +781,8 @@ public: int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; TileLoopIter tit(m_actual, n_actual); - int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + - (loop_idx % loop_num_per_comm) * m0 * n0; + int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; for (tit.Init(LoopDequanter::max_len); !tit.End(); tit.Next()) { int64_t src_offset = offset_this_tile + tit.m_offset_in_tile() * n0; int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; @@ -748,8 +813,8 @@ public: int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; TileLoopIter tit(m_actual, n_actual); - int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + - (loop_idx % loop_num_per_comm) * m0 * n0; + int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; for (tit.Init(LoopDequanter::max_len); !tit.End(); tit.Next()) { int64_t src_offset = offset_this_tile + tit.m_offset_in_tile() * n0; int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; @@ -761,26 +826,191 @@ public: loop_dequanter.WaitForLoop(); } + inline __aicore__ void RunDequantReduceScatter(int32_t cal_idx) + { + switch (dequant_granularity) { + case QuantGranularity::PER_TENSOR: + DequantReduceScatterPerTensor(cal_idx); + return; + case QuantGranularity::PER_CHANNEL: + DequantReduceScatterPerChannel(cal_idx); + return; + case QuantGranularity::PER_TOKEN: + DequantReduceScatterPerChannel(cal_idx); + return; + case QuantGranularity::FLOAT32_SCALE_PER_CHANNEL: + DequantReduceScatterPerChannel(cal_idx); + return; + default: + return; + } + } + inline __aicore__ void DequantReduceScatterPerChannel(int32_t cal_idx) + { + LoopDequanter loop_dequanter; + loop_dequanter.SetForLoop(); + int32_t m_loop_per_rank = m_loop / rank_size; + //int32_t pipe_depth = is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT; + int32_t pipe_depth = MAX_BLOCK_COUNT; + int32_t flag_idx = cal_idx % pipe_depth; + int32_t comm_num = DivCeil(core_loop, loop_num_per_comm); + int32_t actual_loop_num = loop_num_per_comm; + if (cal_idx == comm_num - 1) { + actual_loop_num = core_loop - cal_idx * loop_num_per_comm; + } + + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + + int32_t in_batch_idx = loop_idx % (m_loop * n_loop); + int64_t rank_idx = in_batch_idx % rank_size; + int32_t in_rank_idx = in_batch_idx / rank_size; + + int64_t m_idx, n_idx; + GetBlockIdx(in_rank_idx, m_loop_per_rank, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop_per_rank - 1)) ? (m / rank_size - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + + TileLoopIter tit(m_actual, n_actual); + int64_t rank_offset_c = (loop_idx % rank_size) * (actual_loop_num / rank_size) * m0 * n0; + int64_t offset_this_tile = flag_idx * m0 * loop_num_per_comm * n0 + rank_offset_c + + + ((loop_idx % loop_num_per_comm) / rank_size) * m0 * n0; + + for (tit.Init(LoopDequanter::max_len); !tit.End(); tit.Next()) { + int64_t src_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + auto accum = gm_accum + src_offset; + auto out = gm_out + dst_offset; + auto scale = gm_format_dequant_scale + n_idx * n0; + loop_dequanter.Loop(out, accum, scale, tit.m_this_loop, tit.n_this_loop, n0, n0); + } + } + loop_dequanter.WaitForLoop(); + } + + inline __aicore__ void DequantReduceScatterPerTensor(int32_t cal_idx) + { + LoopDequanter loop_dequanter; + loop_dequanter.SetForLoop(); + float32_t scale = gm_format_dequant_scale[0]; + int32_t m_loop_per_rank = m_loop / rank_size; + //int32_t pipe_depth = is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT; + int32_t pipe_depth = MAX_BLOCK_COUNT; + int32_t flag_idx = cal_idx % pipe_depth; + int32_t comm_num = DivCeil(core_loop, loop_num_per_comm); + int32_t actual_loop_num = loop_num_per_comm; + if (cal_idx == comm_num - 1) { + actual_loop_num = core_loop - cal_idx * loop_num_per_comm; + } + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + + int32_t in_batch_idx = loop_idx % (m_loop * n_loop); + int64_t rank_idx = in_batch_idx % rank_size; + int32_t in_rank_idx = in_batch_idx / rank_size; + + int64_t m_idx, n_idx; + GetBlockIdx(in_rank_idx, m_loop_per_rank, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop_per_rank - 1)) ? (m / rank_size - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + + TileLoopIter tit(m_actual, n_actual); + int64_t rank_offset_c = (loop_idx % rank_size) * (actual_loop_num / rank_size) * m0 * n0; + int64_t offset_this_tile = flag_idx * m0 * loop_num_per_comm * n0 + rank_offset_c + + + ((loop_idx % loop_num_per_comm) / rank_size) * m0 * n0; + + for (tit.Init(LoopDequanter::max_len); !tit.End(); tit.Next()) { + int64_t src_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + int64_t dst_offset = offset_this_tile + tit.m_offset_in_tile() * n0; + auto accum = gm_accum + src_offset; + auto out = gm_out + dst_offset; + loop_dequanter.Loop(out, accum, scale, 0, tit.m_this_loop, tit.n_this_loop, n0, n0); + } + } + loop_dequanter.WaitForLoop(); + } + + + + inline __aicore__ void SetArgs(__gm__ bfloat16_t *gm_out, const LcalWorkspaceInfo &workspace_info, + __gm__ int64_t *gm_dequant_scale, __gm__ int32_t *gm_dequant_offset, QuantGranularity dequant_granularity, + int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop,int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, + __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) + { + BaseDequantRunner::SetArgs(gm_out, workspace_info, gm_dequant_scale, gm_dequant_offset, dequant_granularity, + batch_size, m, n); + + core_num = get_block_num(); + core_idx = get_block_idx(); + + loop_per_EP = p_value * core_num / (EP * TP); + + out_loop_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_expert); + out_loop_per_ep = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_EP); + sum_num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_local_tokens_per_expert); + sum_num_global_tokens_per_local_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_global_tokens_per_local_expert); + + in_expert_comm_count_accum = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_in_expert_comm_count_accum); + + this->n_loop = n_loop; + this->m_loop = m_loop; + this->m0 = m0; + this->n0 = n0; + this->swizzl_direct = swizzle_direct; + this->swizzl_count = swizzle_count; + this->p_value = p_value; + this->rank_size = EP * TP; + this->rank = rank; + + + this->EP = EP; + this->TP = TP; + this->local_expert_nums = local_expert_nums; + + this->is_moe_averaged = is_moe_averaged; + this->is_alltoallvc = is_alltoallvc; + this->num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *>(num_local_tokens_per_expert); + this->num_global_tokens_per_local_expert = + reinterpret_cast<__gm__ int32_t *>(num_global_tokens_per_local_expert); + } private: int32_t core_num; int32_t core_idx; + int32_t m0; int32_t n0; int32_t m_loop; int32_t n_loop; int32_t core_loop; int32_t loop_num_per_comm; - int32_t swizzl_direct; int32_t swizzl_count; + int32_t p_value; int32_t rank_size; + int32_t loop_per_EP; int32_t rank; - - + int32_t EP; + int32_t TP; + int32_t local_expert_nums; + int32_t is_moe_averaged; + int32_t is_alltoallvc; + __gm__ int32_t *out_loop_per_expert; + __gm__ int32_t *out_loop_per_ep; + __gm__ int32_t *sum_num_local_tokens_per_expert; + __gm__ int32_t *sum_num_global_tokens_per_local_expert; + __gm__ int32_t *in_expert_comm_count_accum; + __gm__ int32_t* num_local_tokens_per_expert; + __gm__ int32_t* num_global_tokens_per_local_expert; int32_t sum_loop; }; @@ -789,6 +1019,7 @@ template class FusedPerTokenDequantRunner : public BaseDequantRunner { public: __aicore__ explicit FusedPerTokenDequantRunner() = default; + inline __aicore__ void SetArgs(__gm__ T *gm_buff, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, int32_t core_loop, int32_t swizzl_direct, int32_t swizzl_count, @@ -798,69 +1029,150 @@ public: this->gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; core_num = get_block_num(); core_idx = get_block_idx(); - this -> m = m; + this-> m = m; this -> n = n; this -> m0 = m0; this -> n0 = n0; this -> m_loop = m_loop; this -> n_loop = n_loop; this -> core_loop = core_loop; - this -> swizzl_direct = swizzl_direct; - this -> swizzl_count = swizzl_count; - this -> loop_num_per_comm = p_value * core_num; + this->swizzl_direct = swizzl_direct; + this->swizzl_count = swizzl_count; + + this->loop_num_per_comm = p_value * core_num; this -> p_value = p_value; this -> rank_size = rank_size; } + inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo &workspace_info, __gm__ float32_t *gm_dequant_scale_pertoken, int32_t batch_size, int32_t m, int32_t n, int32_t m0, int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value) + int32_t core_loop,int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, + __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t* num_global_tokens_per_local_expert) { - this -> gm_buff = gm_buff; - this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; - this -> m = m; - this -> n = n; + this->gm_buff = gm_buff; + this->gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; + this->m = m; + this->n = n; + + core_num = get_block_num(); core_idx = get_block_idx(); - this -> n_loop = n_loop; - this -> m_loop = m_loop; - this -> m0 = m0; - this -> n0 = n0; - this -> swizzl_direct = swizzle_direct; - this -> swizzl_count = swizzle_count; - this -> p_value = p_value; - // this -> rank_size = EP * TP; - this -> rank = rank; - } + loop_per_EP = p_value * core_num / (EP * TP); + + out_loop_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_expert); + out_loop_per_ep = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_out_loop_per_EP); + sum_num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_local_tokens_per_expert); + sum_num_global_tokens_per_local_expert = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_sum_num_global_tokens_per_local_expert); + in_expert_comm_count_accum = reinterpret_cast<__gm__ int32_t *> (workspace_info.gm_in_expert_comm_count_accum); + + this->n_loop = n_loop; + this->m_loop = m_loop; + this->m0 = m0; + this->n0 = n0; + this->swizzl_direct = swizzle_direct; + this->swizzl_count = swizzle_count; + this->p_value = p_value; + this->rank_size = EP * TP; + this->rank = rank; -inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & workspace_info, - __gm__ float32_t *gm_dequant_scale_pertoken, - int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m0, int32_t k0,int32_t n0, int32_t m_loop, int32_t n_loop, - int32_t core_loop, int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value) + + this->EP = EP; + this->TP = TP; + this->local_expert_nums = local_expert_nums; + + this->is_moe_averaged = is_moe_averaged; + this->is_alltoallvc = is_alltoallvc; + + this->num_local_tokens_per_expert = reinterpret_cast<__gm__ int32_t *>(num_local_tokens_per_expert); + this->num_global_tokens_per_local_expert = + reinterpret_cast<__gm__ int32_t *>(num_global_tokens_per_local_expert); + } +inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo &workspace_info, + __gm__ float32_t *gm_dequant_scale_pertoken, + int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m0, int32_t k0, int32_t n0, int32_t m_loop, int32_t n_loop, + int32_t core_loop,int32_t rank, int32_t swizzle_direct, int32_t swizzle_count, int32_t p_value, int32_t EP, int32_t TP, + int32_t local_expert_nums, int32_t is_moe_averaged, int32_t is_alltoallvc, int32_t max_output_size, int32_t buffer_size, + __gm__ int32_t* global_tokens_per_expert_matrix) { - this -> gm_buff = gm_buff; - this -> gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; - this -> m = m; - this -> k = k; - this -> n = n; + this->gm_buff = gm_buff; + this->gm_dequant_scale_pertoken = gm_dequant_scale_pertoken; + this->m = m; + this->k = k; + this->n = n; + + core_num = get_block_num(); core_idx = get_block_idx(); - this -> n_loop = n_loop; - this -> m_loop = m_loop; - this -> m0 = m0; - this -> k0 = k0; - this -> n0 = n0; - this -> swizzl_direct = swizzle_direct; - this -> swizzl_count = swizzle_count; - this -> p_value = p_value; - // this -> rank_size = EP * TP; - this -> rank = rank; - this -> buffer_size = buffer_size; + this->n_loop = n_loop; + this->m_loop = m_loop; + this->m0 = m0; + this->k0 = k0; + this->n0 = n0; + this->swizzl_direct = swizzle_direct; + this->swizzl_count = swizzle_count; + this->p_value = p_value; + this->rank_size = EP * TP; + this->rank = rank; + this->buffer_size = buffer_size; + + + this->EP = EP; + this->TP = TP; + this->local_expert_nums = local_expert_nums; + + this->is_moe_averaged = is_moe_averaged; + this->is_alltoallvc = is_alltoallvc; + + //hidden + this->comm_n = p_value * n0; + this->global_tokens_per_expert_matrix = reinterpret_cast<__gm__ int32_t *>(global_tokens_per_expert_matrix); + this->expert_nums = EP * local_expert_nums; + this->maxOutputSize = max_output_size; + if(is_moe_averaged) { + sum_m_loop = DivCeil((m / expert_nums) * EP, m0) * local_expert_nums; + max_m = m; + } else { + if (maxOutputSize == -1) { + max_m = 0; + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t sum_m_ep = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = local_expert_id + ep_idx * local_expert_nums; + for(int32_t i = 0; i < EP; i++) { + sum_m_ep += global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + } + max_m = max(max_m, sum_m_ep); + } + } else { + max_m = maxOutputSize; + } + + + for(int32_t i = 0; i < local_expert_nums; i++){ + int32_t last_sum_m = (i == 0 ? 0 : sum_m[i - 1]); + for(int j = 0; j < EP; j++) { + sum_m[i] += global_tokens_per_expert_matrix[j * expert_nums + rank * local_expert_nums + i]; + //global_tokens_per_expert_matrix[j][rank * local_expert_nums + i] + } + if (maxOutputSize > 0 && sum_m[i] + last_sum_m > maxOutputSize) { + sum_m[i] = maxOutputSize - last_sum_m; + } + sum_m_loop += DivCeil(sum_m[i], m0); + sum_m[i] += (i == 0 ? 0 : sum_m[i - 1]); + } + + } + sum_loop = 0; + //hidden end. } + inline __aicore__ void RunDequantAllReduce(int32_t cal_idx) { LoopPerTokenDequanter loop_dequanter(n0); @@ -877,8 +1189,8 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; TileLoopIter tit(m_actual, n_actual); - int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + - (loop_idx % loop_num_per_comm) * m0 * n0; + int64_t offset_this_tile = flag_idx * loop_num_per_comm * m0 * n0 + + (loop_idx % loop_num_per_comm) * m0 * n0; for (tit.Init(LoopPerTokenDequanter::max_len, n0); !tit.End(); tit.Next()) { int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; // 子核当前需处理的字节偏移 auto buff = gm_buff + offset; // 通信缓冲内的地址 @@ -889,6 +1201,128 @@ inline __aicore__ void SetArgs(__gm__ T *gm_buff, const LcalWorkspaceInfo & work loop_dequanter.WaitForLoop(); } + inline __aicore__ void RunDequantReduceScatter(int32_t cal_idx) + { + LoopPerTokenDequanter loop_dequanter(n0); + loop_dequanter.SetForLoop(); + int32_t m_loop_per_rank = m_loop / rank_size; + //int32_t pipe_depth = is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT; + int32_t pipe_depth = MAX_BLOCK_COUNT; + int32_t flag_idx = cal_idx % pipe_depth; + int32_t comm_num = DivCeil(core_loop, loop_num_per_comm); + int32_t actual_loop_num = loop_num_per_comm; + if (cal_idx == comm_num - 1) { + actual_loop_num = core_loop - cal_idx * loop_num_per_comm; + } + + for (int32_t p = 0; p < p_value; p++) { + int loop_idx = cal_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + + int32_t in_batch_idx = loop_idx % (m_loop * n_loop); + int64_t rank_idx = in_batch_idx % rank_size; + int32_t in_rank_idx = in_batch_idx / rank_size; + + int64_t m_idx, n_idx; + GetBlockIdx(in_rank_idx, m_loop_per_rank, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == (m_loop_per_rank - 1)) ? (m / rank_size - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + + TileLoopIter tit(m_actual, n_actual); + int64_t rank_offset_c = (loop_idx % rank_size) * (actual_loop_num / rank_size) * m0 * n0; + int64_t offset_this_tile = flag_idx * m0 * loop_num_per_comm * n0 + rank_offset_c + + + ((loop_idx % loop_num_per_comm) / rank_size) * m0 * n0; + for (tit.Init(LoopPerTokenDequanter::max_len, n0); !tit.End(); tit.Next()) { + int64_t offset = offset_this_tile + tit.m_offset_in_tile() * n0; // 子核当前需处理的字节偏移 + auto buff = gm_buff + offset; // 通信缓冲内的地址 + auto scale = gm_dequant_scale_pertoken + m_idx * m0 + tit.m_offset_in_tile(); // 注意要加上m_offset_in_tile + loop_dequanter.Loop(buff, scale, tit.m_this_loop, tit.n_this_loop, n0); + } + } + loop_dequanter.WaitForLoop(); + } + + inline __aicore__ void DequantPerTokenMatmulAllToAllHidden(int32_t cal_idx) { + cal_count = DivCeil(n, comm_n); + gm_a_pingpong_size = comm_n * max_m; + gm_a_pingpong_num = buffer_size * 1024 * 1024 / 2 / gm_a_pingpong_size; + if (gm_a_pingpong_num > 8) { + gm_a_pingpong_num = 8; + } + LoopPerTokenDequanter loop_dequanter(n0); + loop_dequanter.SetForLoop(); + int32_t n_len; + if(cal_idx == cal_count - 1) { + n_len = n - cal_idx * comm_n; + } else { + n_len = comm_n; + } + n_loop = DivCeil(n_len,n0); + int32_t sum_loop_num = sum_m_loop * n_loop; + //int32_t flag_id = cal_idx % MAX_BLOCK_COUNT; + int32_t flag_id = cal_idx % gm_a_pingpong_num; + + for(int32_t loop_idx = 0; loop_idx < sum_loop_num; loop_idx ++) { + if((loop_idx + sum_loop) % core_num != core_idx) { + continue; + } + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, sum_m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t sum_loop_before = 0; + int32_t local_expert_idx = -1; + int32_t m_in_expert; + for(int32_t i = 0; i < local_expert_nums; i++) { + if(is_moe_averaged) { + m_in_expert = m / local_expert_nums; + } else { + m_in_expert = sum_m[i] - (i == 0 ? 0 : sum_m[i - 1]); + } + sum_loop_before += DivCeil(m_in_expert, m0); + if(sum_loop_before > m_idx) { + local_expert_idx = i; + break; + } + } + int32_t m_loop_in_expert = DivCeil(m_in_expert, m0); + sum_loop_before -= m_loop_in_expert; + int32_t m_idx_in_expert = m_idx - sum_loop_before; + int32_t m_actual = ((m_idx_in_expert == m_loop_in_expert - 1) ? (m_in_expert - m_idx_in_expert * m0) : m0); + int32_t n_actual = ((n_idx == n_loop - 1) ? (n_len - n_idx * n0) : n0); + + int32_t sum_m_before = 0; + if(is_moe_averaged) { + sum_m_before = local_expert_idx * (m / local_expert_nums); + } else { + sum_m_before = sum_m[local_expert_idx] - m_in_expert; + } + + int64_t m_offset_this_tile = sum_m_before + m_idx_in_expert * m0; + + int64_t offset_this_tile = flag_id * gm_a_pingpong_size + + 1LL * (sum_m_before + m_idx_in_expert * m0) * n_len + 1LL * n_idx * n0; + // int64_t offset_this_tile = + // 1LL * (sum_m_before + m_idx_in_expert * m0) * n_len + 1LL * n_idx * n0; + + + TileLoopIter tit(m_actual, n_actual); + + for (tit.Init(LoopPerTokenDequanter::max_len, n0); !tit.End(); tit.Next()){ + int64_t buff_offset = offset_this_tile + tit.m_offset_in_tile() * n_len; // 子核当前需处理的字节偏移 + //int64_t buff_offset = offset_this_tile; + auto buff = gm_buff + buff_offset; + //auto buff = gm_buff; + auto scale = gm_dequant_scale_pertoken + m_offset_this_tile + tit.m_offset_in_tile(); + //scale = gm_dequant_scale; + loop_dequanter.Loop(buff, scale, tit.m_this_loop, tit.n_this_loop, n_len); + } + + } + sum_loop += sum_loop_num; + loop_dequanter.WaitForLoop(); + } + + private: int32_t core_num; int32_t core_idx; @@ -906,10 +1340,30 @@ private: int32_t rank_size; __gm__ T *gm_buff; __gm__ float32_t *gm_dequant_scale_pertoken; + + int32_t loop_per_EP; int32_t rank; + int32_t EP; + int32_t TP; + int32_t local_expert_nums; + int32_t is_moe_averaged; + int32_t is_alltoallvc; int32_t buffer_size; + + __gm__ int32_t *out_loop_per_expert; + __gm__ int32_t *out_loop_per_ep; + __gm__ int32_t *sum_num_local_tokens_per_expert; + __gm__ int32_t *sum_num_global_tokens_per_local_expert; + __gm__ int32_t *in_expert_comm_count_accum; + + + __gm__ int32_t* num_local_tokens_per_expert; + __gm__ int32_t* num_global_tokens_per_local_expert; + int32_t sum_loop; + + __gm__ int32_t* global_tokens_per_expert_matrix; int32_t max_m; int32_t sum_m[32] = {0}; int32_t sum_m_loop = 0; @@ -917,7 +1371,11 @@ private: int32_t comm_k; int64_t gm_a_pingpong_size; int64_t gm_a_pingpong_num; + int32_t expert_nums; int32_t cal_count; + int32_t maxOutputSize; + }; #endif + #endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_internal.cce b/comm/lcal/src/kernels/coc_internal.cce index 18761af7..548575ad 100644 --- a/comm/lcal/src/kernels/coc_internal.cce +++ b/comm/lcal/src/kernels/coc_internal.cce @@ -13,11 +13,11 @@ #include #include "kernel_operator.h" #include "coc_const_args.cce" - using namespace AscendC; -template + +template FORCE_INLINE_AICORE LocalTensor CreateLocalTensor(__ubuf__ T *addr) { LocalTensor tensor; @@ -95,8 +95,8 @@ inline __aicore__ void Vadd(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, template inline __aicore__ void Vadds(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, - uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, - uint8_t srcRepeatStride) + uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, + uint8_t srcRepeatStride) { LocalTensor srcTensor = CreateLocalTensor(src); LocalTensor dstTensor = CreateLocalTensor(dst); @@ -119,8 +119,8 @@ inline __aicore__ void Vmul(__ubuf__ T *dst, __ubuf__ T *src0, __ubuf__ T *src1, template inline __aicore__ void Vmuls(__ubuf__ T *dst, __ubuf__ T *src, const T &scalarValue, uint8_t repeat, - uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, - uint8_t srcRepeatStride) + uint16_t dstBlockStride, uint16_t srcBlockStride, uint8_t dstRepeatStride, + uint8_t srcRepeatStride) { LocalTensor srcTensor = CreateLocalTensor(src); LocalTensor dstTensor = CreateLocalTensor(dst); @@ -134,57 +134,68 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) (granularity < QuantGranularity::QUANT_GRANULARITY_MAX); } -#define COC_ARGS_FUN_IIO(T_INPUT1, T_INPUT2, T_OUTPUT) \ - __gm__ T_INPUT1 *gm_a, __gm__ T_INPUT2 *gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_gamma, \ - __gm__ T_OUTPUT *gm_out, __gm__ T_OUTPUT *gm_allgather_out, GM_ADDR gm_workspace, \ +#define COC_ARGS_FUN_IIO(T_INPUT1, T_INPUT2, T_OUTPUT) \ + __gm__ T_INPUT1 *gm_a, __gm__ T_INPUT2 *gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_gamma, \ + __gm__ T_OUTPUT *gm_out, __gm__ T_OUTPUT *gm_allgather_out, GM_ADDR gm_workspace, \ GM_ADDR gm_dequant_scale, GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, \ - GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, GM_ADDR para_gm + GM_ADDR gm_quant_offset, GM_ADDR coc_comm_args, GM_ADDR ffts_addr, \ + __gm__ int32_t* num_local_tokens_per_expert, __gm__ int32_t *num_global_tokens_per_local_expert, \ + __gm__ int32_t *global_tokens_per_expert_matrix, GM_ADDR para_gm #define COC_ARGS_FUN_IO(T_INPUT, T_OUTPUT) COC_ARGS_FUN_IIO(T_INPUT, T_INPUT, T_OUTPUT) #define COC_ARGS_FUN(T) COC_ARGS_FUN_IO(T, T) -#define COC_ARGS_CALL() \ +#define COC_ARGS_CALL() \ gm_a, gm_b, gm_bias, gm_gamma, gm_out, gm_allgather_out, gm_workspace, gm_dequant_scale, gm_dequant_offset, \ - gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, para_gm + gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr,\ + num_local_tokens_per_expert, num_global_tokens_per_local_expert, \ + global_tokens_per_expert_matrix, para_gm #define COC_ARGS_CALL_INT8() \ reinterpret_cast(gm_a), reinterpret_cast(gm_b), reinterpret_cast(gm_bias), \ reinterpret_cast(gm_gamma), reinterpret_cast(gm_out), \ reinterpret_cast(gm_allgather_out), gm_workspace, gm_dequant_scale, gm_dequant_offset, \ - gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, para_gm + gm_quant_scale, gm_quant_offset, coc_comm_args, ffts_addr, \ + num_local_tokens_per_expert, num_global_tokens_per_local_expert,\ + global_tokens_per_expert_matrix, para_gm -#define PP_MATMUL_AIC_ARGS_FUN(T_INPUT, T_OUTPUT) \ - GM_ADDR gm_a, GM_ADDR gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_c, \ - __gm__ T_OUTPUT *gm_peer_mem, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ +#define PP_MATMUL_AIC_ARGS_FUN(T_INPUT, T_OUTPUT) \ + GM_ADDR gm_a, GM_ADDR gm_b, __gm__ T_OUTPUT *gm_bias, __gm__ T_OUTPUT *gm_c, \ + __gm__ T_OUTPUT *gm_peer_mem, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ GM_ADDR gm_dequant_offset, int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m0, \ - int32_t k0, int32_t n0, int32_t m_loop, int32_t k_loop, int32_t n_loop, int32_t core_loop, \ - int32_t swizzl_count, int32_t swizzl_direct, int32_t rank, int32_t rank_size, int32_t p_value, \ - int32_t withSerialMode, QuantGranularity quant_granularity, QuantGranularity dequant_granularity, \ - int32_t ag_dim, int32_t rs_dim, bool inner_dim_is_Ag, bool weight_nz, bool is_91093, \ - bool is_deterministic, int32_t buffer_size \ - -#define PP_MATMUL_AIC_ARGS_CALL() \ - reinterpret_cast(gm_a), reinterpret_cast(gm_b), gm_bias, gm_c, gm_peer_mem, \ - reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ - reinterpret_cast(gm_dequant_offset), batch_size, m, k, n, m0, k0, n0, m_loop, k_loop, \ + int32_t k0, int32_t n0, int32_t m_loop, int32_t k_loop, int32_t n_loop, int32_t core_loop, \ + int32_t swizzl_count, int32_t swizzl_direct, int32_t rank, int32_t rank_size, int32_t p_value, \ + int32_t withSerialMode, QuantGranularity quant_granularity, QuantGranularity dequant_granularity, \ + int32_t ag_dim, int32_t rs_dim, bool inner_dim_is_Ag, bool weight_nz, bool is_91093,\ + __gm__ int32_t *num_local_tokens_per_expert, __gm__ int32_t * num_global_tokens_per_local_expert, \ + __gm__ int32_t *global_tokens_per_expert_matrix, int32_t local_expert_nums, int32_t EP, int32_t TP, \ + int32_t maxOutputSize, int32_t is_moe, bool is_deterministic, int32_t buffer_size\ + +#define PP_MATMUL_AIC_ARGS_CALL() \ + reinterpret_cast(gm_a), reinterpret_cast(gm_b), gm_bias, gm_c, gm_peer_mem, \ + reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ + reinterpret_cast(gm_dequant_offset), batch_size, m, k, n, m0, k0, n0, m_loop, k_loop, \ n_loop, core_loop, swizzl_count, swizzl_direct, rank, rank_size, p_value, withSerialMode, quant_granularity, \ dequant_granularity, ag_dim, rs_dim, inner_dim_is_Ag, weight_nz, is_91093, \ - is_deterministic, buffer_size \ + num_local_tokens_per_expert, num_global_tokens_per_local_expert,\ + global_tokens_per_expert_matrix, local_expert_nums, EP, TP, maxOutputSize ,is_moe, is_deterministic, buffer_size\ -#define PP_MATMUL_AIV_PADDING_ARGS_FUN() \ +#define PP_MATMUL_AIV_PADDING_ARGS_FUN() \ GM_ADDR gm_a, GM_ADDR gm_b, GM_ADDR gm_workspace, GM_ADDR gm_dequant_scale, \ - GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ - int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ - QuantGranularity dequant_granularity, int32_t dequant_group_size, QuantGranularity quant_granularity, \ - int32_t quant_group_size, int32_t weight_nz, bool is_deterministic - -#define PP_MATMUL_AIV_PADDING_ARGS_CALL() \ - reinterpret_cast(gm_a), reinterpret_cast(gm_b), \ - reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ + GM_ADDR gm_dequant_offset, GM_ADDR gm_quant_scale, GM_ADDR gm_quant_offset, \ + int32_t batch_size, int32_t m, int32_t k, int32_t n, bool trans_a, bool trans_b, bool is_int8, \ + QuantGranularity dequant_granularity, int32_t dequant_group_size, QuantGranularity quant_granularity, \ + int32_t quant_group_size, int32_t weight_nz, int32_t is_moe, int32_t is_moe_averaged, int32_t is_alltoallvc, \ + int32_t EP,int32_t TP, int32_t local_expert_nums, bool is_deterministic + +#define PP_MATMUL_AIV_PADDING_ARGS_CALL() \ + reinterpret_cast(gm_a), reinterpret_cast(gm_b), \ + reinterpret_cast(gm_workspace), reinterpret_cast(gm_dequant_scale), \ reinterpret_cast(gm_dequant_offset), reinterpret_cast(gm_quant_scale), \ - reinterpret_cast(gm_quant_offset), batch_size, m, k, n, trans_a, trans_b, is_int8, \ - dequant_granularity, dequant_group_size, quant_granularity, quant_group_size, weight_nz, is_deterministic + reinterpret_cast(gm_quant_offset), batch_size, m, k, n, trans_a, trans_b, is_int8, \ + dequant_granularity, dequant_group_size, quant_granularity, quant_group_size, weight_nz, is_moe, \ + is_moe_averaged, is_alltoallvc, EP,TP,local_expert_nums, is_deterministic #define PP_MATMUL_AIV_ADD_BIAS_ARGS_FUN() \ GM_ADDR gm_bias, GM_ADDR gm_out, int32_t batch_size, int32_t m, int32_t n, int32_t rank_size @@ -192,7 +203,7 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL() \ reinterpret_cast(gm_bias), reinterpret_cast(gm_out), batch_size, m, n, rank_size -#define PP_MATMUL_AIV_POST_ARGS_CALL() \ +#define PP_MATMUL_AIV_POST_ARGS_CALL() \ reinterpret_cast(gm_out), reinterpret_cast(gm_bias), \ reinterpret_cast(gm_gamma), reinterpret_cast(para_gm) @@ -204,13 +215,14 @@ inline __aicore__ bool IsQuant(const QuantGranularity &granularity) #define TEMPLATE_ARGS_CALL() ALIGN, IS_INT8, HAVE_BIAS, T inline __aicore__ void AlignJudge(bool trans_a, bool trans_b, int32_t m, int32_t k, int32_t n, int32_t m_align, - int32_t k_align, int32_t n_align, int32_t &aligned_a, int32_t &aligned_b) + int32_t k_align, int32_t n_align, int32_t &aligned_a, int32_t &aligned_b) { if (!trans_a) { aligned_a = k != k_align; } else { aligned_a = (m != m_align && m != 1); } + if (!trans_b) { aligned_b = (n != n_align); } else { @@ -219,13 +231,14 @@ inline __aicore__ void AlignJudge(bool trans_a, bool trans_b, int32_t m, int32_t } inline __aicore__ void GetBlockIdx(int32_t loop_idx, int32_t m_loop, int32_t n_loop, int32_t swizzl_direction, - int32_t swizzl_count, int64_t &m_idx, int64_t &n_idx) + int32_t swizzl_count, int64_t &m_idx, int64_t &n_idx) { uint32_t in_batch_idx = loop_idx % (m_loop * n_loop); - if (swizzl_direction == 0) { // Zn + if (swizzl_direction == 0) { // Zn uint32_t tile_block_loop = (m_loop + swizzl_count - 1) / swizzl_count; uint32_t tile_block_idx = in_batch_idx / (swizzl_count * n_loop); uint32_t in_tile_block_idx = in_batch_idx % (swizzl_count * n_loop); + uint32_t n_row = swizzl_count; if (tile_block_idx == tile_block_loop - 1) { n_row = m_loop - swizzl_count * tile_block_idx; @@ -239,6 +252,7 @@ inline __aicore__ void GetBlockIdx(int32_t loop_idx, int32_t m_loop, int32_t n_l uint32_t tile_block_loop = (n_loop + swizzl_count - 1) / swizzl_count; uint32_t tile_block_idx = in_batch_idx / (swizzl_count * m_loop); uint32_t in_tile_block_idx = in_batch_idx % (swizzl_count * m_loop); + uint32_t n_col = swizzl_count; if (tile_block_idx == tile_block_loop - 1) { n_col = n_loop - swizzl_count * tile_block_idx; @@ -253,17 +267,17 @@ inline __aicore__ void GetBlockIdx(int32_t loop_idx, int32_t m_loop, int32_t n_l template FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, - uint32_t gmGap, uint32_t ubufGap = 0) + uint32_t gmGap, uint32_t ubufGap = 0) { if constexpr (sizeof(T) == 8) { CopyGmToUbufAlign(reinterpret_cast<__ubuf__ int32_t *>(dst), reinterpret_cast<__gm__ int32_t *>(src), nBurst * 2, lenBurst * 2, gmGap, ubufGap); return; } - DataCopyParams dataCopyParams(nBurst, // blockCount - (Block32B::Count(lenBurst)), // blockLen - (Block32B::Count(gmGap)), // srcStride - (ubufGap) // dstStride + DataCopyParams dataCopyParams(nBurst, // blockCount + (Block32B::Count(lenBurst)), // blockLen + (Block32B::Count(gmGap)), // srcStride + (ubufGap) // dstStride ); DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), gmGap * sizeof(T), ubufGap, 0); LocalTensor ubTensor; @@ -273,6 +287,7 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint1 ubTensor.SetAddr(ubAddr); GlobalTensor gmTensor; gmTensor.SetGlobalBuffer(src); + if (Block32B::IsAligned(lenBurst) && Block32B::IsAligned(gmGap)) { DataCopy(ubTensor, gmTensor, dataCopyParams); } else { @@ -283,12 +298,12 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlign(__ubuf__ T *dst, __gm__ T *src, uint1 template FORCE_INLINE_AICORE void CopyUbufToGmAlign(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, - uint32_t gmGap, uint32_t ubufGap = 0) + uint32_t gmGap, uint32_t ubufGap = 0) { - DataCopyParams dataCopyParams(nBurst, // blockCount - static_cast(Block32B::Count(lenBurst)), // blockLen - static_cast(ubufGap), // srcStride - static_cast(Block32B::Count(gmGap)) // dstStride + DataCopyParams dataCopyParams(nBurst, // blockCount + static_cast(Block32B::Count(lenBurst)), // blockLen + static_cast(ubufGap), // srcStride + static_cast(Block32B::Count(gmGap)) // dstStride ); DataCopyExtParams dataCopyAlignParams(nBurst, lenBurst * sizeof(T), ubufGap, gmGap * sizeof(T), 0); LocalTensor ubTensor; @@ -308,13 +323,13 @@ FORCE_INLINE_AICORE void CopyUbufToGmAlign(__gm__ T *dst, __ubuf__ T *src, uint1 template FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, - uint16_t srcStride, uint16_t dstStride) + uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyParams(nBurst, // blockCount - lenBurst, // blockLen - srcStride, // srcStride - dstStride, // dstStride - 0); + DataCopyExtParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride, // dstStride + 0); LocalTensor ubTensor; TBuffAddr ubAddr; ubAddr.logicPos = static_cast(TPosition::VECIN); @@ -328,13 +343,13 @@ FORCE_INLINE_AICORE void CopyGmToUbufAlignB16(__ubuf__ T *dst, __gm__ T *src, ui template FORCE_INLINE_AICORE void CopyUbufToGmAlignB16(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint32_t lenBurst, - uint16_t srcStride, uint16_t dstStride) + uint16_t srcStride, uint16_t dstStride) { - DataCopyExtParams dataCopyParams(nBurst, // blockCount - lenBurst, // blockLen - srcStride, // srcStride - dstStride, // dstStride - 0); + DataCopyExtParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride, // dstStride + 0); LocalTensor ubTensor; TBuffAddr ubAddr; ubAddr.logicPos = static_cast(TPosition::VECIN); @@ -347,12 +362,12 @@ FORCE_INLINE_AICORE void CopyUbufToGmAlignB16(__gm__ T *dst, __ubuf__ T *src, ui template FORCE_INLINE_AICORE void CopyGmToUbuf(__ubuf__ T *dst, __gm__ T *src, uint16_t nBurst, uint32_t lenBurst, - uint16_t srcStride, uint16_t dstStride) + uint16_t srcStride, uint16_t dstStride) { - DataCopyParams dataCopyParams(nBurst, // blockCount - lenBurst, // blockLen - srcStride, // srcStride - dstStride // dstStride + DataCopyParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride // dstStride ); LocalTensor ubTensor; TBuffAddr ubAddr; @@ -366,12 +381,12 @@ FORCE_INLINE_AICORE void CopyGmToUbuf(__ubuf__ T *dst, __gm__ T *src, uint16_t n template FORCE_INLINE_AICORE void CopyUbufToGm(__gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, uint16_t lenBurst, - uint16_t srcStride, uint16_t dstStride) + uint16_t srcStride, uint16_t dstStride) { - DataCopyParams dataCopyParams(nBurst, // blockCount - lenBurst, // blockLen - srcStride, // srcStride - dstStride // dstStride + DataCopyParams dataCopyParams(nBurst, // blockCount + lenBurst, // blockLen + srcStride, // srcStride + dstStride // dstStride ); LocalTensor ubTensor; TBuffAddr ubAddr; @@ -384,8 +399,8 @@ FORCE_INLINE_AICORE void CopyUbufToGm(__gm__ T *dst, __ubuf__ T *src, uint16_t n } template -FORCE_INLINE_AICORE void CopyUbufToGmUnknown(bool ALIGN, __gm__ T *dst, __ubuf__ T*src, uint16_t nBurst, - uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) +FORCE_INLINE_AICORE void CopyUbufToGmUnknown(bool ALIGN, __gm__ T *dst, __ubuf__ T *src, uint16_t nBurst, + uint32_t lenBurst, uint16_t srcStride, uint16_t dstStride) { if (ALIGN) { CopyUbufToGm(dst, src, nBurst, lenBurst / 32, srcStride, dstStride / 32); @@ -396,7 +411,7 @@ FORCE_INLINE_AICORE void CopyUbufToGmUnknown(bool ALIGN, __gm__ T *dst, __ubuf__ template FORCE_INLINE_AICORE void VectorDup(__ubuf__ T *dst, const T &src, uint8_t repeat, uint16_t dstBlockStride, - uint8_t dstRepeatStride) + uint8_t dstRepeatStride) { LocalTensor ubTensor = CreateLocalTensor(dst); Duplicate(ubTensor, src, -1, repeat, dstBlockStride, dstRepeatStride); @@ -428,9 +443,9 @@ public: } } - int rank; // attr rank_id, global rank + int rank; // attr rank_id, global rank int localRank; - int rankSize; // global rank size + int rankSize; // global rank size int localRankSize; uint32_t extraFlag; bool RDMA; @@ -439,9 +454,29 @@ public: bool DETERMINISTIC; bool QUANT_FP16; bool QUANT_FP32; - __gm__ T *buff[LCAL_MAX_RANK_SIZE]; // 共享内存地址列表 + __gm__ T *buff[LCAL_MAX_RANK_SIZE]; // 共享内存地址列表 //int64_t sendCountMatrix[LCAL_MAX_RANK_SIZE * LCAL_MAX_RANK_SIZE]; }; +FORCE_INLINE_AICORE void CommMatrixTrunc(__gm__ int32_t* global_tokens_per_expert_matrix, __gm__ int32_t* workspace, int32_t EP, int32_t local_expert_nums, int32_t maxOutputSize) +{ + int32_t expert_nums = local_expert_nums * EP; + for(int32_t i = 0; i < EP; i++) { + int32_t sum_tokens = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id++) { + int32_t expert_id = i * local_expert_nums + local_expert_id; + for(int32_t j = 0; j < EP; j++) { + if (sum_tokens + global_tokens_per_expert_matrix[j * expert_nums + expert_id] + >= maxOutputSize) { + workspace[j * expert_nums + expert_id] = maxOutputSize - sum_tokens; + sum_tokens = maxOutputSize; + } else { + workspace[j * expert_nums + expert_id] = global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + sum_tokens += global_tokens_per_expert_matrix[j * expert_nums + expert_id]; + } + } + } + } +} -#endif // LCAL_COC_INTERNAL_H \ No newline at end of file +#endif // LCAL_COC_INTERNAL_H diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce b/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce index 798ef54b..5366e03c 100644 --- a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce +++ b/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce @@ -28,12 +28,13 @@ extern "C" __global__ __aicore__ void LcalMatmulReduceScatter_##type##_mix_aic(C // ReduceScatter in LcalMatmulReduceScatter #define COC_MATMUL_REDUCE_SCATTER_FUNC_AUTO_DEF(type) \ extern "C" __global__ __aicore__ void LcalMatmulReduceScatter_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ - CocMatmulReduceScatterAiv(COC_ARGS_CALL()); \ + CocMatmulReduceScatterAiv(COC_ARGS_CALL()); \ } #endif + #if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // 910B support bf16 #define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) COC_TYPE_FUNC(COC_MATMUL_REDUCE_SCATTER_FUNC_AUTO_DEF); -#endif +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_postprocessor.cce b/comm/lcal/src/kernels/coc_postprocessor.cce index faccedf4..9609cc18 100644 --- a/comm/lcal/src/kernels/coc_postprocessor.cce +++ b/comm/lcal/src/kernels/coc_postprocessor.cce @@ -18,8 +18,8 @@ #include "tiling_args.h" using namespace AscendC; -constexpr int32_t BUFFER_NUM = 1; -constexpr int32_t NUM_PER_REP_FP32 = 64; +constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue +constexpr int32_t NUM_PER_REP_FP32 = 64; // ONE_REPEAT_BYTE_SIZE / sizeof(float); constexpr int32_t NUM_PER_BLK_FP32 = 8; constexpr float MINUS_HALF = -0.5; constexpr float ZERO = 0; @@ -30,7 +30,7 @@ class RMSNormprocessor { public: __aicore__ explicit RMSNormprocessor() = default; FORCE_INLINE_AICORE void SetArgs(__gm__ uint8_t *gm_in, __gm__ uint8_t *gm_out, __gm__ uint8_t *gm_gamma, - uint32_t m, uint32_t n) + uint32_t m, uint32_t n) { this->rmsnorm_in = reinterpret_cast<__gm__ T_in *>(gm_out); this->rmsnorm_gamma = reinterpret_cast<__gm__ T_out *>(gm_gamma); @@ -40,6 +40,7 @@ public: this->core_used = core_used; } + // 暂时只支持float16 struct UBufConfig { int64_t global_subblock_idx; int64_t total_subblock; @@ -49,8 +50,8 @@ public: __ubuf__ float *sqx0; __ubuf__ float *sum_tmp0; __ubuf__ float *sum0; - __ubuf__ float *fp32_1; __ubuf__ half *fp16_1; + __ubuf__ float *fp32_1; __ubuf__ float *sqx1; __ubuf__ float *sum_tmp1; __ubuf__ float *sum1; @@ -107,29 +108,35 @@ public: SetFlag(event_id); WaitFlag(event_id); + // fp16 -> fp32 SetVectorMask(0x0, n); Vconv(((__ubuf__ float *)fp32), ((__ubuf__ half *)fp16), 1, 1, 1, 8, 4); PipeBarrier(); + // x^2 Vmul(((__ubuf__ float *)sqx), ((__ubuf__ float *)fp32), ((__ubuf__ float *)fp32), 1, 1, 1, 1, 8, 8, 8); PipeBarrier(); - float average_val = 1.f /n; + + // x^2 / n + float average_val = 1.f / n; Vmuls(((__ubuf__ float *)sqx), ((__ubuf__ float *)sqx), average_val, 1, 1, 1, 8, 8); PipeBarrier(); + // sum(x^2 / n) SetVectorMask(0x0, 64); VectorDup(((__ubuf__ float *)sum_tmp), 0.f, 1, 1, 8); PipeBarrier(); SetVectorMask(0x0, n); Vadd(((__ubuf__ float *)sum_tmp), ((__ubuf__ float *)sqx), ((__ubuf__ float *)sum_tmp), 1, 1, 1, 1, 0, 8, - 0); + 0); PipeBarrier(); SetVectorMask(0x0, 64); vcadd(((__ubuf__ float *)sum), ((__ubuf__ float *)sum_tmp), 1, 0, 1, 0, 0); PipeBarrier(); + // x * 1 / sqrt(sum(x^2 / n) + eps) SetVectorMask(0x0, n); SetFlag(event_id); WaitFlag(event_id); @@ -140,11 +147,13 @@ public: Vmuls(((__ubuf__ float *)fp32), ((__ubuf__ float *)fp32), mul_val, 1, 1, 1, 8, 8); PipeBarrier(); + // fp32 -> fp16 Vconv(((__ubuf__ half *)fp16), ((__ubuf__ float *)fp32), 1, 1, 1, 4, 8); PipeBarrier(); + // x * 1 / sqrt(sum(x^2 / n) + eps) * g Vmul(((__ubuf__ half *)fp16), ((__ubuf__ half *)fp16), ((__ubuf__ half *)ubufConfig.gamma), 1, 1, 1, 1, 8, - 8, 8); + 8, 8); PipeBarrier(); SetFlag(event_id); WaitFlag(event_id); @@ -185,6 +194,7 @@ public: FORCE_INLINE_AICORE void Run() { + // mode, flag_id FFTSCrossCoreSync(0, 0); WaitEvent(0); if (this->with_rms_norm) { diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/comm/lcal/src/kernels/coc_ppmatmul.cce index a3963de5..ac9521d2 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul.cce @@ -23,11 +23,11 @@ struct GetAccumType { #ifdef __DAV_C220_CUBE__ -constexpr int32_t L0AB_PINGPONG_BUFFER_SIZE = 32768; -constexpr int32_t CUBE_MATRIX_SIZE_B16 = 256; -constexpr int32_t CUBE_MATRIX_SIZE_B8 = 16 * 32; +constexpr int32_t L0AB_PINGPONG_BUFFER_SIZE = 32768; // 32 KB +constexpr int32_t CUBE_MATRIX_SIZE_B16 = 256; // 16 * 16 +constexpr int32_t CUBE_MATRIX_SIZE_B8 = 16 * 32; // 16 * 32 constexpr int64_t ND2NZ_STRIDE_LIMIT = 65536; -constexpr int32_t SCALE_L1_SIZE = 256 * 8; +constexpr int32_t SCALE_L1_SIZE = 256 * 8; // 2 KB template inline __aicore__ void CopyCubfToBt(uint64_t dst, __cbuf__ T *src, uint16_t convControl, uint16_t nBurst, uint16_t lenBurst, uint16_t sourceGap, uint16_t dstGap) @@ -35,8 +35,8 @@ inline __aicore__ void CopyCubfToBt(uint64_t dst, __cbuf__ T *src, uint16_t conv DataCopyParams intriParams(nBurst, lenBurst, sourceGap, dstGap); uint32_t src_buffer_offset = reinterpret_cast(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); - uint8_t src_logicpos = static_cast(TPosition::C1); - uint8_t dst_logicpos = static_cast(TPosition::C2); + uint8_t src_logicpos = static_cast(TPosition::C1); // L1 + uint8_t dst_logicpos = static_cast(TPosition::C2); // Bias LocalTensor srcTensor; LocalTensor dstTensor; srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); @@ -51,12 +51,13 @@ inline __aicore__ void CopyGmToCbuf(__cbuf__ T *dst, __gm__ T *src, uint8_t sid, GlobalTensor srcTensor; srcTensor.SetGlobalBuffer(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); - uint8_t logicpos = static_cast(TPosition::C1); + uint8_t logicpos = static_cast(TPosition::C1); // L1 LocalTensor dstTensor; dstTensor = CreateLocalTensor(dst_buffer_offset, logicpos); DataCopy(dstTensor, srcTensor, intriParams); } + template inline __aicore__ void SetFpc(__fbuf__ T *src) { @@ -66,6 +67,7 @@ inline __aicore__ void SetFpc(__fbuf__ T *src) SetFixPipeConfig(tensor); } + template inline __aicore__ void LoadCbufToCaTranspose(__ca__ T *dst, __cbuf__ T *src, uint16_t indexID, uint8_t repeat, uint16_t srcStride, uint16_t dstStride, bool addrmode, uint16_t dstFracStride) { @@ -79,8 +81,8 @@ inline __aicore__ void LoadCbufToCaTranspose(__ca__ T *dst, __cbuf__ T *src, uin ); uint32_t src_buffer_offset = reinterpret_cast(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); - uint8_t src_logicpos = static_cast(TPosition::C1); - uint8_t dst_logicpos = static_cast(TPosition::A2); + uint8_t src_logicpos = static_cast(TPosition::C1); // L1 + uint8_t dst_logicpos = static_cast(TPosition::A2); // L0A LocalTensor srcTensor; LocalTensor dstTensor; srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); @@ -101,8 +103,8 @@ inline __aicore__ void LoadCbufToCbTranspose(__cb__ T *dst, __cbuf__ T *src, uin ); uint32_t src_buffer_offset = reinterpret_cast(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); - uint8_t src_logicpos = static_cast(TPosition::C1); - uint8_t dst_logicpos = static_cast(TPosition::B2); + uint8_t src_logicpos = static_cast(TPosition::C1); // L1 + uint8_t dst_logicpos = static_cast(TPosition::B2); // L0A LocalTensor srcTensor; LocalTensor dstTensor; srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); @@ -124,8 +126,8 @@ inline __aicore__ void LoadCbufToCa(__ca__ T *dst, __cbuf__ T *src, uint16_t bas ); uint32_t src_buffer_offset = reinterpret_cast(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); - uint8_t src_logicpos = static_cast(TPosition::C1); - uint8_t dst_logicpos = static_cast(TPosition::A2); + uint8_t src_logicpos = static_cast(TPosition::C1); // L1 + uint8_t dst_logicpos = static_cast(TPosition::A2); // L0A LocalTensor srcTensor; LocalTensor dstTensor; srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); @@ -133,6 +135,7 @@ inline __aicore__ void LoadCbufToCa(__ca__ T *dst, __cbuf__ T *src, uint16_t bas LoadData(dstTensor, srcTensor, params); } + template inline __aicore__ void LoadCbufToCb(__cb__ T *dst, __cbuf__ T *src, uint16_t baseIdx, uint8_t repeat, uint16_t srcStride, uint16_t dstStride, uint8_t sid, bool transpose, uint8_t addr_cal_mode) { @@ -147,8 +150,8 @@ inline __aicore__ void LoadCbufToCb(__cb__ T *dst, __cbuf__ T *src, uint16_t bas ); uint32_t src_buffer_offset = reinterpret_cast(src); uint32_t dst_buffer_offset = reinterpret_cast(dst); - uint8_t src_logicpos = static_cast(TPosition::C1); - uint8_t dst_logicpos = static_cast(TPosition::B2); + uint8_t src_logicpos = static_cast(TPosition::C1); // L1 + uint8_t dst_logicpos = static_cast(TPosition::B2); // L0B LocalTensor srcTensor; LocalTensor dstTensor; srcTensor = CreateLocalTensor(src_buffer_offset, src_logicpos); @@ -162,7 +165,7 @@ struct IntrinsicCopyGmToL1Nd2Nz { __cbuf__ T *dst, __gm__ T *src, uint8_t sid, uint16_t ndNum, uint16_t nValue, uint16_t dValue, uint16_t srcNdMatrixStride, uint16_t srcDValue, uint16_t dstNzC0Stride, - uint16_t dstNzNStride, uint16_t dstNzMatrixStride){ + uint16_t dstNzNStride, uint16_t dstNzMatrixStride) { Nd2NzParams nd2nzParams( ndNum, nValue, dValue, srcNdMatrixStride, srcDValue, dstNzC0Stride, @@ -178,40 +181,42 @@ struct IntrinsicCopyGmToL1Nd2Nz { } }; + + template struct CopyGmToL1Nd2zN { static inline __aicore__ void move( - __cbuf__ T *dst, __gm__ T *src, - uint16_t nValue, uint16_t dValue, uint32_t srcDValue, uint16_t dstNzC0Stride) { + __cbuf__ T *dst, __gm__ T *src, + uint16_t nValue, uint16_t dValue, uint32_t srcDValue, uint16_t dstNzC0Stride) { constexpr int BLOCK_LEN = 32 / sizeof(T); if (srcDValue < ND2NZ_STRIDE_LIMIT) { IntrinsicCopyGmToL1Nd2Nz::move( dst, src, - 0, - 1, - nValue, - dValue, - 0, - srcDValue, - dstNzC0Stride, - 1, - 0 + 0, // sid + 1, // ndNum + nValue, // nValue + dValue, // dValue + 0, // srcNdMatrixStride, unused + srcDValue, // srcDValue + dstNzC0Stride, // dstNzC0Stride + 1, // dstNzNStride, + 0 // dstNzMatrixStride, unused ); } else { for (int i = 0; i < nValue; i++) { IntrinsicCopyGmToL1Nd2Nz::move( dst + i * BLOCK_LEN, src + i * srcDValue, - 0, - 1, - 1, - dValue, - 0, - 0, - dstNzC0Stride, - 0, - 0 + 0, // sid + 1, // ndNum + 1, // nValue + dValue, // dValue + 0, // srcNdMatrixStride, unused + 0, // srcDValue, unused + dstNzC0Stride, // dstNzC0Stride + 0, // dstNzNStride, unused + 0 // dstNzMatrixStride, unused ); } } @@ -232,6 +237,7 @@ public: this->gm_dequant_scale = reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale); has_offset = gm_dequant_offset != nullptr; + this->batch_size = batch_size; this->m = m; this->k = k; @@ -254,22 +260,37 @@ public: bool has_a_align = IsQuant(quant_granularity) || aligned_a; bool has_b_align = IsQuant(dequant_granularity) && !IS_INT8 || aligned_b; if (weight_nz) { + //k_align16 = Block32B::AlignUp(k); k_align16 = (k + 16 - 1) / 16 * 16; n_align16 = Block32B::AlignUp(n); - aligned_b = 0; - has_b_align = false; + aligned_b = 0; // dont' do padding for nz weight + has_b_align = false; } bool has_accum = IsQuant(dequant_granularity) && IS_INT8 && std::is_same::value; bool has_format_dequant_offset = (dequant_granularity == QuantGranularity::PER_TENSOR) && IS_INT8 && has_offset; + // if allgather, workspace *= rank size int32_t accum_rank_size = 1; - + if (RUN_TYPE == PPMATMUL_RUN_ALL_GATHER_MATMUL) { + accum_rank_size = rank_size; + } + int32_t is_moe_averaged = 0; + int32_t is_alltoallvc = 0; + + if (num_local_tokens_per_expert == nullptr && num_global_tokens_per_local_expert == nullptr && + global_tokens_per_expert_matrix == nullptr){ + is_moe_averaged = 1; + } else if(global_tokens_per_expert_matrix != nullptr) { + is_alltoallvc = 1; + } else { + is_alltoallvc = 0; + } bool has_dequant_param = (dequant_granularity == QuantGranularity::PER_TOKEN || dequant_granularity == QuantGranularity::PER_TENSOR); bool hasFormatDequantScale = (dequant_granularity == QuantGranularity::PER_CHANNEL); workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, - TA, TB, sizeof(MmadDtype), has_a_align, has_b_align, accum_rank_size, has_accum, 0, has_dequant_param, - hasFormatDequantScale, is_deterministic); - + TA, TB, sizeof(MmadDtype), has_a_align, has_b_align, accum_rank_size, has_accum, 0, has_dequant_param, + hasFormatDequantScale,is_deterministic, is_moe, is_alltoallvc, EP, local_expert_nums, maxOutputSize); + gm_a_src = reinterpret_cast<__gm__ MmadDtype *>(has_a_align ? workspace_info.gm_a_align : gm_a); gm_b_src = reinterpret_cast<__gm__ MmadDtype *>(has_b_align ? workspace_info.gm_b_align : gm_b); gm_accum = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_accum); @@ -287,7 +308,7 @@ public: int32_t b_l1_size = n0 * k0 * sizeof(MmadDtype); int32_t b_l1_size_round = DivCeil(b_l1_size, 512) * 512; l1_base_a = reinterpret_cast<__cbuf__ MmadDtype *>((uintptr_t)(IS_INT8 ? SCALE_L1_SIZE : 0)); - l1_base_b = reinterpret_cast<__cbuf__ MmadDtype *>(a_l1_size_round * (IS_INT8 ? 2 : 1) + (uintptr_t)l1_base_a); + l1_base_b = reinterpret_cast<__cbuf__ MmadDtype *>(a_l1_size_round * (IS_INT8 ? 2 : 1) + (uintptr_t) l1_base_a); core_num = get_block_num(); core_idx = get_block_idx(); @@ -307,23 +328,25 @@ public: loop_num_per_comm = p_value * core_num; this->buffer_size = buffer_size; + // 2dtp 确定本卡的ag和rs分别的idx this->ag_dim = ag_dim; this->rs_dim = rs_dim; this->inner_dim_is_Ag = inner_dim_is_Ag; if (inner_dim_is_Ag) { this->ag_rank_idx = rank % ag_dim; this->rs_rank_idx = rank / ag_dim; - } else { + }else { this->ag_rank_idx = rank / rs_dim; this->rs_rank_idx = rank % rs_dim; - } + } } inline __aicore__ void CalLoop(int64_t batch_idx, int64_t m_idx, int64_t n_idx, int32_t m_actual, int32_t n_actual, - __gm__ MmadDtype *gm_a_src_tmp) { + __gm__ MmadDtype *gm_a_src_tmp) { int64_t offset_a, offset_b, offset_a_next, offset_b_next; int32_t m_round, n_round; if (IS_INT8) { + // directive Restrictions if (TA) { m_round = DivCeil(m_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; } else { @@ -362,7 +385,7 @@ public: offset_b = n_idx * n0 * block_size; } else { offset_b = n_idx * n0 * k; - } + } } } else { if (aligned_b == 1) { @@ -372,13 +395,13 @@ public: offset_b = n_idx * n0 * k_align16; } else { offset_b = n_idx * n0; - } + } } } int64_t dequant_param_offset = n_idx * n0; int32_t k_actual = (k_loop == 1) ? k : k0; - int32_t k_round = DivCeil(k_actual, block_size) * block_size; + int32_t k_round = DivCeil(k_actual, block_size) * block_size; // int8 :32 fp16 :16 auto l1_buf_a = ping_flag ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; auto l1_buf_b = ping_flag ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; @@ -389,42 +412,43 @@ public: if (IS_INT8 && has_offset) { PipeBarrier(); IntrinsicCopyGmToL1Nd2Nz::move( - ((__cbuf__ int32_t *) bias_l1), + ((__cbuf__ int32_t *)bias_l1), ((__gm__ int32_t *)gm_format_dequant_offset) + dequant_param_offset, - 0, - 1, - 1, - n_actual, - 0, - n, - 1, - 1, - 0 + 0, // sid + 1, // ndNum + 1, // nValue + n_actual, // dValue + 0, // srcNdMatrixStride, unused + n, // srcDValue + 1, // dstNzC0Stride + 1, // dstNzNStride + 0 // dstNzMatrixStride, unused ); SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); - CopyCubfToBt(((uint64_t)bias_bt), ((__cbuf__ int32_t *)bias_l1), - (uint16_t)0ULL, 1, (n_actual * 4 + 63) / 64, 0, 0); - SetFlag(EVENT_ID1); - SetFlag(EVENT_ID1); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); // int8需要wait MTE1等FIX + CopyCubfToBt(((uint64_t)bias_bt), ((__cbuf__ int32_t *)bias_l1), + (uint16_t)0ULL, 1, (n_actual * 4 + 63) / 64, 0, 0); + SetFlag(EVENT_ID1); // bias ready, mte2 can begin move A/B or scalar + SetFlag(EVENT_ID1); // bias ready, mmad can begin + WaitFlag(EVENT_ID1); // A/B or scalar wait moving bias from L1 to BT } auto gm_src_a = gm_a_src_tmp + offset_a; auto gm_src_b = gm_b_src + offset_b; WaitFlag(event_id); + // *** load matrix A to L1 if (m == 1 || m_actual == 1 && !TA) { CopyGmToCbuf( l1_buf_a, gm_src_a, - 0, - 1, - k_round / block_size, - 0, - 0, - PAD_NONE + 0, // sid + 1, // nBurst + k_round / block_size, // lenBurst + 0, // srcGap + 0, // dstGap + PAD_NONE // padMode ); } else { if (TA) { @@ -442,6 +466,8 @@ public: } } SetFlag(event_id); + + // *** load matrix B to L1 WaitFlag(event_id + 2); if (TB) { auto src_len = k; @@ -450,7 +476,7 @@ public: } if (weight_nz) { int32_t num_col = DivCeil(k_actual, block_size); - CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); + CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); } else { CopyGmToL1Nd2zN::move(l1_buf_b, gm_src_b, n_actual, k_actual, src_len, n_round); } @@ -461,7 +487,7 @@ public: } if (weight_nz) { int32_t num_col = DivCeil(n_actual, block_size); - CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, k_actual, k_align16 - k_actual, k_round - k_actual, PAD_NONE); + CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, k_actual, k_align16 - k_actual, k_round - k_actual, PAD_NONE); } else { CopyGmToL1Nd2zN::move(l1_buf_b, gm_src_b, k_actual, n_actual, src_len, k_round); } @@ -471,6 +497,7 @@ public: int mte1_mad_ping_flag = 1; for (int64_t k_idx = 0; k_idx < k_loop; k_idx++) { + int32_t k_actual = (k_idx == (k_loop - 1)) ? (k - k_idx * k0) : k0; int32_t k_round = DivCeil(k_actual, block_size) * block_size; int32_t k_part_loop = DivCeil(k_actual, k_part_len); @@ -481,16 +508,16 @@ public: if (k_idx < k_loop - 1) { if (TA) { - if (aligned_a == 1){ + if (aligned_a == 1) { offset_a_next = batch_idx * k * m_align + (k_idx + 1) * k0 * m_align + m_idx * m0; } else { offset_a_next = batch_idx * k * m + (k_idx + 1) * k0 * m + m_idx * m0; } } else { - if (aligned_a == 1){ + if (aligned_a == 1) { offset_a_next = batch_idx * m * k_align + m_idx * m0 * k_align + (k_idx + 1) * k0; } else { - offset_a_next = batch_idx * m * k + m_idx * m0 * k + (k_idx + 1) * k0; + offset_a_next = batch_idx * m * k + m_idx * m0 * k + (k_idx + 1) * k0; } } if (TB) { @@ -500,7 +527,7 @@ public: if (weight_nz) { offset_b_next = batch_idx * n * k + (k_idx + 1) * k0 * n_align16 + n_idx * n0 * block_size; } else { - offset_b_next = batch_idx * n * k + n_idx * n0 * k + (k_idx + 1) * k0; + offset_b_next = batch_idx * n * k + n_idx * n0 * k + (k_idx + 1) * k0; } } } else { @@ -511,11 +538,11 @@ public: offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * block_size + n_idx * n0 * k_align16; } else { offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * n + n_idx * n0; - } + } } } - int32_t k_actual_next = ((k_idx + 1) == (k_loop -1)) ? (k - (k_idx + 1) * k0) : k0; + int32_t k_actual_next = ((k_idx + 1) == (k_loop - 1)) ? (k - (k_idx + 1) * k0) : k0; int32_t k_round_next = DivCeil(k_actual_next, block_size) * block_size; __cbuf__ MmadDtype *l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; @@ -526,16 +553,17 @@ public: auto gm_src_b = gm_b_src + offset_b_next; WaitFlag(event_id_next); + // *** load matrix A to L1 if (m == 1 || m_actual == 1 && !TA) { CopyGmToCbuf( l1_buf_a_next, gm_src_a, - 0, - 1, - k_round_next / block_size, - 0, - 0, - PAD_NONE + 0, // sid + 1, // nBurst + k_round_next / block_size, // lenBurst + 0, // srcGap + 0, // dstGap + PAD_NONE // padMode ); } else { if (TA) { @@ -544,18 +572,19 @@ public: src_len = m_align; } CopyGmToL1Nd2zN::move( - l1_buf_a_next, gm_src_a, k_actual_next, m_actual, src_len, k_round_next); + l1_buf_a_next, gm_src_a, k_actual_next, m_actual, src_len, k_round_next); } else { auto src_len = k; if (aligned_a == 1) { src_len = k_align; } CopyGmToL1Nd2zN::move( - l1_buf_a_next, gm_src_a, m_actual, k_actual_next, src_len, m_round); + l1_buf_a_next, gm_src_a, m_actual, k_actual_next, src_len, m_round); } } SetFlag(event_id_next); + // *** load matrix B to L1 WaitFlag(event_id_next + 2); if (TB) { auto src_len = k; @@ -564,7 +593,7 @@ public: } if (weight_nz) { int32_t num_col = DivCeil(k_actual_next, block_size); - CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); + CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); } else { CopyGmToL1Nd2zN::move(l1_buf_b_next, gm_src_b, n_actual, k_actual_next, src_len, n_round); } @@ -575,7 +604,7 @@ public: } if (weight_nz) { int32_t num_col = DivCeil(n_actual, block_size); - CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, k_actual_next, k_align16 - k_actual_next, k_round_next - k_actual_next, PAD_NONE); + CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, k_actual_next, k_align16 - k_actual_next, k_round_next - k_actual_next, PAD_NONE); } else { CopyGmToL1Nd2zN::move(l1_buf_b_next, gm_src_b, k_actual_next, n_actual, src_len, k_round_next); } @@ -588,26 +617,27 @@ public: k_part_len : k_round - k_part_idx * k_part_len; int32_t k0_actual = (k_part_idx < k_part_loop - 1) ? k_part_len : k_actual - k_part_idx * k_part_len; - + auto mte1_mad_event_id = mte1_mad_ping_flag ? EVENT_ID0 : EVENT_ID1; auto l0a_buf = l0a_base + (1 - mte1_mad_ping_flag) * L0AB_PINGPONG_BUFFER_LEN; auto l0b_buf = l0b_base + (1 - mte1_mad_ping_flag) * L0AB_PINGPONG_BUFFER_LEN; + // *** load matrix A from L1 to L0A if (k_part_idx == 0) { WaitFlag(event_id); } WaitFlag(mte1_mad_event_id); if (m == 1 || m_actual == 1 && !TA) { LoadCbufToCa( - l0a_buf, - l1_buf_a + k_part_idx * k_part_len, - 0, - DivCeil(k0_round, cube_matrix_size), - 1, - 0, - 0, - false, - inc + l0a_buf, + l1_buf_a + k_part_idx * k_part_len, + 0, // baseIdx + DivCeil(k0_round, cube_matrix_size), // repeat + 1, // srcStride + 0, // dstStride + 0, // sid + false, // transpose + inc // addr_cal_mode_t ); } else { if (TA) { @@ -617,65 +647,66 @@ public: l0a_buf + i * k0_round * BLOCK_SIZE_32, l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_32 + i * k_round * BLOCK_SIZE_32, - 0, - k0_round / BLOCK_SIZE_32, - 1, - 0, - 0, - k0_round / BLOCK_SIZE_32 - 1 + 0, // baseIdx + k0_round / BLOCK_SIZE_32, // repeat + 1, // srcStride + 0, // dstStride + 0, // addrmode + k0_round / BLOCK_SIZE_32 - 1 // dstFracStride ); } } else { for (int i = 0; i < m_round / BLOCK_SIZE_16; i++) { LoadCbufToCa( - l0a_buf + i * k0_round * BLOCK_SIZE_16, - l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_16 + - i * k_round * BLOCK_SIZE_16, - 0, - k0_round /BLOCK_SIZE_16, - 1, - 0, - 0, - true, - inc + l0a_buf + i * k0_round * BLOCK_SIZE_16, + l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_16 + + i * k_round * BLOCK_SIZE_16, + 0, // baseIdx + k0_round / BLOCK_SIZE_16, // repeat + 1, // srcStride + 0, // dstStride + 0, // sid + true, // transpose + inc // addr_cal_mode_t ); } - } + } } else { - for (int32_t i = 0; i < k0_round / block_size; i++) { + for (int32_t i = 0; i < k0_round / block_size; i++) { LoadCbufToCa( l0a_buf + i * cube_matrix_size, l1_buf_a + k_part_idx * k_part_len * m_round + i * m_round * block_size, - 0, - m_round / BLOCK_SIZE_16, - 1, - k0_round / block_size - 1, - 0, - false, - inc + 0, // baseIdx + m_round / BLOCK_SIZE_16, // repeat + 1, // srcStride + k0_round / block_size - 1, // dstStride + 0, // sid + false, // transpose + inc // addr_cal_mode_t ); } } } - if (k_part_idx == k_part_loop -1) { + if (k_part_idx == k_part_loop - 1) { SetFlag(event_id); } + // *** load matrix B from L1 to L0B if (k_part_idx == 0) { WaitFlag(event_id + 2); } if (TB) { LoadCbufToCb( - l0b_buf, - l1_buf_b + k_part_idx * k_part_len * n_round, - 0, - k0_round * n_round / cube_matrix_size, - 1, - 0, - 0, - false, - inc + l0b_buf, + l1_buf_b + k_part_idx * k_part_len * n_round, + 0, // baseIdx + k0_round * n_round / cube_matrix_size, // repeat + 1, // srcStride + 0, // dstStride + 0, // sid + false, // transpose + inc // addr_cal_mode_t ); } else { if (IS_INT8) { @@ -683,26 +714,26 @@ public: LoadCbufToCbTranspose( l0b_buf + i * ((n_actual + 15) / 16 * 16) * BLOCK_SIZE_32, l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_32) * BLOCK_SIZE_32, - 0, - n_round / BLOCK_SIZE_32, - k_round / BLOCK_SIZE_32, - 1, - 0, - 0 + 0, // baseIdx + n_round / BLOCK_SIZE_32, // repeat + k_round / BLOCK_SIZE_32, // srcStride + 1, // dstStride + 0, // addrmode + 0 // dstFracStride ); } } else { for (int32_t i = 0; i < k0_round / BLOCK_SIZE_16; i++) { LoadCbufToCb( - l0b_buf + i * n_round * BLOCK_SIZE_16, - l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_16) * BLOCK_SIZE_16, - 0, - n_round / BLOCK_SIZE_16, - k_round / BLOCK_SIZE_16, - 0, - 0, - true, - inc + l0b_buf + i * n_round * BLOCK_SIZE_16, + l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_16) * BLOCK_SIZE_16, + 0, // baseIdx + n_round / BLOCK_SIZE_16, // repeat + k_round / BLOCK_SIZE_16, // srcStride + 0, // dstStride + 0, // sid + true, // transpose + inc // addr_cal_mode_t ); } } @@ -721,7 +752,7 @@ public: if (IS_INT8 && has_offset) { if (init_c) { - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID1); // wait move bias fron L1 to BT } PipeBarrier(); if (m != 1 && m_actual == 1 && TA) { @@ -729,26 +760,26 @@ public: (__ca__ int8_t *)l0a_buf, (__cb__ int8_t *)l0b_buf, ((uint64_t)bias_bt), - 16, - k0_actual, - n_actual, - 0, - 0, - init_c, - 0 + 16, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + init_c, // cmatrixSource add C from BT + 0 // cmatrixInitVal ); } else { mad((__cc__ int32_t *)l0c_buf, (__ca__ int8_t *)l0a_buf, (__cb__ int8_t *)l0b_buf, ((uint64_t)bias_bt), - m_actual, - k0_actual, - n_actual, - 0, - 0, - init_c, - 0 + m_actual, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + init_c, // cmatrixSource add C from BT + 0 // cmatrixInitVal ); } } else { @@ -757,25 +788,25 @@ public: mad(l0c_buf, l0a_buf, l0b_buf, - 16, - k0_actual, - n_actual, - 0, - 0, - 0, - init_c + 16, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + 0, // cmatrixSource + init_c // cmatrixInitVal ); } else { mad(l0c_buf, l0a_buf, l0b_buf, - m_actual, - k0_actual, - n_actual, - 0, - 0, - 0, - init_c + m_actual, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + 0, // cmatrixSource + init_c // cmatrixInitVal ); } } @@ -787,6 +818,7 @@ public: ping_flag = 1 - ping_flag; } + if (IS_INT8 && std::is_same::value && (dequant_granularity == QuantGranularity::PER_CHANNEL || dequant_granularity == QuantGranularity::PER_TOKEN)) { WaitFlag(EVENT_ID0); @@ -814,29 +846,29 @@ public: 0 ); PipeBarrier(); - } + } } inline __aicore__ void MoveL0CToGM(__gm__ OutDtype *gm_dst, int64_t offset_c, int32_t m_actual, int32_t n_actual, int32_t src_stride, int32_t dst_stride) { #if (__CCE_AICORE__ == 220) FixpipeParamsV220 FixpipeParams( - n_actual, - m_actual, - src_stride, - dst_stride, - false + n_actual, // nSize = nSizeIn; + m_actual, // mSize = mSizeIn; + src_stride, // srcStride = srcStrideIn; + dst_stride, // dstStride = dstStrideIn; + false // reluEn = reluEnIn; ); #elif (defined(__DAV_C310__)) FixpipeParamsC310 FixpipeParams( - n_actual, - m_actual, - src_stride, - dst_stride + n_actual, // nSize = nSizeIn; + m_actual, // mSize = mSizeIn; + src_stride, // srcStride = srcStrideIn; + dst_stride // dstStride = dstStrideIn; ); #endif uint64_t src_addr = reinterpret_cast(l0c_buf); LocalTensor srcTensor = CreateLocalTensor - (reinterpret_cast(l0c_buf), static_cast(TPosition::CO1)); + (reinterpret_cast(l0c_buf), static_cast(TPosition::CO1)); GlobalTensor dstTensor = CreateGlobalTensor(gm_dst + offset_c); if (IS_INT8) { @@ -880,7 +912,7 @@ public: SetFlag(EVENT_ID0); SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); - SetFlag(EVENT_ID1); + SetFlag(EVENT_ID1); // } inline __aicore__ void Endflags() { @@ -906,14 +938,15 @@ public: int64_t batch_idx = loop_idx / (m_loop * n_loop); int64_t m_idx, n_idx; GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); - int32_t m_actual = (m_idx == (m_loop -1)) ? (m - m_idx * m0) : m0; - int32_t n_actual = (n_idx == (n_loop -1)) ? (n - n_idx * n0) : n0; + int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; CalLoop(batch_idx, m_idx, n_idx, m_actual, n_actual, gm_a_src); SetFlag(EVENT_ID0); WaitFlag(EVENT_ID0); int64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; + // copy from L0C to gm MoveL0CToGM(gm_c, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n); } Endflags(); @@ -945,10 +978,10 @@ public: if (loop_idx >= core_loop) break; int64_t batch_idx = loop_idx / (m_loop * n_loop); - int64_t m_idx , n_idx; + int64_t m_idx, n_idx; GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); - int32_t m_actual = (m_idx == (m_loop -1)) ? (m - m_idx * m0) : m0; - int32_t n_actual = (n_idx == (n_loop -1)) ? (n - n_idx * n0) : n0; + int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; CalLoop(batch_idx, m_idx, n_idx, m_actual, n_actual, gm_a_src); SetFlag(EVENT_ID0); @@ -956,9 +989,14 @@ public: int64_t offset_c; int32_t n_stride; + // if constexpr (IS_INT8 && std::is_same::value) { + // offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; + // n_stride = n; + // } else { offset_c = flag_idx * m0 * loop_num_per_comm * n0 + (loop_idx % loop_num_per_comm) * m0 * n0; n_stride = n0; + //} MoveL0CToGM(gm_peer_mem, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n_stride); } FFTSCrossCoreSync(2, flag_idx); @@ -967,10 +1005,165 @@ public: PipeBarrier(); } - inline __aicore__ void RunAllGatherMatmulReduceScatter() { + inline __aicore__ void RunMatmulReduceScatter() { + int32_t tail_m = (m / rank_size) % m0; + m_loop = m / rank_size / m0; + if (tail_m) { + m_loop += 1; + } + m_loop *= rank_size; + core_loop = batch_size * m_loop * n_loop; + + InitFlags(); + + int32_t comm_num = DivCeil(core_loop, loop_num_per_comm); + // core_loop = batch_size * m_loop * n_loop = p_value * core_num * comm_num + int32_t m_loop_per_rank = m_loop / rank_size; + for (int32_t comm_idx = 0; comm_idx < comm_num; comm_idx++) { + int cur_p_value = p_value; + int32_t actual_loop_num = loop_num_per_comm; + int32_t flag_idx = is_91093 ? comm_idx % BLOCK_COUNT_3 : comm_idx % MAX_BLOCK_COUNT; + if (comm_idx == comm_num - 1) { + actual_loop_num = core_loop - comm_idx * loop_num_per_comm; + } + WaitEvent(flag_idx); + // core_num * p_value + for (int32_t p = 0; p < p_value; p++) { // 每个core一次通信,计算了p_value次 + int loop_idx = comm_idx * p_value * core_num + p * core_num + core_idx; + if (loop_idx >= core_loop) + break; + int64_t batch_idx = loop_idx / (m_loop * n_loop); + int32_t in_batch_idx = loop_idx % (m_loop * n_loop); + int64_t rank_idx = in_batch_idx % rank_size; + int32_t in_rank_idx = in_batch_idx / rank_size; + + int64_t m_idx, n_idx; + GetBlockIdx(in_rank_idx, m_loop_per_rank, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + + int32_t m_actual = (m_idx == (m_loop_per_rank - 1)) ? (m / rank_size - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + __gm__ MmadDtype *gm_a_rank_st; + if (TA) { + gm_a_rank_st = gm_a_src + rank_idx * m / rank_size; + } else { + gm_a_rank_st = gm_a_src + rank_idx * m / rank_size * k_align; + } + CalLoop(batch_idx, m_idx, n_idx, m_actual, n_actual, gm_a_rank_st); + + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int64_t offset_c; + int32_t dst_stride; + __gm__ OutDtype *gm_dst = nullptr; + if (rank_idx == rank && !(IS_INT8 && (dequant_granularity == QuantGranularity::PER_TOKEN|| std::is_same::value))) { + offset_c = batch_idx * m * n / rank_size + m_idx * m0 * n + n_idx * n0; + gm_dst = gm_c; + dst_stride = n; + } else { + int64_t rank_offset_c = (loop_idx % rank_size) * (actual_loop_num / rank_size) * m0 * n0; + offset_c = flag_idx * m0 * loop_num_per_comm * n0 + + rank_offset_c + + ((loop_idx % loop_num_per_comm) / rank_size) * m0 * n0; + gm_dst = gm_peer_mem; + dst_stride = n0; + } + // copy from L0C to gm + MoveL0CToGM(gm_dst, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, dst_stride); + } + FFTSCrossCoreSync(2, flag_idx); + } + + Endflags(); + PipeBarrier(); + } + + inline __aicore__ void DoLocalMatmul() { + for (int32_t loop_idx = 0; loop_idx < core_loop; loop_idx++) { + if (loop_idx % core_num != core_idx) { + continue; + } + int64_t batch_idx = loop_idx / (m_loop * n_loop); + + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + + int32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + + CalLoop(batch_idx, m_idx, n_idx, m_actual, n_actual, gm_a_src); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int64_t offset_c = batch_idx * m * n * rank_size + (rank * m + m_idx * m0) * n + n_idx * n0; + // copy from L0C to gm + MoveL0CToGM(gm_c, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n); + } + } + + inline __aicore__ void RunAllGatherMatmul() { + InitFlags(); + // rank + // m_loop * n_loop + DoLocalMatmul(); + + int64_t gm_a_pingpong_size = m0 * k_align * p_value * rank_size; + int32_t comm_count = DivCeil(batch_size * m_loop, p_value); + for (int32_t comm_idx = 0; comm_idx < comm_count; comm_idx++) { + uint64_t flag_id = comm_idx % MAX_BLOCK_COUNT; + if (is_91093) { + flag_id = comm_idx % 3; + } + int32_t actual_p_value = p_value; + if (comm_idx == comm_count - 1) { + actual_p_value = m_loop - comm_idx * p_value; + } + WaitEvent(flag_id); + + // other_rank, p_value * n_loop * (rank_size - 1) + int32_t actual_loop_num_in_other_rank = actual_p_value * (rank_size - 1) * n_loop; + for (int32_t loop_offset = 0; loop_offset < actual_loop_num_in_other_rank; loop_offset++) { + int32_t loop_idx = core_loop + comm_idx * p_value * n_loop * (rank_size - 1) + loop_offset; + if (loop_idx % core_num != core_idx) { + continue; + } + int64_t batch_idx = loop_idx / (m_loop * n_loop * rank_size); + + int64_t m_idx, n_idx; + GetBlockIdx(loop_offset, actual_p_value * (rank_size - 1), n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + + int32_t m_idx_in_rank = m_idx % actual_p_value; + int64_t m_idx_in_c = comm_idx * p_value + m_idx_in_rank; + int32_t m_actual = (m_idx_in_c == (m_loop - 1)) ? (m - m_idx_in_c * m0) : m0; + int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + int64_t rank_idx = m_idx / actual_p_value; + if (rank_idx >= rank) { + rank_idx += 1; + } + __gm__ MmadDtype *gm_peer_mem_st = reinterpret_cast<__gm__ MmadDtype *>(gm_peer_mem) + + flag_id * gm_a_pingpong_size + + rank_idx * p_value * m0 * k_align; + CalLoop(batch_idx, m_idx_in_rank, n_idx, m_actual, n_actual, gm_peer_mem_st); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + int64_t offset_c = batch_idx * m * n * rank_size + (rank_idx * m + m_idx_in_c * m0) * n + n_idx * n0; + // copy from L0C to gm + MoveL0CToGM(gm_c, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n); + } + FFTSCrossCoreSync(2, flag_id); + } + Endflags(); + PipeBarrier(); + } + + + // p_value的含义在RS和AG不一样:在RS中,每个core计算p_value次后通信一次;在AG中,每从其他rank各gather p_value行后计算一次 + // 在2DTP中,p_value含义和AG一致。 + inline __aicore__ void RunAllGatherMatmulReduceScatter() { + InitFlags(); - int32_t twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; + int32_t twod_big_dim = ag_dim > rs_dim ? ag_dim: rs_dim; int64_t gm_a_pingpong_size = m0 * k_align * p_value * twod_big_dim; int64_t gm_c_pingpong_size = p_value * twod_big_dim * n_loop * m0 * n0; int32_t m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); @@ -979,10 +1172,10 @@ public: int32_t loop_num_per_cal = p_value * n_loop * twod_big_dim; int32_t ag_part_dim = twod_big_dim / ag_dim; int32_t rs_part_dim = twod_big_dim / rs_dim; - for (int32_t comm_idx = 0; comm_idx < comm_count; comm_idx++) { + for (int32_t comm_idx = 0; comm_idx < comm_count; comm_idx++){ uint64_t flag_id = comm_idx % MAX_BLOCK_COUNT; int32_t actual_p_value = p_value; - if (comm_idx == comm_count - 1) { + if (comm_idx == comm_count - 1){ actual_p_value = m_loop_per_bigdim - comm_idx * p_value; } WaitEvent(flag_id); @@ -1005,6 +1198,11 @@ public: int32_t m_actual = (m_idx_in_c == (m_loop_per_bigdim - 1)) ? (m_per_bigdim - m_idx_in_c * m0) : m0; int32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; int64_t bigdim_idx = m_idx / actual_p_value; + // if bigdim=rs, ag_src_idx=bigdim_idx / (bigdim/agdim), ag_part_idx=bigdim_idx % (bigdim/agdim) + // 当rsdim>agdim时,ag会在每张卡拉ag_part_dim个块(每块是pvalue行); + // 当前core从ag_src_idx卡拉第ag_part_idx个块 + // 当rsdim(gm_peer_mem) + (comm_idx % MAX_BLOCK_COUNT) * gm_a_pingpong_size + bigdim_idx * p_value * m0 * k_align; - } else { + }else { gm_mem_st = gm_a_src + (comm_idx * p_value) * m0 * k_align + ag_part_idx * m_per_bigdim * k_align; + // comm_idx * p_value决定每块内部的位置;ag_part_idx * m_per_bigdim决定第几块 } CalLoop(batch_idx, m_idx_in_rank, n_idx, m_actual, n_actual, gm_mem_st); @@ -1028,19 +1227,21 @@ public: int64_t dst_stride; __gm__ OutDtype *gm_dst = nullptr; - if (rs_dst_idx != rs_rank_idx) { + // 每张卡最终大小为m * ag_dim / rs_dim + if (rs_dst_idx != rs_rank_idx){ // 需要RS,写到本卡的shared mem offset_c = gm_c_pingpong_size * (comm_idx % MAX_BLOCK_COUNT) + (m_idx * n_loop + n_idx) * m0 * n0 + LCAL_2DTP_C_OFFSET; gm_dst = gm_peer_mem; dst_stride = n0; - } else { + }else { // 无需RS,写到本卡的gm_c;此处batch size可能不对; offset_c = rs_part_idx * m_per_bigdim * n + m_idx_in_c * m0 * n + n_idx * n0; gm_dst = gm_c; dst_stride = n; } + // copy from L0C to gm MoveL0CToGM(gm_dst, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, dst_stride); } FFTSCrossCoreSync(2, flag_id); @@ -1051,14 +1252,20 @@ public: } inline __aicore__ void Run() { - if (RUN_TYPE == PPMATMUL_RUN_MATMUL_ALLREDUCE) { + if (RUN_TYPE == PPMATMUL_RUN_PURE_MATMUL) { + RunPureMatmul(); + } else if (RUN_TYPE == PPMATMUL_RUN_MATMUL_ALLREDUCE) { if (withSerialMode) { gm_c = gm_peer_mem; RunPureMatmul(); } else { RunMatmulAllReduce(); } - } else if (RUN_TYPE == PPMATMUL_RUN_ALL_GATHER_MATMUL_REDUCE_SCATTER) { + } else if (RUN_TYPE == PPMATMUL_RUN_MATMUL_REDUCE_SCATTER) { + RunMatmulReduceScatter(); + } else if (RUN_TYPE == PPMATMUL_RUN_ALL_GATHER_MATMUL) { + RunAllGatherMatmul(); + } else if (RUN_TYPE == PPMATMUL_RUN_ALL_GATHER_MATMUL_REDUCE_SCATTER){ RunAllGatherMatmulReduceScatter(); } } @@ -1082,9 +1289,9 @@ protected: __cc__ T_ACCUM *l0c_buf = reinterpret_cast<__cc__ T_ACCUM *>((uintptr_t) 0); __cbuf__ int64_t *scale_l1 = reinterpret_cast<__cbuf__ int64_t *>((uintptr_t) 0); - __fbuf__ int64_t *scale_FB = (__fbuf__ int64_t *)(0); + __fbuf__ int64_t *scale_FB = (__fbuf__ int64_t *)(0); - __cbuf__ int32_t * bias_l1 = reinterpret_cast<__cbuf__ int32_t *>((uintptr_t)0); + __cbuf__ int32_t *bias_l1 = reinterpret_cast<__cbuf__ int32_t *>((uintptr_t)0); uint16_t bias_bt = 0; bool has_offset{false}; LcalWorkspaceInfo workspace_info; @@ -1129,15 +1336,17 @@ protected: int32_t withSerialMode; int32_t buffer_size; + // AG+MM+RS int32_t ag_dim; int32_t rs_dim; bool inner_dim_is_Ag{false}; int32_t ag_rank_idx; int32_t rs_rank_idx; bool weight_nz{false}; - + // sio bool is_91093{false}; QuantGranularity dequant_granularity; + }; #elif __DAV_C220_VEC__ @@ -1147,5 +1356,116 @@ protected: #include "coc_dequant_runner.cce" #include "tiling_args.h" +template +inline __aicore__ void CocPureMatmulAiv(COC_ARGS_FUN(T)) +{ + SetAtomicNone(); + SetMaskNorm(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + // get tiling args + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + auto quantInfo = ¶->quantInfo; + auto moeInfo = ¶->moeInfo; + + GlobalTensor commArgsGm; + commArgsGm.SetGlobalBuffer(reinterpret_cast<__gm__ int *>(coc_comm_args), 2); + uint32_t extraFlag = commArgsGm.GetValue(4); + bool is_deterministic = (extraFlag & ExtraFlag::DETERMINISTIC) != 0; + + int32_t batch_size = cocTilingData->batchSize; + int32_t m = cocTilingData->m; + int32_t k = cocTilingData->k; + int32_t n = cocTilingData->n; + + int32_t m0 = cocTilingData->m0; + int32_t k0 = cocTilingData->k0; + int32_t n0 = cocTilingData->n0; + + int32_t m_loop = cocTilingData->mLoop; + int32_t k_loop = cocTilingData->kLoop; + int32_t n_loop = cocTilingData->nLoop; + + int32_t core_loop = cocTilingData->coreLoop; + int32_t swizzl_count = cocTilingData->swizzlCount; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t rank = cocTilingData->rank; + int32_t rank_size = cocTilingData->rankSize; + int32_t p_value = cocTilingData->pValue; + QuantGranularity dequant_granularity = static_cast(quantInfo->dequantGranularity); + int32_t dequant_group_size = quantInfo->dequantGroupSize; + QuantGranularity quant_granularity = static_cast(quantInfo->quantGranularity); + int32_t quant_group_size = quantInfo->quantGroupSize; + bool weight_nz = para->weightNz; + bool swizzl_direct = (tiling_key & SWIZZL_MASK) ? true : false; + bool trans_a = (tiling_key & TRANS_A_MASK) ? true : false; + bool trans_b = (tiling_key & TRANS_B_MASK) ? true : false; + bool have_bias = (tiling_key & BIAS_MASK) ? true : false; + bool is_int8 = (tiling_key & INT8_MASK) ? true : false; + + int32_t local_expert_nums = moeInfo->local_expert_nums; + int32_t EP = moeInfo->EP; + int32_t TP = moeInfo->TP; + int32_t is_moe_averaged = 0; + int32_t is_alltoallvc = 0; + int32_t is_moe = moeInfo->isMoe; + + + int32_t m_align, k_align, n_align; + if (is_int8) { + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + } else { + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + } + int32_t aligned_a, aligned_b; + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + + bool has_a_align = IsQuant(quant_granularity) || aligned_a; + bool has_b_align = IsQuant(dequant_granularity) && !is_int8 || aligned_b; + bool has_accum = IsQuant(dequant_granularity) && is_int8 && std::is_same::value; + bool has_dequant_param = (dequant_granularity == QuantGranularity::PER_TOKEN || dequant_granularity == QuantGranularity::PER_TENSOR); + bool hasFormatDequantScale = (dequant_granularity == QuantGranularity::PER_CHANNEL); + if (weight_nz) { + aligned_b = 0; + has_b_align = false; + } + auto workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, + trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, + hasFormatDequantScale,is_deterministic, 0, is_alltoallvc, 0, 0, 0); + + Preprocessor preprocessor; + PureMatmulBiasAdder add_bias_runner; + SerialDequantRunner serial_dequant_runner; + + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + preprocessor.Run(); + + if (has_accum) { + serial_dequant_runner.SetArgs(reinterpret_cast<__gm__ bfloat16_t *>(gm_out), workspace_info, + reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale), + reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset), dequant_granularity, batch_size, m, n); + serial_dequant_runner.FormatScale(); + } + + if (have_bias) { + add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + + WaitEvent(AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID); + + if (has_accum) { + serial_dequant_runner.Run(); + } + + if (have_bias) { + add_bias_runner.Run(); + } +} + #endif #endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce b/comm/lcal/src/kernels/coc_ppmatmul_switch.cce index 1de57637..de0789d2 100644 --- a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce +++ b/comm/lcal/src/kernels/coc_ppmatmul_switch.cce @@ -10,36 +10,70 @@ #include "coc_internal.cce" #include "coc_ppmatmul.cce" #include "tiling_args.h" +#include "coc_matmulmoe.cce" + #ifdef __DAV_C220_CUBE__ template FORCE_INLINE_AICORE void RunPpMatmul(int32_t tiling_key, PP_MATMUL_AIC_ARGS_FUN(TData, TData)) { - PpMatmul matmul_z; - PpMatmul matmul_tb_z; - PpMatmul matmul_z_int8; - PpMatmul matmul_tb_z_int8; - int32_t tiling_key_sel = tiling_key & 0b011101; - switch (tiling_key_sel) { - case 0b000000 : - matmul_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); - matmul_z.Run(); - break; - case 0b001000 : - matmul_tb_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); - matmul_tb_z.Run(); - break; - case 0b000100 : - matmul_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); - matmul_z_int8.Run(); - break; - case 0b001100 : - matmul_tb_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); - matmul_tb_z_int8.Run(); - break; - default : - break; + constexpr bool IS_MOE = (RUN_TYPE == PPMATMUL_RUN_ALL_TO_ALL_ALL_GATHER_MATMUL_HIDDEN) || (RUN_TYPE == PPMATMUL_RUN_MATMUL_REDUCE_SCATTER_ALL_TO_ALL_HIDDEN) + || (RUN_TYPE == PPMATMUL_RUN_ALL_TO_ALL_ALL_GATHER_MATMUL); + if (IS_MOE) { + PpMatmulMoe matmul_z; + PpMatmulMoe matmul_tb_z; + PpMatmulMoe matmul_z_int8; + PpMatmulMoe matmul_tb_z_int8; + int32_t tiling_key_sel = tiling_key & 0b011101; + switch (tiling_key_sel) { + case 0b000000 : + matmul_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_z.Run(); + break; + case 0b001000 : + matmul_tb_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_tb_z.Run(); + break; + case 0b000100 : + matmul_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_z_int8.Run(); + break; + case 0b001100 : + matmul_tb_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_tb_z_int8.Run(); + break; + default : + break; + } + } else { + PpMatmul matmul_z; + PpMatmul matmul_tb_z; + PpMatmul matmul_z_int8; + PpMatmul matmul_tb_z_int8; + int32_t tiling_key_sel = tiling_key & 0b011101; + switch (tiling_key_sel) { + case 0b000000 : + matmul_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_z.Run(); + break; + case 0b001000 : + matmul_tb_z.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_tb_z.Run(); + break; + case 0b000100 : + matmul_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_z_int8.Run(); + break; + case 0b001100 : + matmul_tb_z_int8.SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + matmul_tb_z_int8.Run(); + break; + default : + break; + } } + // 创建不同类型的PpMatmul实例 + } template @@ -56,10 +90,12 @@ inline __aicore__ void CocPpmatmulSwitchAic(COC_ARGS_FUN(TData)) { set_nd_para(config); SetSyncBaseAddr((uint64_t)ffts_addr); + // 获取 tiling 参数 auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; auto quantInfo = ¶->quantInfo; auto twoDimTPInfo = ¶->twoDimTPInfo; + auto moeInfo = ¶->moeInfo; bool weight_nz = para->weightNz; int32_t batch_size = cocTilingData->batchSize; @@ -96,6 +132,12 @@ inline __aicore__ void CocPpmatmulSwitchAic(COC_ARGS_FUN(TData)) { int32_t ag_dim = twoDimTPInfo->agDim; int32_t rs_dim = twoDimTPInfo->rsDim; bool inner_dim_is_Ag = twoDimTPInfo->innerDimIsAg; + + int32_t local_expert_nums = moeInfo->local_expert_nums; + int32_t TP = moeInfo->TP; + int32_t EP = moeInfo->EP; + int32_t maxOutputSize = moeInfo->maxOutputSize; + int32_t is_moe = moeInfo->isMoe; RunPpMatmul(tiling_key, PP_MATMUL_AIC_ARGS_CALL()); PipeBarrier(); diff --git a/comm/lcal/src/kernels/coc_preprocessor.cce b/comm/lcal/src/kernels/coc_preprocessor.cce index a7cd1b57..ce0de567 100644 --- a/comm/lcal/src/kernels/coc_preprocessor.cce +++ b/comm/lcal/src/kernels/coc_preprocessor.cce @@ -291,6 +291,7 @@ public: class FormatOffset { public: static constexpr int32_t max_len = 49152; + static inline __aicore__ void Loop(__gm__ int32_t *dst, int32_t offset, int32_t len) { static const auto ub_offset = reinterpret_cast<__ubuf__ int32_t *>((uintptr_t)0); @@ -316,7 +317,6 @@ private: static constexpr uint8_t repeat = 255; }; - template <> class Padder : public BasePadder { public: @@ -329,7 +329,7 @@ public: { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - + if (gm_dequant_offset != nullptr && dequant_granularity == QuantGranularity::PER_TENSOR) { offset = *reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset); gm_format_dequant_offset = reinterpret_cast<__gm__ int32_t *>(workspace_info.gm_dequant_param); @@ -389,7 +389,6 @@ private: bool need_format_dequant_offset{ false }; }; - template class DequantPadder : public BasePadder { public: @@ -401,7 +400,6 @@ public: inline __aicore__ void Run() {} }; - template <> class DequantPadder : public BasePadder { public: @@ -413,7 +411,7 @@ public: { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - + scale = *reinterpret_cast<__gm__ half *>(gm_dequant_scale); if (gm_dequant_offset) { offset = *reinterpret_cast<__gm__ half *>(gm_dequant_offset); @@ -552,7 +550,7 @@ public: { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - + if (gm_dequant_offset) { auto scale_dptr = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_scale); auto offset_dptr = reinterpret_cast<__gm__ bfloat16_t *>(gm_dequant_offset); @@ -699,7 +697,6 @@ private: bool has_offset{ false }; }; - template <> class DequantPadder : public BasePadder { public: @@ -711,7 +708,7 @@ public: { this->BasePadder::SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); - + gm_scale = reinterpret_cast<__gm__ half *>(gm_dequant_scale); if (gm_dequant_offset) { gm_offset = reinterpret_cast<__gm__ half *>(gm_dequant_offset); @@ -760,11 +757,10 @@ private: auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); - for (it.InitBatchLoop(); !it.EndBatchLoop(); it.NextBatchLoop()) { for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { auto scale = gm_scale + it.n_cols_complete; @@ -785,6 +781,7 @@ private: CopyUB2UB(ub_quant_scale + row * n_blocks_per_row * Block32B::size, ub_quant_scale, 0, 1, n_blocks_per_row, 0, 0); /* sid */ } + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); @@ -804,7 +801,7 @@ private: WaitFlag(EVENT_ID2); Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID2); - + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID2); @@ -832,8 +829,8 @@ private: auto ub_quant_offset = reinterpret_cast<__ubuf__ half *>((uintptr_t)54272); auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)89856); auto ub_add = reinterpret_cast<__ubuf__ half *>((uintptr_t)125440); - auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); - + auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -921,7 +918,7 @@ private: auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -967,7 +964,7 @@ private: WaitFlag(EVENT_ID2); Vmul(ub_output, ub_vconv, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID2); - + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID2); @@ -996,8 +993,7 @@ private: auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)89856); auto ub_add = reinterpret_cast<__ubuf__ half *>((uintptr_t)125440); auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); - - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1058,7 +1054,7 @@ private: WaitFlag(EVENT_ID3); Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); SetFlag(EVENT_ID3); - + WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID3); @@ -1078,7 +1074,6 @@ private: bool has_offset{ false }; }; - template <> class DequantPadder : public BasePadder { public: @@ -1141,7 +1136,7 @@ private: auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)174592); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1149,12 +1144,12 @@ private: for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { auto scale = gm_scale + it.n_cols_complete; - int32_t n_blocks_per_row_b16 = + int32_t n_blocks_per_row_b16 = Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); - int32_t n_blocks_per_row_b32 = + int32_t n_blocks_per_row_b32 = Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); uint8_t quant_repeat_b32 = static_cast( - DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); + DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -1172,6 +1167,7 @@ private: CopyUB2UB(ub_quant_scale + row * n_blocks_per_row_b32 * Block32B::size, ub_quant_scale, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); } + for (it.InitRowLoop(max_rows_per_loop); !it.EndRowLoop(); it.NextRowLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); @@ -1201,7 +1197,7 @@ private: WaitFlag(EVENT_ID2); Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID2); - + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap); SetFlag(EVENT_ID2); @@ -1225,16 +1221,16 @@ private: int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); // multiplex ub_quant_scale_origin + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); // multiplex ub_quant_scale_origin auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)18688); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); - auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); // multiplex ub_quant_offset_origin + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); // multiplex ub_quant_offset_origin auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)74752); auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)112384); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)149760); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); // multiplex ub_conv_f32 - + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); // multiplex ub_conv_f32 + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1244,9 +1240,9 @@ private: auto scale = gm_scale + it.n_cols_complete; auto offset = gm_offset + it.n_cols_complete; - int32_t n_blocks_per_row_b16 = + int32_t n_blocks_per_row_b16 = Block32B::Count(it.n_cols_this_loop) * (sizeof(bfloat16_t) / sizeof(int8_t)); - int32_t n_blocks_per_row_b32 = + int32_t n_blocks_per_row_b32 = Block32B::Count(it.n_cols_this_loop) * (sizeof(float32_t) / sizeof(int8_t)); uint8_t quant_repeat_b32 = static_cast( DivCeil(n_blocks_per_row_b32, VEC_BLOCK_PER_REPEAT)); @@ -1268,7 +1264,7 @@ private: PipeBarrier(); for (int32_t row = 1; row < max_rows_per_loop; ++row) { - CopyUB2UB(ub_quant_scale + row * n_blocks_per_row_b32 * Block32B::size, + CopyUB2UB(ub_quant_scale + row * n_blocks_per_row_b32 * Block32B::size, ub_quant_scale, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); } @@ -1277,7 +1273,7 @@ private: PipeBarrier(); for (int32_t row = 1; row < max_rows_per_loop; ++row) { - CopyUB2UB(ub_quant_offset + row * n_blocks_per_row_b32 * Block32B::size, + CopyUB2UB(ub_quant_offset + row * n_blocks_per_row_b32 * Block32B::size, ub_quant_offset, /* sid */ 0, 1, n_blocks_per_row_b32, 0, 0); } @@ -1341,11 +1337,11 @@ private: auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)0); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)10496); auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)51712); - auto ub_mul= reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)72192); auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)113152); auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)133632); auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)174592); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1406,7 +1402,7 @@ private: WaitFlag(EVENT_ID2); Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID2); - + WaitFlag(EVENT_ID2); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID2); @@ -1430,16 +1426,16 @@ private: int32_t max_cols_per_loop = (n_cols_round <= MAX_LEN) ? n_cols : MAX_LEN; auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)0); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); // multiplex ub_quant_scale_origin + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)0); // multiplex ub_quant_scale_origin auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)18688); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); - auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); // multiplex ub_quant_offset_origin + auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)56064); // multiplex ub_quant_offset_origin auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)74752); auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)112384); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)149760); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); // multiplex ub_conv_f32 - + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)159232); // multiplex ub_conv_f32 + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1480,7 +1476,7 @@ private: WaitFlag(EVENT_ID1); for (int32_t block_col = 1; block_col < n_blocks_per_row_b16; ++block_col) { - CopyUB2UB(ub_quant_offset_origin + block_col * Block32B::size, + CopyUB2UB(ub_quant_offset_origin + block_col * Block32B::size, ub_quant_offset_origin, /* sid */ 0, it.n_rows_this_loop, 1, n_blocks_per_row_b16 - 1, n_blocks_per_row_b16 - 1); } @@ -1491,7 +1487,7 @@ private: for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); - + int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -1517,9 +1513,9 @@ private: PipeBarrier(); WaitFlag(EVENT_ID3); - Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8,RoundMode::CAST_RINT); + Vconv(ub_output, ub_mul, repeat_b32, 1, 1, 4, 8, RoundMode::CAST_RINT); SetFlag(EVENT_ID3); - + WaitFlag(EVENT_ID3); CopyUbufToGmAlign(dst, ub_output, it.n_rows_this_loop, it.n_cols_this_loop, dst_gap, ubuf_gap_b16); SetFlag(EVENT_ID3); @@ -1548,7 +1544,6 @@ public: int32_t batch_size, int32_t m, int32_t k, int32_t n, int32_t m_align, int32_t k_align, int32_t n_align, bool aligned_a, bool aligned_b, bool trans_a, bool trans_b, __gm__ uint8_t *gm_dequant_scale, __gm__ uint8_t *gm_dequant_offset, int32_t dequant_group_size) {} - inline __aicore__ void Run() {} }; @@ -1614,7 +1609,7 @@ private: auto ub_quant_scale = reinterpret_cast<__ubuf__ half *>((uintptr_t)28416); auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)84480); auto ub_output = reinterpret_cast<__ubuf__ half *>((uintptr_t)140544); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1930,7 +1925,7 @@ private: auto ub_vconv = reinterpret_cast<__ubuf__ half *>((uintptr_t)161024); int32_t group_block = Block32B::Count(group_size); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -1949,7 +1944,7 @@ private: for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); - + int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -2008,7 +2003,7 @@ private: } Vadd(ub_add, ub_vconv, ub_quant_offset, repeat, 1, 1, 1, 8, 8, 8); is_after_mte2 = false; - + PipeBarrier(); WaitFlag(EVENT_ID2); Vmul(ub_output, ub_add, ub_quant_scale, repeat, 1, 1, 1, 8, 8, 8); @@ -2197,13 +2192,13 @@ private: auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)0); auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)34048); auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)68096); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); // multiplex ub_add + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); // multiplex ub_add auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)102144); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)119168); auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)136192); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)153216); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); // multiplex ub_vconv_f32 + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); // multiplex ub_vconv_f32 SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); @@ -2444,16 +2439,16 @@ private: auto ub_quant_offset = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)0); auto ub_quant_scale = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)34048); auto ub_add = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)68096); - auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); // multiplex ub_add + auto ub_vconv_f16 = reinterpret_cast<__ubuf__ float16_t *>((uintptr_t)68096); // multiplex ub_add auto ub_output = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)102144); auto ub_quant_offset_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)119168); auto ub_quant_scale_origin = reinterpret_cast<__ubuf__ bfloat16_t *>((uintptr_t)136192); auto ub_input = reinterpret_cast<__ubuf__ int8_t *>((uintptr_t)153216); auto ub_vconv_f32 = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); - auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); // multiplex ub_vconv_f32 + auto ub_mul = reinterpret_cast<__ubuf__ float32_t *>((uintptr_t)162560); // multiplex ub_vconv_f32 int32_t group_block = Block32B::Count(group_size); - + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); SetFlag(EVENT_ID2); @@ -2477,7 +2472,7 @@ private: for (it.InitColLoop(max_cols_per_loop); !it.EndColLoop(); it.NextColLoop()) { auto src = gm_src + it.src_offset(); auto dst = gm_dst + it.dst_offset(); - + int32_t src_gap = n_cols - it.n_cols_this_loop; int32_t dst_gap = n_cols_aligned - it.n_cols_this_loop; @@ -2572,6 +2567,7 @@ private: bool has_offset{ false }; }; + template class Preprocessor { public: @@ -2601,8 +2597,9 @@ public: } LcalWorkspaceInfo workspace_info = GetLcalWorkspaceInfo(gm_workspace, batch_size, m, k, n, m_align, k_align, n_align, trans_a, trans_b, is_int8 ? 1 : 2, has_a_align, has_b_align, 0, has_accum, 0, has_dequant_param, - hasFormatDequantScale,is_deterministic); - + hasFormatDequantScale,is_deterministic, is_moe, is_alltoallvc, EP, local_expert_nums, m * EP * TP); + + if (this->is_int8) { switch (this->dequant_granularity) { case QuantGranularity::PER_TENSOR: @@ -2629,22 +2626,22 @@ public: switch (this->dequant_granularity) { case QuantGranularity::PER_TENSOR: dequant_per_tensor_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, - m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, - gm_dequant_scale, gm_dequant_offset); + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_scale, gm_dequant_offset); return; case QuantGranularity::PER_CHANNEL: dequant_per_channel_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, - m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, - gm_dequant_scale, gm_dequant_offset); + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_scale, gm_dequant_offset); return; case QuantGranularity::PER_GROUP: dequant_per_group_padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, - m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, - gm_dequant_scale, gm_dequant_offset, dequant_group_size); + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b, + gm_dequant_scale, gm_dequant_offset, dequant_group_size); return; default: padder.SetArgs(gm_a, gm_b, workspace_info, batch_size, m, k, n, - m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); + m_align, k_align, n_align, aligned_a, aligned_b, trans_a, trans_b); return; } } diff --git a/comm/lcal/src/kernels/coc_pure_matmul.cce b/comm/lcal/src/kernels/coc_pure_matmul.cce new file mode 100644 index 00000000..9922690d --- /dev/null +++ b/comm/lcal/src/kernels/coc_pure_matmul.cce @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __CCE_KT_TEST__ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + +#include "coc_ppmatmul_switch.cce" + +#ifdef __DAV_C220_CUBE__ +// LcalPureMatmul +#define COC_PURE_MATMUL_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalPureMatmul_##type##_mix_aic(COC_ARGS_FUN(type)) { \ + CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + + +#elif __DAV_C220_VEC__ +// LcalPureMatmul_Align +#define COC_PURE_MATMUL_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalPureMatmul_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ + CocPureMatmulAiv(COC_ARGS_CALL()); \ +} +#endif + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // 910B support bf16 +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_PURE_MATMUL_FUNC_AUTO_DEF); +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_reduce_scatter.cce b/comm/lcal/src/kernels/coc_reduce_scatter.cce new file mode 100644 index 00000000..ecf35301 --- /dev/null +++ b/comm/lcal/src/kernels/coc_reduce_scatter.cce @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifdef __DAV_C220_VEC__ +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class ReduceScatter: public CocCommBase { +public: + __aicore__ explicit ReduceScatter() {}; + + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)) { + CocCommBase::SetArgsForReduce(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + if constexpr (HAVE_BIAS) { + add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + int32_t tail_m = (m / rank_size) % m0; + m_loop = m / rank_size / m0; + if (tail_m) { + m_loop += 1; + } + m_loop *= rank_size; + core_loop = batch_size * m_loop * n_loop; + cal_count = (core_loop + loop_num_per_comm - 1) / loop_num_per_comm; // 每次通信对应cal_count次计算 + + need_dequant = workspace_info.gm_accum; + if (need_dequant) { + fused_dequant_runner.SetArgs(reinterpret_cast<__gm__ bfloat16_t *>(buff[rank]), workspace_info, + reinterpret_cast<__gm__ int64_t *>(gm_dequant_scale), + reinterpret_cast<__gm__ int32_t *>(gm_dequant_offset), dequant_granularity, + batch_size, m, n, m0, n0, m_loop, n_loop, core_loop, swizzl_direct, + swizzl_count, p_value, rank_size); + } + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + fused_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(buff[rank]), + reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, + m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value, rank_size); + } + + } + + FORCE_INLINE_AICORE void StartBeforeFisrtStep(bool needAdd) + { + if (needAdd) { + SetAtomicAdd(); + PipeBarrier(); + } + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + } + + FORCE_INLINE_AICORE void EndFirstStep(bool needAdd) { + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + + if (needAdd) { + SetFlag(EVENT_ID0); // Scalar等MTE3 + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); + } + } + + FORCE_INLINE_AICORE void UbufToGm(int32_t m_offset, int32_t actual_m, + int32_t& actual_move_m, int32_t left_m, + int64_t batch_idx, int64_t m_idx, int64_t n_idx, + __ubuf__ T *ub_buff, int32_t actual_n) + { + // m0 - m_offset表示当前块剩下的一小段,跳过; + if (m_offset < actual_m) { + actual_move_m = actual_m < m_offset + left_m ? actual_m - m_offset : left_m; + // left_m较大,则该块copy完,下次再copy下一块; + // left_m较小,则只copy left_m的部分 + int64_t out_buff_offset = batch_idx * m * n / rank_size + + (m_idx * m0 + m_offset) * n + n_idx * n0; + CopyUbufToGmUnknown(ALIGN, gm_out + out_buff_offset, ub_buff, actual_move_m, + actual_n * sizeof(T), (n0 - actual_n) * sizeof(T) / 32, + (n - actual_n) * sizeof(T)); + } + } + /* + 将src卡的peermem数据累加到本卡localgm,并实现随路layout转换 + int32_t data_size_remain, copy数据量 + __gm__ T *input, src卡地址 + int32_t offset, src卡偏移量 + int32_t loop_idx_st, 偏移前的loopidx,用于计算本卡output位置 + */ + FORCE_INLINE_AICORE void FirstStepInOut(int32_t data_size_remain, __gm__ T *input, + int32_t gm_offset, int32_t move_offset, int32_t loop_idx_st) + { + int32_t ping_pong_move_count = (data_size_remain + max_ub_ping_pong_size - 1) / max_ub_ping_pong_size; // max_ub_ping_pong_size一定是N0的倍数,但不一定是M0*N0的倍数 + + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = data_size_remain - move_idx * max_ub_ping_pong_size; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + // 读的matrix是多个小的m0*n0块顺序排布,写的时候需要重排 + CopyGmToUbuf(ub_buff_st, input + gm_offset + move_idx * max_ub_ping_pong_size, 1, + actual_move_size * sizeof(T) / 32, 0, 0); + SetFlag(event_id); + WaitFlag(event_id); + + int32_t move_num_offset = move_offset + move_idx * max_ub_ping_pong_size; + auto ub_buff = ub_buff_st; + int32_t left_m = actual_move_size / n0; + while (left_m > 0) { + int32_t loop_idx = loop_idx_st + (move_num_offset / (m0 * n0)) * rank_size; + int64_t batch_idx = loop_idx / (m_loop * n_loop); + int32_t in_batch_idx = loop_idx % (m_loop * n_loop); + int32_t in_rank_idx = in_batch_idx / rank_size; + int64_t m_idx, n_idx; + GetBlockIdx(in_rank_idx, m_loop / rank_size, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t actual_m = (m_idx == (m_loop / rank_size - 1)) ? (m / rank_size - m_idx * m0) : m0; + int32_t actual_n = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + int32_t m_offset = (move_num_offset % (m0 * n0)) / n0; // 当前一块起点对应的m,在当前块的位置 + int32_t actual_move_m = m0 < m_offset + left_m ? m0 - m_offset : left_m; + // m0 - m_offset表示当前块剩下的一小段,跳过; + if (m_offset < actual_m) { + actual_move_m = actual_m < m_offset + left_m ? actual_m - m_offset : left_m; + // left_m较大,则该块copy完,下次再copy下一块; + // left_m较小,则只copy left_m的部分 + int64_t out_buff_offset = batch_idx * m * n / rank_size + (m_idx * m0 + m_offset) * n + n_idx * n0; + CopyUbufToGmUnknown(ALIGN, gm_out + out_buff_offset, ub_buff, actual_move_m, actual_n * sizeof(T), + (n0 - actual_n) * sizeof(T) / 32, (n - actual_n) * sizeof(T)); + } + left_m -= actual_move_m; + move_num_offset += actual_move_m * n0; + ub_buff += actual_move_m * n0; + } + SetFlag(event_id); + } + } + + FORCE_INLINE_AICORE void FirstStepInOutWithSplit(int32_t rank_total, int32_t rank_offset, + int32_t loop_idx_st, int32_t data_loop_idx, bool isSio) + { + int32_t rank_per_core = isSio ? rank_size / 2 / comm_npu_split : rank_size / comm_npu_split; + int32_t before_core_offset = data_loop_idx * comm_data_split * len_per_loop; + int32_t core_rank_offset = (core_idx / comm_data_split) * rank_per_core; + int32_t core_offset = core_idx % comm_data_split * len_per_loop; + int32_t loop_total = rank_total - before_core_offset; + + int32_t rank_buff_offset = rank_offset + before_core_offset + core_offset; + + int32_t m_in_core = (core_offset >= loop_total) ? 0 : + ((core_offset + len_per_loop) > loop_total ? + loop_total - core_offset : len_per_loop); + + for (int32_t rank_idx = 0; rank_idx < rank_per_core; rank_idx++) { + // 由于有些服务器gm地址初始为脏数据,reduceScatter perToken量化场景中matmul数据全部写到了peerMem + // aiv写gm地址的时候均为atomic add,会导致在脏数据上进行累加,结果精度错误 + // 故perToken量化场景,此处第一次搬运不做累加,做覆盖搬运,从第二次开始做累加 + if ((is_int8 && (dequant_granularity == QuantGranularity::PER_TOKEN|| std::is_same::value)) && !isSio && (rank_idx == 1)) { + SetAtomicAdd(); + PipeBarrier(); + } + int32_t rank_idx_rot = (rank_idx + core_idx) % rank_per_core; + int32_t real_rank_idx = core_rank_offset + rank_idx_rot; + + real_rank_idx = isSio ? 2 * real_rank_idx + (rank % 2) : real_rank_idx; + + if (real_rank_idx == rank && !need_dequant && dequant_granularity != QuantGranularity::PER_TOKEN) + continue; + + FirstStepInOut(m_in_core, buff[real_rank_idx], rank_buff_offset, + before_core_offset + core_offset, loop_idx_st); + } + + if ((is_int8 && (dequant_granularity == QuantGranularity::PER_TOKEN|| std::is_same::value)) && !isSio) { + SetFlag(EVENT_ID0); // Scalar等MTE3 + WaitFlag(EVENT_ID0); + SetAtomicNone(); + PipeBarrier(); + } + } + + FORCE_INLINE_AICORE void RunLegacy() + { + // Padding + preprocessor.Run(); + + ResetIpcFlags(2); + PipeBarrier(); + + // 初始化通知aic共享内存是空闲的 + int32_t max_flag_id = cal_count < MAX_BLOCK_COUNT? cal_count: MAX_BLOCK_COUNT; + for (int64_t cal_idx = 0; cal_idx < max_flag_id; ++cal_idx) { + if (cal_idx * loop_num_per_comm + core_idx < core_loop) { + SetAicSync(cal_idx); + } + } + for (int32_t cal_idx = 0; cal_idx < cal_count; ++cal_idx) { + uint64_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + int32_t actual_loop_num = + (cal_idx == cal_count - 1) ? (core_loop - cal_idx * loop_num_per_comm) : loop_num_per_comm; + + m_per_rank = actual_loop_num * m0 / rank_size; + // wait aic + if (core_idx < actual_loop_num) { + WaitEvent(flag_idx); + } + if (need_dequant) { + //fused_dequant_runner.Run(cal_idx); + fused_dequant_runner.RunDequantReduceScatter(cal_idx); + } + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + SetAndWaitAivSync(flag_idx); + //fused_pertoken_dequant_runner.Run(cal_idx); + fused_pertoken_dequant_runner.RunDequantReduceScatter(cal_idx); + } + // aiv之间同步 + SetAndWaitAivSync(flag_idx); + + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + + SetAndWaitAivSync(flag_idx); + bool needAdd = (is_int8 && (dequant_granularity == QuantGranularity::PER_TOKEN|| std::is_same::value)) ? false : true; + StartBeforeFisrtStep(needAdd); + + int32_t m_per_core = (m_per_rank * n0) / comm_data_split; + int32_t data_split_num = DivCeil(m_per_core, len_per_loop); + + int32_t rank_offset = flag_idx * m0 * n0 * loop_num_per_comm + rank * m_per_rank * n0; + for (int32_t loop_idx = 0; loop_idx < data_split_num; loop_idx++) { + if (aiv_idx == 0 && core_idx < comm_npu_split * comm_data_split) { + FirstStepInOutWithSplit(m_per_rank * n0, rank_offset, cal_idx * loop_num_per_comm, loop_idx, false); + } + } + + EndFirstStep(needAdd); + SetAndWaitAivSync(flag_idx); + + CrossRankSyncV2(FLAG_ONE_IDX, cal_idx + 1); + // aiv之间同步 + SetAndWaitAivSync(flag_idx); + + // 发送aic同步 + SetAicSync(flag_idx); + } + + ResetIpcFlags(2); + + if (aiv_idx == 1 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + + if constexpr (HAVE_BIAS) { + add_bias_runner.Run(); + } + } + + FORCE_INLINE_AICORE void DataCopySio(int32_t cal_idx_sio, int32_t len_per_rank) + { + + int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_3; + int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; + int32_t sio_core_idx = core_idx - core_count; + int32_t core_offset = sio_core_idx * len_per_core; + int32_t sio_peer_rank = rank ^ 1; + int32_t size_per_rank = gm_c_pingpong_size / rank_size; + // 循环搬所有卡;0卡读1卡的0 2 4 6 part + + for(int32_t src_rank = rank % 2; src_rank < rank_size; src_rank += 2) { + int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * size_per_rank + core_offset; + if (src_rank == rank) { // eg. 0卡读1卡的0部分,直接存回local + StartBeforeFisrtStep(true); + FirstStepInOut(len_per_core, + buff[sio_peer_rank] + flag_idx_sio * gm_c_pingpong_size + src_rank * size_per_rank, + core_offset, core_offset, cal_idx_sio * loop_num_per_comm); + EndFirstStep(true); + } else { // eg. 0卡读1卡的2 4 6部分,存回peermem相同位置 + FirstStepInPeerMem(len_per_core, buff[sio_peer_rank] + peer_offset, buff[rank] + peer_offset, true); + } + } + } + + FORCE_INLINE_AICORE void RunWithSio() + { + // Padding + preprocessor.Run(); + + ResetIpcFlags(2); + PipeBarrier(); + + // 初始化通知aic共享内存是空闲的 + int32_t max_flag_id = cal_count < BLOCK_COUNT_3 ? cal_count: BLOCK_COUNT_3; + int32_t size_per_rank = gm_c_pingpong_size / rank_size; + for (int64_t cal_idx = 0; cal_idx < max_flag_id; ++cal_idx) { + SetAicSync(cal_idx); + } + int32_t tile_per_rank = loop_num_per_comm / rank_size; + for (int32_t cal_idx = 0; cal_idx < cal_count + 1; ++cal_idx) { + uint64_t flag_idx = cal_idx % BLOCK_COUNT_3; + int32_t hccs_idx = cal_idx - 1; // 先sio后hccs + int32_t flag_idx_hccs = hccs_idx % BLOCK_COUNT_3; + int32_t tile_per_rank_sio = + (cal_idx == cal_count - 1) ? (core_loop - cal_idx * loop_num_per_comm) / rank_size : tile_per_rank; + int32_t tile_per_rank_hccs = + (hccs_idx == cal_count - 1) ? (core_loop - hccs_idx * loop_num_per_comm) / rank_size : tile_per_rank; + + // wait aic + if (cal_idx < cal_count) { + WaitEvent(flag_idx); + } + + // aiv之间同步 + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + // 后SIO_TOTAL_CORE_NUM个core用于SIO搬运 + if (aiv_idx == 0 && core_idx >= core_count && + core_idx < core_count + SIO_TOTAL_CORE_NUM && cal_idx < cal_count) { // MoveSio + DataCopySio(cal_idx, tile_per_rank_sio * m0 * n0); + } + + StartBeforeFisrtStep(true); + int32_t m_per_core = tile_per_rank_hccs * m0 * n0 / comm_data_split; + int32_t data_split_num = DivCeil(m_per_core, len_per_loop); + + for (int32_t loop_idx = 0; loop_idx < data_split_num; loop_idx++) { + if (aiv_idx == 0 && core_idx < comm_npu_split * comm_data_split && cal_idx >= 1) { // 第二轮开始搬hccs + FirstStepInOutWithSplit(tile_per_rank_hccs * m0 * n0, + flag_idx_hccs * gm_c_pingpong_size + rank * size_per_rank, + hccs_idx * loop_num_per_comm, loop_idx, true); + } + } + EndFirstStep(true); + + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + CrossRankSyncV2(FLAG_ONE_IDX, cal_idx + 1); + // aiv之间同步 + SetAndWaitAivSync(flag_idx, BLOCK_COUNT_3); + + // 发送aic同步 + if (cal_idx >= 1) + SetAicSync(flag_idx_hccs); + } + + ResetIpcFlags(2); + + if (aiv_idx == 1 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + + + if constexpr (HAVE_BIAS) { + add_bias_runner.Run(); + } + } + + FORCE_INLINE_AICORE void Run() + { + if (is_91093) { + RunWithSio(); + } else { + RunLegacy(); + } + } + +public: + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::FillZero; + using CocCommBase::FirstStepInPeerMem; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_loop; + using CocCommBase::core_idx; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_count; + using CocCommBase::swizzl_direct; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_int8; + using CocCommBase::is_91093; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::comm_npu_split; + using CocCommBase::comm_data_split; + using CocCommBase::comm_direct; + using CocCommBase::len_per_loop; + using CocCommBase::core_count; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::max_ub_ping_pong_size; + using CocCommBase::loop_num_per_comm; + using CocCommBase::gm_c_pingpong_size; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::local_expert_nums; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; + using CocCommBase::is_deterministic; + using CocCommBase::weight_nz; + using CocCommBase::EP; + using CocCommBase::TP; + using CocCommBase::flag_offset; + + int32_t cal_count; + int32_t m_per_rank; + Preprocessor preprocessor; + MatmulReduceScatterBiasAdder add_bias_runner; + //ReduceScatterFusedPerTokenDequantRunner fused_pertoken_dequant_runner; + FusedPerTokenDequantRunner fused_pertoken_dequant_runner; + //FusedReduceScatterDequantRunner fused_dequant_runner; + FusedDequantRunner fused_dequant_runner; + bool need_dequant; +}; + +constexpr int32_t NO_BIAS_MASK2 = 0b000000 | 0b100000 | 0b010000 | 0b110000 | + 0b001000 | 0b101000 | 0b011000 | 0b111000; +constexpr int32_t BIAS_MASK2 = 0b000010 | 0b100010 | 0b010010 | 0b110010 | + 0b001010 | 0b101010 | 0b011010 | 0b111010; + +template +FORCE_INLINE_AICORE void RunReduceScatterAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { + // 16 align + ReduceScatter reduce_scatter_align_16_without_bias; + ReduceScatter reduce_scatter_align_16_with_bias; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + reduce_scatter_align_16_without_bias.SetArgs(COC_ARGS_CALL()); + reduce_scatter_align_16_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + reduce_scatter_align_16_with_bias.SetArgs(COC_ARGS_CALL()); + reduce_scatter_align_16_with_bias.Run(); + break; + default : + break; + } +} + +template +FORCE_INLINE_AICORE void RunReduceScatterUnAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { + // 16 unalign + ReduceScatter reduce_scatter_unalign_16_without_bias; + ReduceScatter reduce_scatter_unalign_16_with_bias; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + reduce_scatter_unalign_16_without_bias.SetArgs(COC_ARGS_CALL()); + reduce_scatter_unalign_16_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + reduce_scatter_unalign_16_with_bias.SetArgs(COC_ARGS_CALL()); + reduce_scatter_unalign_16_with_bias.Run(); + break; + default : + break; + } +} + +template +FORCE_INLINE_AICORE void CocMatmulReduceScatterAiv(COC_ARGS_FUN(T)) { + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t n = cocTilingData->n; + int32_t tiling_key = cocTilingData->tilingKey; + if (n % BLOCK_SIZE_16 == 0) { + RunReduceScatterAlign16(tiling_key, COC_ARGS_CALL()); + } else { + RunReduceScatterUnAlign16(tiling_key, COC_ARGS_CALL()); + } + PipeBarrier(); +} + +#endif diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index a4b3188d..77f88c5a 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -59,6 +59,12 @@ bool Check2DTPType(LcalType lcalType) return lcalType == LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER; } +bool CheckMOEType(LcalType lcalType) +{ + return (lcalType >= LcalType::ALLTOALLV_ALLGATHER_MATMUL) && + (lcalType <= LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN); +} + bool CheckCoCParamDesc(LcalType lcalType, const CoCParamDesc ¶mDesc) { if (COC_TYPE2ELE_SIZE.find(paramDesc.dataTypeDesc) == COC_TYPE2ELE_SIZE.end()) { @@ -86,7 +92,15 @@ bool CheckCoCParamDesc(LcalType lcalType, const CoCParamDesc ¶mDesc) paramCheckList.emplace_back("agDim", agDim, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); paramCheckList.emplace_back("rsDim", rsDim, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); } - + if (CheckMOEType(lcalType)) { + auto ep = paramDesc.moeInfo.EP; + auto tp = paramDesc.moeInfo.TP; + auto localExpertNums = paramDesc.moeInfo.local_expert_nums; + paramCheckList.emplace_back("ep", ep, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); + paramCheckList.emplace_back("tp", tp, PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); + paramCheckList.emplace_back("localExpertNums", localExpertNums, + PARAM_CHECK_MIN_VALUE_ONE, PARAM_CHECK_MAX_VALUE); + } return CheckParamScopeList(paramCheckList); } @@ -134,6 +148,15 @@ CoCTilingFunc *CreateCoCTilingFunc(LcalType lcalType) } CoCTilingFunc *pTilingFunc = nullptr; switch (lcalType) { + case LcalType::ALL_GATHER_MATMUL: + pTilingFunc = new (std::nothrow) CoCAllGatherMatmulTilingFunc(); + break; + case LcalType::ALL_GATHER_MATMUL_V2: + pTilingFunc = new (std::nothrow) CoCAllGatherMatmulV2TilingFunc(); + break; + case LcalType::MATMUL_REDUCE_SCATTER: + pTilingFunc = new (std::nothrow) CoCMatmulReduceScatterTilingFunc(); + break; case LcalType::MATMUL_ALL_REDUCE: if (isDeterministic) { pTilingFunc = new (std::nothrow) CoCMatmulAllReduceDeterTilingFunc(); @@ -144,6 +167,15 @@ CoCTilingFunc *CreateCoCTilingFunc(LcalType lcalType) case LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER: pTilingFunc = new (std::nothrow) CoCAllgatherMatmulReduceScatterTilingFunc(); break; + case LcalType::ALLTOALLV_ALLGATHER_MATMUL: + pTilingFunc = new (std::nothrow) CoCAllToAllAllGatherMatmulTilingFunc(); + break; + case LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN: + pTilingFunc = new (std::nothrow) CoCAllToAllAllGatherMatmulHiddenTilingFunc(); + break; + case LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN: + pTilingFunc = new (std::nothrow) CoCMatmulReduceScatterAllToAllHiddenTilingFunc(); + break; default: pTilingFunc = new (std::nothrow) CoCTilingFunc(); } @@ -158,25 +190,34 @@ Lcoc::Lcoc(LcalComm &comm) : comm_(&comm) {} int Lcoc::SetParam(LcalType lcalType, const CoCTiling &tiling, const CoCParamDesc ¶mDesc) { + // 参数检查 if (!CheckInputParam(lcalType, tiling, paramDesc)) { return LCAL_ERROR_PARA_CHECK_FAIL; } + // 设置LCOC初始化参数 SetLcocParam(lcalType, paramDesc); + // 创建Tiling函数 CoCTilingFunc *pTilingFunc = CreateCoCTilingFunc(lcalType); if (pTilingFunc == nullptr) { PrintErrorLog(lcalType, "Create CoCTilingFunc failed!"); return LCAL_ERROR_INTERNAL; } + // 生成Tiling策略参数 CoCTilingData tilingData = pTilingFunc->GenerateTiling(taskParam_, tiling); + // 检查Tiling策略参数是否合法 bool tilingCheckRes = pTilingFunc->CheckTiling(taskParam_); if (!tilingCheckRes) { PrintErrorLog(lcalType, "Tiling check failed!"); + // 释放TilingFunc delete pTilingFunc; pTilingFunc = nullptr; return LCAL_ERROR_INTERNAL; } + // 赋值Tiling参数 tiling_ = tilingData; + // 设置成功标志 tilingSuccess_ = true; + // 释放TilingFunc delete pTilingFunc; pTilingFunc = nullptr; return LCAL_SUCCESS; @@ -226,6 +267,42 @@ bool Lcoc::CheckBasic(const CoCInputPkg &inputPkg, const CoCOutputPkg &outputPkg return true; } +int Lcoc::AllGatherMatmul(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +{ + LcalType lcalType = LcalType::ALL_GATHER_MATMUL; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcocAllGatherMatmul", true); + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + +int Lcoc::AllGatherMatmulV2(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +{ + LcalType lcalType = LcalType::ALL_GATHER_MATMUL_V2; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcocAllGatherMatmulV2", true); + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + +int Lcoc::MatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +{ + LcalType lcalType = LcalType::MATMUL_REDUCE_SCATTER; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + if (taskParam_.cocParamDesc.mmInfo.m % taskParam_.rankSize != 0) { + if (taskParam_.rank == 0) { + MKI_LOG(ERROR) << "MatmulReduceScatter: input tensor must be the same size as output size times world size"; + } + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcocMatmulReduceScatter", true); + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) { LcalType lcalType = LcalType::MATMUL_ALL_REDUCE; @@ -236,6 +313,16 @@ int Lcoc::MatmulAllReduce(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *wo return LaunchOperator(inputPkg, outputPkg, workspace, stream); } +int Lcoc::PureMatmul(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) +{ + LcalType lcalType = LcalType::PURE_MATMUL; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + ReportTiming report("LcocPureMatmul", true); + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, aclrtStream stream) { @@ -247,6 +334,35 @@ int Lcoc::AllGatherMatmulReduceScatter(CoCInputPkg inputPkg, CoCOutputPkg output return LaunchOperator(inputPkg, outputPkg, workspace, stream); } +int Lcoc::AllToAllVAllGatherMatmul(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream) +{ + LcalType lcalType = LcalType::ALLTOALLV_ALLGATHER_MATMUL; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + +int Lcoc::MatmulReduceScatterAllToAllVHidden(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream) +{ + LcalType lcalType = LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} + +int Lcoc::AllToAllVAllGatherMatmulHidden(CoCInputPkg inputPkg, CoCOutputPkg outputPkg, void *workspace, + aclrtStream stream) +{ + LcalType lcalType = LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN; + if (!CheckBasic(inputPkg, outputPkg, lcalType)) { + return LCAL_ERROR_PARA_CHECK_FAIL; + } + return LaunchOperator(inputPkg, outputPkg, workspace, stream); +} LcalComm *Lcoc::GetComm() { return comm_; @@ -262,11 +378,9 @@ void Lcoc::GetTiling(CoCTiling &tiling) tiling = tiling_; } + bool IsMatrixAligned(const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) { - if (nElemAlign == 0) { - return false; - } return (transpose ? m : n) % nElemAlign == 0; } @@ -278,6 +392,7 @@ int64_t Lcoc::GetWorkspaceSize() CoCDataTypeDesc dataType = cocParamDesc.dataTypeDesc; const MatMulInfo &mmInfo = cocParamDesc.mmInfo; const QuantInfo &quantInfo = cocParamDesc.quantInfo; + const MoeInfo& moeInfo = cocParamDesc.moeInfo; bool hasQuant = quantInfo.quantGranularity != QuantGranularity::QUANT_GRANULARITY_UNDEFINED; bool hasDequant = quantInfo.dequantGranularity != QuantGranularity::QUANT_GRANULARITY_UNDEFINED; int32_t eleSize = COC_TYPE2ELE_SIZE.at(dataType); @@ -285,35 +400,45 @@ int64_t Lcoc::GetWorkspaceSize() int32_t mAlign = AlignUp(mmInfo.m, nElemAlign); int32_t nAlign = AlignUp(mmInfo.n, nElemAlign); int32_t kAlign = AlignUp(mmInfo.k, nElemAlign); + int32_t maxOutputSize = moeInfo.maxOutputSize; bool hasAAlign = hasQuant || (!IsMatrixAligned(mmInfo.m, mmInfo.k, mmInfo.transA, nElemAlign) && mmInfo.m != 1); bool hasBAlign = (!mmInfo.weightNz) && ((hasDequant && !mmInfo.isInt8) || (!IsMatrixAligned(mmInfo.k, mmInfo.n, mmInfo.transB, nElemAlign))); - - int32_t accumRankSize = 0; + + int32_t accumRankSize = taskParam_.lcalType == LcalType::ALL_GATHER_MATMUL ? taskParam_.rankSize : 0; bool hasAccum = dataType == CoCDataTypeDesc::INT8INT8_INT32_BF16; bool hasDequantParam = (quantInfo.dequantGranularity == QuantGranularity::PER_TOKEN || quantInfo.dequantGranularity == QuantGranularity::PER_TENSOR); bool hasFormatDequantScale = (quantInfo.dequantGranularity == QuantGranularity::PER_CHANNEL); + bool isMoe = false; + if (lcalType == LcalType::ALLTOALLV_ALLGATHER_MATMUL || + lcalType == LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN || + lcalType == LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN) { + isMoe = true; + } + bool isAlltoallVc = + lcalType == LcalType::ALLTOALLV_ALLGATHER_MATMUL || lcalType == LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN || + lcalType == LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN; uint64_t dequantWorkSpaceSize = GetDequantWorkSpaceSize(lcalType, tiling_.withSerialMode, mmInfo.m, mmInfo.n, - tiling_.m0, tiling_.n0, tiling_.pValue, tiling_.nLoop, taskParam_.rankSize, taskParam_.blockDim); + tiling_.m0, tiling_.n0, tiling_.pValue, tiling_.nLoop, taskParam_.rankSize, taskParam_.blockDim, maxOutputSize); LcalWorkspaceInfo lcalWorkspaceInfo = GetLcalWorkspaceInfo(0, mmInfo.batchSize, mmInfo.m, mmInfo.k, mmInfo.n, mAlign, kAlign, nAlign, mmInfo.transA, mmInfo.transB, eleSize, hasAAlign, hasBAlign, - accumRankSize, hasAccum, dequantWorkSpaceSize, hasDequantParam, hasFormatDequantScale, isDeterministic); - + accumRankSize, hasAccum, dequantWorkSpaceSize, hasDequantParam, hasFormatDequantScale, isDeterministic, + isMoe, isAlltoallVc, moeInfo.EP, moeInfo.local_expert_nums, maxOutputSize); + MKI_LOG(DEBUG) << "[Lcoc Workspace]: " << "m=" << mmInfo.m << ", k=" << mmInfo.k << ", n=" << mmInfo.n << ", mAlign=" << mAlign << ", kAlign=" << kAlign << ", nAlign=" << nAlign << ", transA=" << mmInfo.transA << ", transB=" << mmInfo.transB << ", eleSize=" << eleSize << ", hasAAlign=" << hasAAlign << ", hasBAlign=" << hasBAlign << ", accumRankSize=" << accumRankSize << ", hasAccum=" << hasAccum << ", dequantWorkSpaceSize=" << dequantWorkSpaceSize << ", hasDequantParam=" << hasDequantParam << ", hasFormatDequantScale=" << hasFormatDequantScale << ", isDeterministic=" << isDeterministic - << ", workspaceSize=" << lcalWorkspaceInfo.workspaceSize; - + << ", isMoe=" << isMoe << ", isAlltoallVc=" << isAlltoallVc << ", moeInfo.EP=" << static_cast(moeInfo.EP) + << ", moeInfo.local_expert_nums=" << moeInfo.local_expert_nums + << ", maxOutputSize=" << maxOutputSize << ", workspaceSize=" << lcalWorkspaceInfo.workspaceSize; return lcalWorkspaceInfo.workspaceSize; } - } - diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index ad7927b3..187fa934 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -7,12 +7,14 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ + #include "lcoc_func.h" #include "lcoc_args.h" #include "mki/utils/log/log.h" using namespace std; namespace Lcal { + // 校验参数取值范围在[min, max]内,当max=-1时,表示参数取值范围在[min, +∞) bool CheckParamScope(const std::string &name, const int &value, const int &min, const int &max) { if (value < min || (max != PARAM_CHECK_MAX_VALUE && value > max)) { @@ -45,9 +47,6 @@ namespace Lcal { bool CheckParamAlign(const std::string &name, const int &value, const int &align) { - if (align == 0) { - return false; - } if (value % align != 0) { MKI_LOG(ERROR) << "The " << name << ":" << value << " must be aligned by " << align << "!"; return false; @@ -76,12 +75,10 @@ namespace Lcal { int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) { - if (nElemAlign == 0) { - return false; - } int64_t nRow = transpose ? n : m; int64_t nCol = transpose ? m : n; int64_t nColAlign = (nCol + nElemAlign - 1) / nElemAlign * nElemAlign; return batchSize * nRow * nColAlign; } + } \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgather_tiling.cpp b/comm/lcal/src/tiling/allgather_tiling.cpp new file mode 100644 index 00000000..d30ce0da --- /dev/null +++ b/comm/lcal/src/tiling/allgather_tiling.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "tiling.h" +#include "tiling_910B.h" +#include "tiling_91093.h" +#include "tiling_func.h" +#include "lcoc_func.h" + +namespace Lcal { +void CoCAllGatherMatmulTilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + CoCTilingFunc::GetDefaultTiling(taskParam); + if (Is91093(taskParam.chipName)) { + if (cocTilingData.rankSize == RANKSIZE_EIGHT) { + AllGatherNPU91093EightRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_SIXTEEN) { + AllGatherNPU91093SixteenRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_TWO && + taskParam.cocParamDesc.mmInfo.isInt8) { + AllGatherNPU91093TwoRankINT8Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_TWO) { + AllGatherNPU91093TwoRankFP16Tiling(cocTilingData); + return; + } + } else if (Is910B(taskParam.chipName)) { + if (cocTilingData.rankSize == RANKSIZE_EIGHT) { + AllGatherEightRankFP16GetDefaultTiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_FOUR) { + AllGatherFourRankINT8Tiling(cocTilingData); // INT8 + return; + } + } + AllGatherGetDefaultTiling(cocTilingData); +} + +void CoCAllGatherMatmulV2TilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + CoCTilingFunc::GetDefaultTiling(taskParam); + auto coreNum = cocTilingData.blockDim; + if (Is91093(taskParam.chipName)) { + if (cocTilingData.rankSize == RANKSIZE_EIGHT) { + AllGatherV2NPU91093EightRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_SIXTEEN) { + AllGatherV2NPU91093SixteenRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_TWO) { + AllGatherV2NPU91093TwoRankFP16Tiling(cocTilingData); + return; + } + } + if (coreNum >= ALLGATHERV2_CORENUM_SIXTEEN + cocTilingData.rankSize) { + AllGatherV2EightRankFP16Core16GetDefaultTiling(cocTilingData); + return; + } + AllGatherV2EightRankFP16GetDefaultTiling(cocTilingData); +} + +bool CheckKValue(const TaskParam &taskParam, const CoCTilingData &data) +{ + auto blockCount = data.is91093 ? BLOCK_COUNT_3 : MAX_BLOCK_COUNT; + int32_t maxPeerMemPerRank = (taskParam.bufferSize * 1024 * 1024) / INPUT_DTYPE / data.rankSize / blockCount; + if (data.pValue * data.m0 * data.k0 * data.kLoop >= maxPeerMemPerRank) { + std::string str = "The k value is too large and is currently not supported. " + "pValue: " + std::to_string(data.pValue) + ", m0: " + std::to_string(data.m0) + + ", k0: " + std::to_string(data.k0) + ", kLoop: " + std::to_string(data.kLoop) + + "maxPeerMemPerRank: " + std::to_string(maxPeerMemPerRank); + PrintErrorLog(taskParam.lcalType, str); + return false; + } + return true; +} + +bool CoCAllGatherMatmulTilingFunc::CheckTiling(const TaskParam &taskParam) +{ + if (!CoCTilingFunc::CheckTiling(taskParam)) { + return false; + } + if (!CheckKValue(taskParam, cocTilingData)) { + return false; + } + + auto rankSize = cocTilingData.rankSize; + auto commNpuSplit = cocTilingData.commNpuSplit; + auto commDataSplit = cocTilingData.commDataSplit; + auto coreNum = cocTilingData.blockDim; + auto is91093 = cocTilingData.is91093; + auto minCoreCount = is91093 ? rankSize / A3_DIE_NUM : rankSize; + int32_t useCoreCount = commNpuSplit * commDataSplit; + + std::vector> paramCheckList = { + {"commNpuSplit * commDataSplit", useCoreCount, minCoreCount, coreNum} + }; + return CheckParamScopeList(paramCheckList); +} + +bool CoCAllGatherMatmulV2TilingFunc::CheckTiling(const TaskParam &taskParam) +{ + if (!CoCTilingFunc::CheckTiling(taskParam)) { + return false; + } + if (!CheckKValue(taskParam, cocTilingData)) { + return false; + } + + auto commNpuSplit = cocTilingData.commNpuSplit; + auto commDataSplit = cocTilingData.commDataSplit; + auto coreNum = cocTilingData.blockDim; + int32_t useCoreCount = commNpuSplit * commDataSplit; + + std::vector> paramCheckList = { + {"commNpuSplit * commDataSplit", useCoreCount, PARAM_CHECK_MIN_VALUE_ONE, coreNum-1} + }; + return CheckParamScopeList(paramCheckList); +} +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgather_tiling_91093.cpp b/comm/lcal/src/tiling/allgather_tiling_91093.cpp new file mode 100644 index 00000000..7d36783c --- /dev/null +++ b/comm/lcal/src/tiling/allgather_tiling_91093.cpp @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_91093.h" +#include "tiling_func.h" +namespace Lcal { + constexpr int32_t ALLGATHER_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHER_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT = 12; + constexpr int32_t ALLGATHER_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 8; + constexpr int32_t ALLGATHER_91093_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHER_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT = 30; + constexpr int32_t ALLGATHER_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHER_91093_SIXTEEN_RANK_FP16_COMMDIRECT_DEFAULT = 1; + constexpr int32_t ALLGATHER_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT = 10; + constexpr int32_t ALLGATHER_91093_SIXTEEN_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHER_91093_TWO_RANK_FP16_PVALUE_DEFAULT = 14; + constexpr int32_t ALLGATHER_91093_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 20; + constexpr int32_t ALLGATHER_91093_TWO_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHER_91093_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHER_91093_TWO_RANK_INT8_M0_DEFAULT = 128; + constexpr int32_t ALLGATHER_91093_TWO_RANK_INT8_PVALUE_DEFAULT = 14; + constexpr int32_t ALLGATHER_91093_TWO_RANK_INT8_UBMOVENUM_DEFAULT = 40; + constexpr int32_t ALLGATHER_91093_TWO_RANK_INT8_COMMDATASPLIT_DEFAULT = 16; + + static std::map>> g_allgather91093EightRankFP16M0Map = { + {128, + {{-1, 1262, -1, 2147483647, -1, 1720}, {-1, 1262, -1, 2147483647, 1824, 3248}, + {1262, 2147483647, -1, 2147483647, -1, 3248}, {-1, 2274, -1, 6700, 3248, 5660}, + {-1, 2274, 6700, 2147483647, 3248, 6172}, {2274, 2147483647, -1, 2147483647, 3248, 5360}, + {-1, 2147483647, -1, 2147483647, 6934, 8446}, {-1, 2147483647, 9950, 2147483647, 8446, 8958}, + {-1, 2147483647, -1, 2147483647, 8958, 2147483647}}}, + {256, + {{-1, 1262, -1, 2147483647, 1720, 1824}, {-1, 2274, -1, 6700, 5660, 6934}, + {-1, 2274, 6700, 2147483647, 6172, 6934}, {2274, 2147483647, -1, 2147483647, 5360, 6934}, + {-1, 2147483647, -1, 9950, 8446, 8958}}} + }; + + static std::map>> g_allgather91093EightRankFP16UbmovenumMap = { + {8.0, + {{-1, 1768, -1, 2147483647, -1, 1624}, {-1, 1768, 1774, 2147483647, 1624, 2274}, + {1768, 2147483647, -1, 4810, -1, 1200}, {1768, 2147483647, 4810, 2147483647, -1, 2274}}}, + {4.0, + {{-1, 1768, -1, 1774, 1624, 2274}}}, + {2.0, + {{1768, 2147483647, -1, 4810, 1200, 2274}, {-1, 768, 768, 2147483647, 2274, 4608}, + {-1, 768, -1, 2147483647, 4608, 2147483647}, {768, 2147483647, -1, 2147483647, 2274, 2147483647}}}, + {3.0, + {{-1, 768, -1, 768, 2274, 4608}}} + }; + + static std::map>> g_allgather91093EightRankFP16PvalueMap = { + {2, + {{-1, 2786, -1, 4608, -1, 1200}, {-1, 2786, -1, 8192, 1518, 1624}, + {768, 1262, -1, 2147483647, 1624, 1720}, {1262, 2786, 768, 2147483647, 1624, 2274}, + {2786, 2147483647, 4810, 9728, -1, 768}, {7168, 2147483647, 9728, 2147483647, -1, 768}, + {2786, 2147483647, 10720, 2147483647, 768, 1774}, {2786, 2147483647, 4810, 2147483647, 1774, 2274}, + {768, 1262, -1, 2147483647, 2274, 2912}, {1262, 2147483647, -1, 2147483647, 2274, 3248}, + {1262, 2147483647, -1, 2147483647, 3840, 4298}, {-1, 2147483647, -1, 2274, 4298, 5660}, + {1774, 2147483647, -1, 2274, 6684, 8958}}}, + {1, + {{-1, 2786, 4608, 2147483647, -1, 1200}, {-1, 2786, -1, 2147483647, 1200, 1518}, + {-1, 2786, 8192, 2147483647, 1518, 1624}, {-1, 768, -1, 2147483647, 1624, 2274}, + {768, 1262, -1, 2147483647, 1720, 2274}, {2786, 7168, 9728, 2147483647, -1, 768}, + {-1, 768, -1, 2147483647, 2274, 4298}, {768, 1262, -1, 2147483647, 2912, 4298}, + {1262, 2147483647, -1, 2147483647, 3248, 3840}, {-1, 2147483647, -1, 2274, 5660, 6684}, + {-1, 1774, -1, 2274, 6684, 8958}, {-1, 2147483647, 2274, 9728, 4298, 8958}, + {-1, 2147483647, 9728, 2147483647, 4298, 6934}, {-1, 768, -1, 2147483647, 8958, 11744}, + {768, 1262, -1, 8704, 8958, 11744}, {1262, 2147483647, -1, 8446, 10720, 11744}}}, + {4, + {{1262, 2786, -1, 768, 1624, 2274}, {2786, 2147483647, 1262, 4810, 768, 1774}, + {2786, 4608, -1, 4810, 1774, 2274}, {2786, 2147483647, 4810, 10720, 768, 1774}, + {-1, 2147483647, 9728, 2147483647, 6934, 8958}, {768, 1262, 8704, 2147483647, 8958, 11744}, + {1262, 2147483647, 8446, 2147483647, 8958, 11744}, {-1, 2000, -1, 2147483647, 11744, 2147483647}, + {2000, 2147483647, 8200, 2147483647, 11744, 2147483647}}}, + {12, + {{2786, 2147483647, -1, 1262, -1, 768}}}, + {6, + {{2786, 2147483647, 1262, 4810, -1, 768}, {4608, 2147483647, 768, 4810, 1774, 2274}, + {1262, 2147483647, -1, 8446, 8958, 10720}, {2000, 2147483647, -1, 8200, 11744, 2147483647}}}, + {10, + {{2786, 2147483647, -1, 1262, 768, 1774}, {4608, 2147483647, -1, 768, 1774, 2274}}} + }; + + static std::map>> g_allgather91093EightRankFP16CommdatasplitMap = { + {8, + {{-1, 1768, -1, 2147483647, -1, 832}, {1262, 1768, -1, 768, 832, 1624}, + {-1, 1768, 768, 2147483647, 832, 1624}, {-1, 1768, 1774, 4608, 1880, 2274}, + {-1, 1768, 4608, 2147483647, 1624, 2274}, {1768, 8958, -1, 4810, -1, 1200}, + {8958, 2147483647, -1, 4810, -1, 1536}, {1768, 2147483647, 4810, 2147483647, -1, 2274}, + {8100, 8600, 9728, 2147483647, 4636, 2147483647}}}, + {16, + {{-1, 1262, -1, 768, 832, 1624}, {-1, 1768, -1, 1774, 1624, 2274}, + {-1, 1768, 1774, 4608, 1624, 1880}, {1768, 8958, -1, 4810, 1200, 2274}, + {8958, 2147483647, -1, 4810, 1536, 2274}, {-1, 8100, -1, 2147483647, 2274, 2147483647}, + {8100, 8600, -1, 9728, 2274, 2147483647}, {8100, 8600, 9728, 2147483647, 2274, 4636}, + {8600, 2147483647, -1, 2147483647, 2274, 2147483647}}} + }; + + static std::map>> g_allgather91093SixteenRankFP16M0Map = { + {128, + {{-1, 2274, -1, 2147483647, -1, 5552}, {2274, 2786, 8200, 2147483647, -1, 4000}, + {2274, 2786, 5100, 2147483647, 4000, 5552}, {2786, 2147483647, -1, 5360, -1, 5552}, + {2786, 2147483647, 5900, 2147483647, -1, 5552}, {-1, 2147483647, 5360, 2147483647, 5552, 6172}, + {-1, 8958, 5360, 2147483647, 6172, 6934}, {-1, 2147483647, -1, 2147483647, 6934, 2147483647}}}, + {256, + {{2274, 2786, -1, 8200, -1, 4000}, {2274, 2786, -1, 5100, 4000, 5552}, + {2786, 2147483647, 5360, 5900, -1, 5552}, {-1, 2147483647, -1, 5360, 5552, 6934}, + {8958, 2147483647, 5360, 2147483647, 6172, 6934}}} + }; + + static std::map>> g_allgather91093SixteenRankFP16PvalueMap = { + {10, + {{-1, 3798, -1, 1774, -1, 576}, {3798, 9728, -1, 1262, -1, 2274}, + {3798, 2147483647, 1262, 2274, -1, 768}}}, + {6, + {{-1, 3798, 1774, 4608, -1, 576}, {9728, 2147483647, -1, 1262, -1, 2274}}}, + {1, + {{-1, 3798, 4608, 7696, -1, 576}, {-1, 3798, -1, 2147483647, 576, 832}, + {-1, 2560, -1, 2147483647, 832, 1200}, {-1, 2786, 1774, 2147483647, 1200, 2274}, + {2786, 3798, 4298, 2147483647, 1200, 2274}, {-1, 3798, -1, 2147483647, 2274, 3248}, + {3798, 5900, -1, 2147483647, 2274, 2786}, {3798, 5900, 4608, 2147483647, 2786, 3248}, + {5900, 2147483647, 5360, 2147483647, 2274, 3248}, {-1, 2560, -1, 768, 3248, 8704}, + {-1, 7850, -1, 768, 8704, 2147483647}, {-1, 2147483647, 768, 2147483647, 3248, 11744}, + {-1, 1262, 768, 2147483647, 11744, 2147483647}, {2000, 2147483647, 6150, 2147483647, 11744, 2147483647}}}, + {2, + {{-1, 3798, 7696, 2147483647, -1, 576}, {2560, 3798, -1, 2147483647, 832, 1200}, + {-1, 2286, -1, 768, 1200, 2274}, {-1, 3798, 768, 1774, 1200, 2274}, + {2786, 3798, 1774, 4298, 1200, 2274}, {3798, 6700, 4810, 5360, -1, 2274}, + {3798, 8100, 5360, 2147483647, -1, 2274}, {8100, 2147483647, 4810, 6450, 768, 2274}, + {8100, 2147483647, 6450, 2147483647, -1, 2274}, {3798, 5900, -1, 4608, 2786, 3248}, + {5900, 2147483647, 2274, 5360, 2274, 3248}, {2560, 2147483647, -1, 768, 3248, 8704}, + {7850, 2147483647, -1, 768, 8704, 2147483647}, {1262, 2000, 768, 2147483647, 11744, 2147483647}}}, + {4, + {{2286, 3798, -1, 768, 1200, 2274}, {3798, 2147483647, 1262, 2274, 768, 2274}, + {3798, 2147483647, 2274, 4810, -1, 2274}, {6700, 8100, 4810, 5360, -1, 2274}, + {8100, 2147483647, 4810, 6450, -1, 768}, {5900, 2147483647, -1, 2274, 2274, 3248}, + {2000, 2147483647, 768, 6150, 11744, 2147483647}}} + }; + + static std::map>> g_allgather91093SixteenRankFP16CommdirectMap = { + {0, + {{-1, 2147483647, -1, 2147483647, -1, 1200}, {768, 8958, -1, 2147483647, 1200, 1438}, + {-1, 8958, -1, 2147483647, 1438, 2147483647}, {8958, 2147483647, -1, 2147483647, 1200, 2147483647}}}, + {1, + {{-1, 768, -1, 2147483647, 1200, 1438}}} + }; + + static std::map>> g_allgather91093SixteenRankFP16CommdatasplitMap = { + {16, + {{-1, 1262, -1, 2147483647, -1, 1624}, {-1, 1262, -1, 2147483647, 1720, 2626}, + {1262, 2147483647, -1, 2147483647, -1, 2626}, {-1, 768, -1, 3798, 2626, 2147483647}, + {2274, 2147483647, -1, 3798, 2626, 3798}}}, + {1, + {{-1, 1262, -1, 2147483647, 1624, 1720}, {768, 2274, -1, 3798, 2626, 2147483647}, + {2274, 2147483647, -1, 3798, 3798, 2147483647}, {-1, 2147483647, 3798, 2147483647, 2626, 2147483647}}} + }; + + static std::map>> g_allgather91093SixteenRankFP16UbmovenumMap = { + {20.0, + {{-1, 3286, -1, 2147483647, -1, 832}, {-1, 3286, -1, 1262, 832, 2274}, + {-1, 3286, 1774, 2147483647, 832, 2274}, {-1, 3286, -1, 2147483647, 2274, 3248}, + {3286, 3798, -1, 2147483647, -1, 2000}, {3286, 3798, 6150, 2147483647, 2000, 3248}, + {3798, 2147483647, -1, 5360, -1, 2274}, {3798, 2147483647, 5360, 5900, -1, 2000}, + {3798, 2147483647, 5900, 2147483647, -1, 3248}, {-1, 1262, -1, 2147483647, 3542, 4298}, + {-1, 1518, 768, 2147483647, 4298, 6172}, {-1, 768, -1, 2147483647, 6172, 8704}, + {-1, 768, -1, 6656, 8704, 2147483647}}}, + {30.0, + {{-1, 3286, 1262, 1774, 832, 2274}}}, + {10.0, + {{3286, 3798, -1, 6150, 2000, 3248}, {3798, 2147483647, -1, 5360, 2274, 3248}, + {3798, 2147483647, 5360, 5900, 2000, 3248}, {-1, 3000, -1, 7200, 3248, 3542}, + {-1, 2147483647, 7200, 2147483647, 3248, 3542}, {1262, 2024, -1, 2147483647, 3542, 4298}, + {-1, 1006, -1, 768, 4298, 6172}, {-1, 768, 6656, 2147483647, 8704, 2147483647}, + {768, 1774, -1, 11264, 6172, 6684}}}, + {8.0, + {{3000, 2147483647, -1, 7200, 3248, 3542}, {2024, 2147483647, -1, 2147483647, 3542, 4298}, + {1006, 3584, -1, 768, 4298, 6172}, {768, 1774, -1, 11264, 6684, 2147483647}}}, + {6.0, + {{3584, 2147483647, -1, 768, 4298, 6172}, {1518, 2147483647, 768, 2147483647, 4298, 6172}, + {768, 1774, 11264, 2147483647, 6172, 2147483647}}}, + {4.0, + {{1774, 2274, -1, 2560, 6172, 2147483647}, {2274, 3286, -1, 2147483647, 6172, 8958}, + {3286, 2147483647, -1, 2147483647, 6172, 8446}}}, + {3.0, + {{1774, 2274, 2560, 2147483647, 6172, 2147483647}, {2274, 3286, -1, 2147483647, 8958, 2147483647}, + {3286, 2147483647, -1, 2147483647, 8446, 2147483647}}} + }; + + static std::map>> g_allgather91093TwoRankFP16CommdatasplitMap = { + {8, + {{-1, 1536, -1, 3584, -1, 1536}, {1536, 2560, -1, 8704, -1, 1536}, + {1536, 9728, 8704, 9728, -1, 1536}, {3584, 9728, 9728, 2147483647, -1, 1536}, + {9728, 2147483647, 768, 2560, -1, 1536}, {9728, 2147483647, 5120, 2147483647, -1, 1536}}}, + {16, + {{-1, 1536, 3584, 8704, -1, 1536}, {2560, 9728, -1, 8704, -1, 1536}, + {-1, 1536, 8704, 9728, -1, 1536}, {-1, 3584, 9728, 2147483647, -1, 1536}, + {9728, 2147483647, -1, 768, -1, 1536}, {9728, 2147483647, 2560, 5120, -1, 1536}, + {-1, 2147483647, -1, 2147483647, 1536, 2147483647}}} + }; + + static std::map>> g_allgather91093TwoRankFP16M0Map = { + {128, + {{-1, 4608, -1, 1280, -1, 1536}, {-1, 2560, 1280, 2147483647, -1, 1536}, + {2560, 4608, 5632, 2147483647, -1, 1536}, {4608, 5632, 7680, 2147483647, -1, 1536}, + {9728, 2147483647, 8192, 2147483647, -1, 1536}, {-1, 1536, -1, 3584, 1536, 2147483647}, + {1536, 2147483647, -1, 4608, 1536, 2147483647}, {-1, 2147483647, 4608, 2147483647, 1536, 7680}, + {3584, 2147483647, 4608, 2147483647, 7680, 2147483647}}}, + {256, + {{2560, 4608, 1280, 5632, -1, 1536}, {4608, 5632, -1, 7680, -1, 1536}, + {5632, 9728, -1, 2147483647, -1, 1536}, {9728, 2147483647, -1, 8192, -1, 1536}, + {-1, 1536, 3584, 4608, 1536, 2147483647}, {-1, 3584, 4608, 2147483647, 7680, 2147483647}}} + }; + + static std::map>> g_allgather91093TwoRankFP16UbmovenumMap = { + {10.0, + {{-1, 4608, -1, 1792, -1, 1536}}}, + {20.0, + {{-1, 4608, 1792, 2560, -1, 1536}}}, + {6.0, + {{-1, 4608, 2560, 2147483647, -1, 1536}, {4608, 2147483647, -1, 8704, -1, 1536}, + {5632, 2147483647, -1, 8704, 1536, 2560}, {4608, 2147483647, 8704, 2147483647, 1536, 3584}}}, + {4.0, + {{-1, 3584, -1, 6656, 1536, 2560}, {1536, 4608, 8704, 2147483647, 2560, 3584}, + {4608, 5632, -1, 8704, 1536, 2560}, {4608, 7680, 5632, 8704, 2560, 4608}, + {7680, 2147483647, -1, 8704, 2560, 4608}, {4608, 2147483647, 8704, 9728, 3584, 4608}}}, + {3.0, + {{3584, 4608, -1, 6656, 1536, 2560}, {-1, 4608, 6656, 2147483647, 1536, 2560}, + {-1, 1536, -1, 4608, 2560, 3584}, {-1, 1536, -1, 1536, 3584, 4608}, + {1536, 4608, 5632, 2147483647, 3584, 4608}, {4608, 7680, -1, 5632, 2560, 4608}, + {4608, 2147483647, 9728, 2147483647, 3584, 4608}, {-1, 1536, 9728, 2147483647, 19456, 2147483647}, + {1536, 3584, 6656, 2147483647, 4608, 5632}, {5632, 2147483647, 3584, 2147483647, 4608, 5632}}}, + {2.0, + {{-1, 1536, 4608, 2147483647, 2560, 3584}, {1536, 4608, -1, 8704, 2560, 3584}, + {-1, 1536, 1536, 2147483647, 3584, 4608}, {1536, 4608, -1, 5632, 3584, 4608}, + {-1, 1536, -1, 2147483647, 4608, 15360}, {-1, 1536, -1, 9728, 15360, 2147483647}, + {-1, 1536, 9728, 2147483647, 15360, 19456}, {1536, 3584, -1, 6656, 4608, 5632}, + {1536, 3584, -1, 2147483647, 5632, 2147483647}, {3584, 5632, -1, 2147483647, 4608, 2147483647}, + {5632, 2147483647, -1, 3584, 4608, 5632}, {5632, 2147483647, -1, 2147483647, 5632, 2147483647}}}, + {16.0, + {{4608, 2147483647, 8704, 2147483647, -1, 1536}}} + }; + + static std::map>> g_allgather91093TwoRankFP16PvalueMap = { + {10, + {{-1, 2560, -1, 5632, -1, 1536}, {1536, 2560, -1, 2147483647, 1536, 2560}, + {-1, 2560, -1, 7680, 2560, 3584}, {3584, 7680, -1, 3584, -1, 1536}, + {1536, 2560, -1, 2147483647, 11264, 13312}, {2560, 3584, 9728, 2147483647, 9728, 11264}, + {2560, 3584, 8704, 2147483647, 17408, 2147483647}}}, + {4, + {{-1, 2560, 5632, 2147483647, -1, 1536}, {1536, 2560, 8704, 2147483647, 3584, 4608}}}, + {6, + {{-1, 1536, -1, 2147483647, 1536, 2560}, {-1, 1536, -1, 2147483647, 3584, 4608}, + {-1, 1536, 6656, 2147483647, 4608, 9728}, {1536, 2560, -1, 2147483647, 9728, 11264}, + {2560, 3584, 1792, 8704, 9728, 15360}}}, + {12, + {{-1, 2560, 7680, 2147483647, 2560, 3584}, {2560, 3584, -1, 3584, -1, 1536}, + {2560, 2147483647, 3584, 5632, 4608, 9728}, {3584, 2147483647, 6656, 2147483647, -1, 9728}, + {-1, 1536, 6656, 2147483647, 9728, 11264}, {2560, 3584, -1, 2560, 15360, 2147483647}, + {2560, 3584, 8704, 9728, 9728, 11264}, {3584, 8704, 3584, 2147483647, 9728, 11264}, + {3584, 7680, -1, 2147483647, 11264, 13312}, {3584, 4608, -1, 2147483647, 13312, 2147483647}, + {4608, 8704, -1, 8704, 13312, 2147483647}, {8704, 9728, -1, 1792, 9728, 2147483647}, + {8704, 9728, 2560, 2147483647, 9728, 2147483647}, {9728, 2147483647, 1280, 2147483647, 9728, 2147483647}}}, + {14, + {{1536, 2560, -1, 8704, 3584, 4608}, {-1, 1536, -1, 6656, 4608, 9728}, + {1536, 2560, -1, 5632, 4608, 9728}, {7680, 2147483647, -1, 3584, -1, 1536}, + {2560, 2147483647, -1, 3584, 1536, 9728}, {2560, 2147483647, 3584, 5632, -1, 4608}, + {2560, 3584, 5632, 8704, -1, 9728}, {-1, 1536, -1, 6656, 9728, 11264}, + {-1, 1536, -1, 2147483647, 11264, 2147483647}, {1536, 2560, -1, 2147483647, 13312, 2147483647}, + {2560, 3584, -1, 1792, 9728, 15360}, {2560, 3584, 2560, 8704, 15360, 2147483647}, + {3584, 8704, -1, 3584, 9728, 11264}, {7680, 8704, -1, 2147483647, 11264, 13312}, + {4608, 8704, 8704, 2147483647, 13312, 2147483647}, {8704, 9728, 1792, 2560, 9728, 2147483647}, + {9728, 2147483647, -1, 1280, 9728, 2147483647}}}, + {3, + {{1536, 2560, 5632, 2147483647, 4608, 9728}, {2560, 3584, 8704, 2147483647, -1, 9728}}}, + {8, + {{3584, 2147483647, 5632, 6656, -1, 9728}, {2560, 3584, 8704, 2147483647, 11264, 17408}}} + }; + + static std::map>> g_allgather91093TwoRankINT8CommdatasplitMap = { + {8, + {{-1, 1536, -1, 4608, -1, 1536}, {-1, 1536, -1, 3584, 1536, 15360}, + {-1, 1536, 3584, 4608, 1536, 6656}, {1536, 3584, 1280, 1792, -1, 7680}, + {-1, 1536, -1, 1280, 15360, 17408}, {-1, 1536, 8192, 9728, 15360, 17408}, + {-1, 1536, -1, 2048, 17408, 2147483647}, {-1, 1536, 4608, 5632, 17408, 2147483647}}}, + {16, + {{-1, 1536, 4608, 2147483647, -1, 1536}, {-1, 1536, 3584, 4608, 6656, 15360}, + {-1, 1536, 4608, 2147483647, 1536, 15360}, {1536, 2147483647, -1, 1280, -1, 15360}, + {3584, 2147483647, 1280, 1792, -1, 7680}, {1536, 2147483647, 1280, 1792, 7680, 15360}, + {1536, 2147483647, 1792, 2147483647, -1, 15360}, {-1, 1536, 1280, 8192, 15360, 17408}, + {-1, 1536, 9728, 2147483647, 15360, 17408}, {-1, 1536, 2048, 4608, 17408, 2147483647}, + {-1, 1536, 5632, 2147483647, 17408, 2147483647}, {1536, 2147483647, -1, 2147483647, 15360, 2147483647}}} + }; + + static std::map>> g_allgather91093TwoRankINT8UbmovenumMap = { + {30.0, + {{-1, 1536, -1, 4608, -1, 1536}}}, + {10.0, + {{-1, 1536, 4608, 2147483647, -1, 5632}, {6656, 2147483647, -1, 768, -1, 4608}, + {1536, 2147483647, 768, 2147483647, -1, 5632}, {-1, 8704, -1, 2147483647, 5632, 9728}, + {-1, 5632, -1, 2147483647, 9728, 11264}, {-1, 6656, 768, 2147483647, 11264, 13312}, + {9728, 2147483647, -1, 2147483647, 11264, 13312}}}, + {40.0, + {{-1, 1536, -1, 768, 1536, 5632}, {1536, 6656, -1, 768, -1, 1536}}}, + {20.0, + {{-1, 1536, 768, 4608, 1536, 5632}, {-1, 6656, -1, 768, 11264, 13312}}}, + {12.0, + {{1536, 6656, -1, 768, 1536, 5632}, {6656, 2147483647, -1, 768, 4608, 5632}, + {8704, 2147483647, -1, 2147483647, 5632, 9728}, {5632, 2147483647, -1, 2147483647, 9728, 11264}, + {-1, 6656, -1, 2147483647, 13312, 2147483647}, {6656, 9728, -1, 6656, 11264, 2147483647}}}, + {16.0, + {{6656, 9728, 6656, 2147483647, 11264, 2147483647}, {9728, 2147483647, -1, 2147483647, 13312, 2147483647}}} + }; + + static std::map>> g_allgather91093TwoRankINT8PvalueMap = { + {6, + {{-1, 1536, -1, 4608, -1, 1536}, {1536, 9728, -1, 768, 2560, 13312}, + {9728, 2147483647, -1, 1280, 6656, 13312}, {8704, 9728, 1792, 2560, 13312, 2147483647}, + {9728, 2147483647, 2560, 7680, 2560, 5632}, {9728, 2147483647, 9728, 2147483647, 4608, 11264}}}, + {4, + {{-1, 1536, 4608, 2147483647, -1, 1536}, {9728, 2147483647, 1280, 2560, 6656, 13312}, + {-1, 8704, 1792, 2560, 13312, 2147483647}}}, + {10, + {{1536, 2560, -1, 2147483647, -1, 1536}, {-1, 2560, -1, 1280, 1536, 2560}, + {6656, 7680, 3072, 2147483647, 1536, 2560}, {-1, 6656, 768, 2560, 2560, 13312}, + {-1, 9728, 2560, 2147483647, 15360, 17408}}}, + {14, + {{2560, 6656, -1, 2147483647, -1, 1536}, {2560, 4608, -1, 1280, 1536, 2560}, + {-1, 1536, 1280, 2147483647, 1536, 2560}, {4608, 6656, -1, 2147483647, 1536, 2560}, + {7680, 2147483647, -1, 4608, -1, 2560}, {7680, 2147483647, 4608, 8704, -1, 1536}, + {7680, 8704, 8704, 2147483647, -1, 2560}, {9728, 2147483647, 8704, 2147483647, -1, 2560}, + {-1, 1536, -1, 768, 2560, 13312}, {9728, 2147483647, -1, 2560, 2560, 6656}, + {-1, 2147483647, 768, 1792, 17408, 2147483647}, {-1, 8704, 4608, 2147483647, 2560, 15360}, + {-1, 9728, 2560, 2147483647, 17408, 19456}, {9728, 2147483647, 2560, 7680, 5632, 2147483647}, + {9728, 2147483647, 7680, 9728, 2560, 2147483647}, {9728, 2147483647, 9728, 2147483647, 11264, 2147483647}}}, + {12, + {{6656, 7680, -1, 2147483647, -1, 1536}, {1536, 4608, 1280, 2147483647, 1536, 2560}, + {6656, 7680, -1, 3072, 1536, 2560}, {7680, 9728, 4608, 8704, 1536, 2560}, + {-1, 9728, 2560, 4608, 2560, 15360}, {8704, 9728, 4608, 2147483647, 2560, 15360}, + {-1, 9728, 2560, 2147483647, 19456, 2147483647}}}, + {3, + {{9728, 2147483647, 4608, 8704, 1536, 2560}, {-1, 2147483647, -1, 768, 17408, 2147483647}, + {9728, 2147483647, 1792, 2560, 13312, 2147483647}}}, + {8, + {{8704, 9728, 8704, 2147483647, -1, 2560}, {6656, 9728, 768, 2560, 2560, 13312}, + {-1, 2560, -1, 1792, 13312, 17408}, {9728, 2147483647, 9728, 2147483647, 2560, 4608}}}, + {2, + {{2560, 2147483647, -1, 1792, 13312, 17408}}} + }; + + static std::map>> g_allgather91093TwoRankINT8M0Map = { + {128, + {{-1, 4608, -1, 2147483647, -1, 2560}, {9728, 2147483647, 8704, 2147483647, -1, 1536}, + {7680, 2147483647, 7680, 2147483647, 1536, 2560}, {-1, 2147483647, -1, 4608, 2560, 2147483647}, + {-1, 2147483647, 4608, 2147483647, 2560, 19456}, {5632, 2147483647, 4608, 2147483647, 19456, 2147483647}}}, + {256, + {{4608, 2147483647, -1, 8704, -1, 1536}, {4608, 9728, 8704, 2147483647, -1, 1536}, + {4608, 7680, -1, 2147483647, 1536, 2560}, {7680, 2147483647, -1, 7680, 1536, 2560}, + {-1, 5632, 4608, 2147483647, 19456, 2147483647}}} + }; + + void AllGatherNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {ALLGATHER_91093_EIGHT_RANK_FP16_M0_DEFAULT, + g_allgather91093EightRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgather91093EightRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHER_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_allgather91093EightRankFP16PvalueMap}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allgather91093EightRankFP16CommdatasplitMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_NPU_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {ALLGATHER_91093_SIXTEEN_RANK_FP16_M0_DEFAULT, + g_allgather91093SixteenRankFP16M0Map}}, + {&cocTilingData.pValue, + {ALLGATHER_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, + g_allgather91093SixteenRankFP16PvalueMap}}, + {&cocTilingData.commDirect, + {ALLGATHER_91093_SIXTEEN_RANK_FP16_COMMDIRECT_DEFAULT, + g_allgather91093SixteenRankFP16CommdirectMap}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_91093_SIXTEEN_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allgather91093SixteenRankFP16CommdatasplitMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgather91093SixteenRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}} + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit == COMMDATASPLIT_ONE ? cocTilingData.rankSize : COMMNPUSPLIT_ONE; + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherNPU91093TwoRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.commDataSplit, + {ALLGATHER_91093_TWO_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allgather91093TwoRankFP16CommdatasplitMap}}, + {&cocTilingData.m0, + {ALLGATHER_91093_TWO_RANK_FP16_M0_DEFAULT, + g_allgather91093TwoRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_91093_TWO_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgather91093TwoRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHER_91093_TWO_RANK_FP16_PVALUE_DEFAULT, + g_allgather91093TwoRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}} + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherNPU91093TwoRankINT8Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.commDataSplit, + {ALLGATHER_91093_TWO_RANK_INT8_COMMDATASPLIT_DEFAULT, + g_allgather91093TwoRankINT8CommdatasplitMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_91093_TWO_RANK_INT8_UBMOVENUM_DEFAULT, + g_allgather91093TwoRankINT8UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHER_91093_TWO_RANK_INT8_PVALUE_DEFAULT, + g_allgather91093TwoRankINT8PvalueMap}}, + {&cocTilingData.m0, + {ALLGATHER_91093_TWO_RANK_INT8_M0_DEFAULT, + g_allgather91093TwoRankINT8M0Map}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + DealTilingParamByBuffSize(cocTilingData); + } +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgather_tiling_910B.cpp b/comm/lcal/src/tiling/allgather_tiling_910B.cpp new file mode 100644 index 00000000..81b6ffec --- /dev/null +++ b/comm/lcal/src/tiling/allgather_tiling_910B.cpp @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_910B.h" +#include "tiling_func.h" +#include "lcal_types.h" + +namespace Lcal { + constexpr int32_t ALLGATHER_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 10; + constexpr int32_t ALLGATHER_EIGHT_RANK_FP16_PVALUE_DEFAULT = 8; + constexpr int32_t ALLGATHER_EIGHT_RANK_FP16_COMMDIRECT_DEFAULT = 1; + constexpr int32_t ALLGATHER_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHER_EIGHT_RANK_FP16_M0_DEFAULT = 128; + + constexpr int32_t ALLGATHER_FOUR_RANK_INT8_UBMOVENUM_DEFAULT = 4; + constexpr int32_t ALLGATHER_FOUR_RANK_INT8_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHER_FOUR_RANK_INT8_PVALUE_DEFAULT = 14; + constexpr int32_t ALLGATHER_FOUR_RANK_INT8_M0_DEFAULT = 128; + + const int UBMOVE_MAX_M = 10096; + const int UBMOVE_MAX_K = 6656; + const int UBMOVE_MAX_N = 2336; + const int UBMOVE_MAX_M_HIGH = 14144; + const int UBMOVE_DEFAULT = 20; + const int UBMOVE_SMALL_M_SMALL_K_SMALL_N = 50; + const int UBMOVE_SMALL_M_LARGE_K_SMALL_N = 60; + const int UBMOVE_LARGE_M_SMALL_N = 80; + const int SCALE_FACTOR = 512; + const int PVALUE_M_SMALL = 6144; + const int PVALUE_K_SMALL = 10240; + const int PVALUE_N_MEDIUM = 9216; + const int PVALUE_M_MEDIUM = 14144; + const int PVALUE_M_LARGE = 10096; + const int PVALUE_ONE = 1; + const int PVALUE_TWO = 2; + const int PVALUE_THREE = 3; + const int PVALUE_FOUR = 4; + + static std::vector g_allgatherSwizzldirectCoef = { + { -2.462e-04, -7.154e-06, 6.700e-05, 1.416e-06, 1.747e-04, 1.513e-07, 2.296e-02, -3.022e-04, -6.992e-03, + -1.865e-03, 8.685e-03, -2.039e-03, -1.701e-02, 1.805e-03, 1.174e-03, -5.262e-03, 3.752e-05, -1.539e-05, + -2.508e-02, -9.660e-05, 2.489e-03, -7.638e-03, -1.360e-03, -3.614e-04, -1.150e-03 } + }; + + static std::map>> g_allgatherEightRankFP16M0Map = { + {128, + {{-1, 1262, -1, 2147483647, -1, 576}, {-1, 1262, 5660, 2147483647, 576, 1200}, + {-1, 1262, -1, 2147483647, 1200, 5552}, {1262, 8958, -1, 2274, 1888, 5552}, + {8958, 2147483647, -1, 2147483647, 1536, 5552}, {-1, 2147483647, -1, 2274, 6684, 2147483647}, + {9728, 2147483647, 2786, 3286, 5552, 2147483647}, {-1, 768, 3286, 7696, 5552, 2147483647}}}, + {256, + {{-1, 1262, -1, 5660, 576, 1200}, {1262, 8958, -1, 2147483647, -1, 1888}, + {1262, 8958, 2274, 2147483647, 1888, 5552}, {8958, 2147483647, -1, 2147483647, -1, 1536}, + {-1, 2147483647, -1, 2274, 5552, 6684}, {-1, 2147483647, 2274, 2786, 5552, 2147483647}, + {-1, 9728, 2786, 3286, 5552, 2147483647}, {-1, 768, 7696, 2147483647, 5552, 2147483647}, + {768, 2147483647, 3286, 2147483647, 5552, 2147483647}}} + }; + + static std::map>> g_allgatherEightRankFP16CommdatasplitMap = { + {8, + {{-1, 1262, -1, 768, -1, 576}, {-1, 2274, 768, 2147483647, -1, 576}, + {-1, 2274, -1, 2147483647, 576, 1624}, {-1, 1512, -1, 2147483647, 1624, 1720}, + {-1, 2274, -1, 2147483647, 1720, 2274}, {2274, 9728, -1, 2147483647, -1, 768}, + {9728, 2147483647, 768, 2147483647, -1, 768}, {2786, 2147483647, -1, 4298, 768, 1774}, + {2274, 2147483647, 4298, 2147483647, 768, 1774}, {2274, 3584, -1, 1262, 1774, 2274}, + {3584, 2147483647, 1774, 2147483647, 1774, 2274}, {-1, 768, 6700, 2147483647, 2626, 3248}}}, + {16, + {{1262, 2274, -1, 768, -1, 576}, {1512, 2274, -1, 2147483647, 1624, 1720}, + {9728, 2147483647, -1, 768, -1, 768}, {2274, 2786, -1, 4298, 768, 1774}, + {2274, 3584, 2798, 2147483647, 1774, 2274}, {768, 2147483647, 6700, 2147483647, 2626, 2912}}}, + {2, + {{2274, 3584, 1262, 2798, 1774, 2274}, {3584, 2147483647, -1, 1774, 1774, 2274}, + {-1, 2147483647, -1, 2147483647, 2274, 2626}, {-1, 2147483647, -1, 6700, 2626, 3248}, + {768, 2147483647, 6700, 2147483647, 2912, 3248}, {-1, 2147483647, -1, 2147483647, 3248, 2147483647}}} + }; + + static std::map>> g_allgatherEightRankFP16CommdirectMap = { + {1, + {{-1, 768, -1, 2147483647, -1, 4608}, {-1, 768, 768, 2147483647, 4608, 5824}, + {768, 2147483647, -1, 2147483647, -1, 2912}, {1774, 2147483647, -1, 768, 2912, 3584}, + {768, 2147483647, -1, 768, 3584, 6172}, {768, 2147483647, 768, 2147483647, 2912, 6172}, + {-1, 6950, -1, 2147483647, 6172, 2147483647}, {6950, 7450, -1, 2560, 6172, 2147483647}, + {6950, 7450, 2560, 3584, 6172, 8704}, {6950, 7450, 3584, 2147483647, 6172, 2147483647}, + {7450, 2147483647, -1, 2147483647, 6172, 2147483647}}}, + {0, + {{-1, 768, -1, 768, 4608, 5824}, {-1, 768, -1, 2147483647, 5824, 6172}, + {768, 1774, -1, 768, 2912, 3584}, {6950, 7450, 2560, 3584, 8704, 2147483647}}} + }; + + static std::map>> g_allgatherEightRankFP16PvalueMap = { + {1, + {{-1, 768, -1, 2147483647, -1, 576}, {768, 1262, 7196, 2147483647, -1, 576}, + {1262, 4298, -1, 2147483647, -1, 576}, {-1, 4298, 768, 2147483647, 576, 1518}, + {-1, 4298, -1, 2147483647, 1518, 1984}, {-1, 2560, -1, 5660, 1984, 2274}, + {-1, 4298, 5660, 2147483647, 1984, 2274}, {4810, 2147483647, 7946, 10720, -1, 2274}, + {-1, 8958, -1, 2147483647, 2274, 2912}, {-1, 8958, 2560, 2147483647, 2912, 3248}, + {-1, 8958, -1, 1262, 3248, 5660}, {-1, 8958, 1262, 2147483647, 3248, 6450}, + {-1, 8958, -1, 2147483647, 6450, 2147483647}, {8958, 9728, -1, 2147483647, 2274, 5660}, + {8958, 9728, 1536, 2147483647, 5660, 6684}, {8958, 9728, -1, 2147483647, 6684, 2147483647}, + {9728, 2147483647, -1, 768, 6684, 2147483647}, {9728, 2147483647, 768, 3584, 4608, 2147483647}, + {9728, 2147483647, 3584, 2147483647, 2274, 2147483647}}}, + {6, + {{768, 1262, -1, 7196, -1, 576}, {-1, 4298, -1, 768, 576, 1518}, + {4810, 7850, -1, 1262, -1, 2274}, {4810, 7450, 1262, 3286, -1, 2274}}}, + {4, + {{2560, 4298, -1, 5660, 1984, 2274}, {7450, 2147483647, 1262, 3286, -1, 2274}, + {9728, 2147483647, -1, 768, 2274, 3584}}}, + {2, + {{4298, 4810, -1, 2147483647, -1, 2274}, {4810, 2147483647, 3286, 7946, -1, 2274}, + {4810, 2147483647, 10720, 2147483647, -1, 2274}, {-1, 8958, -1, 2560, 2912, 3248}, + {-1, 8958, -1, 1262, 5660, 6450}, {8958, 9728, -1, 1536, 5660, 6684}, + {9728, 2147483647, -1, 768, 3584, 6684}, {9728, 2147483647, 768, 3584, 2274, 4608}}}, + {8, + {{7850, 2147483647, -1, 1262, -1, 2274}}} + }; + + static std::map>> g_allgatherEightRankFP16UbmovenumMap = { + {3.0, + {{-1, 1262, -1, 2147483647, -1, 832}, {-1, 1262, 768, 2147483647, 832, 2400}, + {1262, 2147483647, 768, 2147483647, -1, 1624}, {1262, 2147483647, 1774, 2147483647, 1624, 2274}, + {1262, 2147483647, -1, 1262, 6684, 7434}, {7850, 2147483647, 1262, 1774, 5552, 7434}, + {1262, 2147483647, 1774, 2147483647, 5552, 7434}, {1262, 2147483647, -1, 768, 7434, 8704}, + {-1, 768, 1262, 3286, 7434, 9728}, {768, 1262, 3286, 2147483647, 7434, 9728}}}, + {2.0, + {{-1, 1262, -1, 768, 832, 2400}, {-1, 1262, 7696, 2147483647, 6684, 7434}, + {1262, 2147483647, -1, 768, -1, 1624}, {1262, 2147483647, -1, 1262, 5552, 6684}, + {1262, 7850, 1262, 1774, 5552, 7434}, {-1, 1262, -1, 768, 7434, 8704}, + {-1, 2147483647, -1, 768, 8704, 9728}, {768, 2147483647, 768, 3286, 7434, 9728}, + {1262, 2147483647, 3286, 2147483647, 7434, 9728}, {-1, 7200, -1, 2147483647, 9728, 2147483647}, + {7200, 2147483647, -1, 2147483647, 9728, 11744}, {7200, 2147483647, -1, 11744, 11744, 2147483647}}}, + {8.0, + {{-1, 1262, -1, 2147483647, 2400, 6172}, {1262, 2147483647, -1, 2147483647, 2274, 3248}, + {-1, 768, 3286, 2147483647, 7434, 9728}}}, + {6.0, + {{-1, 768, -1, 2147483647, 6172, 6684}, {-1, 1262, -1, 7696, 6684, 7434}, + {1262, 2147483647, -1, 2147483647, 3248, 4298}, {1262, 2147483647, -1, 768, 4298, 5552}}}, + {4.0, + {{768, 1262, -1, 2147483647, 6172, 6684}, {1262, 2147483647, 768, 2147483647, 4298, 5552}, + {-1, 768, 768, 1262, 7434, 9728}, {7200, 2147483647, 11744, 2147483647, 11744, 2147483647}}}, + {10.0, + {{1262, 2147483647, -1, 1774, 1624, 2274}}} + }; + + static std::map>> g_allgatherFourRankINT8M0Map = { + {128, + {{-1, 2147483647, -1, 2147483647, -1, 3584}, {-1, 2147483647, -1, 4608, 3584, 8704}, + {-1, 2560, 4608, 2147483647, 3584, 8704}, {-1, 2147483647, -1, 2560, 8704, 2147483647}}}, + {256, + {{2560, 2147483647, 4608, 2147483647, 3584, 8704}, {-1, 2147483647, 2560, 2147483647, 8704, 2147483647}}} + }; + + static std::map>> g_allgatherFourRankINT8PvalueMap = { + {12, + {{-1, 5632, -1, 1792, -1, 1536}, {9728, 2147483647, -1, 8704, -1, 2560}}}, + {10, + {{-1, 5632, 1792, 3584, -1, 1536}, {2560, 5632, -1, 1280, 1536, 2560}, + {5632, 7680, -1, 2147483647, -1, 4608}, {7680, 9728, -1, 2147483647, -1, 2560}, + {7680, 9728, -1, 5632, 3584, 4608}, {9728, 2147483647, 8704, 2147483647, 1536, 2560}, + {9728, 2147483647, 1280, 2147483647, 3584, 4608}}}, + {6, + {{-1, 5632, 3584, 8704, -1, 1536}, {-1, 2560, -1, 1280, 1536, 2560}, + {3584, 5632, -1, 2147483647, 2560, 3584}, {1536, 5632, -1, 6656, 3584, 4608}, + {9728, 2147483647, 8704, 2147483647, -1, 1536}}}, + {3, + {{-1, 5632, 8704, 2147483647, -1, 1536}, {-1, 5632, 1280, 2147483647, 1536, 2560}, + {768, 2560, -1, 2147483647, 2560, 3584}, {1536, 5632, 6656, 2147483647, 3584, 4608}, + {7680, 9728, 5632, 2147483647, 3584, 4608}, {1536, 9728, -1, 1280, 4608, 8704}, + {9728, 2147483647, -1, 2560, 4608, 8704}, {1536, 2147483647, 2560, 6656, 4608, 5632}}}, + {1, + {{-1, 768, -1, 2147483647, 2560, 3584}, {-1, 1536, 4096, 2147483647, 3584, 4608}, + {-1, 768, -1, 2560, 4608, 8704}, {-1, 1536, 2560, 2147483647, 4608, 5632}, + {1536, 2147483647, 6656, 2147483647, 4608, 5632}, {-1, 2147483647, 2560, 2147483647, 5632, 8704}, + {-1, 768, -1, 2147483647, 8704, 2147483647}, {768, 2147483647, -1, 768, 9728, 11264}, + {768, 2147483647, 768, 2147483647, 8704, 11264}, {768, 1536, 4096, 2147483647, 11264, 2147483647}, + {1536, 2147483647, -1, 2147483647, 11264, 2147483647}}}, + {4, + {{2560, 3584, -1, 2147483647, 2560, 3584}}}, + {2, + {{-1, 1536, -1, 4096, 3584, 4608}, {768, 1536, -1, 2560, 4608, 8704}, + {1536, 9728, 1280, 2560, 4608, 8704}, {768, 2147483647, -1, 768, 8704, 9728}, + {768, 1536, -1, 4096, 11264, 2147483647}}}, + {14, + {{7680, 9728, -1, 2147483647, 2560, 3584}, {9728, 2147483647, -1, 1280, 2560, 4608}, + {9728, 2147483647, 1280, 2147483647, 2560, 3584}}} + }; + + static std::map>> g_allgatherFourRankINT8CommdatasplitMap = { + {16, + {{-1, 2147483647, -1, 2147483647, -1, 2147483647}}} + }; + + static std::map>> g_allgatherFourRankINT8UbmovenumMap = { + {4.0, + {{-1, 2560, -1, 2147483647, -1, 3584}, {-1, 2560, 1792, 2147483647, 3584, 4608}, + {2560, 2147483647, -1, 1792, -1, 2560}, {2560, 4608, -1, 1792, 2560, 4608}, + {2560, 2147483647, 1792, 2147483647, -1, 3584}, {9728, 2147483647, 1792, 2147483647, 3584, 4608}, + {-1, 768, -1, 4096, 4608, 5632}}}, + {2.0, + {{-1, 2560, -1, 1792, 3584, 4608}, {-1, 768, 4096, 2147483647, 4608, 5632}, + {-1, 768, -1, 2147483647, 5632, 2147483647}, {768, 2147483647, -1, 2147483647, 4608, 2147483647}}}, + {3.0, + {{4608, 2147483647, -1, 1792, 2560, 4608}, {2560, 9728, 1792, 2147483647, 3584, 4608}}} + }; + + void AllGatherFourRankINT8Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {ALLGATHER_FOUR_RANK_INT8_M0_DEFAULT, + g_allgatherFourRankINT8M0Map}}, + {&cocTilingData.pValue, + {ALLGATHER_FOUR_RANK_INT8_PVALUE_DEFAULT, + g_allgatherFourRankINT8PvalueMap}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_FOUR_RANK_INT8_COMMDATASPLIT_DEFAULT, + g_allgatherFourRankINT8CommdatasplitMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_FOUR_RANK_INT8_UBMOVENUM_DEFAULT, + g_allgatherFourRankINT8UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ZERO}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_NPU_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {ALLGATHER_EIGHT_RANK_FP16_M0_DEFAULT, + g_allgatherEightRankFP16M0Map}}, + {&cocTilingData.commDataSplit, + {ALLGATHER_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allgatherEightRankFP16CommdatasplitMap}}, + {&cocTilingData.commDirect, + {ALLGATHER_EIGHT_RANK_FP16_COMMDIRECT_DEFAULT, + g_allgatherEightRankFP16CommdirectMap}}, + {&cocTilingData.pValue, + {ALLGATHER_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_allgatherEightRankFP16PvalueMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHER_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgatherEightRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ZERO}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}} + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit >= COMMDATASPLIT_EIGHT ? COMMNPUSPLIT_ONE : cocTilingData.rankSize; + cocTilingData.commDataSplit = ClampValue(cocTilingData.commDataSplit, COMMDATASPLIT_ONE, + cocTilingData.blockDim / cocTilingData.commNpuSplit); + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + + DealTilingParamByBuffSize(cocTilingData); + } + + int AllGatherUbMoveNum(int m, int k, int n) + { + if (m <= UBMOVE_MAX_M) { + if (k <= UBMOVE_MAX_K) { + if (n <= UBMOVE_MAX_N) { + return UBMOVE_SMALL_M_SMALL_K_SMALL_N * SCALE_FACTOR; + } else { + return UBMOVE_DEFAULT * SCALE_FACTOR; + } + } else { + return UBMOVE_SMALL_M_LARGE_K_SMALL_N * SCALE_FACTOR; + } + } else { + if (n <= UBMOVE_MAX_N) { + if (m <= UBMOVE_MAX_M_HIGH) { + return UBMOVE_LARGE_M_SMALL_N * SCALE_FACTOR; + } else { + return UBMOVE_SMALL_M_LARGE_K_SMALL_N * SCALE_FACTOR; + } + } else { + return UBMOVE_DEFAULT * SCALE_FACTOR; + } + } + return UBMOVE_DEFAULT * SCALE_FACTOR; + } + + int AllGatherPValue(int m, int k, int n) + { + if (m <= PVALUE_M_SMALL) { + if (k <= PVALUE_K_SMALL) { + return PVALUE_ONE; + } else { + if (n <= PVALUE_N_MEDIUM) { + return PVALUE_ONE; + } else { + return PVALUE_TWO; + } + } + } else { + if (n <= PVALUE_N_MEDIUM) { + if (m <= PVALUE_M_MEDIUM) { + return PVALUE_ONE; + } else { + return PVALUE_THREE; + } + } else { + if (m <= PVALUE_M_LARGE) { + return PVALUE_THREE; + } else { + return PVALUE_FOUR; + } + } + } + } + + void AllGatherGetDefaultTiling(CoCTilingData &cocTilingData) + { + int32_t m = cocTilingData.m; + int32_t k = cocTilingData.k; + int32_t n = cocTilingData.n; + double mknGB = (1.0 * m / ONE_K) * (1.0 * k / ONE_K) * (1.0 * n / ONE_K); + double mkGB = (1.0 * m / ONE_K) * (1.0 * k / ONE_K); + double mnGB = (1.0 * m / ONE_K) * (1.0 * n / ONE_K); + double knGB = (1.0 * k / ONE_K) * (1.0 * n / ONE_K); + double c0 = sqrt(1.0 * m / k); + double c1 = 1.0 * m * k / n; + double c2 = sqrt(c1); + double c3 = sqrt(m * k) / n; + double c4 = sqrt(1.0 * k / n); + double swizzlDirectDouble = 0; + std::vector feats = { 1.0 * m, 1.0 / m, 1.0 * k, 1.0 / k, 1.0 * n, 1.0 / n, mknGB, + 1.0 / mknGB, mkGB, 1.0 / mkGB, mnGB, 1.0 / mnGB, knGB, 1.0 / knGB, + c0, 1.0 / c0, c1, 1.0 / c1, c2, 1.0 / c2, c3, + 1.0 / c3, c4, 1.0 / c4, 1 }; + for (uint32_t i = 0; i < feats.size(); i++) { + swizzlDirectDouble += feats[i] * g_allgatherSwizzldirectCoef[i]; + } + swizzlDirectDouble = 1.0 / (1.0 + exp(-swizzlDirectDouble)); + if (swizzlDirectDouble >= HALF_PROB) { + cocTilingData.swizzlDirect = 1; + } else { + cocTilingData.swizzlDirect = 0; + } + + cocTilingData.pValue = AllGatherPValue(m, k, n); + cocTilingData.ubMoveNum = AllGatherUbMoveNum(m, k, n); + cocTilingData.m0 = DEFAULT_ROW; + cocTilingData.n0 = DEFAULT_COL; + cocTilingData.k0 = DEFAULT_COL; + cocTilingData.kLoop = CeilDev(k, cocTilingData.k0); + + cocTilingData.write2OtherRank = 1; + cocTilingData.commDirect = COMM_DATA_DIRECT; + cocTilingData.commNpuSplit = cocTilingData.rankSize; + cocTilingData.commDataSplit = COMMDATASPLIT_ONE; + DealTilingParamByBuffSize(cocTilingData); + cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.k0 * cocTilingData.kLoop * cocTilingData.pValue; + } +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp b/comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp new file mode 100644 index 00000000..6bb50789 --- /dev/null +++ b/comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_91093.h" +#include "tiling_func.h" + +namespace Lcal { + constexpr int32_t ALLGATHERV2_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT = 6; + constexpr int32_t ALLGATHERV2_91093_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHERV2_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 190; + constexpr int32_t ALLGATHERV2_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHERV2_91093_SIXTEEN_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHERV2_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT = 160; + constexpr int32_t ALLGATHERV2_91093_SIXTEEN_RANK_FP16_COMMNPUSPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHERV2_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT = 12; + constexpr int32_t ALLGATHERV2_91093_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 12; + constexpr int32_t ALLGATHERV2_91093_TWO_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHERV2_91093_TWO_RANK_FP16_PVALUE_DEFAULT = 14; + + static std::map>> g_allgatherV291093EightRankFP16CommdatasplitMap = { + {8, + {{-1, 1518, -1, 2147483647, -1, 1624}, {-1, 1518, -1, 4608, 1880, 2274}, + {-1, 1518, 4608, 2147483647, 1624, 2274}, {1518, 8600, -1, 2147483647, -1, 1200}, + {8600, 8958, 10494, 2147483647, -1, 2274}, {8958, 2147483647, -1, 2147483647, -1, 2274}, + {-1, 2147483647, -1, 768, 2912, 5660}}}, + {16, + {{-1, 1518, -1, 4608, 1624, 1880}, {1518, 8600, -1, 2147483647, 1200, 2274}, + {8600, 8958, -1, 10494, -1, 2274}, {-1, 2147483647, -1, 2147483647, 2274, 2912}, + {-1, 2147483647, -1, 768, 5660, 6450}, {-1, 2147483647, 768, 2147483647, 2912, 6450}, + {-1, 2147483647, -1, 2147483647, 6450, 2147483647}}} + }; + + static std::map>> g_allgatherV291093EightRankFP16UbmovenumMap = { + {160, + {{-1, 6950, -1, 2560, -1, 576}, {-1, 6950, -1, 2560, 832, 1200}, + {1262, 6950, -1, 1774, 1200, 2274}, {6950, 2147483647, -1, 2274, -1, 1536}, + {8858, 9728, -1, 2274, 1536, 2274}, {-1, 2147483647, -1, 768, 2912, 4608}, + {-1, 8720, -1, 768, 4608, 5312}}}, + {8, + {{-1, 6950, 2560, 2147483647, -1, 576}, {-1, 6950, -1, 2147483647, 576, 832}, + {-1, 6950, 2560, 2147483647, 832, 1200}, {-1, 1262, -1, 2147483647, 1200, 2274}, + {1262, 6950, 3798, 2147483647, 1200, 2274}, {6950, 2147483647, 2274, 2147483647, -1, 768}, + {6950, 9728, 2274, 2147483647, 768, 1262}, {6950, 2147483647, 4810, 2147483647, 1262, 2274}}}, + {2, + {{1262, 6950, 1774, 3798, 1200, 2274}, {6950, 8858, -1, 2274, 1536, 2274}, + {9728, 2147483647, 1280, 2274, 1536, 2274}, {6950, 2147483647, 2274, 4810, 1262, 1774}, + {-1, 2147483647, -1, 2147483647, 2274, 2912}, {8720, 2147483647, -1, 768, 4608, 5312}, + {-1, 2147483647, 768, 2147483647, 2912, 5312}, {-1, 2147483647, -1, 2147483647, 5312, 8446}, + {-1, 9728, -1, 2147483647, 8446, 10720}, {9728, 2147483647, -1, 9728, 8446, 10720}, + {-1, 2147483647, -1, 2147483647, 10720, 2147483647}}}, + {190, + {{9728, 2147483647, -1, 1280, 1536, 2274}}}, + {20, + {{9728, 2147483647, 2274, 2147483647, 768, 1262}}}, + {3, + {{6950, 2147483647, 2274, 4810, 1774, 2274}, {9728, 2147483647, 9728, 2147483647, 8446, 10720}}} + }; + + static std::map>> g_allgatherV291093EightRankFP16M0Map = { + {128, + {{-1, 4810, -1, 2147483647, -1, 832}, {-1, 4810, -1, 768, 1536, 2274}, + {-1, 4810, 768, 2147483647, 832, 2274}, {4810, 2147483647, -1, 2147483647, -1, 2274}, + {-1, 2147483647, -1, 2274, 2274, 5660}, {-1, 2147483647, 2274, 6950, 2274, 5360}, + {-1, 2147483647, 6950, 7450, 2274, 5660}, {-1, 2147483647, 7450, 2147483647, 2274, 6934}, + {-1, 2147483647, -1, 2274, 6934, 2147483647}, {-1, 2147483647, 2274, 2786, 9470, 2147483647}, + {-1, 2147483647, 2786, 2147483647, 6934, 2147483647}}}, + {256, + {{-1, 4810, -1, 768, 832, 1536}, {-1, 2147483647, -1, 2274, 5660, 6934}, + {-1, 2147483647, 2274, 6950, 5360, 6934}, {-1, 2147483647, 6950, 7450, 5660, 6934}, + {-1, 2147483647, 2274, 2786, 6934, 9470}}} + }; + + static std::map>> g_allgatherV291093EightRankFP16PvalueMap = { + {1, + {{-1, 1774, -1, 2147483647, -1, 576}, {-1, 1262, -1, 2147483647, 576, 2274}, + {-1, 2147483647, -1, 2274, 5660, 6934}, {-1, 2147483647, 2274, 2147483647, 2274, 6934}, + {-1, 2147483647, 1774, 5900, 6934, 2147483647}}}, + {4, + {{1774, 2786, -1, 2147483647, -1, 576}, {2786, 5660, -1, 2274, -1, 2274}, + {2786, 2147483647, 2274, 2147483647, 768, 2274}, {-1, 2147483647, 5900, 2147483647, 6934, 2147483647}}}, + {2, + {{1262, 2786, -1, 2147483647, 576, 2274}, {2786, 2147483647, 2274, 2147483647, -1, 768}, + {-1, 2147483647, -1, 2274, 2274, 5660}, {-1, 2147483647, -1, 1774, 6934, 2147483647}}}, + {6, + {{5660, 2147483647, -1, 2274, -1, 2274}}} + }; + + static std::map>> g_allgatherV291093SixteenRankFP16PvalueMap = { + {4, + {{-1, 2786, -1, 768, -1, 768}, {-1, 2786, -1, 768, 1984, 2274}, + {4608, 6700, -1, 768, -1, 2274}, {3798, 4298, -1, 1006, 2274, 5312}, + {4298, 2147483647, -1, 1262, 2274, 4608}}}, + {2, + {{-1, 2786, -1, 768, 768, 1200}, {1262, 2786, 768, 1262, 1984, 2274}, + {2786, 9728, 768, 3286, -1, 768}, {2786, 2147483647, 768, 3286, 768, 2274}, + {2786, 5900, 3286, 5660, 1774, 2274}, {5900, 2147483647, 3286, 3798, -1, 2274}, + {768, 3798, -1, 1262, 2274, 3248}, {1774, 3798, -1, 768, 3248, 5312}, + {4298, 2147483647, -1, 1262, 4608, 5312}}}, + {1, + {{-1, 2786, 768, 2147483647, -1, 1200}, {-1, 2786, -1, 2147483647, 1200, 1984}, + {-1, 1262, 768, 1262, 1984, 2274}, {-1, 2786, 1262, 2147483647, 1984, 2274}, + {2786, 5900, 3286, 2147483647, -1, 1774}, {2786, 5900, 5660, 2147483647, 1774, 2274}, + {5900, 2147483647, 3798, 2147483647, -1, 2274}, {-1, 768, -1, 1262, 2274, 3248}, + {-1, 3798, 1262, 2147483647, 2274, 3248}, {-1, 1774, -1, 768, 3248, 5312}, + {-1, 3798, 768, 2147483647, 3248, 5312}, {3798, 4298, 1006, 2147483647, 2274, 5312}, + {4298, 2147483647, 1262, 2147483647, 2274, 5312}, {-1, 2147483647, -1, 2147483647, 5312, 2147483647}}}, + {6, + {{2786, 4608, -1, 768, -1, 2274}}}, + {10, + {{6700, 2147483647, -1, 768, -1, 768}}}, + {8, + {{6700, 2147483647, -1, 768, 768, 2274}}}, + {12, + {{9728, 2147483647, 768, 3286, -1, 768}}} + }; + + static std::map>> g_allgatherV291093SixteenRankFP16CommnpusplitMap = { + {8, + {{-1, 2274, -1, 1262, -1, 576}, {-1, 2274, -1, 768, 576, 1200}, + {2024, 2274, 6160, 7696, 1624, 2274}, {2274, 2147483647, -1, 1774, -1, 768}, + {2274, 2147483647, -1, 768, 768, 1262}, {2274, 2147483647, -1, 768, 1774, 2274}, + {2274, 3584, 5360, 2147483647, -1, 768}, {-1, 768, 5660, 7696, 2274, 2147483647}}}, + {1, + {{-1, 2274, 1262, 2147483647, -1, 576}, {-1, 2274, 768, 2147483647, 576, 1200}, + {-1, 2274, -1, 2147483647, 1200, 1624}, {-1, 1518, -1, 2147483647, 1824, 2274}, + {2024, 2274, -1, 6160, 1624, 2274}, {2024, 2274, 7696, 2147483647, 1624, 2274}, + {2274, 2147483647, 1774, 5360, -1, 768}, {2274, 2147483647, 768, 5360, 768, 1262}, + {2274, 2147483647, 768, 5360, 1774, 2274}, {6950, 2147483647, 7696, 2147483647, 1774, 2274}, + {-1, 768, -1, 768, 2274, 4608}, {768, 6450, -1, 768, 2274, 5312}, + {6450, 2147483647, -1, 768, 2274, 5660}}}, + {16, + {{-1, 1518, -1, 2147483647, 1624, 1824}, {1518, 2024, -1, 2147483647, 1624, 2274}, + {2274, 2147483647, -1, 5360, 1262, 1774}, {3584, 6950, 5360, 2147483647, -1, 768}, + {2274, 6950, 5360, 2147483647, 768, 2274}, {6950, 2147483647, 5360, 2147483647, -1, 1774}, + {6950, 2147483647, 5360, 7696, 1774, 2274}, {-1, 768, -1, 768, 4608, 2147483647}, + {-1, 768, 768, 5660, 2274, 2147483647}, {-1, 768, 7696, 2147483647, 2274, 2147483647}, + {768, 6450, 768, 2147483647, 2274, 5312}, {768, 6450, -1, 2147483647, 5312, 2147483647}, + {6450, 2147483647, -1, 768, 5660, 2147483647}, {6450, 2147483647, 768, 2147483647, 2274, 2147483647}}} + }; + + static std::map>> g_allgatherV291093SixteenRankFP16UbmovenumMap = { + {160, + {{-1, 2274, -1, 768, -1, 1456}, {2274, 2147483647, -1, 768, -1, 2274}}}, + {16, + {{-1, 2274, 768, 2147483647, -1, 1456}, {-1, 2274, -1, 8704, 1456, 1824}, + {-1, 2274, -1, 2147483647, 1824, 2400}, {1262, 2274, 768, 2147483647, 2400, 3504}, + {2274, 2147483647, 768, 5900, -1, 2274}, {2274, 2147483647, 5900, 2147483647, 1774, 2274}, + {2274, 2147483647, 11744, 2147483647, 2274, 2786}, {2274, 2147483647, -1, 7696, 2786, 3286}}}, + {8, + {{-1, 2274, 8704, 2147483647, 1456, 1824}, {1262, 2274, 768, 2147483647, 3504, 5552}, + {2274, 2147483647, 768, 2147483647, 3286, 4298}, {768, 1262, 2560, 2147483647, 5552, 6684}, + {1262, 1518, -1, 2560, 5552, 6684}}}, + {18, + {{-1, 1262, -1, 2147483647, 2400, 2912}, {2274, 2147483647, 5900, 2147483647, -1, 1774}}}, + {14, + {{-1, 768, -1, 2147483647, 2912, 5552}}}, + {10, + {{768, 1262, -1, 2147483647, 2912, 5552}, {-1, 768, -1, 2560, 5552, 6684}, + {-1, 768, -1, 1262, 6684, 8704}, {-1, 768, 6656, 2147483647, 8704, 2147483647}, + {768, 1262, -1, 2560, 5552, 6684}, {1262, 1518, 2560, 2147483647, 5552, 6684}}}, + {2, + {{1262, 2274, -1, 768, 2400, 5552}, {2274, 2147483647, -1, 768, 3286, 5552}}}, + {12, + {{2274, 2147483647, -1, 11744, 2274, 2786}, {2274, 2147483647, 7696, 2147483647, 2786, 3286}, + {-1, 768, 2560, 2147483647, 5552, 6172}, {-1, 768, 1262, 2147483647, 6684, 8704}, + {-1, 768, -1, 6656, 8704, 2147483647}}}, + {6, + {{2274, 2786, 768, 2147483647, 4298, 5552}, {768, 1518, -1, 1774, 6684, 2147483647}, + {768, 1518, 5660, 11264, 6684, 2147483647}}}, + {5, + {{2786, 2147483647, 768, 2147483647, 4298, 5552}, {1518, 3286, -1, 2147483647, 5552, 5872}}}, + {80, + {{-1, 768, 2560, 2147483647, 6172, 6684}}}, + {4, + {{768, 1518, 1774, 5660, 6684, 2147483647}, {1518, 3286, -1, 2147483647, 5872, 8958}, + {2786, 3286, 6160, 2147483647, 8958, 2147483647}, {3286, 8958, 1774, 2274, 5552, 2147483647}, + {3286, 8958, 2274, 2147483647, 5552, 7434}, {8958, 9728, 2560, 2147483647, 5552, 2147483647}, + {9728, 2147483647, 9728, 2147483647, 5552, 2147483647}}}, + {3, + {{768, 1518, 11264, 2147483647, 6684, 2147483647}, {1518, 2786, -1, 2147483647, 8958, 2147483647}, + {2786, 3286, -1, 6160, 8958, 2147483647}, {3286, 8958, -1, 1774, 5552, 2147483647}, + {3286, 8958, 2274, 2147483647, 7434, 2147483647}, {8958, 9728, -1, 2560, 5552, 2147483647}, + {9728, 2147483647, -1, 9728, 5552, 2147483647}}} + }; + + static std::map>> g_allgatherV291093SixteenRankFP16M0Map = { + {256, + {{-1, 2024, -1, 7696, -1, 576}, {-1, 2024, -1, 4608, 576, 1200}, + {768, 2024, -1, 2147483647, 1200, 1518}, {2024, 9728, 768, 2147483647, -1, 1518}, + {9728, 2147483647, 2560, 2147483647, -1, 1518}, {-1, 2147483647, -1, 2274, 5660, 6684}, + {2274, 8958, 2274, 2147483647, 1518, 9728}, {2274, 2147483647, 2274, 11744, 9728, 2147483647}}}, + {128, + {{-1, 2024, 7696, 2147483647, -1, 576}, {-1, 2024, 4608, 2147483647, 576, 1200}, + {-1, 768, -1, 2147483647, 1200, 1518}, {2024, 2147483647, -1, 768, -1, 1518}, + {9728, 2147483647, 768, 2560, -1, 1518}, {-1, 2147483647, -1, 2274, 1518, 5660}, + {-1, 2147483647, -1, 2274, 6684, 2147483647}, {-1, 2274, 2274, 2147483647, 1518, 2147483647}, + {8958, 2147483647, 2274, 2147483647, 1518, 9728}, {2274, 2147483647, 11744, 2147483647, 9728, 2147483647}}} + }; + + static std::map>> g_allgatherV291093TwoRankFP16PvalueMap = { + {3, + {{-1, 4608, -1, 3584, -1, 1536}}}, + {6, + {{-1, 4608, 3584, 4608, -1, 1536}, {4608, 6656, 4608, 2147483647, -1, 8704}, + {6656, 2147483647, 4608, 2147483647, -1, 7680}, {6656, 9728, -1, 2560, 9728, 15360}, + {9728, 2147483647, 1280, 2560, 9728, 2147483647}, {3584, 6656, 2560, 3584, 9728, 15360}}}, + {4, + {{-1, 4608, 4608, 2147483647, -1, 1536}, {-1, 2560, -1, 3584, 1536, 2560}, + {-1, 4608, 3584, 2147483647, 1536, 2560}}}, + {10, + {{2560, 4608, -1, 3584, 1536, 2560}, {3584, 4608, -1, 2147483647, 2560, 5632}, + {4608, 2147483647, 768, 1280, -1, 4608}, {4608, 2147483647, 1280, 2560, 1536, 2560}, + {-1, 1536, 7168, 2147483647, 9728, 11264}, {-1, 1536, 8704, 2147483647, 11264, 13312}, + {-1, 1536, 7680, 2147483647, 15360, 2147483647}, {1536, 2560, 8704, 2147483647, 11264, 2147483647}, + {4608, 9728, -1, 2560, 15360, 2147483647}}}, + {12, + {{-1, 1536, -1, 2147483647, 2560, 5632}, {2560, 3584, -1, 2147483647, 2560, 5632}, + {4608, 8704, -1, 768, -1, 9728}, {4608, 2147483647, 768, 1280, 4608, 9728}, + {4608, 2147483647, 1280, 2560, -1, 1536}, {1536, 2560, -1, 2147483647, 9728, 11264}, + {2560, 3584, 7680, 2147483647, 9728, 2147483647}, {3584, 6656, 2560, 3584, 15360, 2147483647}, + {6656, 2147483647, 2560, 3584, 9728, 2147483647}, {3584, 2147483647, 3584, 2147483647, 9728, 2147483647}}}, + {14, + {{1536, 2560, -1, 2147483647, 2560, 5632}, {-1, 4608, -1, 2147483647, 5632, 9728}, + {8704, 2147483647, -1, 768, -1, 9728}, {4608, 2147483647, 1280, 2560, 2560, 8704}, + {4608, 6656, 2560, 4608, -1, 9728}, {4608, 6656, 4608, 2147483647, 8704, 9728}, + {6656, 2147483647, 4608, 2147483647, 7680, 9728}, {-1, 1536, -1, 7168, 9728, 11264}, + {-1, 1536, -1, 8704, 11264, 13312}, {-1, 1536, -1, 7680, 13312, 2147483647}, + {-1, 1536, 7680, 2147483647, 13312, 15360}, {1536, 2560, -1, 8704, 11264, 2147483647}, + {2560, 3584, -1, 7680, 9728, 2147483647}, {3584, 6656, -1, 2560, 9728, 15360}, + {3584, 4608, -1, 2560, 15360, 2147483647}, {9728, 2147483647, -1, 1280, 9728, 2147483647}}}, + {8, + {{4608, 2147483647, 1280, 2560, 8704, 9728}, {6656, 2147483647, 2560, 4608, -1, 9728}}} + }; + + static std::map>> g_allgatherV291093TwoRankFP16M0Map = { + {128, + {{-1, 3584, -1, 2147483647, -1, 2560}, {-1, 3584, -1, 8704, 2560, 3584}, + {3584, 2147483647, -1, 2147483647, -1, 3584}, {4608, 2147483647, -1, 7680, 3584, 2147483647}, + {3584, 2147483647, 7680, 2147483647, 3584, 2147483647}}}, + {256, + {{-1, 3584, 8704, 2147483647, 2560, 3584}, {-1, 4608, -1, 7680, 3584, 2147483647}, + {-1, 3584, 7680, 2147483647, 3584, 2147483647}}} + }; + + static std::map>> g_allgatherV291093TwoRankFP16UbmovenumMap = { + {3.0, + {{-1, 4608, -1, 768, -1, 1536}, {-1, 4608, 2560, 2147483647, 5632, 7680}, + {-1, 1536, -1, 2147483647, 7680, 8704}, {4608, 7680, -1, 2147483647, -1, 4608}, + {4608, 7680, 7680, 2147483647, 4608, 8704}, {7680, 2147483647, -1, 2147483647, -1, 1536}, + {7680, 2147483647, 6656, 2147483647, 1536, 8704}}}, + {4.0, + {{-1, 4608, 768, 2147483647, -1, 1536}, {9728, 2147483647, -1, 768, 8704, 18432}}}, + {6.0, + {{-1, 4608, -1, 2147483647, 1536, 5632}, {-1, 4608, -1, 2560, 5632, 7680}, + {1536, 4608, -1, 768, 8704, 19456}, {9728, 2147483647, -1, 768, 18432, 2147483647}}}, + {2.0, + {{1536, 4608, -1, 2147483647, 7680, 8704}, {4608, 7680, -1, 7680, 4608, 8704}, + {7680, 2147483647, -1, 6656, 1536, 8704}, {4608, 9728, -1, 768, 8704, 2147483647}, + {-1, 2147483647, 768, 2147483647, 8704, 2147483647}}}, + {8.0, + {{-1, 1536, -1, 768, 8704, 13312}}}, + {10.0, + {{-1, 1536, -1, 768, 13312, 2147483647}}}, + {12.0, + {{1536, 4608, -1, 768, 19456, 2147483647}}} + }; + + void AllGatherV2NPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLGATHERV2_91093_EIGHT_RANK_FP16_M0_DEFAULT, + g_allgatherV291093EightRankFP16M0Map}}, + {&cocTilingData.commDataSplit, + {ALLGATHERV2_91093_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allgatherV291093EightRankFP16CommdatasplitMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHERV2_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgatherV291093EightRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHERV2_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_allgatherV291093EightRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.commDirect = + cocTilingData.commDataSplit == COMMDATASPLIT_ONE ? COMM_DATA_DIRECT : COMM_NPU_DIRECT; + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit == COMMDATASPLIT_ONE ? cocTilingData.rankSize : COMMNPUSPLIT_ONE; + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherV2NPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLGATHERV2_91093_SIXTEEN_RANK_FP16_M0_DEFAULT, + g_allgatherV291093SixteenRankFP16M0Map}}, + {&cocTilingData.commNpuSplit, + {ALLGATHERV2_91093_SIXTEEN_RANK_FP16_COMMNPUSPLIT_DEFAULT, + g_allgatherV291093SixteenRankFP16CommnpusplitMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHERV2_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgatherV291093SixteenRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHERV2_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, + g_allgatherV291093SixteenRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + cocTilingData.commDirect = + cocTilingData.commNpuSplit <= COMMNPUSPLIT_EIGHT ? COMM_NPU_DIRECT : COMM_DATA_DIRECT; + cocTilingData.commDataSplit = + cocTilingData.commNpuSplit > COMMNPUSPLIT_ONE ? COMMDATASPLIT_ONE : COMMDATASPLIT_EIGHT; + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherV2NPU91093TwoRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.pValue, + {ALLGATHERV2_91093_TWO_RANK_FP16_PVALUE_DEFAULT, + g_allgatherV291093TwoRankFP16PvalueMap}}, + {&cocTilingData.m0, + {ALLGATHERV2_91093_TWO_RANK_FP16_M0_DEFAULT, + g_allgatherV291093TwoRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLGATHERV2_91093_TWO_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgatherV291093TwoRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}} + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + + DealTilingParamByBuffSize(cocTilingData); + } +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp b/comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp new file mode 100644 index 00000000..416da4ee --- /dev/null +++ b/comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling_910B.h" +#include "tiling_func.h" +#include "lcal_types.h" + +namespace Lcal { + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_PVALUE_DEFAULT = 6; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 3; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT = 8; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_CORE16_PVALUE_DEFAULT = 6; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_CORE16_UBMOVENUM_DEFAULT = 8; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_CORE16_COMMDIRECT_DEFAULT = 1; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_CORE16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t ALLGATHERV2_EIGHT_RANK_FP16_CORE16_M0_DEFAULT = 128; + + static std::map>> g_allgatherV2EightRankFP16CorE16M0Map = { + {128, + {{-1, 3798, -1, 2147483647, -1, 1200}, {-1, 3798, -1, 10720, 1720, 2274}, + {-1, 3798, 10720, 2147483647, 1200, 2274}, {3798, 4298, -1, 2786, -1, 2274}, + {4810, 2147483647, -1, 2786, -1, 2274}, {3798, 2147483647, 2786, 2147483647, -1, 2274}, + {-1, 2147483647, -1, 6950, 2912, 5360}, {-1, 2147483647, 6450, 6950, 5360, 6934}, + {-1, 2147483647, -1, 6950, 6934, 2147483647}, {-1, 2147483647, 9950, 2147483647, 2274, 2626}, + {-1, 2147483647, 6950, 2147483647, 2626, 2147483647}}}, + {256, + {{-1, 3798, -1, 10720, 1200, 1720}, {4298, 4810, -1, 2786, -1, 2274}, + {-1, 2147483647, -1, 6950, 2274, 2912}, {-1, 2147483647, -1, 6450, 5360, 6934}, + {-1, 2147483647, 6950, 9950, 2274, 2626}}} + }; + + static std::map>> g_allgatherV2EightRankFP16CorE16CommdatasplitMap = { + {16, + {{-1, 2147483647, -1, 2147483647, -1, 2147483647}}} + }; + + static std::map>> g_allgatherV2EightRankFP16CorE16CommdirectMap = { + {1, + {{-1, 2147483647, -1, 2147483647, -1, 3248}, {-1, 2147483647, -1, 768, 3248, 5660}, + {-1, 2147483647, -1, 768, 8704, 2147483647}, {-1, 2147483647, 768, 2147483647, 3248, 2147483647}}}, + {0, + {{-1, 2147483647, -1, 768, 5660, 8704}}} + }; + + static std::map>> g_allgatherV2EightRankFP16CorE16UbmovenumMap = { + {2, + {{-1, 6450, -1, 2147483647, -1, 2912}, {-1, 2786, -1, 2147483647, 2912, 7434}, + {2786, 6450, -1, 2147483647, 2912, 6934}, {2786, 6450, 768, 2147483647, 6934, 7434}, + {6450, 7850, 1262, 1774, 768, 7434}, {7850, 2147483647, -1, 1774, 1536, 7434}, + {6450, 8958, 1774, 10720, -1, 7434}, {6450, 8958, 10720, 2147483647, -1, 6150}, + {8958, 2147483647, 1774, 2147483647, -1, 7434}, {-1, 1262, -1, 2147483647, 7434, 8958}, + {1262, 1774, 1792, 2147483647, 7434, 8958}, {1774, 2147483647, -1, 2147483647, 7434, 8958}, + {1774, 2147483647, -1, 768, 8958, 2147483647}, {-1, 2147483647, 768, 2147483647, 8958, 2147483647}}}, + {3, + {{2786, 6450, -1, 768, 6934, 7434}, {6450, 7850, -1, 1262, -1, 7434}, + {7850, 2147483647, -1, 1774, 768, 1536}, {1262, 1774, -1, 1792, 7434, 8958}, + {1262, 1774, -1, 768, 8958, 2147483647}}}, + {6, + {{6450, 7850, 1262, 1774, -1, 768}}}, + {8, + {{7850, 2147483647, -1, 1774, -1, 768}}}, + {4, + {{6450, 8958, 10720, 2147483647, 6150, 7434}, {-1, 1262, -1, 768, 8958, 2147483647}}} + }; + + static std::map>> g_allgatherV2EightRankFP16CorE16PvalueMap = { + {2, + {{-1, 3798, -1, 2147483647, -1, 832}, {768, 3798, 768, 2147483647, 832, 1200}, + {2024, 2560, -1, 1262, 1200, 2274}, {2024, 3798, 1262, 2147483647, 1200, 2274}, + {3798, 4608, 7946, 8446, -1, 2274}, {3798, 2147483647, 9728, 2147483647, -1, 768}, + {-1, 1262, -1, 768, 2274, 5660}, {768, 1262, -1, 7696, 7680, 10752}, + {1262, 2147483647, -1, 2147483647, 2912, 3248}, {8958, 9728, -1, 2147483647, 7680, 2147483647}}}, + {4, + {{-1, 3798, -1, 768, 832, 1200}, {2560, 3798, -1, 1262, 1200, 2274}, + {3798, 2147483647, -1, 1774, -1, 2274}, {3798, 2147483647, 8446, 9728, -1, 1262}, + {3798, 2147483647, 9728, 2147483647, 768, 1262}, {3798, 2147483647, 8446, 2147483647, 1262, 2274}, + {-1, 1262, 7696, 2147483647, 8704, 10752}, {-1, 1262, -1, 2147483647, 10752, 2147483647}, + {1262, 2147483647, -1, 2274, 3248, 6934}, {1262, 2147483647, 8958, 2147483647, 4298, 6934}, + {1262, 1774, 6700, 2147483647, 6934, 2147483647}, {1774, 8958, 6450, 2147483647, 6934, 2147483647}, + {8958, 2147483647, 6700, 2147483647, 6934, 7680}, {9728, 2147483647, -1, 2147483647, 7680, 2147483647}}}, + {1, + {{-1, 768, 768, 2147483647, 832, 1200}, {-1, 2024, -1, 2147483647, 1200, 2274}, + {-1, 1262, -1, 768, 5660, 7680}, {-1, 1262, 768, 2147483647, 2274, 7680}, + {-1, 768, -1, 7696, 7680, 10752}, {-1, 1262, 7696, 2147483647, 7680, 8704}, + {1262, 2147483647, -1, 2147483647, 2274, 2912}, {1262, 2147483647, 2274, 8958, 3248, 6934}, + {1262, 2147483647, 8958, 2147483647, 3248, 4298}, {1262, 1774, -1, 6700, 6934, 2147483647}, + {1774, 8958, -1, 6450, 6934, 2147483647}, {8958, 2147483647, -1, 6700, 6934, 7680}}}, + {6, + {{3798, 2147483647, 1774, 7946, -1, 2274}, {4608, 2147483647, 7946, 8446, -1, 2274}}} + }; + + static std::map>> g_allgatherV2EightRankFP16CommdatasplitMap = { + {8, + {{-1, 2274, -1, 2147483647, -1, 5312}, {2274, 2147483647, -1, 2147483647, -1, 4810}, + {5148, 2147483647, -1, 768, 4810, 5312}, {2274, 2147483647, 768, 2147483647, 4810, 5312}, + {-1, 2147483647, -1, 2147483647, 5312, 2147483647}}}, + {4, + {{2274, 5148, -1, 768, 4810, 5312}}} + }; + + static std::map>> g_allgatherV2EightRankFP16M0Map = { + {128, + {{-1, 2274, -1, 2147483647, -1, 6172}, {-1, 2274, 6700, 2147483647, 6172, 6934}, + {2274, 2786, 8200, 2147483647, -1, 6934}, {2786, 2147483647, -1, 6950, -1, 5360}, + {2786, 2147483647, 6950, 2147483647, -1, 6934}, {-1, 2147483647, -1, 2274, 6934, 2147483647}, + {-1, 2147483647, 2274, 4810, 6934, 7434}, {-1, 2147483647, 2274, 4810, 7946, 2147483647}, + {-1, 2147483647, 4810, 2147483647, 6934, 2147483647}}}, + {256, + {{-1, 2274, -1, 6700, 6172, 6934}, {2274, 2786, -1, 8200, -1, 6934}, + {2786, 2147483647, -1, 6950, 5360, 6934}, {-1, 2147483647, 2274, 4810, 7434, 7946}}} + }; + + static std::map>> g_allgatherV2EightRankFP16UbmovenumMap = { + {2.0, + {{-1, 768, -1, 2560, -1, 576}, {-1, 768, -1, 3584, 832, 2274}, + {768, 2147483647, -1, 1774, -1, 2274}, {-1, 2147483647, -1, 8200, 2274, 2626}, + {-1, 2147483647, -1, 2147483647, 2626, 2147483647}}}, + {3.0, + {{-1, 768, 2560, 2147483647, -1, 576}, {-1, 768, -1, 2147483647, 576, 832}, + {-1, 768, 3584, 2147483647, 832, 2274}, {768, 2147483647, 1774, 2147483647, -1, 2274}, + {-1, 2147483647, 8200, 2147483647, 2274, 2626}}} + }; + + static std::map>> g_allgatherV2EightRankFP16PvalueMap = { + {2, + {{-1, 2786, -1, 2560, -1, 576}, {1280, 2786, -1, 2147483647, 576, 832}, + {-1, 2786, -1, 4608, 832, 1200}, {1262, 2786, -1, 2147483647, 1720, 2274}, + {2786, 3286, -1, 7708, -1, 768}, {2786, 3286, -1, 2147483647, 768, 2274}, + {3286, 4298, -1, 2147483647, -1, 1262}, {4298, 7450, 4810, 2147483647, -1, 768}, + {-1, 2147483647, -1, 768, 2274, 3584}, {4608, 2147483647, -1, 768, 5660, 7680}, + {-1, 2147483647, 768, 5900, 2912, 3248}, {-1, 2147483647, 6450, 8446, 8446, 2147483647}, + {-1, 768, 11264, 2147483647, 2274, 7680}, {-1, 768, 8446, 2147483647, 7680, 2147483647}, + {768, 1262, 8446, 9728, 4636, 2147483647}, {768, 1262, 9728, 2147483647, 6684, 2147483647}, + {1262, 2147483647, 8446, 9728, 6450, 2147483647}, {1262, 2147483647, 9728, 10720, 6684, 2147483647}, + {1262, 2147483647, 10720, 2147483647, 5104, 2147483647}}}, + {1, + {{-1, 2786, 2560, 2147483647, -1, 576}, {-1, 1280, -1, 2147483647, 576, 832}, + {-1, 2786, 4608, 2147483647, 832, 1200}, {-1, 2786, -1, 2147483647, 1200, 1720}, + {-1, 1262, -1, 2147483647, 1720, 2274}, {2786, 3286, 7708, 2147483647, -1, 768}, + {3286, 4298, -1, 2147483647, 1774, 2274}, {7450, 2147483647, 4810, 2147483647, 1774, 2274}, + {-1, 4608, -1, 768, 5660, 7680}, {-1, 2147483647, -1, 768, 7680, 2147483647}, + {-1, 2147483647, 768, 5900, 2274, 2912}, {-1, 2147483647, 768, 5900, 3248, 2147483647}, + {-1, 2147483647, 5900, 8446, 2274, 8446}, {-1, 2147483647, 5900, 6450, 8446, 2147483647}, + {-1, 768, 8446, 11264, 2274, 7680}, {768, 1262, 8446, 9728, 2274, 4636}, + {768, 1262, 9728, 2147483647, 2274, 6684}, {1262, 2147483647, 8446, 9728, 2274, 6450}, + {1262, 2147483647, 9728, 10720, 2274, 6684}, {1262, 2147483647, 10720, 2147483647, 2274, 5104}}}, + {4, + {{3286, 4298, -1, 2147483647, 1262, 1774}, {4298, 8958, -1, 4298, -1, 2274}, + {8958, 2147483647, -1, 4810, -1, 1536}, {4298, 7450, 4810, 2147483647, 768, 2274}, + {7450, 2147483647, 4810, 2147483647, -1, 1774}, {-1, 2147483647, -1, 768, 3584, 5660}}}, + {6, + {{4298, 8958, 4298, 4810, -1, 2274}, {8958, 2147483647, -1, 4810, 1536, 2274}}} + }; + + void AllGatherV2EightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.commDataSplit, + {ALLGATHERV2_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_allgatherV2EightRankFP16CommdatasplitMap}}, + {&cocTilingData.m0, + {ALLGATHERV2_EIGHT_RANK_FP16_M0_DEFAULT, + g_allgatherV2EightRankFP16M0Map}}, + {&cocTilingData.ubMoveNum, + {ALLGATHERV2_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_allgatherV2EightRankFP16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHERV2_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_allgatherV2EightRankFP16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}}, + {&cocTilingData.commDirect, {COMM_NPU_DIRECT}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + int32_t coreNum = cocTilingData.blockDim - cocTilingData.rankSize; + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit >= COMMDATASPLIT_EIGHT ? COMMNPUSPLIT_ONE : COMMNPUSPLIT_THREE; + cocTilingData.commNpuSplit = std::min(cocTilingData.commNpuSplit, cocTilingData.rankSize); + cocTilingData.commDataSplit = + ClampValue(cocTilingData.commDataSplit, COMMDATASPLIT_ONE, coreNum / cocTilingData.commNpuSplit); + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + + DealTilingParamByBuffSize(cocTilingData); + } + + void AllGatherV2EightRankFP16Core16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map tilingParamMap = { + {&cocTilingData.m0, + {ALLGATHERV2_EIGHT_RANK_FP16_CORE16_M0_DEFAULT, + g_allgatherV2EightRankFP16CorE16M0Map}}, + {&cocTilingData.commDataSplit, + {ALLGATHERV2_EIGHT_RANK_FP16_CORE16_COMMDATASPLIT_DEFAULT, + g_allgatherV2EightRankFP16CorE16CommdatasplitMap}}, + {&cocTilingData.commDirect, + {ALLGATHERV2_EIGHT_RANK_FP16_CORE16_COMMDIRECT_DEFAULT, + g_allgatherV2EightRankFP16CorE16CommdirectMap}}, + {&cocTilingData.ubMoveNum, + {ALLGATHERV2_EIGHT_RANK_FP16_CORE16_UBMOVENUM_DEFAULT, + g_allgatherV2EightRankFP16CorE16UbmovenumMap}}, + {&cocTilingData.pValue, + {ALLGATHERV2_EIGHT_RANK_FP16_CORE16_PVALUE_DEFAULT, + g_allgatherV2EightRankFP16CorE16PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}} + }; + SetTilingParam(cocTilingData, tilingParamMap); + + int32_t coreNum = cocTilingData.blockDim - cocTilingData.rankSize; + cocTilingData.commNpuSplit = + cocTilingData.commDataSplit >= COMMDATASPLIT_EIGHT ? COMMNPUSPLIT_ONE : cocTilingData.rankSize; + cocTilingData.commDataSplit = + ClampValue(cocTilingData.commDataSplit, COMMDATASPLIT_ONE, coreNum / cocTilingData.commNpuSplit); + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum * cocTilingData.commDataSplit; + + DealTilingParamByBuffSize(cocTilingData); + } +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp b/comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp new file mode 100644 index 00000000..afac5b04 --- /dev/null +++ b/comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling.h" +#include "tiling_910B.h" +#include "tiling_91093.h" +#include "tiling_func.h" +#include "lcoc_func.h" + +namespace Lcal { +void CoCAllToAllAllGatherMatmulHiddenTilingFunc::GetDefaultTiling(const TaskParam &tilingInfo) +{ + CoCTilingFunc::GetDefaultTiling(tilingInfo); + auto k = tilingInfo.cocParamDesc.mmInfo.k; + auto m = tilingInfo.cocParamDesc.mmInfo.m; + auto maxOutputSize = tilingInfo.cocParamDesc.moeInfo.maxOutputSize; + + auto blockCount = MAX_BLOCK_COUNT; + int32_t maxPvalue = (k + 255) / 256; + + cocTilingData.m0 = DEFAULT_ROW; + cocTilingData.n0 = DEFAULT_COL; + cocTilingData.k0 = DEFAULT_COL; + int32_t bufferSize = tilingInfo.bufferSize * 1024 * 1024; + int32_t maxPeerMemPerRank = bufferSize / INPUT_DTYPE / blockCount; + constexpr int32_t Seven = 7; + cocTilingData.pValue = Seven; + if (cocTilingData.pValue > maxPvalue) { + cocTilingData.pValue = maxPvalue; + } + + if (m < DEFAULT_ROW) { + cocTilingData.pValue = (k + cocTilingData.k0 - 1) / cocTilingData.k0; + } + + if (cocTilingData.pValue * cocTilingData.k0 * maxOutputSize > maxPeerMemPerRank) { + cocTilingData.pValue = maxPeerMemPerRank / maxOutputSize / cocTilingData.k0; + } + cocTilingData.ubMoveNum = AllTOAll_HIDDEN_UBMOVENUM; + constexpr int32_t two = 2; + int32_t maxUbPingPongSize = cocTilingData.ubMoveNum / two; + if (cocTilingData.pValue * cocTilingData.k0 > maxUbPingPongSize) { + cocTilingData.pValue = maxUbPingPongSize / cocTilingData.k0; + } + + return; +} + +bool CoCAllToAllAllGatherMatmulHiddenTilingFunc::CheckTiling(const TaskParam &tilingInfo) +{ + int32_t rankSize = cocTilingData.rankSize; + int32_t ep = tilingInfo.cocParamDesc.moeInfo.EP; + int32_t tp = tilingInfo.cocParamDesc.moeInfo.TP; + int32_t expertPerRank = tilingInfo.cocParamDesc.moeInfo.local_expert_nums; + int32_t k = tilingInfo.cocParamDesc.mmInfo.k; + auto maxOutputSize = tilingInfo.cocParamDesc.moeInfo.maxOutputSize; + + auto blockCount = MAX_BLOCK_COUNT; + int32_t maxPeerMemPerRank = (tilingInfo.bufferSize * 1024 * 1024) / INPUT_DTYPE / blockCount; + if ((cocTilingData.pValue - 1) * cocTilingData.k0 > k) { + return false; + } + if (cocTilingData.pValue * cocTilingData.k0 * maxOutputSize > maxPeerMemPerRank) { + std::string str = "The k value is too large and is currently not supported. " + "pValue: " + std::to_string(cocTilingData.pValue) + ", k0: " + + std::to_string(cocTilingData.k0) + "maxPeerMemPerRank: " + std::to_string(maxPeerMemPerRank); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + constexpr int32_t Two = 2; + int32_t maxUbPingPongSize = cocTilingData.ubMoveNum / Two; + if (cocTilingData.pValue * cocTilingData.k0 > maxUbPingPongSize) { + std::string str = "The k value is too large and is currently not supported. " + "pValue: " + std::to_string(cocTilingData.pValue) + ", k0: " + + std::to_string(cocTilingData.k0) + "maxUbPingPongSize: " + std::to_string(maxUbPingPongSize); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + + if (ep * tp != rankSize) { + std::string str = "The ep * tp != rankSize. " + "rankSize: " + std::to_string(rankSize) + ", ep: " + std::to_string(ep) + + " , tp: " + std::to_string(tp); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + + std::vector> paramCheckList = { + {"expertPerrank", expertPerRank, 1, 20} + }; + return CheckParamScopeList(paramCheckList); +} +} diff --git a/comm/lcal/src/tiling/alltoall_allgather_tiling.cpp b/comm/lcal/src/tiling/alltoall_allgather_tiling.cpp new file mode 100644 index 00000000..e788eb2c --- /dev/null +++ b/comm/lcal/src/tiling/alltoall_allgather_tiling.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling.h" +#include "tiling_910B.h" +#include "tiling_91093.h" +#include "tiling_func.h" +#include "lcoc_func.h" +# +namespace Lcal { +void CoCAllToAllAllGatherMatmulTilingFunc::GetDefaultTiling(const TaskParam &tilingInfo) +{ + CoCTilingFunc::GetDefaultTiling(tilingInfo); + cocTilingData.m0 = DEFAULT_ROW; + cocTilingData.n0 = DEFAULT_COL; + cocTilingData.k0 = DEFAULT_COL; + constexpr int32_t pValue = 1; + cocTilingData.pValue = pValue; + constexpr int32_t ubMove = 28672; + cocTilingData.ubMoveNum = ubMove; + return; +} + +bool CheckPValue(const TaskParam &tilingInfo, const CoCTilingData &data) +{ + auto blockCount = MAX_BLOCK_COUNT; + int32_t bufferSize = tilingInfo.bufferSize * 1024 * 1024; + int32_t maxPeerMemPerRank = bufferSize / INPUT_DTYPE / data.rankSize / blockCount; + if (data.pValue * data.m0 * data.k0 * data.kLoop >= maxPeerMemPerRank) { + std::string str = "The k value is too large and is currently not supported. " + "pValue: " + std::to_string(data.pValue) + ", m0: " + std::to_string(data.m0) + + ", k0: " + std::to_string(data.k0) + ", kLoop: " + std::to_string(data.kLoop) + + "maxPeerMemPerRank: " + std::to_string(maxPeerMemPerRank); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + return true; +} + +bool CoCAllToAllAllGatherMatmulTilingFunc::CheckTiling(const TaskParam &tilingInfo) +{ + if (!CoCTilingFunc::CheckTiling(tilingInfo)) { + return false; + } + if (!CheckPValue(tilingInfo, cocTilingData)) { + return false; + } + + int32_t rankSize = cocTilingData.rankSize; + int32_t ep = tilingInfo.cocParamDesc.moeInfo.EP; + int32_t tp = tilingInfo.cocParamDesc.moeInfo.TP; + int32_t expertPerRank = tilingInfo.cocParamDesc.moeInfo.local_expert_nums; + + if (ep * tp != rankSize) { + std::string str = "The ep * tp != rankSize. " + "rankSize: " + std::to_string(rankSize) + ", ep: " + std::to_string(ep) + + " , tp: " + std::to_string(tp); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + + std::vector> paramCheckList = { + {"expertPerrank", expertPerRank, 1, 20} + }; + return CheckParamScopeList(paramCheckList); +} +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp b/comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp new file mode 100644 index 00000000..a1e9ffb6 --- /dev/null +++ b/comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "tiling.h" +#include "tiling_910B.h" +#include "tiling_91093.h" +#include "tiling_func.h" +#include "lcoc_func.h" +# +namespace Lcal { +void CoCMatmulReduceScatterAllToAllHiddenTilingFunc::GetDefaultTiling(const TaskParam &tilingInfo) +{ + CoCTilingFunc::GetDefaultTiling(tilingInfo); + auto n = tilingInfo.cocParamDesc.mmInfo.n; + auto maxOutputSize = tilingInfo.cocParamDesc.moeInfo.maxOutputSize; + int32_t maxPvalue = (n + 255) / 256; + + cocTilingData.m0 = DEFAULT_ROW; + cocTilingData.n0 = DEFAULT_COL; + cocTilingData.k0 = DEFAULT_COL; + int32_t m = tilingInfo.cocParamDesc.mmInfo.m; + auto blockCount = MAX_BLOCK_COUNT; + int32_t bufferSize = tilingInfo.bufferSize * 1024 * 1024; + int32_t maxPeerMemPerRank = bufferSize / INPUT_DTYPE / blockCount; + constexpr int32_t Four = 4; + cocTilingData.pValue = Four; + if (cocTilingData.pValue > maxPvalue) { + cocTilingData.pValue = maxPvalue; + } + + if (m < DEFAULT_ROW) { + cocTilingData.pValue = (n + cocTilingData.n0 - 1) / cocTilingData.n0; + } + + if (cocTilingData.pValue * cocTilingData.n0 * maxOutputSize > maxPeerMemPerRank) { + cocTilingData.pValue = maxPeerMemPerRank / maxOutputSize / cocTilingData.n0; + } + + cocTilingData.ubMoveNum = AllTOAll_HIDDEN_UBMOVENUM; + constexpr int32_t two = 2; + int32_t maxUbPingPongSize = cocTilingData.ubMoveNum / two; + if (cocTilingData.pValue * cocTilingData.n0 > maxUbPingPongSize) { + cocTilingData.pValue = maxUbPingPongSize / cocTilingData.n0; + } + return; +} +bool CoCMatmulReduceScatterAllToAllHiddenTilingFunc::CheckTiling(const TaskParam &tilingInfo) +{ + int32_t rankSize = cocTilingData.rankSize; + int32_t ep = tilingInfo.cocParamDesc.moeInfo.EP; + int32_t tp = tilingInfo.cocParamDesc.moeInfo.TP; + int32_t expertPerRank = tilingInfo.cocParamDesc.moeInfo.local_expert_nums; + int32_t n = tilingInfo.cocParamDesc.mmInfo.n; + auto maxOutputSize = tilingInfo.cocParamDesc.moeInfo.maxOutputSize; + + auto blockCount = MAX_BLOCK_COUNT; + int32_t maxPeerMemPerRank = (tilingInfo.bufferSize * 1024 * 1024) / INPUT_DTYPE / blockCount; + if ((cocTilingData.pValue - 1) * cocTilingData.n0 > n) { + return false; + } + if (cocTilingData.pValue * cocTilingData.n0 * maxOutputSize > maxPeerMemPerRank) { + std::string str = "The k value is too large and is currently not supported. " + "pValue: " + std::to_string(cocTilingData.pValue) + ", n0: " + + std::to_string(cocTilingData.n0) + "maxPeerMemPerRank: " + std::to_string(maxPeerMemPerRank); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + constexpr int32_t Two = 2; + int32_t maxUbPingPongSize = cocTilingData.ubMoveNum / Two; + if (cocTilingData.pValue * cocTilingData.n0 > maxUbPingPongSize) { + std::string str = "The k value is too large and is currently not supported. " + "pValue: " + std::to_string(cocTilingData.pValue) + ", n0: " + + std::to_string(cocTilingData.n0) + "maxUbPingPongSize: " + std::to_string(maxUbPingPongSize); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + + if (ep * tp != rankSize) { + std::string str = "The ep * tp != rankSize. " + "rankSize: " + std::to_string(rankSize) + ", ep: " + std::to_string(ep) + + " , tp: " + std::to_string(tp); + PrintErrorLog(tilingInfo.lcalType, str); + return false; + } + + std::vector> paramCheckList = { + {"expertPerrank", expertPerRank, 1, 20} + }; + return CheckParamScopeList(paramCheckList); +} +} diff --git a/comm/lcal/src/tiling/reducescatter_tiling.cpp b/comm/lcal/src/tiling/reducescatter_tiling.cpp new file mode 100644 index 00000000..0afd9204 --- /dev/null +++ b/comm/lcal/src/tiling/reducescatter_tiling.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "tiling.h" +#include "lcoc_func.h" +#include "tiling_910B.h" +#include "tiling_91093.h" +#include "tiling_func.h" +namespace Lcal { +void CoCMatmulReduceScatterTilingFunc::GetDefaultTiling(const TaskParam &taskParam) +{ + CoCTilingFunc::GetDefaultTiling(taskParam); + if (Is91093(taskParam.chipName)) { + if (cocTilingData.rankSize == RANKSIZE_EIGHT) { + ReduceScatterNPU91093EightRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_SIXTEEN) { + ReduceScatterNPU91093SixteenRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_TWO && + taskParam.cocParamDesc.mmInfo.isInt8) { + ReduceScatterNPU91093TwoRankINT8Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_TWO) { + ReduceScatterNPU91093TwoRankFP16Tiling(cocTilingData); + return; + } else if (cocTilingData.rankSize == RANKSIZE_FOUR) { + ReduceScatterNPU91093FourRankFP16Tiling(cocTilingData); + return; + } + } else if (Is910B(taskParam.chipName)) { + if (cocTilingData.rankSize == RANKSIZE_FOUR) { + ReduceScatterFourRankINT8Tiling(cocTilingData); // INT8 + return; + } + } + ReduceScatterEightRankFP16GetDefaultTiling(cocTilingData); +} + +bool CoCMatmulReduceScatterTilingFunc::CheckTiling(const TaskParam &taskParam) +{ + if (!CoCTilingFunc::CheckTiling(taskParam)) { + return false; + } + auto pValue = cocTilingData.pValue; + auto rankSize = cocTilingData.rankSize; + auto blockDim = cocTilingData.blockDim; + if ((pValue * blockDim) % rankSize != 0) { + std::string str = "The product of pValue and blockDim must be divisible by rankSize." + " pValue: " + std::to_string(pValue) + " blockDim: " + std::to_string(blockDim) + + " rankSize: " + std::to_string(rankSize); + PrintErrorLog(taskParam.lcalType, str); + return false; + } + return true; +} +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/reducescatter_tiling_91093.cpp b/comm/lcal/src/tiling/reducescatter_tiling_91093.cpp new file mode 100644 index 00000000..c30c0c16 --- /dev/null +++ b/comm/lcal/src/tiling/reducescatter_tiling_91093.cpp @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "tiling_91093.h" +#include "tiling_func.h" +namespace Lcal { + constexpr int32_t REDUCESCATTER_91093_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t REDUCESCATTER_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 40; + constexpr int32_t REDUCESCATTER_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT = 14; + constexpr int32_t REDUCESCATTER_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT = 40; + constexpr int32_t REDUCESCATTER_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT = 14; + constexpr int32_t REDUCESCATTER_91093_SIXTEEN_RANK_FP16_M0_DEFAULT = 128; + + constexpr int32_t REDUCESCATTER_91093_TWO_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t REDUCESCATTER_91093_TWO_RANK_FP16_UBMOVENUM_DEFAULT = 16; + constexpr int32_t REDUCESCATTER_91093_TWO_RANK_FP16_PVALUE_DEFAULT = 12; + constexpr int32_t REDUCESCATTER_91093_TWO_RANK_INT8_PVALUE_DEFAULT = 14; + constexpr int32_t REDUCESCATTER_91093_TWO_RANK_INT8_M0_DEFAULT = 128; + constexpr int32_t REDUCESCATTER_91093_TWO_RANK_INT8_UBMOVENUM_DEFAULT = 16; + constexpr int32_t REDUCESCATTER_91093_FOUR_RANK_FP16_UBMOVENUM_DEFAULT = 20; + constexpr int32_t REDUCESCATTER_91093_FOUR_RANK_FP16_PVALUE_DEFAULT = 12; + constexpr int32_t REDUCESCATTER_91093_FOUR_RANK_FP16_M0_DEFAULT = 128; + + static std::map>> g_reducescatter91093EightRankFP16PvalueMap = { + {4, + {{-1, 6656, -1, 2560, -1, 1536}, {2560, 6656, -1, 3584, 1536, 2560}, + {6656, 7680, 1536, 2560, -1, 3584}, {7680, 2147483647, -1, 2560, 1536, 3584}, + {-1, 2147483647, -1, 2560, 3584, 6656}, {-1, 2147483647, 1536, 2560, 8704, 11264}, + {-1, 3584, 1536, 2560, 11264, 13312}, {4608, 2147483647, 2560, 4608, 4608, 2147483647}, + {4608, 2147483647, 4608, 5632, 6656, 2147483647}, {-1, 2147483647, 5632, 6656, 6656, 2147483647}, + {3584, 2147483647, 6656, 7680, 3584, 2147483647}}}, + {2, + {{-1, 6656, 2560, 4608, -1, 1536}, {-1, 6656, 11264, 2147483647, -1, 1536}, + {-1, 2560, -1, 3584, 1536, 2560}, {-1, 6656, 2560, 4608, 2560, 3584}, + {6656, 8704, 2560, 2147483647, -1, 2560}, {8704, 9728, 2560, 2147483647, -1, 3584}, + {9728, 2147483647, 4608, 2147483647, -1, 2560}, {2560, 4608, 2560, 5632, 3584, 4608}, + {-1, 3584, 2560, 5632, 4608, 2147483647}, {4608, 2147483647, 4608, 5632, 3584, 6656}, + {-1, 3584, 6656, 7680, 3584, 2147483647}, {-1, 2560, 7680, 9728, 3584, 2147483647}}}, + {1, + {{-1, 6656, 4608, 11264, -1, 1536}, {-1, 6656, 3584, 2147483647, 1536, 2560}, + {-1, 4608, 4608, 9728, 2560, 3584}, {4608, 6656, 4608, 2147483647, 2560, 3584}, + {9728, 2147483647, 13312, 2147483647, 2560, 3584}, {-1, 2560, 9728, 2147483647, 3584, 2147483647}}}, + {8, + {{-1, 6656, -1, 1536, 2560, 3584}, {-1, 2147483647, 1536, 2560, 6656, 8704}, + {-1, 4608, -1, 1536, 11264, 13312}, {3584, 2147483647, 1536, 2560, 11264, 13312}, + {7680, 2147483647, 1536, 2560, 13312, 2147483647}}}, + {3, + {{-1, 6656, 1536, 2560, 2560, 3584}, {-1, 4608, 9728, 2147483647, 2560, 3584}, + {6656, 8704, 2560, 2147483647, 2560, 3584}, {9728, 2147483647, 2560, 4608, -1, 2560}, + {9728, 2147483647, 2560, 13312, 2560, 3584}, {-1, 2560, 2560, 5632, 3584, 4608}, + {3584, 4608, 2560, 5632, 4608, 2147483647}, {4608, 2147483647, 2560, 4608, 3584, 4608}, + {-1, 2147483647, 5632, 6656, 3584, 6656}, {2560, 2147483647, 7680, 2147483647, 3584, 2147483647}}}, + {10, + {{6656, 7680, -1, 1536, -1, 3584}, {-1, 2147483647, -1, 1536, 6656, 8704}, + {-1, 4608, -1, 1536, 13312, 2147483647}}}, + {6, + {{7680, 2147483647, -1, 2560, -1, 1536}}}, + {12, + {{-1, 2147483647, -1, 1536, 8704, 11264}, {4608, 2147483647, -1, 1536, 13312, 2147483647}, + {-1, 7680, 1536, 2560, 13312, 2147483647}}}, + {14, + {{4608, 2147483647, -1, 1536, 11264, 13312}}} + }; + + static std::map>> g_reducescatter91093EightRankFP16UbmovenumMap = { + {8.0, + {{-1, 1536, -1, 7168, -1, 1536}, {9728, 2147483647, 4608, 7168, 1536, 2560}}}, + {12.0, + {{-1, 1536, 7168, 2147483647, -1, 1536}, {1536, 6656, -1, 2147483647, -1, 1536}, + {7680, 9728, -1, 2147483647, -1, 1536}, {-1, 9728, 3584, 2147483647, 1536, 2560}, + {-1, 9728, 2560, 2147483647, 2560, 6656}, {9728, 2147483647, -1, 4608, 1536, 2560}, + {9728, 2147483647, -1, 8704, 2560, 5632}, {9728, 2147483647, 2560, 11264, 5632, 6656}, + {-1, 2147483647, 2560, 2147483647, 6656, 13312}, {4608, 2147483647, 2560, 11264, 13312, 2147483647}}}, + {16.0, + {{6656, 7680, -1, 6144, -1, 1536}, {-1, 9728, 1536, 3584, 1536, 2560}, + {-1, 9728, -1, 2560, 3584, 6656}, {9728, 2147483647, 1536, 2147483647, -1, 1536}, + {9728, 2147483647, 7168, 2147483647, 1536, 2560}, {9728, 2147483647, 8704, 2147483647, 2560, 5632}, + {9728, 2147483647, 1536, 2560, 5632, 6656}, {-1, 2147483647, 1536, 2560, 6656, 8704}, + {2048, 2147483647, 1536, 2560, 8704, 9728}, {-1, 2147483647, 1536, 2560, 9728, 2147483647}, + {-1, 4608, 2560, 11264, 13312, 2147483647}, {2560, 2147483647, 11264, 2147483647, 13312, 2147483647}}}, + {10.0, + {{6656, 7680, 6144, 2147483647, -1, 1536}}}, + {20.0, + {{-1, 9728, -1, 1536, 1536, 2560}, {-1, 9728, -1, 2560, 2560, 3584}, + {9728, 2147483647, 11264, 2147483647, 5632, 6656}, {-1, 2560, -1, 1536, 11264, 13312}, + {-1, 2048, 1536, 2560, 8704, 9728}, {-1, 2560, 11264, 2147483647, 13312, 2147483647}}}, + {40.0, + {{9728, 2147483647, -1, 1536, -1, 1536}, {9728, 2147483647, -1, 1536, 5632, 6656}, + {-1, 8704, -1, 1536, 6656, 7680}, {-1, 3584, -1, 1536, 7680, 11264}, + {-1, 4608, -1, 1536, 13312, 2147483647}}}, + {30.0, + {{8704, 2147483647, -1, 1536, 6656, 7680}, {3584, 2147483647, -1, 1536, 7680, 11264}, + {2560, 2147483647, -1, 1536, 11264, 13312}, {4608, 2147483647, -1, 1536, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatter91093EightRankFP16M0Map = { + {128, + {{-1, 1536, -1, 7168, -1, 1536}, {1536, 2560, -1, 9728, -1, 1536}, + {-1, 2560, -1, 2147483647, 1536, 3584}, {2560, 2147483647, -1, 2147483647, -1, 3584}, + {-1, 1536, -1, 2147483647, 3584, 11264}, {1536, 2560, -1, 2147483647, 3584, 5632}, + {2560, 2147483647, -1, 2147483647, 3584, 11264}, {-1, 1536, -1, 2147483647, 11264, 13312}, + {1536, 2147483647, -1, 2560, 11264, 13312}, {-1, 2147483647, -1, 3584, 13312, 2147483647}, + {-1, 2147483647, 5632, 2147483647, 13312, 2147483647}}}, + {256, + {{-1, 1536, 7168, 2147483647, -1, 1536}, {1536, 2560, 9728, 2147483647, -1, 1536}, + {1536, 2560, -1, 2147483647, 5632, 11264}, {1536, 2147483647, 2560, 2147483647, 11264, 13312}, + {-1, 2147483647, 3584, 5632, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatter91093SixteenRankFP16M0Map = { + {128, + {{-1, 6656, -1, 2147483647, -1, 1536}, {6656, 8704, -1, 3584, -1, 1536}, + {-1, 8704, -1, 2147483647, 1536, 3584}, {8704, 2147483647, -1, 2147483647, -1, 3584}, + {-1, 2147483647, -1, 3584, 3584, 2147483647}, {-1, 2560, 3584, 2147483647, 3584, 2147483647}, + {4608, 2147483647, 3584, 2147483647, 3584, 2147483647}}}, + {256, + {{6656, 8704, 3584, 2147483647, -1, 1536}, {2560, 4608, 3584, 2147483647, 3584, 2147483647}}} + }; + + static std::map>> g_reducescatter91093SixteenRankFP16PvalueMap = { + {8, + {{-1, 1536, -1, 7168, -1, 1536}, {1536, 2560, -1, 5632, -1, 1536}, + {5632, 6656, -1, 2560, 1536, 2560}, {7680, 9728, -1, 2560, -1, 3584}, + {-1, 2147483647, -1, 2560, 3584, 4608}, {2560, 2147483647, -1, 2560, 8704, 9728}}}, + {2, + {{-1, 1536, 7168, 2147483647, -1, 1536}, {1536, 2560, 5632, 2147483647, -1, 1536}, + {2560, 4608, -1, 2560, -1, 1536}, {2560, 6656, 2560, 2147483647, -1, 1536}, + {-1, 5632, -1, 2147483647, 1536, 2560}, {5632, 6656, 2560, 2147483647, 1536, 2560}, + {-1, 6656, 3584, 2147483647, 2560, 3584}, {6656, 7680, 4608, 2147483647, -1, 3584}, + {9728, 2147483647, -1, 3584, -1, 1536}, {7680, 2147483647, 3584, 2147483647, -1, 3584}, + {-1, 6656, 4608, 11264, 3584, 2147483647}, {6656, 2147483647, 9728, 11264, 3584, 2147483647}, + {-1, 2147483647, 11264, 2147483647, 3584, 2147483647}}}, + {4, + {{4608, 6656, -1, 2560, -1, 1536}, {-1, 5632, -1, 3584, 2560, 3584}, + {6656, 7680, -1, 4608, -1, 1536}, {6656, 7680, -1, 4608, 2560, 3584}, + {7680, 9728, 2560, 3584, -1, 3584}, {9728, 2147483647, -1, 3584, 1536, 3584}, + {-1, 2147483647, 2560, 4608, 3584, 7680}, {-1, 7680, 2560, 4608, 7680, 2147483647}, + {7680, 2147483647, 2560, 4608, 7680, 8704}, {6656, 2147483647, 4608, 9728, 3584, 2147483647}}}, + {6, + {{5632, 6656, -1, 3584, 2560, 3584}, {6656, 7680, -1, 4608, 1536, 2560}, + {-1, 2560, -1, 2560, 8704, 9728}, {7680, 2147483647, 2560, 4608, 8704, 2147483647}}}, + {10, + {{-1, 2147483647, -1, 2560, 4608, 8704}, {-1, 2147483647, -1, 2560, 9728, 13312}, + {-1, 2147483647, 1536, 2560, 13312, 2147483647}}}, + {14, + {{-1, 2147483647, -1, 1536, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatter91093SixteenRankFP16UbmovenumMap = { + {16.0, + {{-1, 9728, -1, 2560, -1, 2560}, {9728, 2147483647, -1, 1536, -1, 2560}, + {9728, 2147483647, 9216, 2147483647, -1, 2560}, {9728, 2147483647, 7680, 2147483647, 2560, 5632}, + {-1, 2147483647, 1536, 2560, 5632, 9728}, {-1, 2147483647, 7680, 13312, 5632, 11264}, + {-1, 2147483647, 7680, 2147483647, 13312, 2147483647}}}, + {12.0, + {{-1, 9728, 2560, 2147483647, -1, 3584}, {-1, 9728, -1, 2147483647, 3584, 5632}, + {9728, 2147483647, 1536, 9216, -1, 2560}, {9728, 2147483647, -1, 7680, 2560, 5632}, + {-1, 2147483647, 2560, 7680, 5632, 11264}, {2560, 2147483647, 2560, 2147483647, 11264, 13312}, + {-1, 2147483647, 2560, 7680, 13312, 2147483647}}}, + {30.0, + {{-1, 9728, -1, 2560, 2560, 3584}, {-1, 2147483647, -1, 1536, 5632, 2147483647}}}, + {20.0, + {{-1, 2147483647, 1536, 2560, 9728, 2147483647}, {-1, 2147483647, 13312, 2147483647, 5632, 11264}}}, + {40.0, + {{-1, 2560, 2560, 2147483647, 11264, 13312}}} + }; + + static std::map>> g_reducescatter91093TwoRankFP16PvalueMap = { + {3, + {{-1, 9728, -1, 1536, -1, 1536}, {-1, 1536, 5632, 2147483647, 2560, 3584}, + {9728, 2147483647, -1, 2560, 1536, 2560}, {9728, 2147483647, 5120, 11264, 2560, 3584}, + {9728, 2147483647, 13312, 2147483647, 2560, 3584}, {2560, 3584, 2560, 2147483647, 3584, 4608}, + {7168, 2147483647, 2560, 3584, 9728, 2147483647}, {3584, 2147483647, 6656, 2147483647, 9728, 2147483647}}}, + {2, + {{-1, 9728, 1536, 2560, -1, 1536}, {-1, 9728, 2560, 3584, 1536, 2560}, + {1536, 9728, 2560, 3584, 2560, 3584}, {9728, 2147483647, 1536, 3584, -1, 1536}, + {9728, 2147483647, 2560, 9216, 1536, 2560}, {9728, 2147483647, 11264, 2147483647, 1536, 2560}, + {9728, 2147483647, 2560, 5120, 2560, 3584}, {-1, 1536, 1536, 2560, 3584, 7680}, + {3584, 2147483647, 2560, 3584, 3584, 9728}, {3584, 7168, 2560, 3584, 9728, 2147483647}, + {3584, 9728, 3584, 6656, 3584, 2147483647}, {3584, 2147483647, 6656, 2147483647, 3584, 9728}}}, + {1, + {{-1, 9728, 2560, 2147483647, -1, 1536}, {-1, 9728, 3584, 2147483647, 1536, 2560}, + {-1, 1536, 2560, 5632, 2560, 3584}, {1536, 9728, 3584, 2147483647, 2560, 3584}, + {9728, 2147483647, 3584, 2147483647, -1, 1536}, {9728, 2147483647, 9216, 11264, 1536, 2560}, + {9728, 2147483647, 11264, 13312, 2560, 3584}, {-1, 2560, 2560, 2147483647, 3584, 6656}, + {2560, 3584, 2560, 2147483647, 4608, 6656}, {-1, 3584, 2560, 2147483647, 6656, 2147483647}}}, + {4, + {{-1, 5632, -1, 2560, 1536, 2560}, {-1, 2147483647, -1, 2560, 2560, 3584}, + {-1, 3584, -1, 1536, 3584, 7680}, {1536, 6656, 1536, 2560, 3584, 7680}, + {-1, 3584, 1536, 2560, 7680, 2147483647}, {9728, 2147483647, 3584, 6656, 3584, 2147483647}}}, + {6, + {{5632, 9728, -1, 2560, 1536, 2560}, {9728, 2147483647, -1, 1536, -1, 1536}, + {-1, 3584, -1, 1536, 7680, 9728}, {9728, 2147483647, -1, 1536, 3584, 4608}, + {3584, 6656, 1536, 2560, 7680, 2147483647}, {6656, 2147483647, 1536, 2560, 3584, 2147483647}}}, + {8, + {{-1, 3584, -1, 1536, 9728, 2147483647}, {3584, 9728, -1, 1536, 3584, 4608}}}, + {12, + {{3584, 7680, -1, 1536, 4608, 2147483647}}}, + {10, + {{7680, 2147483647, -1, 1536, 4608, 2147483647}}} + }; + + static std::map>> g_reducescatter91093TwoRankFP16UbmovenumMap = { + {6.0, + {{-1, 1536, -1, 8704, -1, 1536}}}, + {8.0, + {{-1, 1536, 8704, 2147483647, -1, 1536}, {-1, 2560, -1, 1536, 9728, 2147483647}}}, + {10.0, + {{1536, 6656, -1, 1536, -1, 1536}, {6656, 7680, -1, 1536, -1, 2560}, + {8704, 2147483647, -1, 1536, -1, 2560}, {2560, 8704, -1, 1536, 2560, 3584}, + {2560, 2147483647, -1, 1536, 5632, 2147483647}}}, + {4.0, + {{1536, 6656, 1536, 2147483647, -1, 1536}, {-1, 8704, 1536, 2147483647, 1536, 2560}, + {-1, 2560, 4608, 2147483647, 2560, 7680}, {-1, 2560, 1536, 2147483647, 7680, 2147483647}, + {8704, 2147483647, -1, 1536, 2560, 3584}, {2560, 2147483647, 13312, 2147483647, 2560, 13312}}}, + {16.0, + {{-1, 3584, -1, 1536, 1536, 2560}}}, + {12.0, + {{3584, 6656, -1, 1536, 1536, 2560}, {7680, 8704, -1, 1536, -1, 2560}, + {-1, 2560, -1, 1536, 2560, 9728}, {2560, 2147483647, -1, 1536, 3584, 5632}}}, + {3.0, + {{6656, 2147483647, 1536, 2147483647, -1, 1536}, {8704, 2147483647, 1536, 2147483647, 1536, 2560}, + {-1, 2560, 1536, 4608, 2560, 7680}, {2560, 2147483647, 1536, 13312, 2560, 13312}, + {2560, 2147483647, 1536, 2147483647, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatter91093TwoRankFP16M0Map = { + {256, + {{-1, 1536, -1, 7168, -1, 1536}, {-1, 1536, 6656, 2147483647, 5632, 2147483647}, + {1536, 3584, 4608, 2147483647, 5632, 2147483647}}}, + {128, + {{-1, 1536, 7168, 2147483647, -1, 1536}, {1536, 4608, -1, 2147483647, -1, 1536}, + {-1, 4608, -1, 2147483647, 1536, 5632}, {4608, 2147483647, -1, 2147483647, -1, 5632}, + {-1, 2147483647, -1, 4608, 5632, 2147483647}, {-1, 1536, 4608, 6656, 5632, 2147483647}, + {3584, 2147483647, 4608, 2147483647, 5632, 2147483647}}} + }; + + static std::map>> g_reducescatter91093TwoRankINT8UbmovenumMap = { + {16.0, + {{-1, 4608, -1, 2560, -1, 1536}}}, + {8.0, + {{-1, 4608, 2560, 4608, -1, 1536}, {-1, 4608, 1536, 3584, 3584, 4608}, + {4608, 8704, 1536, 2560, -1, 4608}, {-1, 1536, 1536, 2560, 4608, 2147483647}, + {1536, 4608, -1, 2560, 9728, 11264}}}, + {4.0, + {{-1, 4608, 4608, 2147483647, -1, 1536}, {-1, 4608, -1, 2147483647, 1536, 2560}, + {-1, 4608, 4608, 2147483647, 2560, 3584}, {-1, 2560, 3584, 2147483647, 3584, 4608}, + {3584, 4608, 3584, 2147483647, 3584, 4608}, {4608, 8704, 4608, 7680, -1, 4608}, + {-1, 1536, 3584, 2147483647, 4608, 2147483647}}}, + {3.0, + {{-1, 4608, -1, 4608, 2560, 3584}, {2560, 3584, 3584, 2147483647, 3584, 4608}, + {4608, 8704, 2560, 4608, -1, 4608}, {4608, 8704, 7680, 2147483647, -1, 4608}, + {8704, 2147483647, 2560, 2147483647, -1, 4608}, {-1, 1536, 2560, 3584, 6656, 2147483647}, + {1536, 4608, -1, 2147483647, 4608, 9728}, {1536, 4608, 2560, 2147483647, 9728, 11264}, + {1536, 4608, -1, 2147483647, 11264, 2147483647}, {4608, 2147483647, 2560, 2147483647, 4608, 13312}, + {4608, 9728, 1536, 2147483647, 13312, 2147483647}, {9728, 2147483647, 3072, 2147483647, 13312, 2147483647}}}, + {12.0, + {{-1, 4608, -1, 1536, 3584, 4608}, {4608, 7680, -1, 1536, -1, 4608}, + {-1, 1536, -1, 1536, 4608, 2147483647}}}, + {10.0, + {{7680, 2147483647, -1, 1536, -1, 4608}, {4608, 2147483647, -1, 1536, 4608, 2147483647}}}, + {6.0, + {{8704, 2147483647, 1536, 2560, -1, 4608}, {-1, 1536, 2560, 3584, 4608, 6656}, + {4608, 2147483647, 1536, 2560, 4608, 13312}, {9728, 2147483647, 1536, 3072, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatter91093TwoRankINT8M0Map = { + {128, + {{-1, 1536, -1, 4096, -1, 1536}, {-1, 2560, -1, 2147483647, 1536, 9728}, + {2560, 3584, -1, 3584, 1536, 9728}, {3584, 2147483647, -1, 3584, -1, 9728}, + {3584, 2147483647, 3584, 2147483647, 6656, 9728}, {-1, 2147483647, -1, 2147483647, 9728, 11264}, + {3584, 2147483647, -1, 2147483647, 11264, 13312}, {-1, 3584, -1, 3584, 13312, 2147483647}, + {-1, 1536, 3584, 2147483647, 13312, 2147483647}, {3584, 2147483647, -1, 2147483647, 13312, 2147483647}}}, + {256, + {{-1, 1536, 4096, 2147483647, -1, 1536}, {1536, 2560, -1, 2147483647, -1, 1536}, + {2560, 3584, -1, 3584, -1, 1536}, {2560, 3584, 3584, 2147483647, -1, 9728}, + {3584, 2147483647, 3584, 2147483647, -1, 6656}, {-1, 3584, -1, 2147483647, 11264, 13312}, + {1536, 3584, 3584, 2147483647, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatter91093TwoRankINT8PvalueMap = { + {3, + {{-1, 3584, -1, 1536, -1, 1536}, {-1, 3584, 1536, 2560, 2560, 3584}, + {-1, 1536, 9216, 2147483647, 2560, 3584}, {3584, 4608, -1, 2560, -1, 1536}, + {7680, 8704, 4608, 5632, -1, 3584}, {-1, 1536, 1536, 3584, 3584, 2147483647}, + {2560, 3584, 3584, 4608, 3584, 2147483647}, {7680, 2147483647, 3584, 4608, 3584, 6656}, + {6656, 2147483647, 4608, 9728, 3584, 2147483647}}}, + {4, + {{-1, 3584, 1536, 2560, -1, 1536}, {4608, 2147483647, -1, 2560, -1, 1536}, + {3584, 2147483647, -1, 3584, 1536, 3584}, {-1, 1536, -1, 1536, 5632, 2147483647}, + {1536, 3584, 1536, 3584, 3584, 2147483647}, {3584, 2147483647, 3584, 4608, 6656, 2147483647}}}, + {2, + {{-1, 3584, 2560, 4096, -1, 1536}, {1536, 3584, -1, 4608, 1536, 2560}, + {-1, 3584, 2560, 4608, 2560, 3584}, {3584, 2147483647, 2560, 4608, -1, 1536}, + {3584, 2147483647, 3584, 4608, 1536, 3584}, {8704, 2147483647, 4608, 5632, -1, 3584}, + {-1, 2560, 3584, 5632, 3584, 2147483647}, {2560, 3584, 4608, 5632, 3584, 2147483647}, + {3584, 7680, 3584, 4608, 3584, 6656}, {3584, 6656, 4608, 2147483647, 6656, 2147483647}, + {6656, 2147483647, 9728, 2147483647, 3584, 2147483647}}}, + {1, + {{-1, 3584, 4096, 2147483647, -1, 1536}, {-1, 1536, -1, 4608, 1536, 2560}, + {-1, 3584, 4608, 2147483647, 1536, 2560}, {-1, 1536, 4608, 9216, 2560, 3584}, + {1536, 3584, 4608, 2147483647, 2560, 3584}, {3584, 7680, 4608, 2147483647, -1, 3584}, + {7680, 2147483647, 5632, 2147483647, -1, 3584}, {-1, 3584, 5632, 2147483647, 3584, 2147483647}, + {3584, 6656, 4608, 2147483647, 3584, 6656}}}, + {14, + {{-1, 3584, -1, 1536, 2560, 3584}, {2560, 2147483647, -1, 1536, 9728, 2147483647}}}, + {8, + {{-1, 1536, -1, 1536, 3584, 5632}, {3584, 7680, 1536, 2560, 3584, 2147483647}, + {3584, 2147483647, 2560, 3584, 11264, 2147483647}}}, + {6, + {{1536, 2560, -1, 1536, 3584, 2147483647}, {3584, 2147483647, 2560, 3584, 3584, 11264}}}, + {10, + {{2560, 7680, -1, 1536, 3584, 7680}}}, + {12, + {{7680, 2147483647, -1, 1536, 3584, 7680}, {2560, 2147483647, -1, 1536, 7680, 9728}, + {7680, 2147483647, 1536, 2560, 3584, 2147483647}}} + }; + + static std::map>> g_reducescatter91093FourRankFP16M0Map = { + {256, + {{-1, 1536, -1, 4096, -1, 1536}, {1536, 6656, 1536, 2147483647, -1, 1536}, + {-1, 5632, 3584, 2147483647, 1536, 2560}, {5632, 6656, 2560, 2147483647, 1536, 2560}, + {6656, 7680, -1, 7680, -1, 2560}, {7680, 2147483647, 3584, 7680, -1, 2560}, + {6656, 7680, 7680, 8704, -1, 2560}, {-1, 4608, 4608, 2147483647, 2560, 11264}, + {-1, 4608, 3584, 2147483647, 11264, 2147483647}, {4608, 6656, 3584, 9728, 2560, 2147483647}, + {9728, 2147483647, -1, 1536, 5120, 2147483647}}}, + {128, + {{-1, 1536, 4096, 2147483647, -1, 1536}, {1536, 6656, -1, 1536, -1, 1536}, + {-1, 5632, -1, 3584, 1536, 2560}, {5632, 6656, -1, 2560, 1536, 2560}, + {7680, 2147483647, -1, 3584, -1, 2560}, {7680, 2147483647, 7680, 8704, -1, 2560}, + {6656, 2147483647, 8704, 2147483647, -1, 2560}, {-1, 4608, -1, 4608, 2560, 11264}, + {-1, 4608, -1, 3584, 11264, 2147483647}, {4608, 6656, -1, 3584, 2560, 2147483647}, + {4608, 6656, 9728, 2147483647, 2560, 2147483647}, {6656, 9728, -1, 2147483647, 2560, 2147483647}, + {9728, 2147483647, -1, 1536, 2560, 5120}, {9728, 2147483647, 1536, 2147483647, 2560, 2147483647}}} + }; + + static std::map>> g_reducescatter91093FourRankFP16PvalueMap = { + {4, + {{-1, 2560, -1, 1536, -1, 1536}, {-1, 3584, -1, 1536, 1536, 2560}, + {3584, 4608, -1, 1536, -1, 3584}, {9728, 2147483647, -1, 2560, -1, 3584}, + {-1, 3584, -1, 1536, 3584, 7680}, {4608, 2147483647, 1536, 2560, 3584, 2147483647}}}, + {2, + {{2560, 3584, -1, 1536, -1, 1536}, {-1, 3584, 1536, 2560, 1536, 2560}, + {1536, 3584, 1536, 3584, 2560, 3584}, {3584, 4608, 1536, 3584, -1, 3584}, + {4608, 8704, -1, 3584, -1, 3584}, {9728, 2147483647, 2560, 3584, -1, 3584}, + {3584, 2147483647, 3584, 4608, 1536, 3584}, {-1, 1536, 1536, 2560, 3584, 2147483647}, + {-1, 2147483647, 2560, 4608, 3584, 9728}, {-1, 7680, 2560, 4608, 9728, 2147483647}, + {7680, 2147483647, 3584, 4608, 9728, 2147483647}, {3584, 8704, 4608, 9728, 3584, 2147483647}, + {9728, 2147483647, 4608, 8704, 3584, 2147483647}, {9728, 2147483647, 9728, 2147483647, 3584, 2147483647}}}, + {1, + {{-1, 3584, 1536, 2147483647, -1, 1536}, {-1, 3584, 2560, 2147483647, 1536, 2560}, + {-1, 1536, 1536, 2147483647, 2560, 3584}, {1536, 3584, 3584, 2147483647, 2560, 3584}, + {3584, 2147483647, 3584, 2147483647, -1, 1536}, {3584, 2147483647, 4608, 2147483647, 1536, 3584}, + {-1, 3584, 4608, 2147483647, 3584, 2147483647}, {8704, 9728, 4608, 9728, 3584, 2147483647}, + {9728, 2147483647, 8704, 9728, 3584, 2147483647}, {3584, 9728, 9728, 2147483647, 3584, 2147483647}}}, + {12, + {{-1, 3584, -1, 1536, 2560, 3584}}}, + {8, + {{8704, 9728, -1, 3584, -1, 3584}, {-1, 3584, -1, 1536, 7680, 2147483647}}}, + {10, + {{3584, 2147483647, -1, 1536, 3584, 2147483647}}}, + {3, + {{1536, 4608, 1536, 2560, 3584, 2147483647}, {7680, 2147483647, 2560, 3584, 9728, 2147483647}}} + }; + + static std::map>> g_reducescatter91093FourRankFP16UbmovenumMap = { + {12.0, + {{-1, 1536, -1, 4096, -1, 1536}, {1536, 2560, 2560, 4608, 1536, 2560}, + {2560, 3584, 3072, 7680, -1, 2560}, {2560, 3584, 9216, 2147483647, -1, 1536}, + {3584, 2147483647, 2560, 4608, -1, 2560}, {-1, 2560, 2560, 3584, 2560, 3584}, + {-1, 2560, 3584, 5120, 2560, 4608}, {-1, 1536, 5120, 2147483647, 2560, 4608}, + {2560, 2147483647, 1536, 2560, 2560, 7680}, {-1, 2147483647, 1536, 2560, 7680, 2147483647}}}, + {8.0, + {{-1, 1536, 4096, 2147483647, -1, 1536}, {-1, 2560, 4608, 7680, 1536, 2560}, + {-1, 2560, 8704, 2147483647, 1536, 2560}, {2560, 3584, 7680, 2147483647, 1536, 2560}, + {3584, 2147483647, 4608, 2147483647, -1, 2560}, {2560, 2147483647, 2560, 4608, 2560, 4608}, + {-1, 2560, 2560, 2147483647, 4608, 2147483647}, {2560, 2147483647, 2560, 4608, 4608, 2147483647}}}, + {16.0, + {{1536, 2560, -1, 2147483647, -1, 1536}, {-1, 1536, -1, 4608, 1536, 2560}, + {1536, 2560, -1, 2560, 1536, 2560}, {-1, 2560, 7680, 8704, 1536, 2560}, + {2560, 3584, -1, 3072, -1, 1536}, {3584, 2147483647, 1536, 2560, -1, 2560}, + {-1, 2560, -1, 2560, 2560, 3584}, {-1, 2560, -1, 3584, 3584, 4608}, + {-1, 2560, 1536, 2560, 4608, 7680}}}, + {20.0, + {{2560, 3584, -1, 3072, 1536, 2560}, {3584, 2147483647, -1, 1536, -1, 2560}, + {2560, 2147483647, -1, 1536, 2560, 4608}, {-1, 2147483647, -1, 1536, 4608, 2147483647}}}, + {10.0, + {{2560, 3584, 7680, 9216, -1, 1536}}}, + {4.0, + {{1536, 2560, 5120, 2147483647, 2560, 4608}, {2560, 2147483647, 13312, 2147483647, 3584, 4608}}}, + {6.0, + {{2560, 2147483647, 4608, 2147483647, 2560, 3584}, {2560, 2147483647, 4608, 13312, 3584, 4608}, + {2560, 2147483647, 4608, 2147483647, 4608, 2147483647}}} + }; + + void ReduceScatterNPU91093EightRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.pValue, + {REDUCESCATTER_91093_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_reducescatter91093EightRankFP16PvalueMap}}, + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_91093_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_reducescatter91093EightRankFP16UbmovenumMap}}, + {&cocTilingData.m0, + {REDUCESCATTER_91093_EIGHT_RANK_FP16_M0_DEFAULT, + g_reducescatter91093EightRankFP16M0Map}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } + + void ReduceScatterNPU91093SixteenRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {REDUCESCATTER_91093_SIXTEEN_RANK_FP16_M0_DEFAULT, + g_reducescatter91093SixteenRankFP16M0Map}}, + {&cocTilingData.pValue, + {REDUCESCATTER_91093_SIXTEEN_RANK_FP16_PVALUE_DEFAULT, + g_reducescatter91093SixteenRankFP16PvalueMap}}, + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_91093_SIXTEEN_RANK_FP16_UBMOVENUM_DEFAULT, + g_reducescatter91093SixteenRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } + + void ReduceScatterNPU91093TwoRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.pValue, + {REDUCESCATTER_91093_TWO_RANK_FP16_PVALUE_DEFAULT, + g_reducescatter91093TwoRankFP16PvalueMap}}, + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_91093_TWO_RANK_FP16_UBMOVENUM_DEFAULT, + g_reducescatter91093TwoRankFP16UbmovenumMap}}, + {&cocTilingData.m0, + {REDUCESCATTER_91093_TWO_RANK_FP16_M0_DEFAULT, + g_reducescatter91093TwoRankFP16M0Map}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } + + void ReduceScatterNPU91093TwoRankINT8Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_91093_TWO_RANK_INT8_UBMOVENUM_DEFAULT, + g_reducescatter91093TwoRankINT8UbmovenumMap}}, + {&cocTilingData.m0, + {REDUCESCATTER_91093_TWO_RANK_INT8_M0_DEFAULT, + g_reducescatter91093TwoRankINT8M0Map}}, + {&cocTilingData.pValue, + {REDUCESCATTER_91093_TWO_RANK_INT8_PVALUE_DEFAULT, + g_reducescatter91093TwoRankINT8PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } + + void ReduceScatterNPU91093FourRankFP16Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {REDUCESCATTER_91093_FOUR_RANK_FP16_M0_DEFAULT, + g_reducescatter91093FourRankFP16M0Map}}, + {&cocTilingData.pValue, + {REDUCESCATTER_91093_FOUR_RANK_FP16_PVALUE_DEFAULT, + g_reducescatter91093FourRankFP16PvalueMap}}, + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_91093_FOUR_RANK_FP16_UBMOVENUM_DEFAULT, + g_reducescatter91093FourRankFP16UbmovenumMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } +} diff --git a/comm/lcal/src/tiling/reducescatter_tiling_910B.cpp b/comm/lcal/src/tiling/reducescatter_tiling_910B.cpp new file mode 100644 index 00000000..f24cb898 --- /dev/null +++ b/comm/lcal/src/tiling/reducescatter_tiling_910B.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "tiling_910B.h" +#include "tiling_func.h" +namespace Lcal { + constexpr int32_t REDUCESCATTER_FOUR_RANK_INT8_PVALUE_DEFAULT = 14; + constexpr int32_t REDUCESCATTER_FOUR_RANK_INT8_UBMOVENUM_DEFAULT = 8; + constexpr int32_t REDUCESCATTER_FOUR_RANK_INT8_M0_DEFAULT = 128; + constexpr int32_t REDUCESCATTER_EIGHT_RANK_FP16_M0_DEFAULT = 128; + constexpr int32_t REDUCESCATTER_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT = 20; + constexpr int32_t REDUCESCATTER_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT = 16; + constexpr int32_t REDUCESCATTER_EIGHT_RANK_FP16_PVALUE_DEFAULT = 12; + + static std::map>> g_reducescatterFourRankINT8M0Map = { + {128, + {{-1, 2560, -1, 7680, -1, 1536}, {-1, 1536, 7680, 2147483647, -1, 1536}, + {1536, 2560, 8704, 2147483647, -1, 1536}, {3584, 2147483647, -1, 4608, -1, 1536}, + {8704, 2147483647, 4608, 5632, -1, 1536}, {2560, 3584, 5632, 2147483647, -1, 1536}, + {-1, 2147483647, -1, 2147483647, 1536, 2147483647}}}, + {256, + {{1536, 2560, 7680, 8704, -1, 1536}, {2560, 3584, -1, 4608, -1, 1536}, + {2560, 8704, 4608, 5632, -1, 1536}, {3584, 2147483647, 5632, 2147483647, -1, 1536}}} + }; + + static std::map>> g_reducescatterFourRankINT8UbmovenumMap = { + {8.0, + {{-1, 1536, -1, 7168, -1, 1536}, {-1, 1536, -1, 2560, 1536, 3584}, + {1536, 2147483647, -1, 1536, -1, 1536}, {1536, 2560, 1536, 4608, -1, 1536}}}, + {6.0, + {{-1, 1536, 7168, 2147483647, -1, 1536}, {-1, 1536, 2560, 2147483647, 1536, 3584}, + {-1, 1536, -1, 4608, 3584, 13312}, {1536, 2147483647, -1, 1536, 1536, 13312}, + {1536, 2560, 1536, 4608, 1536, 13312}, {2560, 2147483647, 1536, 4608, -1, 13312}, + {1536, 2560, 4608, 5632, -1, 6144}, {-1, 2147483647, -1, 4608, 13312, 2147483647}, + {5632, 6656, 9728, 2147483647, 13312, 2147483647}}}, + {4.0, + {{-1, 1536, 4608, 2147483647, 3584, 13312}, {1536, 2560, 4608, 5632, 6144, 13312}, + {2560, 2147483647, 4608, 5632, -1, 13312}, {1536, 2147483647, 5632, 2147483647, -1, 13312}, + {-1, 5632, 4608, 2147483647, 13312, 2147483647}, {5632, 2147483647, 4608, 9728, 13312, 2147483647}, + {6656, 2147483647, 9728, 2147483647, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatterFourRankINT8PvalueMap = { + {12, + {{-1, 1536, -1, 4096, -1, 1536}, {5632, 2147483647, -1, 2560, 3584, 5632}}}, + {1, + {{-1, 3584, 4096, 2147483647, -1, 1536}, {-1, 3584, 6656, 2147483647, 1536, 3584}, + {4608, 7680, 7680, 2147483647, -1, 3584}, {9728, 2147483647, 8192, 2147483647, -1, 1536}, + {-1, 1536, 6656, 9728, 3584, 2147483647}, {-1, 1536, 9728, 2147483647, 9728, 2147483647}, + {1536, 2560, 7680, 2147483647, 3584, 11264}}}, + {2, + {{1536, 3584, -1, 4096, -1, 1536}, {-1, 3584, -1, 6656, 1536, 3584}, + {3584, 4608, -1, 2147483647, -1, 2560}, {4608, 7680, 4608, 7680, -1, 3584}, + {7680, 9728, -1, 2147483647, -1, 1536}, {9728, 2147483647, -1, 8192, -1, 1536}, + {-1, 1536, 4608, 6656, 3584, 2147483647}, {-1, 1536, 9728, 2147483647, 3584, 9728}, + {1536, 2560, 5632, 7680, 3584, 2147483647}, {1536, 2560, 7680, 2147483647, 11264, 2147483647}}}, + {4, + {{3584, 4608, -1, 6144, 2560, 3584}, {4608, 7680, 1536, 4608, -1, 3584}, + {-1, 1536, 1536, 4608, 3584, 2147483647}, {1536, 2560, -1, 4608, 4608, 7680}, + {5632, 6656, 4608, 5632, 3584, 2147483647}, {6656, 8704, 4608, 2147483647, 6656, 2147483647}}}, + {3, + {{3584, 4608, 6144, 2147483647, 2560, 3584}, {7680, 8704, 4608, 2147483647, 1536, 3584}, + {8704, 2147483647, 5632, 2147483647, 1536, 3584}, {1536, 2560, -1, 4608, 3584, 4608}, + {1536, 2560, 4608, 5632, 3584, 2147483647}, {2560, 5632, 4608, 2147483647, 3584, 2147483647}, + {5632, 6656, 5632, 2147483647, 3584, 2147483647}, {6656, 8704, 4608, 2147483647, 3584, 6656}, + {8704, 2147483647, 4608, 2147483647, 3584, 2147483647}}}, + {8, + {{4608, 7680, -1, 1536, -1, 3584}, {2560, 5632, -1, 2560, 3584, 7680}, + {4608, 5632, 2560, 4608, 3584, 2147483647}, {5632, 2147483647, 2560, 4608, 3584, 9728}}}, + {6, + {{7680, 8704, -1, 4608, 1536, 3584}, {8704, 2147483647, -1, 5632, 1536, 3584}, + {-1, 1536, -1, 1536, 3584, 2147483647}, {1536, 2560, 1536, 4608, 7680, 2147483647}, + {2560, 4608, 2560, 4608, 3584, 2147483647}}}, + {10, + {{1536, 2560, -1, 1536, 7680, 2147483647}}}, + {14, + {{2560, 5632, -1, 2560, 7680, 2147483647}, {5632, 2147483647, -1, 2560, 5632, 2147483647}, + {5632, 2147483647, 2560, 4608, 9728, 2147483647}}} + }; + + static std::map>> g_reducescatterEightRankFP16PvalueMap = { + {2, + {{-1, 1536, -1, 2147483647, -1, 1536}, {1536, 5632, 1536, 2147483647, -1, 1536}, + {-1, 1536, -1, 2147483647, 1536, 2560}, {1536, 5632, 1536, 2147483647, 1536, 2560}, + {5632, 6656, 1536, 2560, -1, 1536}, {5632, 2147483647, 2560, 2147483647, -1, 2560}, + {-1, 4608, 1536, 2560, 2560, 4608}, {-1, 2147483647, 2560, 2147483647, 2560, 2147483647}}}, + {4, + {{1536, 6656, -1, 1536, -1, 2560}, {5632, 6656, 1536, 2560, 1536, 2560}, + {6656, 2147483647, 1536, 2560, -1, 2560}, {-1, 4608, -1, 1536, 2560, 5632}, + {-1, 4608, 1536, 2560, 4608, 5632}, {4608, 8704, -1, 2560, 2560, 3584}, + {8704, 2147483647, 1536, 2560, 2560, 5632}, {-1, 2560, -1, 2560, 5632, 2147483647}, + {2560, 2147483647, 1536, 2560, 5632, 2147483647}}}, + {6, + {{6656, 8704, -1, 1536, -1, 2560}}}, + {8, + {{8704, 2147483647, -1, 1536, -1, 2560}, {4608, 8704, -1, 2560, 3584, 5632}, + {2560, 6656, -1, 1536, 5632, 2147483647}}}, + {10, + {{8704, 2147483647, -1, 1536, 2560, 5632}}}, + {12, + {{6656, 2147483647, -1, 1536, 5632, 2147483647}}} + }; + + static std::map>> g_reducescatterEightRankFP16CommdatasplitMap = { + {16, + {{-1, 9728, -1, 2147483647, -1, 1536}, {9728, 2147483647, -1, 9728, -1, 1536}, + {-1, 2147483647, -1, 2147483647, 1536, 2147483647}}}, + {8, + {{9728, 2147483647, 9728, 2147483647, -1, 1536}}} + }; + + static std::map>> g_reducescatterEightRankFP16UbmovenumMap = { + {8.0, + {{-1, 1536, -1, 4096, -1, 1536}, {-1, 1536, 7168, 8704, -1, 1536}, + {1536, 2560, -1, 7680, -1, 1536}, {-1, 2560, 8704, 2147483647, -1, 1536}, + {2560, 2147483647, -1, 1536, -1, 1536}, {3584, 2147483647, 7680, 8704, -1, 1536}, + {6144, 2147483647, 8704, 9728, -1, 1536}, {2560, 3584, 9728, 2147483647, -1, 1536}, + {-1, 1536, -1, 3584, 1536, 2560}, {-1, 1536, -1, 5120, 5632, 7680}, + {1536, 2560, -1, 1536, 1536, 2147483647}, {1536, 2560, 9728, 2147483647, 11264, 2147483647}}}, + {10.0, + {{-1, 1536, 4096, 7168, -1, 1536}, {1536, 2560, 7680, 8704, -1, 1536}, + {2560, 2147483647, 1536, 7680, -1, 1536}, {2560, 3584, 7680, 8704, -1, 1536}, + {2560, 6144, 8704, 9728, -1, 1536}, {3584, 9728, 9728, 2147483647, -1, 1536}, + {-1, 1536, -1, 3584, 2560, 5632}, {-1, 1536, 3584, 2147483647, 1536, 5632}, + {-1, 1536, -1, 5120, 7680, 13312}, {-1, 1536, 5120, 2147483647, 5632, 13312}, + {-1, 1536, -1, 5120, 13312, 2147483647}, {-1, 1536, 7680, 2147483647, 13312, 2147483647}, + {2560, 2147483647, -1, 1536, 1536, 2147483647}, {1536, 2147483647, 1536, 9728, 1536, 2147483647}, + {1536, 2147483647, 9728, 2147483647, 1536, 11264}, {2560, 2147483647, 9728, 2147483647, 11264, 2147483647}}}, + {20.0, + {{9728, 2147483647, 9728, 2147483647, -1, 1536}}}, + {6.0, + {{-1, 1536, 5120, 7680, 13312, 2147483647}}} + }; + + static std::map>> g_reducescatterEightRankFP16M0Map = { + {128, + {{-1, 5632, -1, 2147483647, -1, 1536}, {5632, 8704, -1, 3584, -1, 1536}, + {-1, 8704, -1, 2147483647, 1536, 7680}, {8704, 2147483647, -1, 2147483647, -1, 7680}, + {-1, 2147483647, -1, 3584, 7680, 2147483647}, {-1, 1536, 3584, 2147483647, 7680, 2147483647}, + {2560, 2147483647, 3584, 2147483647, 7680, 2147483647}}}, + {256, + {{5632, 8704, 3584, 2147483647, -1, 1536}, {1536, 2560, 3584, 2147483647, 7680, 2147483647}}} + }; + + void ReduceScatterFourRankINT8Tiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.m0, + {REDUCESCATTER_FOUR_RANK_INT8_M0_DEFAULT, + g_reducescatterFourRankINT8M0Map}}, + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_FOUR_RANK_INT8_UBMOVENUM_DEFAULT, + g_reducescatterFourRankINT8UbmovenumMap}}, + {&cocTilingData.pValue, + {REDUCESCATTER_FOUR_RANK_INT8_PVALUE_DEFAULT, + g_reducescatterFourRankINT8PvalueMap}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {DEFAULT_SWIZZLE_COUNT}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + {&cocTilingData.commDataSplit, {COMMDATASPLIT_SIXTEEN}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } + + void ReduceScatterEightRankFP16GetDefaultTiling(CoCTilingData &cocTilingData) + { + std::map TilingParamMap = { + {&cocTilingData.pValue, + {REDUCESCATTER_EIGHT_RANK_FP16_PVALUE_DEFAULT, + g_reducescatterEightRankFP16PvalueMap}}, + {&cocTilingData.commDataSplit, + {REDUCESCATTER_EIGHT_RANK_FP16_COMMDATASPLIT_DEFAULT, + g_reducescatterEightRankFP16CommdatasplitMap}}, + {&cocTilingData.ubMoveNum, + {REDUCESCATTER_EIGHT_RANK_FP16_UBMOVENUM_DEFAULT, + g_reducescatterEightRankFP16UbmovenumMap}}, + {&cocTilingData.m0, + {REDUCESCATTER_EIGHT_RANK_FP16_M0_DEFAULT, + g_reducescatterEightRankFP16M0Map}}, + {&cocTilingData.swizzlDirect, {SWIZZLE_DIRECT_ONE}}, + {&cocTilingData.swizzlCount, {SWIZZLE_COUNT_FOUR}}, + {&cocTilingData.commDirect, {COMM_DATA_DIRECT}}, + {&cocTilingData.commNpuSplit, {COMMNPUSPLIT_ONE}}, + }; + SetTilingParam(cocTilingData, TilingParamMap); + + cocTilingData.lenPerLoop = cocTilingData.ubMoveNum; + } +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/tiling.cpp b/comm/lcal/src/tiling/tiling.cpp index 9e439491..7c9c7d14 100644 --- a/comm/lcal/src/tiling/tiling.cpp +++ b/comm/lcal/src/tiling/tiling.cpp @@ -10,7 +10,6 @@ #include "tiling_func.h" #include "mki/utils/log/log.h" #include "tiling.h" - namespace Lcal { CoCTilingData CoCTilingFunc::GenerateTiling(const TaskParam &taskParam, const CoCTiling &tiling) { @@ -20,6 +19,7 @@ CoCTilingData CoCTilingFunc::GenerateTiling(const TaskParam &taskParam, const Co this->GetDefaultTiling(taskParam); + // 设置Tiling策略参数 SetTilingData(taskParam, tiling, cocTilingData); return cocTilingData; @@ -32,6 +32,7 @@ bool CoCTilingFunc::CheckTiling(const TaskParam &taskParam) void CoCTilingFunc::GetDefaultTiling(const TaskParam &taskParam) { + // 暂时没有使用场景 cocTilingData.ubMoveNum = VALID_UB_MOVE_NUM; cocTilingData.commNpuSplit = cocTilingData.rankSize; cocTilingData.commDataSplit = COMMDATASPLIT_ONE; diff --git a/comm/lcal/src/tiling/tiling_args.cpp b/comm/lcal/src/tiling/tiling_args.cpp index 5b02b0da..abe4dde6 100644 --- a/comm/lcal/src/tiling/tiling_args.cpp +++ b/comm/lcal/src/tiling/tiling_args.cpp @@ -60,4 +60,4 @@ namespace Lcal { is91093 = false; tag = 0; } -} +} \ No newline at end of file diff --git a/comm/lcal/src/tiling/tiling_func.cpp b/comm/lcal/src/tiling/tiling_func.cpp index 593736a0..d99cb3f2 100644 --- a/comm/lcal/src/tiling/tiling_func.cpp +++ b/comm/lcal/src/tiling/tiling_func.cpp @@ -39,6 +39,7 @@ namespace Lcal { double GetMTETime(double mknGB, int32_t m0, int32_t n0, double aBindWidth, double bBindWidth) { + // 预估Matmul计算的MTE2搬运时间 return DOUBLE * mknGB * (SECOND_TO_MS / ONE_K) * (1.0 / (n0 * aBindWidth) + 1.0 / (m0 * bBindWidth)); } @@ -73,12 +74,12 @@ namespace Lcal { uint32_t GetTilingKey(const MatMulInfo &mmInfo, CoCTilingData &tilingData) { - uint32_t tilingKey = static_cast(tilingData.swizzlDirect); - tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transA); - tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transB); - tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.isInt8); - tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.withBias); - tilingKey = (static_cast(tilingKey) << 1) + static_cast(tilingData.splitK); + uint32_t tilingKey = static_cast(tilingData.swizzlDirect); // 32 + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transA); // 16 + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.transB); // 8 + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.isInt8); // 4 + tilingKey = (static_cast(tilingKey) << 1) + static_cast(mmInfo.withBias); // 2 + tilingKey = (static_cast(tilingKey) << 1) + static_cast(tilingData.splitK); // 1 return tilingKey; } @@ -90,7 +91,7 @@ namespace Lcal { int maxPValue = maxPeerMemPerRank / cocTilingData.m0 / cocTilingData.k0 / cocTilingData.kLoop; cocTilingData.pValue = ClampValue(cocTilingData.pValue, MIN_P_VALUE, maxPValue); - if (cocTilingData.m0 == DEFAULT_COL + if (cocTilingData.m0 == DEFAULT_COL && cocTilingData.pValue * cocTilingData.m0 * cocTilingData.k0 * cocTilingData.kLoop >= maxPeerMemPerRank) { cocTilingData.m0 = DEFAULT_ROW; cocTilingData.n0 = DEFAULT_COL; @@ -252,12 +253,13 @@ namespace Lcal { void CalTilingParam(const MatMulInfo &mmInfo, CoCTilingData &tilingData) { + // 计算 tilingData.mLoop = CeilDev(tilingData.m, tilingData.m0); tilingData.kLoop = CeilDev(tilingData.k, tilingData.k0); tilingData.nLoop = CeilDev(tilingData.n, tilingData.n0); tilingData.coreLoop = tilingData.batchSize * tilingData.mLoop * tilingData.nLoop; tilingData.tilingKey = GetTilingKey(mmInfo, tilingData); - + // 对齐 tilingData.ubMoveNum = RoundNum(tilingData.ubMoveNum, HALF_KBYTE); tilingData.lenPerLoop = RoundNum(tilingData.lenPerLoop, HALF_KBYTE); tilingData.extraUbMoveNum = RoundNum(tilingData.extraUbMoveNum, HALF_KBYTE); @@ -278,7 +280,9 @@ namespace Lcal { void SetTilingData(const TaskParam &taskParam, const CoCTiling &tiling, CoCTilingData &tilingData) { + // 输入Tiling赋值给Tiling策略的参数 TransformCoCTiling(tiling, tilingData); + // 根据最终的Tiling策略参数,计算mLoop等参数 CalTilingParam(taskParam.cocParamDesc.mmInfo, tilingData); } } \ No newline at end of file -- Gitee From d5f5740852d44f14aef10ece37ea2f8a89fc4bac Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 1 Sep 2025 19:59:20 +0800 Subject: [PATCH 406/414] clean code --- comm/lcal/src/lcal_comm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/lcal_comm.cpp b/comm/lcal/src/lcal_comm.cpp index 9cc7b8d0..b54380a0 100644 --- a/comm/lcal/src/lcal_comm.cpp +++ b/comm/lcal/src/lcal_comm.cpp @@ -131,8 +131,8 @@ bool SkipUnusedChannel910B2C(int curRank, int peerRank, ChipName chipName) { if (chipName == ChipName::CHIP_910B2C) { constexpr int rankSizePerNode = 8; - if ((curRank / rankSizePerNode != peerRank / rankSizePerNode) - && (std::abs(curRank - peerRank) != rankSizePerNode)) { + if ((curRank / rankSizePerNode != peerRank / rankSizePerNode) && + (std::abs(curRank - peerRank) != rankSizePerNode)) { return true; } } -- Gitee From a382d2aee42ce966593435899b01b38dda4f6b4d Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 11:33:54 +0800 Subject: [PATCH 407/414] add --- .../kernels/coc_alltoall_allgather_hidden.cce | 600 +++++++++ .../coc_alltoall_reduce_scatter_hidden.cce | 350 +++++ .../src/kernels/coc_alltoallv_allgather.cce | 642 +++++++++ .../coc_alltoallv_allgather_matmul.cce | 44 + .../coc_matmul_reduce_scatter_alltoallv.cce | 28 + comm/lcal/src/kernels/coc_matmulmoe.cce | 1171 +++++++++++++++++ 6 files changed, 2835 insertions(+) create mode 100644 comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce create mode 100644 comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce create mode 100644 comm/lcal/src/kernels/coc_alltoallv_allgather.cce create mode 100644 comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce create mode 100644 comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce create mode 100644 comm/lcal/src/kernels/coc_matmulmoe.cce diff --git a/comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce b/comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce new file mode 100644 index 00000000..dcfdeeb1 --- /dev/null +++ b/comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce @@ -0,0 +1,600 @@ +#ifdef __DAV_C220_VEC__ +#include + +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class AllToAllvAllGatherHiddenSplit: public CocCommBase{ +public: + FORCE_INLINE_AICORE AllToAllvAllGatherHiddenSplit(){}; + FORCE_INLINE_AICORE void SetArgs(COC_ARGS_FUN(T)){ + CocCommBase::SetArgs(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + this->gm_out = aligned_a ? reinterpret_cast<__gm__ T *>(workspace_info.gm_a_align) : gm_a; + this -> expert_nums = local_expert_nums * EP; + comm_k = p_value * k0; + comm_count = DivCeil(k, comm_k);//28 + this->gm_quant_scale = reinterpret_cast<__gm__ float32_t *>(gm_quant_scale); + int32_t output_num = m; + if (!is_moe_averaged) { + output_num = 0; + this -> global_tokens_per_expert_matrix = global_tokens_per_expert_matrix; + for (int32_t i = 0 ; i < EP; i++) { + for (int32_t j = 0; j < local_expert_nums; j++) { + output_num += this->global_tokens_per_expert_matrix[i * expert_nums + j + rank * local_expert_nums]; + } + } + } + if (maxOutputSize > 0 && output_num >= maxOutputSize) { + output_num = maxOutputSize; + } + if(dequant_granularity == QuantGranularity::PER_TOKEN) { + serial_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ MatType *>(gm_out), reinterpret_cast<__gm__ float32_t*>(workspace_info.gm_dequant_param), output_num, n, m0, n0); + } + } + + + inline __attribute__((always_inline)) __aicore__ void ScaleAllToAll(){ + int32_t usable_buff = 200 * 1024 * 1024 / 4 / 2; + int32_t max_move_num = usable_buff / rank_size; + int32_t scale_pingpang_size = usable_buff; + + int32_t cal_count = 0; + if(is_moe_averaged) { + cal_count = DivCeil(m / EP, max_move_num); + } else { + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t in_num = 0; + int32_t out_num = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + out_num += global_tokens_per_expert_matrix[rank * expert_nums + j + ep_idx * local_expert_nums]; + } + for(int32_t j = 0; j < local_expert_nums; j++) { + in_num += global_tokens_per_expert_matrix[ep_idx * expert_nums + j + rank * local_expert_nums];// + } + cal_count = max(cal_count, max(in_num, out_num)); + } + cal_count = DivCeil(cal_count, max_move_num); + } + + PipeBarrier(); + + int64_t sum_out = 0, sum_in = 0; + int32_t received_loop_number = 0; + int32_t ep_idx = real_core_idx; + + int32_t out_num, in_num; + if(is_moe_averaged) { + out_num = m / EP; + in_num = m / EP; + } else { + out_num = 0; + in_num = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + out_num += global_tokens_per_expert_matrix[rank * expert_nums + j + real_core_idx * local_expert_nums]; + } + for(int32_t j = 0; j < local_expert_nums; j ++) { + in_num += global_tokens_per_expert_matrix[real_core_idx * expert_nums + rank * local_expert_nums + j]; + } + } + + max_ub_ping_pong_size = max_ub_ping_pong_size / 2; // + int32_t receive_expert_id = 0; + int32_t receive_expert_token_nums; + int32_t last_ep_local = 0; + if (is_moe_averaged) { + receive_expert_token_nums = m / EP / local_expert_nums; + last_ep_local = (m / EP) * real_core_idx; + } else { + receive_expert_token_nums = global_tokens_per_expert_matrix[real_core_idx * expert_nums + rank * local_expert_nums]; + for(int32_t i = 0; i < real_core_idx * local_expert_nums; i++) { + last_ep_local += global_tokens_per_expert_matrix[rank * expert_nums + i]; + } + } + + + + for(int32_t cal_idx = 0; cal_idx < cal_count; cal_idx ++) { + int32_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + + SetAndWaitAivSync(flag_idx, gm_a_pingpong_num); + int32_t received_rank_num = 0; + if (is_moe_averaged){ + received_rank_num = rank_size; + } else { + for(int32_t i = 0; i < EP; i++) { + int32_t in_num_tmp = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + in_num_tmp += global_tokens_per_expert_matrix[i * expert_nums + rank * local_expert_nums + j];// + } + if(cal_idx * max_move_num < in_num_tmp) { + received_rank_num += 1; + } + } + } + + received_loop_number += received_rank_num; + + if(real_core_idx < rank_size){ + if(real_core_idx == rank) { + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_TWO_IDX, FLAG_VALUE); + } + if(is_moe_averaged || cal_idx * max_move_num < out_num) { + int32_t data_len = ((cal_idx + 1) * max_move_num >= out_num) ? (out_num - cal_idx * max_move_num) : max_move_num; + __gm__ float32_t *src_address; + __gm__ float32_t *dst_address = (__gm__ float32_t *)buff[real_core_idx] + flag_idx * scale_pingpang_size + max_move_num * rank;; + if (is_moe_averaged) { + src_address = gm_quant_scale + 1LL * last_ep_local + sum_out; + } else { + src_address = gm_quant_scale + 1LL * last_ep_local + sum_out; + } + + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_TWO_IDX, FLAG_VALUE * (cal_idx + 1)); + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDstv2(src_address, dst_address, data_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + sum_out += data_len; + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_ADD_IDX, FLAG_VALUE); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_ADD_IDX, FLAG_VALUE * received_loop_number); + + if(is_moe_averaged || cal_idx * max_move_num < in_num) { + int32_t data_len = ((cal_idx + 1) * max_move_num >= in_num) ? (in_num - cal_idx * max_move_num) : max_move_num; + __gm__ float32_t *src_address; + __gm__ float32_t *dst_address; + src_address = (__gm__ float32_t *)buff[rank] + flag_idx * scale_pingpang_size + max_move_num * real_core_idx; + + while(receive_expert_id < local_expert_nums && data_len > 0) { + int32_t move_data_len; + if (data_len >= receive_expert_token_nums){ + move_data_len = receive_expert_token_nums; + } else { + move_data_len = data_len; + } + + if (is_moe_averaged) { + dst_address = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_dequant_param) + + 1LL * (m / local_expert_nums) * receive_expert_id + 1LL * (m / expert_nums) * real_core_idx + sum_in; + } else { + int32_t before_expert_sum = 0; + for(int32_t i = 0; i < receive_expert_id; i++){ + for(int32_t j = 0; j < EP; j ++) { + before_expert_sum += global_tokens_per_expert_matrix[j * expert_nums + i + rank * local_expert_nums]; + } + } + int32_t before_rank_in_expert_sum = 0; + for(int32_t i = 0; i < real_core_idx; i++){ + before_rank_in_expert_sum += global_tokens_per_expert_matrix[i * expert_nums + rank * local_expert_nums + receive_expert_id]; + } + dst_address = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_dequant_param) + + 1LL * before_expert_sum + 1LL * before_rank_in_expert_sum + sum_in; + } + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDstv2(src_address, dst_address, move_data_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + + + if (data_len >= receive_expert_token_nums){ + receive_expert_id += 1; + data_len -= receive_expert_token_nums; + if (receive_expert_id > local_expert_nums) { + break; + } + if (is_moe_averaged) { + receive_expert_token_nums = m / EP / local_expert_nums; + } else { + receive_expert_token_nums = global_tokens_per_expert_matrix[real_core_idx * expert_nums + receive_expert_id + rank * local_expert_nums]; + } + sum_in = 0; + } else{ + sum_in += data_len; + receive_expert_token_nums -= data_len; + data_len = 0; + } + src_address += move_data_len; + } + } + } + } + + + max_ub_ping_pong_size = max_ub_ping_pong_size * 2; + if (real_core_idx < rank_size) { + if(real_core_idx == rank) { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + FLAG_TWO_IDX, 0); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_TWO_IDX, 0); + } + PipeBarrier(); + } + + + + inline __attribute__((always_inline)) __aicore__ void AllGatherGlobalTokensMatrix(){ + int32_t usable_buff = 100 * 1024 * 1024 / 4; + //先把num_local_tokens_per_expert copy到共享内存 + PipeBarrier(); + SetAndWaitAivSync(0); + if(real_core_idx < rank_size) { + int32_t data_len = expert_nums; + __gm__ int32_t *src_address = num_local_tokens_per_expert; + __gm__ int32_t *dst_address = (__gm__ int32_t *)buff[rank]; + if(real_core_idx == rank) { + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, 1, data_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_TWO_IDX, FLAG_VALUE); + } + + + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_TWO_IDX, FLAG_VALUE); + + src_address = (__gm__ int32_t *)buff[real_core_idx]; + dst_address = global_tokens_per_expert_matrix + real_core_idx * data_len; + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, 1, data_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + } + + + + template + inline __attribute__((always_inline)) __aicore__ void MoveResultFromSrcToDst(__gm__ CommType *gm_src, __gm__ CommType *gm_dst, + int32_t m_actual, int32_t n_actual, bool is_align) + { + __ubuf__ CommType *output_UB_T[2] = {(__ubuf__ CommType *)(32), (__ubuf__ CommType *)(97440)}; + int32_t max_move_m = (max_ub_ping_pong_size / Block32B::AlignUp(n_actual)); + if (max_move_m > 4095) + max_move_m = 4095; + int32_t ping_pong_move_count = DivCeil(m_actual, max_move_m); + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_m = max_move_m; + int32_t actual_move_n = n_actual; + if(move_idx == ping_pong_move_count - 1) { + actual_move_m = m_actual - move_idx * max_move_m; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + if(is_align) { + CopyGmToUbuf(ub_buff_st, gm_src, actual_move_m, actual_move_n * sizeof(CommType) / 32, (k_align - actual_move_n) * sizeof(CommType) / 32, 0); + } else { + CopyGmToUbufAlignB16(ub_buff_st, gm_src, actual_move_m ,actual_move_n * sizeof(CommType), (k_align - actual_move_n) * sizeof(CommType), 0); + } + SetFlag(event_id); + WaitFlag(event_id); + if(is_align) { + CopyUbufToGm(gm_dst, ub_buff_st, actual_move_m, actual_move_n * sizeof(CommType) / 32, 0, 0); + } else { + CopyUbufToGmAlignB16(gm_dst, ub_buff_st, actual_move_m , actual_move_n * sizeof(CommType), 0, 0); + } + gm_src += actual_move_m * k_align; + gm_dst += actual_move_m * actual_move_n; + SetFlag(event_id); + } + } + + template + inline __attribute__((always_inline)) __aicore__ void MoveResultFromSrcToDstv2(__gm__ CommType *gm_src, __gm__ CommType *gm_dst, + int32_t len, bool is_align) + { + __ubuf__ CommType *output_UB_T[2] = {(__ubuf__ CommType *)(32), (__ubuf__ CommType *)(97440)}; + int32_t ping_pong_move_count = (len + max_ub_ping_pong_size - 1) / max_ub_ping_pong_size; + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = len - move_idx * max_ub_ping_pong_size; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + if(is_align) { + CopyGmToUbuf(ub_buff_st, gm_src, 1, actual_move_size * sizeof(CommType) / 32, 0, 0); + } else { + CopyGmToUbufAlignB16(ub_buff_st, gm_src, 1, actual_move_size * sizeof(CommType), 0, 0); + } + SetFlag(event_id); + WaitFlag(event_id); + if(is_align) { + CopyUbufToGm(gm_dst, ub_buff_st, 1, actual_move_size * sizeof(CommType) / 32, 0, 0); + } else { + CopyUbufToGmAlignB16(gm_dst, ub_buff_st, 1, actual_move_size * sizeof(CommType), 0, 0); + } + gm_src += max_ub_ping_pong_size; + gm_dst += max_ub_ping_pong_size; + SetFlag(event_id); + } + } + + + + inline __attribute__((always_inline)) __aicore__ void EndFlagsAndBias() + { + ResetIpcFlags(4); + if (real_core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + // if constexpr (HAVE_BIAS) { + // add_bias_runner.Run(); + // } + } + +inline __attribute__((always_inline)) __aicore__ void Run(){ + preprocessor.Run(local_expert_nums); + if (is_moe_averaged) { + max_m = m; + } else { + if (maxOutputSize == -1) { + max_m = 0; + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t sum_m_ep = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = local_expert_id + ep_idx * local_expert_nums; + for(int32_t i = 0; i < EP; i++) { + sum_m_ep += global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + } + max_m = max(max_m, sum_m_ep); + } + } else { + max_m = maxOutputSize; + } + } + gm_a_pingpong_size = comm_k * max_m; //8192 + gm_a_pingpong_num = buffer_size * 1024 * 1024 / sizeof(T) / gm_a_pingpong_size; + if (gm_a_pingpong_num > 8) { + gm_a_pingpong_num = 8; + } + + if(dequant_granularity == QuantGranularity::PER_TOKEN){ + ScaleAllToAll(); + } + + withSerialMode = 1; + int64_t dst_before_expert_sum[16] = {0}; // 当前expert搬运起点,src的位置 + int32_t sum_num_local_tokens_per_expert[16] = {0}; // 当前expert搬运dst的位置 + int32_t gmm_ep_idx = real_core_idx < rank_size ? real_core_idx : rank_size - 1; + if (!is_moe_averaged) { + int32_t hcumsum = 0; + for (int32_t j = 0; j <= gmm_ep_idx; j++) { + for(int32_t i = 0; i < local_expert_nums; i++) { + if (j == gmm_ep_idx) + sum_num_local_tokens_per_expert[i] = hcumsum; + hcumsum += global_tokens_per_expert_matrix[rank * expert_nums + i + j * local_expert_nums]; + } + } + + int32_t cumsum = 0; + for (int32_t i = 0; i < local_expert_nums; i++) { + for(int32_t j = 0; j < rank_size; j++) { + if (j == rank) { + dst_before_expert_sum[i] = cumsum; + } + cumsum += global_tokens_per_expert_matrix[j * expert_nums + i + gmm_ep_idx * local_expert_nums]; + } + } + } else { + for (int32_t i = 0; i < local_expert_nums; i++) { + sum_num_local_tokens_per_expert[i] = (m / expert_nums) * (gmm_ep_idx * local_expert_nums + i); + dst_before_expert_sum[i] = (m / expert_nums) * (EP * i + rank); + } + // dst_before_expert_sum = token_per_expert * rank_size * local_expert_id; + // dst_in_expert_sum = token_per_expert * rank; + } + + for(int32_t comm_idx = 0; comm_idx < comm_count + gm_a_pingpong_num ; comm_idx++){ + uint64_t flag_idx = comm_idx % gm_a_pingpong_num; + + if(comm_idx > gm_a_pingpong_num - 1) { + WaitEvent(flag_idx); + } + SetAndWaitAivSync(flag_idx, gm_a_pingpong_num); + if (real_core_idx < rank_size && comm_idx < comm_count) { + if(real_core_idx == rank){ + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_ZERO_IDX, FLAG_VALUE); + } + int32_t k_len; + if(comm_idx == comm_count - 1){ + k_len = k - comm_idx * comm_k; + } else { + k_len = comm_k; + } + + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_ZERO_IDX, FLAG_VALUE * (comm_idx + 1)); + + int32_t m_len = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = real_core_idx * local_expert_nums + local_expert_id; + if(is_moe_averaged) { + m_len = m / EP / local_expert_nums; + } else { + m_len = global_tokens_per_expert_matrix[rank * expert_nums + expert_id]; + } + if (maxOutputSize > 0 && m_len > maxOutputSize - dst_before_expert_sum[local_expert_id]) { + m_len = maxOutputSize - dst_before_expert_sum[local_expert_id]; + } + if (m_len <= 0) { + continue; + } + __gm__ T *src_address, *dst_address; + src_address = gm_out + 1LL * k_align * sum_num_local_tokens_per_expert[local_expert_id] + comm_idx * comm_k; + dst_address = buff[real_core_idx] + 1LL * flag_idx * gm_a_pingpong_size + 1LL * k_len * dst_before_expert_sum[local_expert_id]; + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, m_len, k_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_ONE_IDX, FLAG_VALUE); + if(real_core_idx == rank){ + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_ONE_IDX, FLAG_VALUE * (comm_idx + 1) * rank_size); + } + } + SetAndWaitAivSync(flag_idx, gm_a_pingpong_num); + if (comm_idx < comm_count) { + SetAicSync(flag_idx); + } + } + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + serial_pertoken_dequant_runner.Run(); + } + EndFlagsAndBias(); + } + +public: + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_loop; + using CocCommBase::real_core_idx; + using CocCommBase::core_num; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::buffer_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_direct; + using CocCommBase::swizzl_count; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_int8; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::max_ub_ping_pong_size; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::withSerialMode; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; + using CocCommBase::is_deterministic; + using CocCommBase::weight_nz; + + using CocCommBase::global_tokens_per_expert_matrix; + using CocCommBase::num_local_tokens_per_expert; + + + using CocCommBase::local_expert_nums; + using CocCommBase::TP; + using CocCommBase::EP; + using CocCommBase::maxOutputSize; + using CocCommBase::flag_offset; + int32_t max_m; + int32_t comm_k; + int32_t comm_count; + int32_t gm_a_pingpong_size; + int32_t expert_nums; + int32_t gm_a_pingpong_num; + int32_t m_align; + int32_t k_align; + int32_t n_align; + int32_t aligned_a; + int32_t aligned_b; + Preprocessor preprocessor; + + //AllGatherMatmulBiasAdder add_bias_runner; + FusedPerTokenDequantRunner fused_pertoken_dequant_runner; + SerialPerTokenDequantRunner serial_pertoken_dequant_runner; + __gm__ float32_t *gm_quant_scale; +}; + + + +template +inline __aicore__ void CocAllToAllVAllGatherHiddenAiv(COC_ARGS_FUN(T)){ + AllToAllvAllGatherHiddenSplit alltoall_allgather_without_bias; + AllToAllvAllGatherHiddenSplit alltoall_allgather_with_bias; + AllToAllvAllGatherHiddenSplit alltoall_allgather_int8_without_bias; + AllToAllvAllGatherHiddenSplit alltoall_allgather_int8_with_bias; + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t write_to_other_rank = cocTilingData->write2OtherRank; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + alltoall_allgather_without_bias.SetArgs(COC_ARGS_CALL()); + alltoall_allgather_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + alltoall_allgather_with_bias.SetArgs(COC_ARGS_CALL()); + alltoall_allgather_with_bias.Run(); + break; + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + alltoall_allgather_int8_without_bias.SetArgs(COC_ARGS_CALL_INT8()); + alltoall_allgather_int8_without_bias.Run(); + break; + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + alltoall_allgather_int8_with_bias.SetArgs(COC_ARGS_CALL_INT8()); + alltoall_allgather_int8_with_bias.Run(); + break; + default : + break; + } + PipeBarrier(); +} + +#endif diff --git a/comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce b/comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce new file mode 100644 index 00000000..7bea5dc4 --- /dev/null +++ b/comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce @@ -0,0 +1,350 @@ +#ifdef __DAV_C220_VEC__ +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class AllToAllVReduceScatterHiddenSplit : public CocCommBase { +public: + __aicore__ explicit AllToAllVReduceScatterHiddenSplit(){}; + + inline __attribute__((always_inline)) __aicore__ void SetArgs(COC_ARGS_FUN(T)) + { + CocCommBase::SetArgs(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + this->gm_out = gm_out; + expert_nums = local_expert_nums * EP; + if (!is_moe_averaged) { + this->global_tokens_per_expert_matrix = global_tokens_per_expert_matrix; + } + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + if(dequant_granularity == QuantGranularity::PER_TOKEN) { + fused_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(buff[rank]), workspace_info, reinterpret_cast<__gm__ float32_t*>(gm_quant_scale), + batch_size, m, k, n, m0,k0, n0, m_loop, n_loop, core_loop, rank, swizzl_direct, + swizzl_count, p_value, EP, TP, local_expert_nums, is_moe_averaged, 1, maxOutputSize, buffer_size, global_tokens_per_expert_matrix); + } + } + + inline __attribute__((always_inline)) __aicore__ void EndFlagsAndBias() + { + ResetIpcFlags(2); + if (real_core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + } + + + + + template + inline __attribute__((always_inline)) __aicore__ void MoveResultFromSrcToDst(__gm__ CommType *gm_src, __gm__ CommType *gm_dst, + int32_t m_actual,int32_t n_actual) + { + __ubuf__ CommType *output_UB_T[2] = {(__ubuf__ CommType *)(32), (__ubuf__ CommType *)(97440)}; + int32_t max_move_m = (max_ub_ping_pong_size / Block32B::AlignUp(n_actual)); + if (max_move_m > 4095) + max_move_m = 4095; + int32_t ping_pong_move_count = DivCeil(m_actual, max_move_m); + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_m = max_move_m; // 4 + int32_t actual_move_n = n_actual;//3584 + if(move_idx == ping_pong_move_count - 1) { + actual_move_m = m_actual - move_idx * max_move_m; + } + + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + CopyGmToUbufAlignB16(ub_buff_st, gm_src, actual_move_m ,actual_move_n * sizeof(CommType), 0, 0); + + SetFlag(event_id); + WaitFlag(event_id); + CopyUbufToGmAlignB16(gm_dst, ub_buff_st, actual_move_m , actual_move_n * sizeof(CommType), 0, (n - actual_move_n) * sizeof(CommType)); + gm_src += actual_move_m * actual_move_n; + gm_dst += actual_move_m * n; + SetFlag(event_id); + } + } + + + + inline __attribute__((always_inline)) __aicore__ void Run() + { + preprocessor.Run(local_expert_nums); + if (is_moe_averaged) { + max_m = m; + } else { + if (maxOutputSize == -1) { + max_m = 0; + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t sum_m_ep = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = local_expert_id + ep_idx * local_expert_nums; + for(int32_t i = 0; i < EP; i++) { + sum_m_ep += global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + } + max_m = max(max_m, sum_m_ep); + } + } else { + max_m = maxOutputSize; + } + } + + comm_n = p_value * n0; + gm_a_pingpong_size = max_m * comm_n; + gm_a_pingpong_num = buffer_size * 1024 * 1024 / 2 / gm_a_pingpong_size; + if (gm_a_pingpong_num > 8) { + gm_a_pingpong_num = 8; + } + + cal_count = DivCeil(n, comm_n); + int32_t max_flag_id = cal_count < gm_a_pingpong_num ? cal_count : gm_a_pingpong_num; + for (int64_t cal_idx = 0; cal_idx < max_flag_id; ++cal_idx) { + SetAicSync(cal_idx); + } + + int64_t dst_before_expert_sum[16] = {0}; // 当前expert搬运起点,src的位置 + int32_t sum_num_local_tokens_per_expert[16] = {0}; // 当前expert搬运dst的位置 + int32_t gmm_ep_idx = real_core_idx < rank_size ? real_core_idx : rank_size - 1; + if (!is_moe_averaged) { + int32_t hcumsum = 0; + for (int32_t j = 0; j <= gmm_ep_idx; j++) { + for(int32_t i = 0; i < local_expert_nums; i++) { + if (j == gmm_ep_idx) + sum_num_local_tokens_per_expert[i] = hcumsum; + hcumsum += global_tokens_per_expert_matrix[rank * expert_nums + i + j * local_expert_nums]; + } + } + + int32_t cumsum = 0; + for (int32_t i = 0; i < local_expert_nums; i++) { + for(int32_t j = 0; j < rank_size; j++) { + if (j == rank) { + dst_before_expert_sum[i] = cumsum; + } + cumsum += global_tokens_per_expert_matrix[j * expert_nums + i + gmm_ep_idx * local_expert_nums]; + } + } + } else { + for (int32_t i = 0; i < local_expert_nums; i++) { + sum_num_local_tokens_per_expert[i] = (m / expert_nums) * (gmm_ep_idx * local_expert_nums + i); + dst_before_expert_sum[i] = (m / expert_nums) * (EP * i + rank); + } + } + + for (int32_t cal_idx = 0; cal_idx < cal_count; ++cal_idx) { + uint64_t flag_idx = cal_idx % gm_a_pingpong_num; + WaitEvent(flag_idx); + + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + SetAndWaitAivSync(flag_idx,gm_a_pingpong_num); + fused_pertoken_dequant_runner.DequantPerTokenMatmulAllToAllHidden(cal_idx); + } + SetAndWaitAivSync(flag_idx, gm_a_pingpong_num); + int64_t n_len, m_len; + if(cal_idx == cal_count - 1){ + n_len = n - cal_idx * comm_n; + } else { + n_len = comm_n; + } + int32_t n_loop_cal = DivCeil(n_len, n0); + + if (real_core_idx < rank_size) { + if (real_core_idx == rank) { + SetBuffFlagByAdd( + ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + FLAG_ZERO_IDX, FLAG_VALUE); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_ZERO_IDX, FLAG_VALUE * (cal_idx + 1)); + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = real_core_idx * local_expert_nums + local_expert_id; + if(is_moe_averaged) { + m_len = m / EP / local_expert_nums; + } else { + m_len = global_tokens_per_expert_matrix[rank * expert_nums + expert_id]; + } + + if (maxOutputSize > 0 && m_len > maxOutputSize - dst_before_expert_sum[local_expert_id]) { + m_len = maxOutputSize - dst_before_expert_sum[local_expert_id]; + } + + if (m_len <= 0) { + continue; + } + int64_t buff_offset = flag_idx * gm_a_pingpong_size + 1LL * dst_before_expert_sum[local_expert_id] * n_len; + + int64_t gm_offset = 1LL * sum_num_local_tokens_per_expert[local_expert_id] * n + 1LL * cal_idx * comm_n; + __gm__ T *src_address, *dst_address; + src_address = buff[real_core_idx] + buff_offset; + dst_address = gm_out + gm_offset; + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, m_len, n_len); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_ONE_IDX, FLAG_VALUE); + if (real_core_idx == rank) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + FLAG_ONE_IDX, + FLAG_VALUE * (cal_idx + 1) * EP); + } + } + SetAndWaitAivSync(flag_idx, gm_a_pingpong_num); + SetAicSync(flag_idx); + } + EndFlagsAndBias(); + } + + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::FillZero; + using CocCommBase::FirstStepInPeerMem; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_loop; + using CocCommBase::real_core_idx; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::buffer_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_count; + using CocCommBase::swizzl_direct; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_int8; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::max_ub_ping_pong_size; + using CocCommBase::loop_num_per_comm; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::maxOutputSize; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; + using CocCommBase::is_deterministic; + using CocCommBase::flag_offset; + using CocCommBase::weight_nz; + + int32_t gm_a_pingpong_size; + int32_t gm_a_pingpong_num; + int32_t cal_count; + int32_t comm_n; + int32_t max_m; + + int32_t m_align; + int32_t k_align; + int32_t n_align; + + + using CocCommBase::global_tokens_per_expert_matrix; + using CocCommBase::expert_nums; + using CocCommBase::local_expert_nums; + using CocCommBase::TP; + using CocCommBase::EP; + Preprocessor preprocessor; + FusedPerTokenDequantRunner fused_pertoken_dequant_runner; +}; + + +template +inline __attribute__((always_inline)) __aicore__ void RunAllToAllVReduceScatterHiddenAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) +{ + // 16 align + AllToAllVReduceScatterHiddenSplit all_to_allv_reduce_scatter_align_16_without_bias; + //AllToAllVReduceScatterHiddenSplit all_to_allv_reduce_scatter_align_16_with_bias; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + all_to_allv_reduce_scatter_align_16_without_bias.SetArgs(COC_ARGS_CALL()); + all_to_allv_reduce_scatter_align_16_without_bias.Run(); + break; + // case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + // case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + // case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + // case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + // all_to_allv_reduce_scatter_align_16_with_bias.SetArgs(COC_ARGS_CALL()); + // all_to_allv_reduce_scatter_align_16_with_bias.Run(); + // break; + default: + break; + } +} + +template +inline __attribute__((always_inline)) __aicore__ void RunAllToAllVReduceScatterHiddenUnAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) +{ + // 16 unalign + AllToAllVReduceScatterHiddenSplit all_to_allv_reduce_scatter_unalign_16_without_bias; + AllToAllVReduceScatterHiddenSplit all_to_allv_reduce_scatter_unalign_16_with_bias; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + all_to_allv_reduce_scatter_unalign_16_without_bias.SetArgs(COC_ARGS_CALL()); + all_to_allv_reduce_scatter_unalign_16_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + all_to_allv_reduce_scatter_unalign_16_with_bias.SetArgs(COC_ARGS_CALL()); + all_to_allv_reduce_scatter_unalign_16_with_bias.Run(); + break; + default: + break; + } +} + +template +inline __attribute__((always_inline)) __aicore__ void CocMatmulAllToAllVReduceScatterHiddenAiv(COC_ARGS_FUN(T)) +{ + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t n = cocTilingData->n; + int32_t tiling_key = cocTilingData->tilingKey; + if (n % BLOCK_SIZE_16 == 0) { + RunAllToAllVReduceScatterHiddenAlign16(tiling_key, COC_ARGS_CALL()); + } else { + RunAllToAllVReduceScatterHiddenUnAlign16(tiling_key, COC_ARGS_CALL()); + } + + PipeBarrier(); +} + +#endif diff --git a/comm/lcal/src/kernels/coc_alltoallv_allgather.cce b/comm/lcal/src/kernels/coc_alltoallv_allgather.cce new file mode 100644 index 00000000..0b70df2a --- /dev/null +++ b/comm/lcal/src/kernels/coc_alltoallv_allgather.cce @@ -0,0 +1,642 @@ +#ifdef __DAV_C220_VEC__ +#include + +#include "coc_internal.cce" +#include "coc_comm_base.cce" +#include "kernel_operator.h" +using namespace AscendC; + +template +class AllToAllvAllGather: public CocCommBase{ +public: + __aicore__ explicit AllToAllvAllGather(){}; + inline __attribute__((always_inline)) __aicore__ void SetArgs(COC_ARGS_FUN(T)){ + CocCommBase::SetArgs(COC_ARGS_CALL()); + preprocessor.SetArgs(PP_MATMUL_AIV_PADDING_ARGS_CALL()); + if constexpr (HAVE_BIAS) { + add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); + } + m_align = Block512B::AlignUp(m); + k_align = Block512B::AlignUp(k); + n_align = Block512B::AlignUp(n); + + AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); + this->gm_out = aligned_a ? reinterpret_cast<__gm__ T *>(workspace_info.gm_a_align) : gm_a; + this->gm_quant_scale = reinterpret_cast<__gm__ float32_t *>(gm_quant_scale); + this -> expert_nums = local_expert_nums * EP; + is_moe_averaged = 0; + if(global_tokens_per_expert_matrix == nullptr) { + is_moe_averaged = 1; + } + this -> global_tokens_per_expert_matrix = reinterpret_cast<__gm__ int32_t *>(global_tokens_per_expert_matrix); + gm_a_pingpong_size = m0 * k_align * p_value * rank_size; + if(dequant_granularity == QuantGranularity::PER_TOKEN) { + int32_t output_num = m; + if (!is_moe_averaged) { + output_num = 0; + for (int32_t i = 0 ; i < EP; i++) { + for (int32_t j = 0; j < local_expert_nums; j++) { + output_num += global_tokens_per_expert_matrix[i * expert_nums + j + rank * local_expert_nums]; + } + } + if (maxOutputSize > 0 && output_num >= maxOutputSize) { + output_num = maxOutputSize; + } + } + serial_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ MatType *>(gm_out), reinterpret_cast<__gm__ float32_t*>(workspace_info.gm_dequant_param), output_num, n, m0, n0); + } + } + + inline __attribute__((always_inline)) __aicore__ void ScaleAllToAll(){ + + int32_t usable_buff = 200 * 1024 * 1024 / 4 / 2; + int32_t max_move_num = usable_buff / rank_size; + int32_t scale_pingpang_size = usable_buff; + + int32_t cal_count = 0; + if(is_moe_averaged) { + cal_count = DivCeil(m / EP, max_move_num); + } else { + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t in_num = 0; + int32_t out_num = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + out_num += global_tokens_per_expert_matrix[rank * expert_nums + j + ep_idx * local_expert_nums]; + } + for(int32_t j = 0; j < local_expert_nums; j++) { + in_num += global_tokens_per_expert_matrix[ep_idx * expert_nums + j + rank * local_expert_nums];// + } + cal_count = max(cal_count, max(in_num, out_num)); + } + cal_count = DivCeil(cal_count, max_move_num); + } + + PipeBarrier(); + + int64_t sum_out = 0, sum_in = 0; + int32_t received_loop_number = 0; + int32_t ep_idx = real_core_idx; + + int32_t out_num, in_num; + if(is_moe_averaged) { + out_num = m / EP; + in_num = m / EP; + } else if(real_core_idx < rank_size){ + out_num = 0; + in_num = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + out_num += global_tokens_per_expert_matrix[rank * expert_nums + j + real_core_idx * local_expert_nums]; + } + for(int32_t j = 0; j < local_expert_nums; j ++) { + in_num += global_tokens_per_expert_matrix[real_core_idx * expert_nums + rank * local_expert_nums + j]; + } + } + + max_ub_ping_pong_size = max_ub_ping_pong_size / 2; // + int32_t receive_expert_id = 0; + int32_t receive_expert_token_nums; + int32_t last_ep_local = 0; + if (is_moe_averaged) { + receive_expert_token_nums = m / EP / local_expert_nums; + last_ep_local = (m / EP) * real_core_idx; + } else if(real_core_idx < rank_size){ + receive_expert_token_nums = global_tokens_per_expert_matrix[real_core_idx * expert_nums + rank * local_expert_nums]; + for(int32_t i = 0; i < real_core_idx * local_expert_nums; i++) { + last_ep_local += global_tokens_per_expert_matrix[rank * expert_nums + i]; + } + } + + for(int32_t cal_idx = 0; cal_idx < cal_count; cal_idx ++) { + int32_t flag_idx = cal_idx % MAX_BLOCK_COUNT; + + SetAndWaitAivSync(flag_idx); + int32_t received_rank_num = 0; + if (is_moe_averaged){ + received_rank_num = rank_size; + } else { + for(int32_t i = 0; i < EP; i++) { + int32_t in_num_tmp = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + in_num_tmp += global_tokens_per_expert_matrix[i * expert_nums + rank * local_expert_nums + j];// + } + if(cal_idx * max_move_num < in_num_tmp) { + received_rank_num += 1; + } + } + } + + received_loop_number += received_rank_num; + + if(real_core_idx < rank_size){ + if(real_core_idx == rank) { + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_TWO_IDX, FLAG_VALUE); + } + if(is_moe_averaged || cal_idx * max_move_num < out_num) { + int32_t data_len = ((cal_idx + 1) * max_move_num >= out_num) ? (out_num - cal_idx * max_move_num) : max_move_num; + __gm__ float32_t *src_address; + __gm__ float32_t *dst_address = (__gm__ float32_t *)buff[real_core_idx] + flag_idx * scale_pingpang_size + max_move_num * rank;; + if (is_moe_averaged) { + src_address = gm_quant_scale + 1LL * last_ep_local + sum_out; + } else { + src_address = gm_quant_scale + 1LL * last_ep_local + sum_out; + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_TWO_IDX, FLAG_VALUE * (cal_idx + 1)); + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, data_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + + sum_out += data_len; + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_ADD_IDX, FLAG_VALUE); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_ADD_IDX, FLAG_VALUE * received_loop_number); + + if(is_moe_averaged || cal_idx * max_move_num < in_num) { + int32_t data_len = ((cal_idx + 1) * max_move_num >= in_num) ? (in_num - cal_idx * max_move_num) : max_move_num; + __gm__ float32_t *src_address; + __gm__ float32_t *dst_address; + src_address = (__gm__ float32_t *)buff[rank] + flag_idx * scale_pingpang_size + max_move_num * real_core_idx; + + while(receive_expert_id < local_expert_nums && data_len > 0) { + int32_t move_data_len; + if (data_len >= receive_expert_token_nums){ + move_data_len = receive_expert_token_nums; + } else { + move_data_len = data_len; + } + + if (is_moe_averaged) { + dst_address = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_dequant_param) + + 1LL * (m / local_expert_nums) * receive_expert_id + 1LL * (m / expert_nums) * real_core_idx + sum_in; + } else { + int32_t before_expert_sum = 0; + for(int32_t i = 0; i < receive_expert_id; i++){ + for(int32_t j = 0; j < EP; j ++) { + before_expert_sum += global_tokens_per_expert_matrix[j * expert_nums + i + rank * local_expert_nums]; + } + } + int32_t before_rank_in_expert_sum = 0; + for(int32_t i = 0; i < real_core_idx; i++){ + before_rank_in_expert_sum += global_tokens_per_expert_matrix[i * expert_nums + rank * local_expert_nums + receive_expert_id]; + } + dst_address = reinterpret_cast<__gm__ float32_t *>(workspace_info.gm_dequant_param) + + 1LL * before_expert_sum + 1LL * before_rank_in_expert_sum + sum_in; + } + + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, move_data_len, 0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + + + if (data_len >= receive_expert_token_nums){ + receive_expert_id += 1; + data_len -= receive_expert_token_nums; + if (receive_expert_id > local_expert_nums) { + break; + } + if (is_moe_averaged) { + receive_expert_token_nums = m / EP / local_expert_nums; + } else { + receive_expert_token_nums = global_tokens_per_expert_matrix[real_core_idx * expert_nums + receive_expert_id + rank * local_expert_nums]; + } + sum_in = 0; + } else{ + sum_in += data_len; + receive_expert_token_nums -= data_len; + data_len = 0; + } + src_address += move_data_len; + } + } + } + } + + + max_ub_ping_pong_size = max_ub_ping_pong_size * 2; + + if (real_core_idx < rank_size) { + if(real_core_idx == rank) { + SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + FLAG_TWO_IDX, 0); + } + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_TWO_IDX, 0); + } + PipeBarrier(); + } + + + + template + inline __attribute__((always_inline)) __aicore__ void MoveResultFromSrcToDst(__gm__ CommType *gm_src, __gm__ CommType *gm_dst, + int32_t len, bool is_align = true) + { + __ubuf__ CommType *output_UB_T[2] = {(__ubuf__ CommType *)(32), (__ubuf__ CommType *)(97440)}; + int32_t ping_pong_move_count = (len + max_ub_ping_pong_size - 1) / max_ub_ping_pong_size; + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { + int32_t actual_move_size = max_ub_ping_pong_size; + if (move_idx == ping_pong_move_count - 1) { + actual_move_size = len - move_idx * max_ub_ping_pong_size; + } + auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; + auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; + WaitFlag(event_id); + if(is_align) { + CopyGmToUbuf(ub_buff_st, gm_src, 1, actual_move_size * sizeof(CommType) / 32, 0, 0); + } else { + CopyGmToUbufAlignB16(ub_buff_st, gm_src, 1, actual_move_size * sizeof(CommType), 0, 0); + } + SetFlag(event_id); + WaitFlag(event_id); + if(is_align) { + CopyUbufToGm(gm_dst, ub_buff_st, 1, actual_move_size * sizeof(CommType) / 32, 0, 0); + } else { + CopyUbufToGmAlignB16(gm_dst, ub_buff_st, 1, actual_move_size * sizeof(CommType), 0, 0); + } + gm_src += max_ub_ping_pong_size; + gm_dst += max_ub_ping_pong_size; + SetFlag(event_id); + } + } + + inline __attribute__((always_inline)) __aicore__ void EndFlagsAndBias() + { + ResetIpcFlags(4); + if (real_core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + FLAG_ZERO_IDX, 0); + } + PipeBarrier(); + if constexpr (HAVE_BIAS) { + add_bias_runner.Run(); + } + } + +inline __attribute__((always_inline)) __aicore__ void Run(){ + preprocessor.Run(local_expert_nums); + int32_t comm_m = p_value * m0; + int32_t comm_count; + if (is_moe_averaged) { + comm_count = DivCeil(m / EP , comm_m); + } else { + int32_t max_comm_count = 0; + int32_t max_input_per_ep = 0; + int32_t max_output_per_ep = 0; + for (int32_t ep_idx = 0; ep_idx < EP; ep_idx++) { + int32_t tmp_sum = 0; + for(int32_t i = 0; i < local_expert_nums; i++) { + tmp_sum += global_tokens_per_expert_matrix[rank * expert_nums + ep_idx * local_expert_nums + i]; + } + max_output_per_ep = max(max_output_per_ep, tmp_sum); + tmp_sum = 0; + for(int32_t i = 0; i < local_expert_nums; i++) { + tmp_sum += global_tokens_per_expert_matrix[ep_idx * expert_nums + rank * local_expert_nums + i]; + } + max_input_per_ep = max(max_input_per_ep, tmp_sum); + max_comm_count = max(max_comm_count, max(max_output_per_ep, max_input_per_ep)); + } + comm_count = DivCeil(max_comm_count, comm_m); + } + + + int32_t out_num = 0;//发往 core_idx 卡的token数; + int32_t before_rank_offset_src = 0;//发往core_idx卡的token的地址offset; + int32_t cur_local_expert_id = 0;//当前正在发送的local expert id; + int32_t cur_expert_len = 0;//当前发送的local expert 的token的长度; + int32_t expert_remain_data_len; + if (real_core_idx < rank_size) { + if (is_moe_averaged) { + before_rank_offset_src = (m / rank_size) * real_core_idx; + out_num = (m / rank_size); + cur_expert_len = m / rank_size / local_expert_nums; + } else { + for(int32_t i = 0; i < real_core_idx; i++){ + for (int32_t j = 0; j < local_expert_nums; j++) { + before_rank_offset_src += global_tokens_per_expert_matrix[rank * expert_nums + j + i * local_expert_nums]; + } + } + for(int32_t i = 0; i < local_expert_nums; i++) { + out_num += global_tokens_per_expert_matrix[rank * expert_nums + i + real_core_idx * local_expert_nums]; + } + cur_expert_len = global_tokens_per_expert_matrix[rank * expert_nums + real_core_idx * local_expert_nums]; + } + } + expert_remain_data_len = cur_expert_len; + + + + if(dequant_granularity == QuantGranularity::PER_TOKEN){ + ScaleAllToAll(); + } + + + int32_t cur_expert = real_core_idx * local_expert_nums; + int32_t received_loop_number = 0; + int32_t sum_out_this_core = 0; //已经发往 core_idx 卡的token数 + int32_t sum_in_expert = 0; //当前expert已经发送的token数 + + for(int32_t comm_idx = 0; comm_idx < comm_count + MAX_BLOCK_COUNT; comm_idx++){ + uint64_t flag_idx = comm_idx % MAX_BLOCK_COUNT; + int32_t received_rank_num = 0; + if (is_moe_averaged){ + received_rank_num = rank_size; + } else { + for(int32_t i = 0; i < EP; i++){ + int32_t in_loop_per_ep = 0; + for(int32_t j = 0; j < local_expert_nums; j++) { + in_loop_per_ep += global_tokens_per_expert_matrix[i * expert_nums + j + rank * local_expert_nums]; + } + if (comm_idx * comm_m < in_loop_per_ep) { + received_rank_num += 1; + } + } + } + received_loop_number += received_rank_num; + + if (comm_idx > 1) { + WaitEvent(flag_idx); + } + SetAndWaitAivSync(flag_idx); + + + if (real_core_idx < rank_size && comm_idx < comm_count) { + if(real_core_idx == rank){ + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_ZERO_IDX, FLAG_VALUE); + } + if(is_moe_averaged || comm_idx * comm_m < out_num){ + int32_t data_len; + if ((comm_idx + 1) * comm_m >= out_num){ + data_len = out_num - comm_idx * comm_m; + } else { + data_len = comm_m; + } + + __gm__ T *src_address, *dst_address; + src_address = gm_out + 1LL * before_rank_offset_src * k_align + 1LL * comm_idx * comm_m * k_align; + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_ZERO_IDX, FLAG_VALUE * (comm_idx + 1)); + + //因为data_len的token可能跨expert,所以需要循环 + int32_t remain_data_len = data_len; + while(cur_local_expert_id < local_expert_nums && remain_data_len > 0) { + int32_t move_data_len; + if (remain_data_len >= cur_expert_len - sum_in_expert) { + move_data_len = cur_expert_len - sum_in_expert; + } else { + move_data_len = remain_data_len; + } + + if (move_data_len > 0) { + move_data_len = 1LL * move_data_len * k_align; + //关键点:计算本次通信在目标卡共享内存内的地址 + int32_t before_expert_offset = 0; //在目标卡的共享内存中这次通信expert的offset + int32_t before_rank_offset = 0; //在目标卡的共享内存中这次通信当前expert中当前rank的offset + for(int32_t i = 0; i < rank_size; i++) { + //第i张卡发往core_idx卡的token数。 + int32_t out_this_rank = 0; + for (int32_t j = 0; j < local_expert_nums; j ++) { + int32_t expert_token_num; + if (is_moe_averaged) { + expert_token_num = m / expert_nums; + } else { + expert_token_num = global_tokens_per_expert_matrix[i * expert_nums + real_core_idx * local_expert_nums + j]; + } + out_this_rank += expert_token_num; + } + + int32_t data_len_this_rank; + if ((comm_idx + 1) * comm_m >= out_this_rank) { + data_len_this_rank = out_this_rank - comm_idx * comm_m; + } else { + data_len_this_rank = comm_m; + } + + //expert token数的前缀和 + int32_t sum = 0; + for(int32_t j = 0; j < cur_local_expert_id; j++) { + int32_t expert_id = real_core_idx * local_expert_nums + j; + //本次通信,第i张卡发往expert_id的token数。 + //i卡发往expert的总的token数: + int32_t expert_token_num; + if (is_moe_averaged) { + expert_token_num = m / expert_nums; + } else { + expert_token_num = global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + if (comm_idx * comm_m < sum + expert_token_num && comm_idx * comm_m + data_len_this_rank > sum) + { + int32_t tmp_len = min(comm_idx * comm_m + data_len_this_rank, sum + expert_token_num) - + max(comm_idx * comm_m, sum); + before_expert_offset += tmp_len; + } + sum += expert_token_num; + } + if (i < rank) { + int32_t expert_id = real_core_idx * local_expert_nums + cur_local_expert_id; + int32_t expert_token_num; + if (is_moe_averaged) { + expert_token_num = m / expert_nums; + } else { + expert_token_num = global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + if ((comm_idx * comm_m < sum + expert_token_num) && (comm_idx * comm_m + data_len_this_rank > sum)) { + int32_t tmp_len = min(comm_idx * comm_m + data_len_this_rank, sum + expert_token_num) - + max(comm_idx * comm_m, sum); + before_rank_offset += tmp_len; + } + } + } + + dst_address = buff[real_core_idx] + 1LL * flag_idx * gm_a_pingpong_size + + 1LL * before_expert_offset * k_align + 1LL * before_rank_offset * k_align; + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 + MoveResultFromSrcToDst(src_address, dst_address, move_data_len); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + } + + if (remain_data_len >= cur_expert_len - sum_in_expert) { + cur_local_expert_id ++; + remain_data_len -= (cur_expert_len - sum_in_expert); + if (is_moe_averaged) { + cur_expert_len = m / expert_nums; + } else if(cur_local_expert_id < local_expert_nums){ + cur_expert_len = global_tokens_per_expert_matrix[rank * expert_nums + real_core_idx * local_expert_nums + cur_local_expert_id]; + } + sum_in_expert = 0; + } else { + sum_in_expert += remain_data_len; + remain_data_len = 0; + } + src_address += move_data_len; + } + + SetBuffFlagByAdd(ctrl_flags_UB, (__gm__ int32_t *)buff[real_core_idx] + flag_offset + + FLAG_ONE_IDX, FLAG_VALUE); + } + if(real_core_idx == rank){ + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + + FLAG_ONE_IDX, FLAG_VALUE * received_loop_number); + } + } + + SetAndWaitAivSync(flag_idx); + SetAicSync(flag_idx); + } + + if (dequant_granularity == QuantGranularity::PER_TOKEN) { + serial_pertoken_dequant_runner.Run(); + } + EndFlagsAndBias(); + } + + +public: + using CocCommBase::SetAicSync; + using CocCommBase::SetAndWaitAivSync; + + using CocCommBase::SetBuffFlag; + using CocCommBase::SetBuffFlagByAdd; + using CocCommBase::CheckBuffFlag; + using CocCommBase::ResetIpcFlags; + using CocCommBase::CrossRankSyncV1; + using CocCommBase::CrossRankSyncV2; + + using CocCommBase::buff; + using CocCommBase::gm_out; + using CocCommBase::ctrl_flags_UB; + using CocCommBase::output_UB_T; + using CocCommBase::batch_size; + using CocCommBase::m; + using CocCommBase::k; + using CocCommBase::n; + using CocCommBase::m0; + using CocCommBase::k0; + using CocCommBase::n0; + using CocCommBase::m_loop; + using CocCommBase::n_loop; + using CocCommBase::k_loop; + using CocCommBase::core_loop; + using CocCommBase::real_core_idx; + using CocCommBase::core_num; + using CocCommBase::rank; + using CocCommBase::rank_size; + using CocCommBase::tiling_key; + using CocCommBase::swizzl_direct; + using CocCommBase::swizzl_count; + using CocCommBase::trans_a; + using CocCommBase::trans_b; + using CocCommBase::is_int8; + using CocCommBase::p_value; + using CocCommBase::aiv_idx; + using CocCommBase::other_rank; + using CocCommBase::max_ub_single_dma_size; + using CocCommBase::max_ub_ping_pong_size; + using CocCommBase::dequant_granularity; + using CocCommBase::dequant_group_size; + using CocCommBase::quant_granularity; + using CocCommBase::quant_group_size; + using CocCommBase::workspace_info; + using CocCommBase::withSerialMode; + + + using CocCommBase::num_local_tokens_per_expert; + using CocCommBase::num_global_tokens_per_local_expert; + using CocCommBase::global_tokens_per_expert_matrix; + + using CocCommBase::local_expert_nums; + using CocCommBase::TP; + using CocCommBase::EP; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; + using CocCommBase::is_deterministic; + using CocCommBase::maxOutputSize; + using CocCommBase::weight_nz; + + using CocCommBase::comm_npu_split; + using CocCommBase::comm_data_split; + using CocCommBase::comm_direct; + using CocCommBase::len_per_loop; + using CocCommBase::core_count; + using CocCommBase::flag_offset; + + + __gm__ int32_t *out_loop_per_ep; + __gm__ int32_t *in_loop_per_ep; + __gm__ int32_t *sum_num_local_tokens_per_expert; + __gm__ int32_t *sum_num_global_tokens_per_local_expert; + __gm__ int32_t *expert_comm_count_accum; + + __gm__ float32_t *gm_quant_scale; + + + + int32_t gm_a_pingpong_size; + int32_t m_align; + int32_t k_align; + int32_t n_align; + int32_t aligned_a; + int32_t aligned_b; + + int32_t expert_nums; + + Preprocessor preprocessor; + AllGatherMatmulBiasAdder add_bias_runner; + SerialPerTokenDequantRunner serial_pertoken_dequant_runner; + + bool need_dequant; +}; + + + +template +inline __aicore__ void CocAllToAllVAllGatherAiv(COC_ARGS_FUN(T)){ + AllToAllvAllGather alltoall_allgather_without_bias; + AllToAllvAllGather alltoall_allgather_with_bias; + AllToAllvAllGather alltoall_allgather_int8_without_bias; + AllToAllvAllGather alltoall_allgather_int8_with_bias; + SetAtomicNone(); + SetMaskNormImpl(); + SetSyncBaseAddr((uint64_t)ffts_addr); + SetVectorMask((uint64_t)-1, (uint64_t)-1); + + auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); + auto cocTilingData = ¶->cocTilingData; + int32_t tiling_key = cocTilingData->tilingKey; + int32_t write_to_other_rank = cocTilingData->write2OtherRank; + switch (tiling_key) { + case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : + case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : + alltoall_allgather_without_bias.SetArgs(COC_ARGS_CALL()); + alltoall_allgather_without_bias.Run(); + break; + case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : + case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : + alltoall_allgather_with_bias.SetArgs(COC_ARGS_CALL()); + alltoall_allgather_with_bias.Run(); + break; + case 0b000100 : case 0b100100 : case 0b010100 : case 0b110100 : + case 0b001100 : case 0b101100 : case 0b011100 : case 0b111100 : + alltoall_allgather_int8_without_bias.SetArgs(COC_ARGS_CALL_INT8()); + alltoall_allgather_int8_without_bias.Run(); + break; + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : + alltoall_allgather_int8_with_bias.SetArgs(COC_ARGS_CALL_INT8()); + alltoall_allgather_int8_with_bias.Run(); + break; + default : + break; + } + PipeBarrier(); +} + +#endif diff --git a/comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce b/comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce new file mode 100644 index 00000000..ab8f4469 --- /dev/null +++ b/comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce @@ -0,0 +1,44 @@ +#ifdef __CCE_KT_TEST__ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + + + +#include "coc_ppmatmul_switch.cce" +#include "coc_alltoallv_allgather.cce" +#include "coc_alltoall_allgather_hidden.cce" +#ifdef __DAV_C220_CUBE__ + + +#define COC_ALL_TO_ALL_ALL_GATHER_MATMUL_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllToAllVAllGatherMatmul_##type##_mix_aic(COC_ARGS_FUN(type)){ \ + return CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} +#define COC_ALL_TO_ALL_ALL_GATHER_MATMUL_HIDDEN_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllToAllVAllGatherMatmulHidden_##type##_mix_aic(COC_ARGS_FUN(type)){ \ + return CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + + +#elif __DAV_C220_VEC__ +#define COC_ALL_TO_ALL_ALL_GATHER_MATMUL_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllToAllVAllGatherMatmul_##type##_mix_aiv(COC_ARGS_FUN(type)){ \ + return CocAllToAllVAllGatherAiv(COC_ARGS_CALL()); \ +} +#define COC_ALL_TO_ALL_ALL_GATHER_MATMUL_HIDDEN_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalAllToAllVAllGatherMatmulHidden_##type##_mix_aiv(COC_ARGS_FUN(type)){ \ + return CocAllToAllVAllGatherHiddenAiv(COC_ARGS_CALL()); \ +} + +#endif + + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_ALL_TO_ALL_ALL_GATHER_MATMUL_FUNC_AUTO_DEF); +COC_TYPE_FUNC(COC_ALL_TO_ALL_ALL_GATHER_MATMUL_HIDDEN_FUNC_AUTO_DEF); + +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce b/comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce new file mode 100644 index 00000000..4d1e93ac --- /dev/null +++ b/comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce @@ -0,0 +1,28 @@ +#ifdef __CCE_KT_TEST__ +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif + + +#include "coc_ppmatmul_switch.cce" +#include "coc_alltoall_reduce_scatter_hidden.cce" +#ifdef __DAV_C220_CUBE__ +#define COC_MATMUL_REDUCE_SCATTER_ALL_TO_ALL_HIDDEN_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalMatmulReduceScatterAllToAllVHidden_##type##_mix_aic(COC_ARGS_FUN(type)){ \ + return CocPpmatmulSwitchAic(COC_ARGS_CALL()); \ +} + +#elif __DAV_C220_VEC__ +#define COC_MATMUL_REDUCE_SCATTER_ALL_TO_ALL_HIDDEN_FUNC_AUTO_DEF(type) \ +extern "C" __global__ __aicore__ void LcalMatmulReduceScatterAllToAllVHidden_##type##_mix_aiv(COC_ARGS_FUN(type)){ \ + return CocMatmulAllToAllVReduceScatterHiddenAiv(COC_ARGS_CALL()); \ +} +#endif + + +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // +#define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) + +COC_TYPE_FUNC(COC_MATMUL_REDUCE_SCATTER_ALL_TO_ALL_HIDDEN_FUNC_AUTO_DEF); +#endif \ No newline at end of file diff --git a/comm/lcal/src/kernels/coc_matmulmoe.cce b/comm/lcal/src/kernels/coc_matmulmoe.cce new file mode 100644 index 00000000..5bf45181 --- /dev/null +++ b/comm/lcal/src/kernels/coc_matmulmoe.cce @@ -0,0 +1,1171 @@ +#include "coc_internal.cce" +#include "coc_ppmatmul.cce" +#ifdef __DAV_C220_CUBE__ +template +class PpMatmulMoe : public PpMatmul { + using T_ACCUM = typename GetAccumType::T; + static constexpr bool IS_INT8 = std::is_same::value; +public: + __aicore__ explicit PpMatmulMoe() {}; + inline __aicore__ void SetArgs(PP_MATMUL_AIC_ARGS_FUN(MmadDtype, OutDtype)) + { + PpMatmul::SetArgs(PP_MATMUL_AIC_ARGS_CALL()); + + // moe args + is_moe_averaged = 0; + if (global_tokens_per_expert_matrix != nullptr) { + this -> global_tokens_per_expert_matrix = global_tokens_per_expert_matrix; + } else { + is_moe_averaged = 1; + } + this->local_expert_nums = local_expert_nums; + expert_nums = local_expert_nums * EP; + this->EP = EP; + this->TP = TP; + this->maxOutputSize = maxOutputSize; + + } + + //GMM + inline __aicore__ void CalLoop(int64_t batch_idx, int64_t m_idx, int64_t n_idx, int32_t m_actual, int32_t n_actual, + __gm__ MmadDtype *gm_a_src_tmp, __gm__ MmadDtype *gm_b_src_tmp, int32_t k, int32_t k_all, int32_t expert_dequant_param_offset = 0) { + + + int32_t k_loop = DivCeil(k, k0); + int32_t k_align = Block512B::AlignUp(k); + int32_t k_all_align = Block512B::AlignUp(k_all); + if (k != k_all) { + k_align = k; + } + + int64_t offset_a, offset_b, offset_a_next, offset_b_next; + int32_t m_round, n_round; + if (IS_INT8) { + // directive Restrictions + if (TA) { + m_round = DivCeil(m_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; + } else { + m_round = DivCeil(m_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } + if (TB) { + n_round = DivCeil(n_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } else { + n_round = DivCeil(n_actual, BLOCK_SIZE_32) * BLOCK_SIZE_32; + } + } else { + m_round = DivCeil(m_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + n_round = DivCeil(n_actual, BLOCK_SIZE_16) * BLOCK_SIZE_16; + } + + int32_t mn_max = m_round > n_round ? m_round : n_round; + int32_t k_part_len = L0AB_PINGPONG_BUFFER_LEN / mn_max / block_size * block_size; + if (TA) { + if (aligned_a == 1) { + offset_a = batch_idx * k * m_align + m_idx * m0; + } else { + offset_a = batch_idx * k * m + m_idx * m0; + } + } else { + if (aligned_a == 1) { + offset_a = batch_idx * m * k_align + m_idx * m0 * k_align; + } else { + offset_a = batch_idx * m * k + m_idx * m0 * k; + } + } + + if (TB) { + if (aligned_b == 1) { + offset_b = batch_idx * n * k_all_align + n_idx * n0 * k_all_align; + } else { + if (weight_nz) { + offset_b = n_idx * n0 * block_size; + } else { + offset_b = n_idx * n0 * k_all; + } + } + } else { + if (aligned_b == 1) { + offset_b = batch_idx * k * n_align + n_idx * n0; + } else { + if (weight_nz) { + offset_b = n_idx * n0 * k_align16; + } else { + offset_b = n_idx * n0; + } + } + } + + int64_t dequant_param_offset = n_idx * n0 + expert_dequant_param_offset; + + int32_t k_actual = (k_loop == 1) ? k : k0; + int32_t k_round = DivCeil(k_actual, block_size) * block_size; // int8 :32 fp16 :16 + + auto l1_buf_a = ping_flag ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; + auto l1_buf_b = ping_flag ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; + auto l0a_buf = ping_flag ? l0a_base : l0a_base + L0AB_PINGPONG_BUFFER_LEN; + auto l0b_buf = ping_flag ? l0b_base : l0b_base + L0AB_PINGPONG_BUFFER_LEN; + auto event_id = ping_flag ? EVENT_ID0 : EVENT_ID1; + + if (IS_INT8 && has_offset) { + PipeBarrier(); + IntrinsicCopyGmToL1Nd2Nz::move( + ((__cbuf__ int32_t *)bias_l1), + ((__gm__ int32_t *)gm_format_dequant_offset) + dequant_param_offset, + 0, // sid + 1, // ndNum + 1, // nValue + n_actual, // dValue + 0, // srcNdMatrixStride, unused + n, // srcDValue + 1, // dstNzC0Stride + 1, // dstNzNStride + 0 // dstNzMatrixStride, unused + ); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + CopyCubfToBt(((uint64_t)bias_bt), ((__cbuf__ int32_t *)bias_l1), + (uint16_t)0ULL, 1, (n_actual * 4 + 63) / 64, 0, 0); + SetFlag(EVENT_ID1); // bias ready, mte2 can begin move A/B or scalar + SetFlag(EVENT_ID1); // bias ready, mmad can begin + WaitFlag(EVENT_ID1); // A/B or scalar wait moving bias from L1 to BT + + } + + auto gm_src_a = gm_a_src_tmp + offset_a; + //auto gm_src_b = gm_b_src + offset_b; + auto gm_src_b = gm_b_src_tmp + offset_b; + WaitFlag(event_id); + // *** load matrix A to L1 + if (m_actual == 1 && !TA) { + CopyGmToCbuf( + l1_buf_a, + gm_src_a, + 0, // sid + 1, // nBurst + k_round / block_size, // lenBurst + 0, // srcGap + 0, // dstGap + PAD_NONE // padMode + ); + } else { + if (TA) { + auto src_len = m; + if (aligned_a == 1) { + src_len = m_align; + } + CopyGmToL1Nd2zN::move(l1_buf_a, gm_src_a, k_actual, m_actual, src_len, k_round); + } else { + auto src_len = k; + if (aligned_a == 1) { + src_len = k_align; + } + CopyGmToL1Nd2zN::move(l1_buf_a, gm_src_a, m_actual, k_actual, src_len, m_round); + } + } + SetFlag(event_id); + + // *** load matrix B to L1 + WaitFlag(event_id + 2); + if (TB) { + //auto src_len = k; + auto src_len = k_all; + if (aligned_b == 1) { + //src_len = k_align; + src_len = k_all_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(k_actual, block_size); + CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b, gm_src_b, n_actual, k_actual, src_len, n_round); + } + } else { + auto src_len = n; + if (aligned_b == 1) { + src_len = n_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(n_actual, block_size); + CopyGmToCbuf(l1_buf_b, gm_src_b, 0, num_col, k_actual, k_align16 - k_actual, k_round - k_actual, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b, gm_src_b, k_actual, n_actual, src_len, k_round); + } + } + SetFlag(event_id + 2); + + int mte1_mad_ping_flag = 1; + + for (int64_t k_idx = 0; k_idx < k_loop; k_idx++) { + + int32_t k_actual = (k_idx == (k_loop - 1)) ? (k - k_idx * k0) : k0; + int32_t k_round = DivCeil(k_actual, block_size) * block_size; + int32_t k_part_loop = DivCeil(k_actual, k_part_len); + + __cbuf__ MmadDtype *l1_buf_a = ping_flag ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; + __cbuf__ MmadDtype *l1_buf_b = ping_flag ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; + auto event_id = ping_flag ? EVENT_ID0 : EVENT_ID1; + + if (k_idx < k_loop - 1) { + if (TA) { + if (aligned_a == 1) { + offset_a_next = batch_idx * k * m_align + (k_idx + 1) * k0 * m_align + m_idx * m0; + } else { + offset_a_next = batch_idx * k * m + (k_idx + 1) * k0 * m + m_idx * m0; + } + } else { + if (aligned_a == 1) { + offset_a_next = batch_idx * m * k_align + m_idx * m0 * k_align + (k_idx + 1) * k0; + } else { + offset_a_next = batch_idx * m * k + m_idx * m0 * k + (k_idx + 1) * k0; + } + } + if (TB) { + if (aligned_b == 1) { + //offset_b_next = batch_idx * n * k_align + n_idx * n0 * k_align + (k_idx + 1) * k0; + offset_b_next = batch_idx * n * k_align + n_idx * n0 * k_all_align + (k_idx + 1) * k0; + } else { + if (weight_nz) { + offset_b_next = batch_idx * n * k + (k_idx + 1) * k0 * n_align16 + n_idx * n0 * block_size; + } else { + offset_b_next = batch_idx * n * k + n_idx * n0 * k_all + (k_idx + 1) * k0; + } + //offset_b_next = batch_idx * n * k + n_idx * n0 * k + (k_idx + 1) * k0; + //offset_b_next = batch_idx * n * k + n_idx * n0 * k_all + (k_idx + 1) * k0; + } + } else { + if (aligned_b == 1) { + offset_b_next = batch_idx * k * n_align + (k_idx + 1) * k0 * n_align + n_idx * n0; + } else { + //offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * n + n_idx * n0; + if (weight_nz) { + offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * block_size + n_idx * n0 * k_align16; + } else { + offset_b_next = batch_idx * k * n + (k_idx + 1) * k0 * n + n_idx * n0; + } + } + } + + int32_t k_actual_next = ((k_idx + 1) == (k_loop - 1)) ? (k - (k_idx + 1) * k0) : k0; + int32_t k_round_next = DivCeil(k_actual_next, block_size) * block_size; + + __cbuf__ MmadDtype *l1_buf_a_next = (1 - ping_flag) ? l1_base_a : l1_base_a + L1_PINGPONG_BUFFER_LEN; + __cbuf__ MmadDtype *l1_buf_b_next = (1 - ping_flag) ? l1_base_b : l1_base_b + L1_PINGPONG_BUFFER_LEN; + auto event_id_next = (1 - ping_flag) ? EVENT_ID0 : EVENT_ID1; + + auto gm_src_a = gm_a_src_tmp + offset_a_next; + //auto gm_src_b = gm_b_src + offset_b_next; + auto gm_src_b = gm_b_src_tmp + offset_b_next; + WaitFlag(event_id_next); + // *** load matrix A to L1 + if (m_actual == 1 && !TA) { + CopyGmToCbuf( + l1_buf_a_next, + gm_src_a, + 0, // sid + 1, // nBurst + k_round_next / block_size, // lenBurst + 0, // srcGap + 0, // dstGap + PAD_NONE // padMode + ); + } else { + if (TA) { + auto src_len = m; + if (aligned_a == 1) { + src_len = m_align; + } + CopyGmToL1Nd2zN::move( + l1_buf_a_next, gm_src_a, k_actual_next, m_actual, src_len, k_round_next); + } else { + auto src_len = k; + if (aligned_a == 1) { + src_len = k_align; + } + CopyGmToL1Nd2zN::move( + l1_buf_a_next, gm_src_a, m_actual, k_actual_next, src_len, m_round); + } + } + SetFlag(event_id_next); + + // *** load matrix B to L1 + WaitFlag(event_id_next + 2); + if (TB) { + //auto src_len = k; + auto src_len = k_all; + if (aligned_b == 1) { + //src_len = k_align; + src_len = k_all_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(k_actual_next, block_size); + CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, n_actual, n_align16 - n_actual, n_round - n_actual, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b_next, gm_src_b, n_actual, k_actual_next, src_len, n_round); + } + // CopyGmToL1Nd2zN::move( + // l1_buf_b_next, gm_src_b, n_actual, k_actual_next, src_len, n_round); + } else { + auto src_len = n; + if (aligned_b == 1) { + src_len = n_align; + } + if (weight_nz) { + int32_t num_col = DivCeil(n_actual, block_size); + CopyGmToCbuf(l1_buf_b_next, gm_src_b, 0, num_col, k_actual_next, k_align16 - k_actual_next, k_round_next - k_actual_next, PAD_NONE); + } else { + CopyGmToL1Nd2zN::move(l1_buf_b_next, gm_src_b, k_actual_next, n_actual, src_len, k_round_next); + } + // CopyGmToL1Nd2zN::move( + // l1_buf_b_next, gm_src_b, k_actual_next, n_actual, src_len, k_round_next); + } + SetFlag(event_id_next + 2); + } + + for (int k_part_idx = 0; k_part_idx < k_part_loop; k_part_idx++) { + int32_t k0_round = (k_part_idx < k_part_loop - 1) ? + k_part_len : k_round - k_part_idx * k_part_len; + int32_t k0_actual = (k_part_idx < k_part_loop - 1) ? + k_part_len : k_actual - k_part_idx * k_part_len; + + auto mte1_mad_event_id = mte1_mad_ping_flag ? EVENT_ID0 : EVENT_ID1; + auto l0a_buf = l0a_base + (1 - mte1_mad_ping_flag) * L0AB_PINGPONG_BUFFER_LEN; + auto l0b_buf = l0b_base + (1 - mte1_mad_ping_flag) * L0AB_PINGPONG_BUFFER_LEN; + + // *** load matrix A from L1 to L0A + if (k_part_idx == 0) { + WaitFlag(event_id); + } + WaitFlag(mte1_mad_event_id); + if (m_actual == 1 && !TA) { + LoadCbufToCa( + l0a_buf, + l1_buf_a + k_part_idx * k_part_len, + 0, // baseIdx + DivCeil(k0_round, cube_matrix_size), // repeat + 1, // srcStride + 0, // dstStride + 0, // sid + false, // transpose + inc // addr_cal_mode_t + ); + } else { + if (TA) { + if (IS_INT8) { + for (int i = 0; i < m_round / BLOCK_SIZE_32; i++) { + LoadCbufToCaTranspose( + l0a_buf + i * k0_round * BLOCK_SIZE_32, + l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_32 + + i * k_round * BLOCK_SIZE_32, + 0, // baseIdx + k0_round / BLOCK_SIZE_32, // repeat + 1, // srcStride + 0, // dstStride + 0, // addrmode + k0_round / BLOCK_SIZE_32 - 1 // dstFracStride + ); + } + } else { + for (int i = 0; i < m_round / BLOCK_SIZE_16; i++) { + LoadCbufToCa( + l0a_buf + i * k0_round * BLOCK_SIZE_16, + l1_buf_a + k_part_idx * k_part_len * BLOCK_SIZE_16 + + i * k_round * BLOCK_SIZE_16, + 0, // baseIdx + k0_round / BLOCK_SIZE_16, // repeat + 1, // srcStride + 0, // dstStride + 0, // sid + true, // transpose + inc // addr_cal_mode_t + ); + } + } + } else { + for (int32_t i = 0; i < k0_round / block_size; i++) { + LoadCbufToCa( + l0a_buf + i * cube_matrix_size, + l1_buf_a + k_part_idx * k_part_len * m_round + + i * m_round * block_size, + 0, // baseIdx + m_round / BLOCK_SIZE_16, // repeat + 1, // srcStride + k0_round / block_size - 1, // dstStride + 0, // sid + false, // transpose + inc // addr_cal_mode_t + ); + } + } + } + if (k_part_idx == k_part_loop - 1) { + SetFlag(event_id); + } + + // *** load matrix B from L1 to L0B + if (k_part_idx == 0) { + WaitFlag(event_id + 2); + } + if (TB) { + LoadCbufToCb( + l0b_buf, + l1_buf_b + k_part_idx * k_part_len * n_round, + 0, // baseIdx + k0_round * n_round / cube_matrix_size, // repeat + 1, // srcStride + 0, // dstStride + 0, // sid + false, // transpose + inc // addr_cal_mode_t + ); + } else { + if (IS_INT8) { + for (int32_t i = 0; i < k0_round / BLOCK_SIZE_32; i++) { + LoadCbufToCbTranspose( + l0b_buf + i * ((n_actual + 15) / 16 * 16) * BLOCK_SIZE_32, + l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_32) * BLOCK_SIZE_32, + 0, // baseIdx + n_round / BLOCK_SIZE_32, // repeat + k_round / BLOCK_SIZE_32, // srcStride + 1, // dstStride + 0, // addrmode + 0 // dstFracStride + ); + } + } else { + for (int32_t i = 0; i < k0_round / BLOCK_SIZE_16; i++) { + LoadCbufToCb( + l0b_buf + i * n_round * BLOCK_SIZE_16, + l1_buf_b + (k_part_idx * k_part_len + i * BLOCK_SIZE_16) * BLOCK_SIZE_16, + 0, // baseIdx + n_round / BLOCK_SIZE_16, // repeat + k_round / BLOCK_SIZE_16, // srcStride + 0, // dstStride + 0, // sid + true, // transpose + inc // addr_cal_mode_t + ); + } + } + } + if (k_part_idx == k_part_loop - 1) { + SetFlag(event_id + 2); + } + + SetFlag(mte1_mad_event_id); + WaitFlag(mte1_mad_event_id); + + bool init_c = (k_idx == 0 && k_part_idx == 0); + if (init_c) { + WaitFlag(EVENT_ID0); + } + + if (IS_INT8 && has_offset) { + if (init_c) { + WaitFlag(EVENT_ID1); // wait move bias fron L1 to BT + } + PipeBarrier(); + if (m != 1 && m_actual == 1 && TA) { + mad((__cc__ int32_t *)l0c_buf, + (__ca__ int8_t *)l0a_buf, + (__cb__ int8_t *)l0b_buf, + ((uint64_t)bias_bt), + 16, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + init_c, // cmatrixSource add C from BT + 0 // cmatrixInitVal + ); + } else { + mad((__cc__ int32_t *)l0c_buf, + (__ca__ int8_t *)l0a_buf, + (__cb__ int8_t *)l0b_buf, + ((uint64_t)bias_bt), + m_actual, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + init_c, // cmatrixSource add C from BT + 0 // cmatrixInitVal + ); + } + //has_offset = 0; + } else { + PipeBarrier(); + if (m != 1 && m_actual == 1 && TA) { + mad(l0c_buf, + l0a_buf, + l0b_buf, + 16, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + 0, // cmatrixSource + init_c // cmatrixInitVal + ); + } else { + mad(l0c_buf, + l0a_buf, + l0b_buf, + m_actual, // m + k0_actual, // k + n_actual, // n + 0, // unitFlag + 0, // kDirectionAlign + 0, // cmatrixSource + init_c // cmatrixInitVal + ); + } + } + PipeBarrier(); + SetFlag(mte1_mad_event_id); + + mte1_mad_ping_flag = 1 - mte1_mad_ping_flag; + } + ping_flag = 1 - ping_flag; + } + + + if (IS_INT8 && std::is_same::value && (dequant_granularity == QuantGranularity::PER_CHANNEL || + dequant_granularity == QuantGranularity::PER_TOKEN)) { + //if (IS_INT8 && std::is_same::value && (dequant_granularity == QuantGranularity::PER_CHANNEL)) { + WaitFlag(EVENT_ID0); + PipeBarrier(); + CopyGmToCbuf( + scale_l1, + gm_dequant_scale + dequant_param_offset, + 0, + 1, + (n_actual * sizeof(int64_t) + 31) / 32, + 0, + 0, + PAD_NONE + ); + SetFlag(EVENT_ID0); + + WaitFlag(EVENT_ID0); + + copy_cbuf_to_fbuf( + scale_FB, + scale_l1, + 1, + (n_actual * sizeof(int64_t) + 127) / 128, + 0, + 0 + ); + PipeBarrier(); + } + } + + inline __aicore__ void MoveL0CToGM(__gm__ OutDtype *gm_dst, int64_t offset_c, int64_t offset_l0c, int32_t m_actual, int32_t n_actual, int32_t src_stride, int32_t dst_stride) { + #if (__CCE_AICORE__ == 220) + FixpipeParamsV220 FixpipeParams( + n_actual, // nSize = nSizeIn; + m_actual, // mSize = mSizeIn; + src_stride, // srcStride = srcStrideIn; + dst_stride, // dstStride = dstStrideIn; + false // reluEn = reluEnIn; + ); + #elif (defined(__DAV_C310__)) + FixpipeParamsC310 FixpipeParams( + n_actual, // nSize = nSizeIn; + m_actual, // mSize = mSizeIn; + src_stride, // srcStride = srcStrideIn; + dst_stride // dstStride = dstStrideIn; + ); + #endif + LocalTensor srcTensor = CreateLocalTensor + (reinterpret_cast(l0c_buf + offset_l0c), static_cast(TPosition::CO1)); + GlobalTensor dstTensor = CreateGlobalTensor(gm_dst + offset_c); + + if (IS_INT8) { + if constexpr (std::is_same::value) { + if (dequant_granularity == QuantGranularity::PER_CHANNEL || dequant_granularity == QuantGranularity::PER_TOKEN) { + SetFpc(scale_FB); + FixpipeParams.quantPre = VDEQF16; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + //SetFlag(EVENT_ID0); + } else if (dequant_granularity == QuantGranularity::PER_TENSOR) { + FixpipeParams.quantPre = DEQF16; + FixpipeParams.deqScalar = gm_dequant_scale[0]; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + } + } else if constexpr (std::is_same::value) { + GlobalTensor dstAccum = CreateGlobalTensor(gm_accum + offset_c); + Fixpipe(dstAccum, srcTensor, FixpipeParams); + } + } else { + if constexpr (std::is_same::value) { + FixpipeParams.quantPre = F322BF16; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + } else { + FixpipeParams.quantPre = F322F16; + Fixpipe(dstTensor, srcTensor, FixpipeParams); + } + } + } + + + + inline __aicore__ void RunAllToAllAllGatherMatmul(){ + InitFlags(); + int32_t k_actual; + if (aligned_a){ + k_actual = k_align; + } else { + k_actual = k; + } + + int64_t gm_a_pingpong_size = m0 * k_align * p_value * rank_size; + + int32_t comm_m = m0 * p_value; + int32_t comm_count; + if (is_moe_averaged) { + comm_count = DivCeil(m / EP , comm_m); + } else { + int32_t max_comm_count = 0; + int32_t max_input_per_ep = 0; + int32_t max_output_per_ep = 0; + for (int32_t ep_idx = 0; ep_idx < EP; ep_idx++) { + int32_t tmp_sum = 0; + for(int32_t i = 0; i < local_expert_nums; i++) { + tmp_sum += global_tokens_per_expert_matrix[rank * expert_nums + ep_idx * local_expert_nums + i]; + } + max_output_per_ep = max(max_output_per_ep, tmp_sum); + tmp_sum = 0; + for(int32_t i = 0; i < local_expert_nums; i++) { + tmp_sum += global_tokens_per_expert_matrix[ep_idx * expert_nums + rank * local_expert_nums + i]; + } + max_input_per_ep = max(max_input_per_ep, tmp_sum); + max_comm_count = max(max_comm_count, max(max_output_per_ep,max_input_per_ep)); + } + comm_count = DivCeil(max_comm_count, comm_m); + } + int32_t in_expert_offset[16 * 16] = {0};//[i][j]代表第i个expert的第j个rank的offset + int32_t data_len_in_expert_from_rank[16] = {0};//当前expert从rank[i]收到的token数。 + int32_t before_expert_offset_dst[16] = {0}; + int32_t before_rank_offset[16 * 16] = {0}; + + for (int32_t local_expert_idx = 0; local_expert_idx < local_expert_nums; local_expert_idx ++) { + int32_t expert_idx = rank * local_expert_nums + local_expert_idx; + if (is_moe_averaged) { + before_expert_offset_dst[local_expert_idx] = local_expert_idx * (m / local_expert_nums); + } else { + for(int32_t i = 0; i < local_expert_idx; i++) { + for(int32_t j = 0; j < rank_size; j++) { + before_expert_offset_dst[local_expert_idx] += global_tokens_per_expert_matrix[j * expert_nums + i + rank * local_expert_nums]; + } + } + } + + for (int i = 0; i < rank_size - 1; i ++) { + int32_t tmp_len; + if (is_moe_averaged) { + tmp_len = m / local_expert_nums / rank_size; + } else { + tmp_len = global_tokens_per_expert_matrix[i * expert_nums + expert_idx]; + } + before_rank_offset[local_expert_idx * rank_size + i + 1] = before_rank_offset[local_expert_idx * rank_size + i] + tmp_len; + } + } + int32_t out_this_rank[16] = {0}; + for(int32_t i = 0; i < rank_size; i++) { + for (int32_t j = 0; j < local_expert_nums; j++ ) { + if (is_moe_averaged) { + out_this_rank[i] = m / EP; + } else { + out_this_rank[i] += global_tokens_per_expert_matrix[i * expert_nums + j + rank * local_expert_nums]; + } + } + } + + + + + int32_t sum_loop = 0; + for(int32_t comm_idx = 0; comm_idx < comm_count; comm_idx++){ + uint64_t flag_id = comm_idx % MAX_BLOCK_COUNT; + WaitEvent(flag_id); + + for (int32_t local_expert_idx = 0; local_expert_idx < local_expert_nums; local_expert_idx ++) { + int32_t expert_idx = rank * local_expert_nums + local_expert_idx; + int32_t expert_num_this_com = 0; //本次通信该expert收到的token数。 + int32_t before_expert_offset_src = 0; //本次通信该expert在共享内存中的地址的offset。 + for(int32_t i = 0; i < rank_size; i++) { + int32_t data_len; + if ((comm_idx + 1) * comm_m >= out_this_rank[i]) { + data_len = out_this_rank[i] - comm_idx * comm_m; + } else { + data_len = comm_m; + } + + //expert token数的前缀和 + int32_t sum = 0; + for(int32_t j = 0; j <= local_expert_idx; j++) { + int32_t tmp_expert_id = rank * local_expert_nums + j; + int32_t expert_token_num; + if (is_moe_averaged) { + expert_token_num = (m / expert_nums); + } else { + expert_token_num = global_tokens_per_expert_matrix[i * expert_nums + tmp_expert_id]; + } + + if (comm_idx * comm_m < sum + expert_token_num && comm_idx * comm_m + data_len > sum) { + int32_t tmp_len = min(comm_idx * comm_m + data_len, sum + expert_token_num) - + max(comm_idx * comm_m, sum); + if (j < local_expert_idx) { + before_expert_offset_src += tmp_len; + } else { + expert_num_this_com += tmp_len; + data_len_in_expert_from_rank[i] = tmp_len; + } + } else { + if (j == local_expert_idx) { + data_len_in_expert_from_rank[i] = 0; + } + } + sum += expert_token_num; + } + } + + int32_t m_loop_in_expert = DivCeil(expert_num_this_com, m0); + int32_t loop_in_expert = m_loop_in_expert * n_loop; + for(int32_t loop_idx = 0; loop_idx < loop_in_expert; loop_idx ++) { + if ((loop_idx + sum_loop) % core_num != core_idx) { + continue; + } + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, m_loop_in_expert, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t m_actual = (m_idx == m_loop_in_expert - 1) ? expert_num_this_com - m_idx * m0 : m0; + int32_t n_actual = (n_idx == n_loop - 1) ? n - n_idx * n0 : n0; + __gm__ MmadDtype *gm_peer_mem_st = reinterpret_cast<__gm__ MmadDtype *>(gm_peer_mem) + + flag_id * gm_a_pingpong_size + + before_expert_offset_src * k_align; + + __gm__ MmadDtype *gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k * n_align; + if(TB){ + gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k_align * n; + } + if (weight_nz) { + gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k_align16 * n_align16; + } + CalLoop(0, m_idx, n_idx, m_actual, n_actual, gm_peer_mem_st, gm_b_src_tmp, k, k, n * local_expert_idx); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int64_t rank_offset = 0; + int32_t l0c_offset = 0; + for (int32_t src_rank_id = 0; src_rank_id < rank_size; src_rank_id ++) { + if (rank_offset + data_len_in_expert_from_rank[src_rank_id] > m_idx * m0 && + rank_offset < m_idx * m0 + m_actual) { + int32_t rank_m_actual = min(m_idx * m0 + m_actual, rank_offset + data_len_in_expert_from_rank[src_rank_id]) - + max(rank_offset, m_idx * m0); + int32_t dst_stride = n; + int32_t tmp_in_rank_offset = 0; + if (m_idx * m0 > rank_offset) { + tmp_in_rank_offset = m_idx * m0 - rank_offset; + } + int64_t offset_c = before_expert_offset_dst[local_expert_idx] * n + before_rank_offset[local_expert_idx * rank_size + src_rank_id] * n + + in_expert_offset[local_expert_idx * rank_size + src_rank_id] * n + + tmp_in_rank_offset * n + + n_idx * n0; + int32_t src_stride = (m_actual + 15) / 16 * 16; + int32_t real_rank_m_actual = rank_m_actual; + int64_t m_offset_c = before_expert_offset_dst[local_expert_idx] + before_rank_offset[local_expert_idx * rank_size + src_rank_id] + + in_expert_offset[local_expert_idx * rank_size + src_rank_id] + tmp_in_rank_offset; + if (maxOutputSize > 0) { + if (maxOutputSize <= m_offset_c) { + real_rank_m_actual = 0; + } else if (m_offset_c + real_rank_m_actual > maxOutputSize) { + real_rank_m_actual = maxOutputSize - m_offset_c; + } + } + if (real_rank_m_actual > 0) { + MoveL0CToGM(gm_c, offset_c, l0c_offset, real_rank_m_actual, n_actual, src_stride, dst_stride); + } + l0c_offset += rank_m_actual * 16; + } + rank_offset += data_len_in_expert_from_rank[src_rank_id]; + } + + if (IS_INT8) { + if constexpr (std::is_same::value) { + if (dequant_granularity == QuantGranularity::PER_CHANNEL || dequant_granularity == QuantGranularity::PER_TOKEN) { + SetFlag(EVENT_ID0); + } + } + } + SetFlag(EVENT_ID0); + if (IS_INT8 && has_offset) { + SetFlag(EVENT_ID1); + } + } + + for (int32_t i = 0; i < rank_size; i ++) { + in_expert_offset[local_expert_idx * rank_size + i] += data_len_in_expert_from_rank[i]; + } + sum_loop += loop_in_expert; + } + FFTSCrossCoreSync(2, flag_id); + } + Endflags(); + PipeBarrier(); + } + + + inline __aicore__ void RunAllToAllAllGatherMatmulHidden(){ + InitFlags(); + int32_t max_m; + int32_t sum_m[16] = {0}; + int32_t sum_m_loop = 0; + if(is_moe_averaged) { + sum_m_loop = DivCeil((m / expert_nums) * EP, m0) * local_expert_nums; + max_m = m; + } else { + if (maxOutputSize == -1) { + max_m = 0; + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t sum_m_ep = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = local_expert_id + ep_idx * local_expert_nums; + for(int32_t i = 0; i < EP; i++) { + sum_m_ep += global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + } + max_m = max(max_m, sum_m_ep); + } + } else { + max_m = maxOutputSize; + } + for(int32_t i = 0; i < local_expert_nums; i++){ + int32_t last_sum_m = (i == 0 ? 0 : sum_m[i - 1]); + for(int j = 0; j < EP; j++) { + sum_m[i] += global_tokens_per_expert_matrix[j * expert_nums + rank * local_expert_nums + i]; + //global_tokens_per_expert_matrix[j][rank * local_expert_nums + i] + } + if (maxOutputSize > 0 && sum_m[i] + last_sum_m > maxOutputSize) { + sum_m[i] = maxOutputSize - last_sum_m; + } + sum_m_loop += DivCeil(sum_m[i], m0); + sum_m[i] += (i == 0 ? 0 : sum_m[i - 1]); + } + } + + int32_t comm_k = k0 * p_value; + int64_t gm_a_pingpong_size = comm_k * max_m; + int64_t gm_a_pingpong_num = buffer_size * 1024 * 1024 / sizeof(MmadDtype) / gm_a_pingpong_size; + if (gm_a_pingpong_num > 8) { + gm_a_pingpong_num = 8; + } + int32_t comm_count = DivCeil(k, comm_k); + int32_t sum_loop_num = sum_m_loop * n_loop; + int32_t sum_loop = 0; + //SetAtomicAdd(); + for(int32_t comm_idx = 0; comm_idx < comm_count; comm_idx++){ + if (comm_idx == 1) { + PipeBarrier(); + SetAtomicAdd(); + PipeBarrier(); + } + int32_t k_len; + if(comm_idx == comm_count - 1) { + k_len = k - comm_idx * comm_k; + } else { + k_len = comm_k; + } + if (comm_idx == 1) { + PipeBarrier(); + SetAtomicAdd(); + PipeBarrier(); + FFTSCrossCoreSync(0, AIC_FINISH_MATMUL_FLAG_ID); + WaitEvent(AIC_FINISH_MATMUL_FLAG_ID); + } + + uint64_t flag_id = comm_idx % gm_a_pingpong_num; + WaitEvent(flag_id); + for(int32_t loop_idx = 0; loop_idx < sum_loop_num; loop_idx ++) { + if((loop_idx + sum_loop) % core_num != core_idx) { + continue; + } + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, sum_m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + /* + 1.先判断m_idx和n_idx属于哪个expert。 + 2.再计算在该expert内的坐标。 + */ + int32_t sum_loop_before = 0; + int32_t local_expert_idx = -1; + int32_t m_in_expert; + for(int32_t i = 0; i < local_expert_nums; i++) { + if(is_moe_averaged) { + m_in_expert = m / local_expert_nums; + } else { + m_in_expert = sum_m[i] - (i == 0 ? 0 : sum_m[i - 1]); + } + sum_loop_before += DivCeil(m_in_expert, m0); + if(sum_loop_before > m_idx) { + local_expert_idx = i; + break; + } + } + int32_t m_loop_in_expert = DivCeil(m_in_expert, m0); + sum_loop_before -= m_loop_in_expert; + int32_t m_idx_in_expert = m_idx - sum_loop_before; + int32_t m_actual = (m_idx_in_expert == m_loop_in_expert - 1 ? m_in_expert - m_idx_in_expert * m0 : m0); + int32_t n_actual = (n_idx == n_loop - 1) ? n - n_idx * n0 : n0; + int32_t sum_m_before; + if(is_moe_averaged) { + sum_m_before = local_expert_idx * (m / local_expert_nums); + } else { + sum_m_before = sum_m[local_expert_idx] - m_in_expert; + } + + __gm__ MmadDtype *gm_peer_mem_st = reinterpret_cast<__gm__ MmadDtype *>(gm_peer_mem) + + 1LL * flag_id * gm_a_pingpong_size + + 1LL * sum_m_before * k_len; + __gm__ MmadDtype *gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k * n_align + 1LL * comm_idx * comm_k * n_align; + //__gm__ MmadDtype *gm_b_src_tmp = gm_b_src; + if(TB){ + gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k_align * n + 1LL * comm_idx * comm_k; + } + if (weight_nz) { + gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k_align16 * n_align16 + 1LL * comm_idx * comm_k * block_size; + //gm_b_src_tmp = gm_b_src; + } + //CalLoop(0, m_idx_in_expert, n_idx, m_actual, n_actual, gm_peer_mem_st, gm_b_src_tmp); + CalLoop(0, m_idx_in_expert, n_idx, m_actual, n_actual, gm_peer_mem_st, gm_b_src_tmp, k_len, k_align, n * local_expert_idx); + //CalLoop(local_expert_idx, m_idx_in_expert, n_idx, m_actual, n_actual, gm_peer_mem_st, gm_b_src_tmp, k_len, k); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + int64_t offset_c = 1LL * sum_m_before * n + 1LL * m_idx_in_expert * m0 * n + 1LL * n_idx * n0; + MoveL0CToGM(gm_c, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, n); + } + sum_loop += sum_loop_num; + has_offset = 0; + FFTSCrossCoreSync(2, flag_id); + } + PipeBarrier(); + SetAtomicNone(); + PipeBarrier(); + + Endflags(); + PipeBarrier(); + } + + + inline __aicore__ void RunMatmulReduceScatterAllToAllHidden(){ + InitFlags(); + int32_t comm_n = p_value * n0; + int32_t cal_count = DivCeil(n, comm_n); + int32_t max_m; + int32_t sum_m[16] = {0}; + int32_t sum_m_loop = 0; + if(is_moe_averaged) { + sum_m_loop = DivCeil((m / expert_nums) * EP, m0) * local_expert_nums; + max_m = m; + } else { + if (maxOutputSize == -1) { + max_m = 0; + for(int32_t ep_idx = 0; ep_idx < EP; ep_idx ++) { + int32_t sum_m_ep = 0; + for(int32_t local_expert_id = 0; local_expert_id < local_expert_nums; local_expert_id ++) { + int32_t expert_id = local_expert_id + ep_idx * local_expert_nums; + for(int32_t i = 0; i < EP; i++) { + sum_m_ep += global_tokens_per_expert_matrix[i * expert_nums + expert_id]; + } + } + max_m = max(max_m, sum_m_ep); + } + } else { + max_m = maxOutputSize; + } + for(int32_t i = 0; i < local_expert_nums; i++){ + int32_t last_sum_m = (i == 0 ? 0 : sum_m[i - 1]); + for(int j = 0; j < EP; j++) { + sum_m[i] += global_tokens_per_expert_matrix[j * expert_nums + rank * local_expert_nums + i]; + //global_tokens_per_expert_matrix[j][rank * local_expert_nums + i] + } + if (maxOutputSize > 0 && sum_m[i] + last_sum_m > maxOutputSize) { + sum_m[i] = maxOutputSize - last_sum_m; + } + sum_m_loop += DivCeil(sum_m[i], m0); + sum_m[i] += (i == 0 ? 0 : sum_m[i - 1]); + } + } + + int64_t gm_a_pingpong_size = comm_n * max_m; + int64_t gm_a_pingpong_num = buffer_size * 1024 * 1024 / 2 / gm_a_pingpong_size; + if (gm_a_pingpong_num > 8) { + gm_a_pingpong_num = 8; + } + int32_t sum_loop = 0; + for (int32_t cal_idx = 0; cal_idx < cal_count; cal_idx++) { + int32_t n_len; + if(cal_idx == cal_count - 1) { + n_len = n - cal_idx * comm_n; + } else { + n_len = comm_n; + } + n_loop = DivCeil(n_len,n0); + int32_t sum_loop_num = sum_m_loop * n_loop; + int32_t flag_id = cal_idx % gm_a_pingpong_num; + WaitEvent(flag_id); + for(int32_t loop_idx = 0; loop_idx < sum_loop_num; loop_idx ++) { + if((loop_idx + sum_loop) % core_num != core_idx) { + continue; + } + int64_t m_idx, n_idx; + GetBlockIdx(loop_idx, sum_m_loop, n_loop, swizzl_direct, swizzl_count, m_idx, n_idx); + int32_t sum_loop_before = 0; + int32_t local_expert_idx = -1; + int32_t m_in_expert; + for(int32_t i = 0; i < local_expert_nums; i++) { + if(is_moe_averaged) { + m_in_expert = m / local_expert_nums; + } else { + m_in_expert = sum_m[i] - (i == 0 ? 0 : sum_m[i - 1]); + } + sum_loop_before += DivCeil(m_in_expert, m0); + if(sum_loop_before > m_idx) { + local_expert_idx = i; + break; + } + } + int32_t m_loop_in_expert = DivCeil(m_in_expert, m0); + sum_loop_before -= m_loop_in_expert; + int32_t m_idx_in_expert = m_idx - sum_loop_before; + int32_t m_actual = ((m_idx_in_expert == m_loop_in_expert - 1) ? (m_in_expert - m_idx_in_expert * m0) : m0); + int32_t n_actual = ((n_idx == n_loop - 1) ? (n_len - n_idx * n0) : n0); + + int32_t sum_m_before = 0; + if(is_moe_averaged) { + sum_m_before = local_expert_idx * (m / local_expert_nums); + } else { + sum_m_before = sum_m[local_expert_idx] - m_in_expert; + } + __gm__ MmadDtype *gm_a_src_inner = gm_a_src + 1LL * sum_m_before * k_align; + __gm__ MmadDtype *gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k * n_align; + // __gm__ MmadDtype *gm_b_src_tmp = gm_b_src; + if(TB){ + gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k_align * n; + } + if (weight_nz) { + gm_b_src_tmp = gm_b_src + 1LL * local_expert_idx * k_align16 * n_align16; + //gm_b_src_tmp = gm_b_src; + } + + int32_t real_n_idx = n_idx + cal_idx * comm_n / n0; + CalLoop(0, m_idx_in_expert, real_n_idx, m_actual, n_actual, gm_a_src_inner, gm_b_src_tmp, k, k, n * local_expert_idx); + //CalLoop(0, m_idx_in_expert, real_n_idx, m_actual, n_actual, gm_a_src_inner); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + + int32_t dst_stride = n_len; + __gm__ OutDtype *gm_out = reinterpret_cast<__gm__ OutDtype *>(gm_peer_mem); + int32_t offset_c = flag_id * gm_a_pingpong_size + 1LL * (sum_m_before + m_idx_in_expert * m0) * n_len + 1LL * n_idx * n0; + MoveL0CToGM(gm_out, offset_c, m_actual, n_actual, (m_actual + 15) / 16 * 16, dst_stride); + } + sum_loop += sum_loop_num; + FFTSCrossCoreSync(2, flag_id); + } + Endflags(); + PipeBarrier(); + } + + inline __aicore__ void Run() { + + if(RUN_TYPE == PPMATMUL_RUN_ALL_TO_ALL_ALL_GATHER_MATMUL_HIDDEN) { + RunAllToAllAllGatherMatmulHidden(); + } else if(RUN_TYPE == PPMATMUL_RUN_MATMUL_REDUCE_SCATTER_ALL_TO_ALL_HIDDEN){ + RunMatmulReduceScatterAllToAllHidden(); + } else if (RUN_TYPE == PPMATMUL_RUN_ALL_TO_ALL_ALL_GATHER_MATMUL) { + RunAllToAllAllGatherMatmul(); + } + } + using PpMatmul::gm_a_src; + using PpMatmul::gm_b_src; + using PpMatmul::gm_c; + using PpMatmul::gm_peer_mem; + using PpMatmul::gm_dequant_scale; + using PpMatmul::gm_format_dequant_offset; + using PpMatmul::gm_accum; + using PpMatmul::l1_base_a; + using PpMatmul::l1_base_b; + using PpMatmul::l0a_base; + using PpMatmul::l0b_base; + using PpMatmul::l0c_buf; + using PpMatmul::scale_l1; + using PpMatmul::scale_FB; + using PpMatmul::bias_l1; + using PpMatmul::bias_bt; + using PpMatmul::has_offset; + using PpMatmul::core_num; + using PpMatmul::batch_size; + using PpMatmul::m; + using PpMatmul::k; + using PpMatmul::n; + using PpMatmul::m_align; + using PpMatmul::k_align; + using PpMatmul::n_align; + using PpMatmul::k_align16; + using PpMatmul::n_align16; + using PpMatmul::m0; + using PpMatmul::k0; + using PpMatmul::n0; + using PpMatmul::m_loop; + using PpMatmul::n_loop; + using PpMatmul::k_loop; + using PpMatmul::core_loop; + using PpMatmul::core_idx; + using PpMatmul::ping_flag; + using PpMatmul::block_size; + using PpMatmul::cube_matrix_size; + using PpMatmul::aligned_a; + using PpMatmul::aligned_b; + using PpMatmul::swizzl_count; + using PpMatmul::swizzl_direct; + using PpMatmul::L1_PINGPONG_BUFFER_LEN; + using PpMatmul::L0AB_PINGPONG_BUFFER_LEN; + using PpMatmul::rank; + using PpMatmul::rank_size; + using PpMatmul::p_value; + using PpMatmul::loop_num_per_comm; + using PpMatmul::InitFlags; + using PpMatmul::Endflags; + using PpMatmul::MoveL0CToGM; + using PpMatmul::dequant_granularity; + using PpMatmul::workspace_info; + using PpMatmul::withSerialMode; + using PpMatmul::weight_nz; + using PpMatmul::CalLoop; + using PpMatmul::buffer_size; + +private: + int32_t EP; + int32_t TP; + int32_t maxOutputSize; + __gm__ int32_t *num_local_tokens_per_expert; + __gm__ int32_t *num_global_tokens_per_local_expert; + __gm__ int32_t * global_tokens_per_expert_matrix; + + __gm__ int32_t* gm_out_loop_per_expert; + __gm__ int32_t* gm_in_loop_per_expert; + __gm__ int32_t* gm_out_loop_per_EP; + __gm__ int32_t* gm_in_loop_per_EP; + __gm__ int32_t* gm_sum_num_local_tokens_per_expert; + __gm__ int32_t* gm_sum_num_global_tokens_per_local_expert; + __gm__ int32_t* gm_num_local_tokens_per_expert; + __gm__ int32_t* gm_num_global_tokens_per_local_expert; + __gm__ int32_t *gm_in_expert_comm_count_accum; + __gm__ int32_t *gm_out_expert_comm_count_accum; + + int32_t expert_nums; + int32_t local_expert_nums; + int32_t is_moe_averaged; + int32_t is_alltoallvc; +}; +#endif \ No newline at end of file -- Gitee From a22744ecf799351eb22017a0c89c1d4c070be6a6 Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 14:37:02 +0800 Subject: [PATCH 408/414] add --- comm/lcal/src/kernels/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/comm/lcal/src/kernels/CMakeLists.txt index a4669b02..598c35b3 100644 --- a/comm/lcal/src/kernels/CMakeLists.txt +++ b/comm/lcal/src/kernels/CMakeLists.txt @@ -8,8 +8,8 @@ # See LICENSE in the root of the software repository for the full text of the License. # include(../ascendc.cmake) -set(OP_NAMES matmul_allreduce allgather_matmul_reduce_scatter) - +set(OP_NAMES pure_matmul matmul_allreduce matmul_reduce_scatter allgather_matmul allgather_matmul_reduce_scatter alltoallv_allgather_matmul matmul_reduce_scatter_alltoallv) + file(GLOB KERNEL_FILES *.cpp) set_source_files_properties(${KERNEL_FILES} PROPERTIES LANGUAGE CCE) file(GLOB KERNEL_FILES2 *.cce) @@ -47,6 +47,4 @@ add_custom_target(lcoc_op foreach(OP_NAME IN LISTS OP_NAMES) add_dependencies(${OP_NAME}_o lcoc_${OP_NAME}_aic_obj lcoc_${OP_NAME}_aiv_obj) add_dependencies(lcoc_op ${OP_NAME}_o) -endforeach() - - +endforeach() \ No newline at end of file -- Gitee From 0a2d1f486554b979a6738bba6dad5ec9283abcee Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 14:52:10 +0800 Subject: [PATCH 409/414] add case --- tests/apitest/opstest/csv/linear_parallel.csv | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv index 443a1392..ae51e1c4 100644 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -41,6 +41,15 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 40|llama_65bCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 41|llama_65bCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 42|llama_65bCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;nd|28,5,2752;8192,2752|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +43|NoErrorCase0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +44|NoErrorCase1LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|28,2,1024;8,1024|1|float16|nd|14,2,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +45|IErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,59;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||I:ERROR_INVALID_TENSOR_DIM +46|SErrorDim0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM +47|NoErrorCase0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +48|NoErrorCase1AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,4,1024;8,1024|1|float16|nd|4,4,8|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +49|IErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,33;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM +50|SErrorDim0AllGatherLinear|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1|||||||S:ERROR_INVALID_TENSOR_DIM +51|NoErrorCase0AllGatherLinearV2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"keepIntermediate":true}|2|float16;float16|nd;nd|2,16;32,16|2|float16;float16|nd;nd|4,32;4,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 52|NoErrorCase0MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;1,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR 53|NoErrorCase1MatmulAllReduceDequantWithBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|8,2;4,2;1,4;1,4|1|bf16|nd|8,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 54|NoErrorCase2MatmulAllReduceDequantWithoutBias|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","quantType":1,"quantGroupSize":0}|4|float16;int8;float16;float16|nd;nd;nd;nd|2,2;4,2;0;1,4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;-2,2||||||Ascend910B|NO_ERROR @@ -48,13 +57,28 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 56|NoErrorCase4MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR 57|NoErrorCase5MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|bf16;int8;bf16;bf16;bf16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|bf16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR 58|NoErrorCase6MatmulAllReducePerGroupDequantWithoutBias|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"lcoc","type":0,"quantType":2,"quantGroupSize":2}|5|float16;int8;float16;float16;float16|nd;nd;nd;nd;nd|2,4;4,4;0;2,4;1,4|1|float16|nd|2,4|customize;customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1;1,-1||||||Ascend910B|NO_ERROR +59|NoErrorCase0PureMatmul|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +60|NoErrorCase1PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"quantGroupSize":2}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;0;2,4|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR +61|NoErrorCase2PureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|NO_ERROR +62|DimCheckFailPureMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0}|4|bf16;int8;bf16;bf16|nd;nd;nd;nd|2,4;4,4;1,4;1|1|bf16|nd|2,4|customize;customize;customize;customize|-1,1;-10,10;-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM +63|PureMatmulW8A8Fp16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +64|PureMatmulW8A8Bf16PerTensor|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +65|PureMatmulW8A8Fp16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|float16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +66|PureMatmulW8A8Bf16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +67|PureMatmulW8A8InvalidQuantType|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|S:ERROR_INVALID_PARAM +68|PureMatmulKeepIntermediateInValid|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","keepIntermediate":true,"type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|C:ERROR_INVALID_PARAM 69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,1,32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,1,32,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,1,32,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -780|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE +75|PureMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +80|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE 81|rsv|LinearParallelOperation|{"rank":0,"rankSize":2,"rsv":[1]}|0||||0||||||||||||C:ERROR_INVALID_PARAM 82|NoErrorCase0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":1}}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 83|NoErrorCase1AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|2048,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR @@ -62,3 +86,20 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 85|NoErrorCase3AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|512,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 86|IErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":1}}|2|float16;float16|nd;nd|32,16;32,20|1|float16|nd|16,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM 87|SErrorDim0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":2,"rsDim":4,"innerDimIsAg":0}}|2|float16;float16|nd;nd|1024,64;32,64|1|float16|nd|1024,32|customize;customize|-1,1;-1,1||||||Ascend910B|S:ERROR_INVALID_TENSOR_DIM +88|AllGatherMatmulInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +89|AllGatherMatmulInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|160,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +90|MatmulReducescatterInt8FP16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":1}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|float16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +91|MatmulReducescatterInt8BF16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|80,80;80,80;80;80|1|bf16|nd|40,80|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR +92|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +93|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +94|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +95|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +96|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +97|SErrorAlltoallvcAllGatherGroupMatmul|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":5,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +98|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|float16;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +99|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|6|int8;int8;int32;int64;float;int32|nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32|1|float16|nd|32768,1024|random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +100|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":3,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,48;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +101|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|4|float16;int8;int32;int32|nd;nd;nd;nd|32768,1024;16,1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +102|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM +103|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM +104|PureMatmulW8A8Fp16_3_float|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;float|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|random;random;random;random|-5,5;-5,5;-10,10;1,2||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -- Gitee From cfa75e96a0ee1b9b7a6265d8efac352b942a3dc1 Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 15:41:23 +0800 Subject: [PATCH 410/414] cleancode --- comm/lcal/src/lcoc.cpp | 3 +++ comm/lcal/src/lcoc_func.cpp | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/comm/lcal/src/lcoc.cpp b/comm/lcal/src/lcoc.cpp index 483333ea..7ac343d6 100644 --- a/comm/lcal/src/lcoc.cpp +++ b/comm/lcal/src/lcoc.cpp @@ -382,6 +382,9 @@ void Lcoc::GetTiling(CoCTiling &tiling) bool IsMatrixAligned(const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) { + if (nElemAlign == 0) { + return false; + } return (transpose ? m : n) % nElemAlign == 0; } diff --git a/comm/lcal/src/lcoc_func.cpp b/comm/lcal/src/lcoc_func.cpp index 187fa934..ed5550fd 100644 --- a/comm/lcal/src/lcoc_func.cpp +++ b/comm/lcal/src/lcoc_func.cpp @@ -47,6 +47,9 @@ namespace Lcal { bool CheckParamAlign(const std::string &name, const int &value, const int &align) { + if (align == 0) { + return false; + } if (value % align != 0) { MKI_LOG(ERROR) << "The " << name << ":" << value << " must be aligned by " << align << "!"; return false; @@ -75,6 +78,9 @@ namespace Lcal { int64_t GetAlignedMatrixSize(const int64_t &batchSize, const int64_t &m, const int64_t &n, const bool &transpose, int nElemAlign) { + if (nElemAlign == 0) { + return false; + } int64_t nRow = transpose ? n : m; int64_t nCol = transpose ? m : n; int64_t nColAlign = (nCol + nElemAlign - 1) / nElemAlign * nElemAlign; -- Gitee From 70a2149aef83721f404e2f181942ef9a9df38a5f Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 16:13:23 +0800 Subject: [PATCH 411/414] fix --- .../coc_allgather_matmul_reduce_scatter.cce | 3 +- .../kernels/coc_allgather_reducescatter.cce | 125 +++++--- comm/lcal/src/kernels/coc_allreduce.cce | 289 ++++++++++++++---- 3 files changed, 309 insertions(+), 108 deletions(-) diff --git a/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce b/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce index 265667ae..a630e6bd 100644 --- a/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce +++ b/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce @@ -24,13 +24,14 @@ extern "C" __global__ __aicore__ void LcalAllGatherMatmulReduceScatter_##type##_ } #elif __DAV_C220_VEC__ +// Vector in AllGatherMatmulReduceScatter #define COC_ALL_GATHER_MATMUL_REDUCESCATTER_FUNC_AUTO_DEF(type) \ extern "C" __global__ __aicore__ void LcalAllGatherMatmulReduceScatter_##type##_mix_aiv(COC_ARGS_FUN(type)) { \ return CocAllGatherMatmulReduceScatterAiv(COC_ARGS_CALL()); \ } #endif -#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) +#if defined(__DAV_C220_CUBE__) || defined(__DAV_C220_VEC__) // 910B support bf16 #define COC_TYPE_FUNC(fun) fun(float16_t);fun(bfloat16_t) COC_TYPE_FUNC(COC_ALL_GATHER_MATMUL_REDUCESCATTER_FUNC_AUTO_DEF); diff --git a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce index 3655cc9a..84c1f803 100644 --- a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce +++ b/comm/lcal/src/kernels/coc_allgather_reducescatter.cce @@ -22,12 +22,13 @@ public: if constexpr (HAVE_BIAS) { add_bias_runner.SetArgs(PP_MATMUL_AIV_ADD_BIAS_ARGS_CALL()); } - + m_align = (m + CUBE_MATRIX_SIZE - 1) / CUBE_MATRIX_SIZE * CUBE_MATRIX_SIZE; k_align = (k + CUBE_MATRIX_SIZE - 1) / CUBE_MATRIX_SIZE * CUBE_MATRIX_SIZE; n_align = (n + CUBE_MATRIX_SIZE - 1) / CUBE_MATRIX_SIZE * CUBE_MATRIX_SIZE; AlignJudge(trans_a, trans_b, m, k, n, m_align, k_align, n_align, aligned_a, aligned_b); this->gm_a = aligned_a ? reinterpret_cast<__gm__ T *>(workspace_info.gm_a_align) : gm_a; + // 确定本卡的ag和rs分别的idx if (inner_dim_is_Ag) { this->rank_ag_idx = rank % ag_dim; this->rank_rs_idx = rank / ag_dim; @@ -40,12 +41,12 @@ public: this->other_rank_rs_idx = other_rank % rs_dim; } - twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; + twod_big_dim = ag_dim > rs_dim ? ag_dim: rs_dim; gm_a_pingpong_size = m0 * k_align * p_value * twod_big_dim; - gm_c_pingpong_size = p_value * twod_big_dim *n_loop * m0 * n0; + gm_c_pingpong_size = p_value * twod_big_dim * n_loop * m0 * n0; m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); - m_per_bigdim = m *ag_dim / twod_big_dim; - comm_count = DivCeil(batch_size * m_loop_per_bigdim , p_value); + m_per_bigdim = m * ag_dim / twod_big_dim; + comm_count = DivCeil(batch_size * m_loop_per_bigdim, p_value); ag_part_dim = twod_big_dim / ag_dim; rs_part_dim = twod_big_dim / rs_dim; @@ -69,7 +70,7 @@ public: auto ub0 = output_UB_T[0]; auto ub1 = output_UB_T[1]; int32_t interm_offset = 0; - for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx) { + for (int32_t move_idx = 0; interm_offset < copy_size; ++move_idx){ uint32_t data_size = interm_offset + ag_max_ub_ping_pong_size < copy_size ? ag_max_ub_ping_pong_size : copy_size - interm_offset; auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub = (move_idx & 1) ? ub0 : ub1; @@ -83,9 +84,9 @@ public: } } - FORCE_INLINE_AICORE + FORCE_INLINE_AICORE void MoveToOtherRankWithSkip(__gm__ T *gm_src, int32_t rank_offset, int32_t len, - int32_t rank_st, int32_t skip_num, int32_t group_num) + int32_t rank_st, int32_t skip_num, int32_t group_num) { int32_t ping_pong_move_count = (len + ag_max_ub_ping_pong_size - 1) / ag_max_ub_ping_pong_size; for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { @@ -119,18 +120,18 @@ public: } } - FORCE_INLINE_AICORE + FORCE_INLINE_AICORE void MoveWithSplit(__gm__ T *gm_src, int32_t rank_offset, int32_t len) { int32_t data_split = DivCeil(len, ag_len_per_loop); - int32_t data_block = ag_len_per_loop; + int32_t data_block = ag_len_per_loop; // 每份数据量 int32_t group_num = ag_dim / ag_comm_npu_split; - int32_t data_offset = -data_block; - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); + int32_t data_offset = -data_block; // 当前份数据的起始位置 + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 for (int32_t data_block_idx = 0; data_block_idx < data_split; ++data_block_idx) { - data_offset += data_block; - data_block = data_block_idx == data_split - 1 ? len - data_offset : data_block; + data_offset += data_block; // 当前份数据的起始位置 + data_block = data_block_idx == data_split - 1 ? len - data_offset : data_block; // 当前份数据量 int32_t num_per_core = DivCeil(data_block, ag_comm_data_split); int32_t data_src = data_offset + (core_idx / ag_comm_npu_split) * num_per_core; @@ -141,6 +142,7 @@ public: core_idx, ag_comm_npu_split, group_num); continue; } + // data len 方向:所有的数据先发送到目标卡0,再发送到目标卡1,以此类推 int32_t dst_rank = core_idx % ag_dim; for (int32_t rank_group_idx = 0; rank_group_idx < group_num; ++rank_group_idx) { int32_t real_rank; @@ -155,8 +157,8 @@ public: dst_rank = (dst_rank + ag_comm_npu_split) % ag_dim; } } - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 } FORCE_INLINE_AICORE int32_t GetRealCoreIdx(int32_t index, int32_t rank_per_core) @@ -177,17 +179,18 @@ public: int32_t real_core_offset = core_index % rs_comm_data_split * rs_len_per_loop; buff_offset = before_core_offset + real_core_offset; + m_in_core = (real_core_offset >= loop_total) ? 0 : - ((real_core_offset + rs_len_per_loop) > loop_total ? - loop_total - real_core_offset : rs_len_per_loop); + ((real_core_offset + rs_len_per_loop) > loop_total ? + loop_total - real_core_offset : rs_len_per_loop); } FORCE_INLINE_AICORE void FirstStepInOutWithSplit(int32_t rank_total, int32_t rank_buff_offset, int32_t comm_idx, int32_t flag_idx, int64_t out_part_offset) { SetAtomicAdd(); PipeBarrier(); - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 int32_t rank_per_core = rs_dim / rs_comm_npu_split; int32_t m_per_core = rank_total / rs_comm_data_split; @@ -200,7 +203,7 @@ public: for (int32_t rank_idx = 0; rank_idx < rank_per_core; rank_idx++) { int32_t real_rank_idx_tmp = GetRealCoreIdx(rank_idx, rank_per_core); int32_t real_rank_idx; - if (inner_dim_is_Ag) { + if (inner_dim_is_Ag){ real_rank_idx = real_rank_idx_tmp * ag_dim + rank % ag_dim; } else { real_rank_idx = real_rank_idx_tmp + rank / rs_dim * rs_dim; @@ -208,28 +211,30 @@ public: if (real_rank_idx == rank) continue; - - FirstStepInOut(m_in_core, buff[real_rank_idx], rank_buff_offset, offset, comm_idx, flag_idx, out_part_offset); + + FirstStepInOut(m_in_core, buff[real_rank_idx], rank_buff_offset, offset, comm_idx, flag_idx, out_part_offset); } } - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); - SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 + SetFlag(EVENT_ID0); // Scalar等MTE3 WaitFlag(EVENT_ID0); SetAtomicNone(); PipeBarrier(); } + FORCE_INLINE_AICORE void FirstStepInOut(int32_t mat_blocks_size, __gm__ T *input, int32_t gm_offset, int32_t offset, int32_t comm_idx, int32_t flag_idx, int64_t out_part_offset) { - int32_t ping_pong_move_count = DivCeil(mat_blocks_size, rs_max_ub_ping_pong_size); + int32_t ping_pong_move_count = DivCeil(mat_blocks_size, rs_max_ub_ping_pong_size); // max_ub_ping_pong_size一定是N0的倍数,但不一定是M0*N0的倍数 for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { int32_t actual_move_size = rs_max_ub_ping_pong_size; if (move_idx == ping_pong_move_count - 1) { - actual_move_size = mat_blocks_size - move_idx * rs_max_ub_ping_pong_size; + actual_move_size = mat_blocks_size - move_idx * rs_max_ub_ping_pong_size; } auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; WaitFlag(event_id); + // 从其他卡读的matrix是多个小的m0*n0块顺序排布,写的时候需要重排 CopyGmToUbuf(ub_buff_st, input + gm_offset + offset + move_idx * rs_max_ub_ping_pong_size, 1, actual_move_size * sizeof(T) / 32, 0, 0); SetFlag(event_id); WaitFlag(event_id); @@ -237,17 +242,21 @@ public: auto ub_buff = ub_buff_st; int32_t left_m = actual_move_size / n0; while (left_m > 0) { + // 获取写到本卡的m和n的idx int32_t loop_idx = (move_num_offset / (m0 * n0)); int32_t n_idx = loop_idx % n_loop; int64_t m_idx = comm_idx * p_value + loop_idx / n_loop; int32_t actual_m = (m_idx == (m_loop_per_bigdim - 1)) ? (m_per_bigdim - m_idx * m0) : m0; int32_t actual_n = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; - int32_t m_offset = (move_num_offset % (m0 * n0)) / n0; + int32_t m_offset = (move_num_offset % (m0 * n0)) / n0; // 当前一块起点对应的m,在当前块的位置 int32_t actual_move_m; - if (m_offset >= actual_m) { + if (m_offset >= actual_m) { // m0=128,最后一个小块m=120, actual_move_m = m0 < m_offset + left_m ? m0 - m_offset : left_m; + // m0 - m_offset表示当前块剩下的一小段,跳过; } else { actual_move_m = actual_m < m_offset + left_m ? actual_m - m_offset : left_m; + // left_m较大,则该块copy完,下次再copy下一块; + // left_m较小,则只copy left_m的部分 int64_t out_buff_offset = (m_idx * m0 + m_offset) * n + n_idx * n0; CopyUbufToGmUnknown(n % BLOCK_SIZE_16 == 0, gm_out + out_part_offset + out_buff_offset, ub_buff, actual_move_m, actual_n * sizeof(T), (n0 - actual_n) * sizeof(T) / 32, (n - actual_n) * sizeof(T)); @@ -260,6 +269,7 @@ public: } } + FORCE_INLINE_AICORE void EndFlagsAndBias() { ResetIpcFlags(2); @@ -274,15 +284,21 @@ public: } } + // p_value的含义在RS和AG不一样:在RS中,每个core计算p_value次后通信一次;在AG中,每从其他rank各gather p_value行后计算一次 + // 在2DTP中,p_value含义和AG一致。 FORCE_INLINE_AICORE void Run() { + // Padding preprocessor.Run(); + ResetIpcFlags(2); PipeBarrier(); - int32_t twod_big_dim = ag_dim > rs_dim ? ag_dim : rs_dim; + // twod_big_dim:2D情况下每次总共搬运块数,取AG和RS较大的dim + int32_t twod_big_dim = ag_dim > rs_dim ? ag_dim: rs_dim; int64_t gm_a_pingpong_size = m0 * k_align * p_value * twod_big_dim; - int64_t gm_c_pingpong_size = p_value * twod_big_dim *n_loop * m0 * n0; + // 2 * 4 * 8 * 128*256 + int64_t gm_c_pingpong_size = p_value * twod_big_dim * n_loop * m0 * n0; int32_t m_loop_per_bigdim = DivCeil(m_loop * ag_dim, twod_big_dim); - int64_t m_per_bigdim = m *ag_dim / twod_big_dim; + int64_t m_per_bigdim = m * ag_dim / twod_big_dim; int32_t comm_count = DivCeil(m_loop_per_bigdim, p_value); int32_t ag_m = p_value * m0; int32_t rs_p_value = p_value; @@ -290,19 +306,31 @@ public: for (int32_t comm_idx = 0; comm_idx < comm_count + MAX_BLOCK_COUNT; ++comm_idx) { uint64_t flag_idx = comm_idx % MAX_BLOCK_COUNT; int32_t commrs_idx = comm_idx - MAX_BLOCK_COUNT; - if (comm_idx == comm_count - 1){ + if (comm_idx == comm_count - 1) { // last allgather ag_m = m_per_bigdim - (comm_count - 1) * p_value * m0; } - if (commrs_idx == comm_count - 1){ + if (commrs_idx == comm_count - 1) { // last reducescatter rs_p_value = m_loop_per_bigdim - (comm_count - 1) * p_value; } + // wait aic if (commrs_idx >= 0) { WaitEvent(flag_idx); } + + // aiv之间同步 SetAndWaitAivSync(flag_idx); + // AGRS条件下,AG只会与部分core通信 + // 本卡与other卡的rs的rank相同,表示在一个ag通信域 + // 前两个循环没有rs,只有ag;最后两个循环没有ag,只有rs + + // 首先写自己rank的flag + CrossRankSyncV1(FLAG_ZERO_IDX, comm_idx + 1); SetAndWaitAivSync(flag_idx); + // AG部分 if (comm_idx < comm_count && aiv_idx == 0 && core_idx < ag_comm_npu_split * ag_comm_data_split) { + // check目标rank数据是否准备好 + // AG每张卡 copy ag_part_dim次 for (int32_t ag_part_idx = 0; ag_part_idx < ag_part_dim; ag_part_idx++) { int64_t src_offset = comm_idx * p_value * m0 * k_align + ag_part_idx * m_per_bigdim * k_align; int32_t bigdim_idx = rank_ag_idx * ag_part_dim + ag_part_idx; @@ -310,6 +338,7 @@ public: MoveWithSplit(gm_a + src_offset, rank_offset, ag_m * k_align); } } + // RS部分 if (comm_idx >= MAX_BLOCK_COUNT && aiv_idx == 0 && core_idx >= ag_core_count && core_idx < ag_core_count + rs_core_count) { for (int32_t rs_part_idx = 0; rs_part_idx < rs_part_dim; rs_part_idx++) { int32_t bigdim_idx = rank_rs_idx * rs_part_dim + rs_part_idx; @@ -321,22 +350,24 @@ public: SetAndWaitAivSync(flag_idx); CrossRankSyncV2(FLAG_ONE_IDX, comm_idx + 1); + // aiv之间同步 SetAndWaitAivSync(flag_idx); + // 发送aic同步 SetAicSync(flag_idx); } + EndFlagsAndBias(); } - public: using CocCommBase::SetAicSync; using CocCommBase::SetAndWaitAivSync; using CocCommBase::SetBuffFlag; using CocCommBase::SetBuffFlagByAdd; using CocCommBase::CheckBuffFlag; - using CocCommBase::ResetIpcFlags; using CocCommBase::CrossRankSyncV1; using CocCommBase::CrossRankSyncV2; + using CocCommBase::ResetIpcFlags; using CocCommBase::buff; using CocCommBase::gm_out; using CocCommBase::ctrl_flags_UB; @@ -356,8 +387,8 @@ public: using CocCommBase::rank; using CocCommBase::rank_size; using CocCommBase::tiling_key; - using CocCommBase::swizzl_count; using CocCommBase::swizzl_direct; + using CocCommBase::swizzl_count; using CocCommBase::trans_a; using CocCommBase::trans_b; using CocCommBase::is_int8; @@ -383,7 +414,13 @@ public: using CocCommBase::extra_len_per_loop; using CocCommBase::extra_ub_move_num; using CocCommBase::weight_nz; + using CocCommBase::local_expert_nums; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; using CocCommBase::is_deterministic; + using CocCommBase::EP; + using CocCommBase::TP; using CocCommBase::flag_offset; int32_t m_align; int64_t k_align; @@ -409,8 +446,11 @@ public: int32_t rs_max_ub_ping_pong_size; __gm__ T *gm_a; + // 本卡的ag、rs分别的rank idx + // 前8个core,每个core负责一张卡的通信 int32_t rank_ag_idx; int32_t rank_rs_idx; + // 本core负责的其他卡通信,对应的ag和rs的rank idx int32_t other_rank_ag_idx; int32_t other_rank_rs_idx; Preprocessor preprocessor; @@ -423,11 +463,12 @@ public: int32_t m_per_bigdim; int32_t ag_part_dim; int32_t rs_part_dim; - + }; template inline __aicore__ void CocAllGatherMatmulReduceScatterAiv(COC_ARGS_FUN(T)) { + // write AllGatherReduceScatter allgatherreducescatter_write_without_bias; AllGatherReduceScatter allgatherreducescatter_write_with_bias; @@ -438,6 +479,7 @@ inline __aicore__ void CocAllGatherMatmulReduceScatterAiv(COC_ARGS_FUN(T)) { auto para = reinterpret_cast<__gm__ Lcal::CoCKernelParam *>(para_gm); auto cocTilingData = ¶->cocTilingData; int32_t tiling_key = cocTilingData->tilingKey; + // swizzl = 0 transa = 0 transb = 0 splitk = 0 bias = 0 int8 = 0 switch (tiling_key) { case 0b000000 : case 0b100000 : case 0b010000 : case 0b110000 : case 0b001000 : case 0b101000 : case 0b011000 : case 0b111000 : @@ -449,9 +491,10 @@ inline __aicore__ void CocAllGatherMatmulReduceScatterAiv(COC_ARGS_FUN(T)) { allgatherreducescatter_write_with_bias.SetArgs(COC_ARGS_CALL()); allgatherreducescatter_write_with_bias.Run(); break; - default: + default : break; } + PipeBarrier(); } diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/comm/lcal/src/kernels/coc_allreduce.cce index c2243edb..c1f95df0 100644 --- a/comm/lcal/src/kernels/coc_allreduce.cce +++ b/comm/lcal/src/kernels/coc_allreduce.cce @@ -43,9 +43,9 @@ public: } if (dequant_granularity == QuantGranularity::PER_TOKEN) { fused_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(buff[rank]), - reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, - m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value, rank_size); - serial_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(gm_out), reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, m0, n0); + reinterpret_cast<__gm__ float32_t *>(gm_quant_scale), m, n, + m0, n0, m_loop, n_loop, core_loop, swizzl_direct, swizzl_count, p_value, rank_size); + serial_pertoken_dequant_runner.SetArgs(reinterpret_cast<__gm__ T *>(gm_out), reinterpret_cast<__gm__ float32_t*>(gm_quant_scale), m, n, m0, n0); } total_core_idx = aiv_idx * core_num + core_idx; cal_count = DivCeil(core_loop, loop_num_per_comm); @@ -66,16 +66,16 @@ public: SetAtomicAdd(); PipeBarrier(); } - SetFlag(EVENT_ID0); - SetFlag(EVENT_ID1); + SetFlag(EVENT_ID0); // MTE2等MTE3 + SetFlag(EVENT_ID1); // MTE2等MTE3 } FORCE_INLINE_AICORE void EndFlagsAndBias() { - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 if constexpr (HAVE_BIAS) { - SetFlag(EVENT_ID0); + SetFlag(EVENT_ID0); // Scalar等MTE3 WaitFlag(EVENT_ID0); SetAtomicNone(); PipeBarrier(); @@ -89,13 +89,14 @@ public: } FORCE_INLINE_AICORE void EndFirstStep(uint64_t flag_idx) { - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); + SetFlag(EVENT_ID0); // Scalar等MTE3 + WaitFlag(EVENT_ID0); SetAtomicNone(); - PipeBarrier(); + PipeBarrier(); SetAndWaitAivSync(flag_idx, is_91093 ? BLOCK_COUNT_4 : MAX_BLOCK_COUNT); } + // input是peermem属于每个rank的部分的首地址 FORCE_INLINE_AICORE void SecondStepParallel(int32_t data_size_remain, __gm__ T* input, int32_t gm_out_offset) { if (data_size_remain <= 0) { return; @@ -103,25 +104,25 @@ public: InitFlags(); int32_t ping_pong_move_count = DivCeil(data_size_remain, max_ub_ping_pong_size); - for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { - int32_t actual_move_size = (move_idx == ping_pong_move_count -1) ? + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx){ + int32_t actual_move_size = (move_idx == ping_pong_move_count - 1) ? data_size_remain - move_idx * max_ub_ping_pong_size : max_ub_ping_pong_size; auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; WaitFlag(event_id); CopyGmToUbuf(ub_buff_st, input + move_idx * max_ub_ping_pong_size, 1, actual_move_size * sizeof(T) / 32, 0, 0); - SetFlag(event_id); - WaitFlag(event_id); + SetFlag(event_id); + WaitFlag(event_id); int32_t move_num_offset = gm_out_offset + move_idx * max_ub_ping_pong_size; CopyUbToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset); - SetFlag(event_id); + SetFlag(event_id); } EndFlagsAndBias(); } - FORCE_INLINE_AICORE void SecondStepParallelWithSplit( int32_t data_size_remain, int32_t cal_idx, - int32_t flag_idx, int32_t data_loop_idx) { + FORCE_INLINE_AICORE void SecondStepParallelWithSplit(int32_t data_size_remain, int32_t cal_idx, + int32_t flag_idx, int32_t data_loop_idx) { if (data_size_remain <= 0) { return; } @@ -136,16 +137,16 @@ public: int32_t other_rank_buff_offset = flag_idx * gm_c_pingpong_size + other_rank_offset; int32_t ping_pong_move_count = DivCeil(data_size_remain, max_ub_ping_pong_size); - for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx) { - int32_t actual_move_size = (move_idx == ping_pong_move_count -1) ? + for (int32_t move_idx = 0; move_idx < ping_pong_move_count; ++move_idx){ + int32_t actual_move_size = (move_idx == ping_pong_move_count - 1) ? data_size_remain - move_idx * max_ub_ping_pong_size : max_ub_ping_pong_size; auto event_id = (move_idx & 1) ? EVENT_ID0 : EVENT_ID1; auto ub_buff_st = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; WaitFlag(event_id); CopyGmToUbuf(ub_buff_st, buff[real_core_idx] + other_rank_buff_offset + move_idx * max_ub_ping_pong_size, 1, actual_move_size * sizeof(T) / 32, 0, 0); - SetFlag(event_id); - WaitFlag(event_id); + SetFlag(event_id); + WaitFlag(event_id); int64_t move_num_offset = other_rank_offset + move_idx * max_ub_ping_pong_size; CopyUbToGmTransLayout(ub_buff_st, actual_move_size, move_num_offset + cal_idx * gm_c_pingpong_size); SetFlag(event_id); @@ -155,14 +156,16 @@ public: } FORCE_INLINE_AICORE void FirstStepDivCore(int32_t data_len, int32_t offset) { - if (is_deterministic && rank_size >=4 && rank_size <= 8) { + // 4或8卡确定性,用tree + if (is_deterministic && rank_size >= 4 && rank_size <= 8) { return FirstStepInPeerMemTree(data_len, offset); } + // 否则,用线性累加 return FirstStepInPeerMemSeq(data_len, offset); } FORCE_INLINE_AICORE void SecondStepSerial(int32_t data_size_remain, __gm__ T *input, - __gm__ T *output) + __gm__ T *output) { if (data_size_remain <= 0) { return; @@ -175,48 +178,48 @@ public: auto ub = (move_idx & 1) ? output_UB_T[0] : output_UB_T[1]; WaitFlag(event_id); CopyGmToUbuf(ub, input + offset, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); - SetFlag(event_id); + SetFlag(event_id); // MTE3等MTE2 WaitFlag(event_id); CopyUbufToGm(output + offset, ub, 1, max_ub_ping_pong_size * sizeof(T) / 32, 0, 0); - SetFlag(event_id); + SetFlag(event_id); // MTE2等MTE3 data_size_remain -= max_ub_ping_pong_size; offset += max_ub_ping_pong_size; } - WaitFlag(EVENT_ID0); - WaitFlag(EVENT_ID1); + WaitFlag(EVENT_ID0); // MTE2等MTE3 + WaitFlag(EVENT_ID1); // MTE2等MTE3 if (data_size_remain >= 0) { CopyGmToUbuf(output_UB_T[0], input + offset, 1, (data_size_remain * sizeof(T) + 31) / 32, 0, 0); - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); + SetFlag(EVENT_ID0); // MTE3等MTE2 + WaitFlag(EVENT_ID0); if (ALIGN) { - CopyUbufToGm(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); + CopyUbufToGm(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T) / 32, 0, 0); } else { CopyUbufToGmAlignB16(output + offset, output_UB_T[0], 1, data_size_remain * sizeof(T), 0, 0); } } if constexpr (HAVE_BIAS) { - SetFlag(EVENT_ID0); - WaitFlag(EVENT_ID0); + SetFlag(EVENT_ID0); // Scalar等MTE3 + WaitFlag(EVENT_ID0); SetAtomicNone(); - PipeBarrier(); + PipeBarrier(); } } FORCE_INLINE_AICORE void ParallelWithSplit() { ResetIpcFlags(3); - PipeBarrier(); + PipeBarrier(); for (int32_t cal_idx = 0; cal_idx < cal_count; ++cal_idx) { uint64_t flag_idx = cal_idx % MAX_BLOCK_COUNT; - int32_t actual_loop_num = (cal_idx == cal_count -1) ? core_loop - cal_idx * loop_num_per_comm : - loop_num_per_comm; + int32_t actual_loop_num = (cal_idx == cal_count - 1) ? core_loop - cal_idx * loop_num_per_comm : + loop_num_per_comm; int32_t m_total = actual_loop_num * m0; - m_per_rank = DivCeil(m_total, rank_size); + m_per_rank = DivCeil(m_total, rank_size); // pvalue * corenum / ranksize * m0 m_in_rank = (rank * m_per_rank >= m_total) ? 0 : - ((rank + 1) * m_per_rank > m_total ? m_total - rank * m_per_rank : m_per_rank); - + ((rank + 1) * m_per_rank > m_total ? m_total - rank * m_per_rank : m_per_rank); + // wait aic WaitEvent(flag_idx); if (need_dequant) { @@ -226,10 +229,13 @@ public: if (dequant_granularity == QuantGranularity::PER_TOKEN) { SetAndWaitAivSync(flag_idx); + //fused_pertoken_dequant_runner.Run(cal_idx); fused_pertoken_dequant_runner.RunDequantAllReduce(cal_idx); } + // aiv之间同步 SetAndWaitAivSync(flag_idx); + // 卡内matmul结果准备就绪 CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); StartBeforeFisrtStep(flag_idx); @@ -241,6 +247,7 @@ public: int32_t len_per_core = rank_total / comm_data_split; int32_t data_split_num = DivCeil(len_per_core, len_per_loop); + SetFlag(EVENT_ID0); SetFlag(EVENT_ID1); for (int loop_index = 0; loop_index < data_split_num; loop_index++) { @@ -279,18 +286,149 @@ public: SetAndWaitAivSync(flag_idx); CrossRankSyncV2(FLAG_TWO_IDX, cal_idx + 1); + + // aiv之间同步 SetAndWaitAivSync(flag_idx); + + // 发送aic同步 SetAicSync(flag_idx); } ResetIpcFlags(3); if (aiv_idx == 0 && core_idx < rank_size) { CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); - } + } + } + + + FORCE_INLINE_AICORE void DataCopySioRs(int32_t cal_idx_sio, int32_t len_per_rank) { + int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_4; + int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; + int32_t sio_core_idx = total_core_idx - core_count; + int32_t core_offset = sio_core_idx * len_per_core; + int32_t sio_peer_rank = rank ^ 1; + // sio-0搬0 2 4 6,sio-1搬1 3 5 7 + for(int32_t src_rank = rank % 2; src_rank < rank_size; src_rank += 2) { + int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank + core_offset; + FirstStepInPeerMem(len_per_core, buff[sio_peer_rank] + peer_offset, buff[rank] + peer_offset); + } + } + + FORCE_INLINE_AICORE void DataCopySioAg(int32_t cal_idx_sio, int32_t len_per_rank) { + int32_t flag_idx_sio = cal_idx_sio % BLOCK_COUNT_4; + int32_t len_per_core = len_per_rank / SIO_TOTAL_CORE_NUM; + int32_t sio_core_idx = total_core_idx - core_count; + int32_t core_offset = sio_core_idx * len_per_core; + int32_t sio_peer_rank = rank ^ 1; + // 1从0卡搬0 2 4 6, 0从1卡搬1 3 5 7 + for(int32_t src_rank = sio_peer_rank % 2; src_rank < rank_size; src_rank += 2) { + int32_t peer_offset = flag_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank; + int32_t dst_offset = cal_idx_sio * gm_c_pingpong_size + src_rank * len_per_rank + core_offset; + SecondStepParallel(len_per_core, buff[sio_peer_rank] + peer_offset + core_offset, dst_offset); + } + // copy自己卡 + int32_t local_offset = flag_idx_sio * gm_c_pingpong_size + rank * len_per_rank + core_offset; + int32_t dst_offset = cal_idx_sio * gm_c_pingpong_size + rank * len_per_rank + core_offset; + SecondStepParallel(len_per_core, buff[rank] + local_offset, dst_offset); + } + + FORCE_INLINE_AICORE void ParallelSio() { + ResetIpcFlags(3); + PipeBarrier(); + int32_t last_loop_num = core_loop - (cal_count - 1) * loop_num_per_comm; + int32_t core_group = GetCoreGroup(); + for (int32_t cal_idx = 0; cal_idx < cal_count + 2; ++cal_idx) { + int32_t hccs_idx = cal_idx - 1; // sio-rs -> hccs -> sio-ag + int32_t sio2_idx = cal_idx - 2; // sio-ag + int32_t flag_idx_sio1 = cal_idx % BLOCK_COUNT_4; + int32_t flag_idx_hccs = hccs_idx % BLOCK_COUNT_4; + int32_t flag_idx_sio2 = sio2_idx % BLOCK_COUNT_4; + int32_t loop_num_hccs = hccs_idx == cal_count - 1 ? last_loop_num : loop_num_per_comm; + // wait aic + if (cal_idx < cal_count){ + WaitEvent(flag_idx_sio1); + } + + if (need_dequant) { + fused_dequant_runner.RunDequantAllReduce(cal_idx); + } + + // aiv之间同步 + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + + // 卡内matmul结果准备就绪 + CrossRankSyncV1(FLAG_ZERO_IDX, cal_idx + 1); + + StartBeforeFisrtStep(flag_idx_sio1); + + if (core_group == 0 && cal_idx >= 1 && cal_idx < cal_count + 1) { // step 2-1 hccs rs + int32_t size_per_rank = loop_num_hccs * m0 * n0 / rank_size; + int32_t rank_offset = rank * size_per_rank; + int32_t rank_buff_offset = flag_idx_hccs * gm_c_pingpong_size + rank_offset; + int32_t size_per_core = size_per_rank / (comm_data_split); + + int32_t data_split_num = DivCeil(size_per_core, len_per_loop); + + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); + for (int loop_index = 0; loop_index < data_split_num; loop_index++) { + int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; + int32_t loop_total = size_per_rank - before_core_offset; + int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; + + int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + len_per_loop) > loop_total ? + loop_total - real_core_offset : len_per_loop); + + FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); + } + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); + } + if (core_group == 1 && cal_idx < cal_count) { // step 1 sio reducescatter + int32_t loop_num_sio1 = cal_idx == cal_count - 1 ? last_loop_num : loop_num_per_comm; + int32_t size_per_rank = loop_num_sio1 * m0 * n0 / rank_size; + DataCopySioRs(cal_idx, size_per_rank); + } + + EndFirstStep(flag_idx_sio1); + + CrossRankSyncV1(FLAG_ONE_IDX, cal_idx + 1); + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + if (core_group == 0 && cal_idx >= 1 && cal_idx < cal_count + 1) { // step2-2 hccs ag + int32_t size_per_rank = loop_num_hccs * m0 * n0 / rank_size; + int32_t pipe_offset = flag_idx_hccs * gm_c_pingpong_size + other_rank * size_per_rank; + int32_t dst_offset = hccs_idx * gm_c_pingpong_size + other_rank * size_per_rank; + if ((other_rank % 2) == (rank % 2) && other_rank != rank) { + FirstStepInPeerMemTransLayout(size_per_rank, buff[other_rank] + pipe_offset, buff[rank] + pipe_offset, dst_offset); + } + } + if (core_group == 1 && cal_idx >= 2) { // step3: sio-ag + int32_t loop_num_sio2 = sio2_idx == cal_count - 1 ? last_loop_num : loop_num_per_comm; + int32_t size_per_rank = loop_num_sio2 * m0 * n0 / rank_size; + DataCopySioAg(sio2_idx, size_per_rank); + } + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + CrossRankSyncV2(FLAG_TWO_IDX, cal_idx + 1); + + // aiv之间同步 + SetAndWaitAivSync(flag_idx_sio1, BLOCK_COUNT_4); + + // 发送aic同步 + if (cal_idx >= 2) + SetAicSync(flag_idx_sio2); + } + ResetIpcFlags(3); + + if (aiv_idx == 0 && core_idx < rank_size) { + CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + FLAG_ZERO_IDX, 0); + } } + FORCE_INLINE_AICORE void Serial() { SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag); + // aic/aiv之间同步 WaitEvent(AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID); FFTSCrossCoreSync(0, AIV_FINISH_ALIGN_FLAG_ID); @@ -301,10 +439,10 @@ public: } if (aiv_idx == 1 && core_idx < rank_size) { int32_t data_size = batch_size * m * n; - int32_t data_size_per_rank = (data_size + BLOCK_SIZE_16 * rank_size -1) / (BLOCK_SIZE_16 * rank_size) * BLOCK_SIZE_16; - if (other_rank == rank) { + int32_t data_size_per_rank = (data_size + BLOCK_SIZE_16 * rank_size - 1) / (BLOCK_SIZE_16 * rank_size) * BLOCK_SIZE_16; + if (other_rank == rank){ SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ZERO_IDX, tag); - } else { + }else { CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ZERO_IDX, tag); PipeBarrier(); int32_t rank_buff_offset = rank * data_size_per_rank; @@ -314,9 +452,9 @@ public: CheckBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[other_rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag * rank_size); PipeBarrier(); int32_t data_size_in_other_rank = data_size_per_rank; - if (other_rank * data_size_in_other_rank >= data_size) { - data_size_in_other_rank = 0; - } else if ((other_rank + 1) * data_size_in_other_rank > data_size) { + if (other_rank * data_size_in_other_rank >= data_size){ + data_size_in_other_rank = 0; + } else if ((other_rank + 1) * data_size_in_other_rank > data_size){ data_size_in_other_rank = data_size - other_rank * data_size_per_rank; } int32_t other_rank_buff_offset = other_rank * data_size_per_rank; @@ -326,6 +464,7 @@ public: FORCE_INLINE_AICORE void SerialWithSplit() { SetBuffFlag(ctrl_flags_UB, (__gm__ int32_t *)buff[rank] + flag_offset + MAX_FLAG_COUNT + FLAG_ONE_IDX, tag); + // aic/aiv之间同步 WaitEvent(AIV_WAIT_AIC_FINISH_MATMUL_FLAG_ID); FFTSCrossCoreSync(0, AIV_FINISH_ALIGN_FLAG_ID); @@ -336,7 +475,7 @@ public: } int32_t data_size = batch_size * m * n; - int32_t data_size_per_rank = (data_size + BLOCK_SIZE_16 * rank_size -1) / (BLOCK_SIZE_16 * rank_size) * BLOCK_SIZE_16; + int32_t data_size_per_rank = (data_size + BLOCK_SIZE_16 * rank_size - 1) / (BLOCK_SIZE_16 * rank_size) * BLOCK_SIZE_16; int32_t use_core_count = comm_npu_split * comm_data_split; int32_t rank_buff_offset = rank * data_size_per_rank; @@ -352,14 +491,15 @@ public: SetFlag(EVENT_ID1); for (int loop_index = 0; loop_index < data_split_num; loop_index++) { if (aiv_idx == 0 && core_idx < comm_data_split * comm_npu_split) { - int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; - int32_t loop_total = data_size_per_rank - before_core_offset; - int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; + int32_t before_core_offset = len_per_loop * comm_data_split * loop_index; + int32_t loop_total = data_size_per_rank - before_core_offset; + int32_t real_core_offset = core_idx % comm_data_split * len_per_loop; - int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : - ((real_core_offset + len_per_loop) > loop_total ? - loop_total - real_core_offset : len_per_loop); - FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); + int32_t m_in_core = (real_core_offset >= loop_total) ? 0 : + ((real_core_offset + len_per_loop) > loop_total ? + loop_total - real_core_offset : len_per_loop); + + FirstStepDivCore(m_in_core, rank_buff_offset + before_core_offset + real_core_offset); } } WaitFlag(EVENT_ID0); @@ -373,10 +513,10 @@ public: if (aiv_idx == 0 && core_idx < rank_size) { PipeBarrier(); int32_t data_size_in_other_rank = data_size_per_rank; - if (other_rank * data_size_in_other_rank >= data_size) { - data_size_in_other_rank = 0; - } else if ((other_rank + 1) * data_size_in_other_rank > data_size) { - data_size_in_other_rank = data_size -other_rank * data_size_per_rank; + if (other_rank * data_size_in_other_rank >= data_size){ + data_size_in_other_rank = 0; + } else if ((other_rank + 1) * data_size_in_other_rank > data_size){ + data_size_in_other_rank = data_size - other_rank * data_size_per_rank; } int32_t other_rank_buff_offset = other_rank * data_size_per_rank; SecondStepSerial(data_size_in_other_rank, buff[other_rank] + other_rank_buff_offset, gm_out + other_rank_buff_offset); @@ -385,8 +525,9 @@ public: FORCE_INLINE_AICORE void Run() { + // Padding preprocessor.Run(); - + if constexpr (HAVE_BIAS) { add_bias_runner.Run(); } @@ -401,6 +542,8 @@ public: ParallelWithSplit(); } + + PipeBarrier(); postprocessor.Run(); PipeBarrier(); @@ -408,6 +551,7 @@ public: if (withSerialMode && dequant_granularity == QuantGranularity::PER_TOKEN) { serial_pertoken_dequant_runner.Run(); } + } public: @@ -420,6 +564,7 @@ public: using CocCommBase::FirstStepInPeerMem; using CocCommBase::FirstStepInPeerMemSeq; using CocCommBase::FirstStepInPeerMemTree; + using CocCommBase::FirstStepInPeerMemTransLayout; using CocCommBase::CopyUbToGmTransLayout; using CocCommBase::ResetIpcFlags; using CocCommBase::CrossRankSyncV1; @@ -459,7 +604,7 @@ public: using CocCommBase::max_ub_ping_pong_size; using CocCommBase::withSerialMode; using CocCommBase::tag; - using CocCommBase::loop_num_per_comm; + using CocCommBase::loop_num_per_comm; // p_value * core_num using CocCommBase::gm_c_pingpong_size; using CocCommBase::dequant_granularity; using CocCommBase::dequant_group_size; @@ -472,6 +617,12 @@ public: using CocCommBase::core_count; using CocCommBase::weight_nz; using CocCommBase::is_deterministic; + using CocCommBase::local_expert_nums; + using CocCommBase::is_moe; + using CocCommBase::is_moe_averaged; + using CocCommBase::is_alltoallvc; + using CocCommBase::EP; + using CocCommBase::TP; using CocCommBase::flag_offset; int32_t cal_count; int32_t m_per_rank; @@ -482,9 +633,10 @@ public: MatmulAllReduceBiasAdder add_bias_runner; SerialDequantRunner serial_dequant_runner; FusedDequantRunner fused_dequant_runner; + //AllReduceFusedPerTokenDequantRunner fused_pertoken_dequant_runner; FusedPerTokenDequantRunner fused_pertoken_dequant_runner; SerialPerTokenDequantRunner serial_pertoken_dequant_runner; - bool need_dequant; + bool need_dequant; }; constexpr int32_t NO_BIAS_MASK1 = 0b000000 | 0b100000 | 0b010000 | 0b110000 | 0b001000 | 0b101000 | 0b011000 | @@ -495,6 +647,7 @@ constexpr int32_t BIAS_MASK1 = 0b000010 | 0b100010 | 0b010010 | 0b110010 | 0b001 template FORCE_INLINE_AICORE void RunAllReduceAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { + // 16 align AllReduce allreduce_align_16_without_bias; AllReduce allreduce_align_16_with_bias; switch (tiling_key) { @@ -507,7 +660,7 @@ FORCE_INLINE_AICORE void RunAllReduceAlign16(int32_t tiling_key, COC_ARGS_FUN(T) break; case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : - case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : allreduce_align_16_with_bias.SetArgs(COC_ARGS_CALL()); allreduce_align_16_with_bias.Run(); @@ -519,6 +672,7 @@ FORCE_INLINE_AICORE void RunAllReduceAlign16(int32_t tiling_key, COC_ARGS_FUN(T) template FORCE_INLINE_AICORE void RunAllReduceUnAlign16(int32_t tiling_key, COC_ARGS_FUN(T)) { + // 16 unalign AllReduce allreduce_unalign_16_without_bias; AllReduce allreduce_unalign_16_with_bias; switch (tiling_key) { @@ -531,7 +685,7 @@ FORCE_INLINE_AICORE void RunAllReduceUnAlign16(int32_t tiling_key, COC_ARGS_FUN( break; case 0b000010 : case 0b100010 : case 0b010010 : case 0b110010 : case 0b001010 : case 0b101010 : case 0b011010 : case 0b111010 : - case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : + case 0b000110 : case 0b100110 : case 0b010110 : case 0b110110 : case 0b001110 : case 0b101110 : case 0b011110 : case 0b111110 : allreduce_unalign_16_with_bias.SetArgs(COC_ARGS_CALL()); allreduce_unalign_16_with_bias.Run(); @@ -541,11 +695,13 @@ FORCE_INLINE_AICORE void RunAllReduceUnAlign16(int32_t tiling_key, COC_ARGS_FUN( } } -template +template inline __aicore__ void CocMatmulAllReduceAiv(COC_ARGS_FUN(T)) { + // 16 align AllReduce allreduce_align_16_without_bias; AllReduce allreduce_align_16_with_bias; + // 16 unalign AllReduce allreduce_unalign_16_without_bias; AllReduce allreduce_unalign_16_with_bias; @@ -562,7 +718,8 @@ inline __aicore__ void CocMatmulAllReduceAiv(COC_ARGS_FUN(T)) int32_t tiling_key = cocTilingData->tilingKey; int32_t rank_size = cocTilingData->rankSize; int32_t withSerialMode = cocTilingData->withSerialMode; - if ((withSerialMode == 0 && n % BLOCK_SIZE_16 == 0) || (withSerialMode && (batch_size * m * n) % (rank_size * BLOCK_SIZE_16) == 0)) { + // swizzl = 0 transa = 0 transb = 0 splitk = 0 bias = 0 int8 = 0 + if ((withSerialMode == 0 && n % BLOCK_SIZE_16 == 0) || (withSerialMode && (batch_size * m * n) % (rank_size * BLOCK_SIZE_16) == 0)){ RunAllReduceAlign16(tiling_key, COC_ARGS_CALL()); } else { RunAllReduceUnAlign16(tiling_key, COC_ARGS_CALL()); -- Gitee From 310fead954495927008ed3abf99d6e32ae95c188 Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 16:29:39 +0800 Subject: [PATCH 412/414] fix --- comm/lcal/src/tiling/tiling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comm/lcal/src/tiling/tiling.cpp b/comm/lcal/src/tiling/tiling.cpp index b53bf883..1d74aa6d 100644 --- a/comm/lcal/src/tiling/tiling.cpp +++ b/comm/lcal/src/tiling/tiling.cpp @@ -33,7 +33,7 @@ bool CoCTilingFunc::CheckTiling(const TaskParam &taskParam) void CoCTilingFunc::GetDefaultTiling(const TaskParam &taskParam) { - // 暂时没有使用场景 + (void) taskParam; cocTilingData.ubMoveNum = VALID_UB_MOVE_NUM; cocTilingData.commNpuSplit = cocTilingData.rankSize; cocTilingData.commDataSplit = COMMDATASPLIT_ONE; -- Gitee From b708cd6bdfd6d832d7c431704ba576760c9c6a43 Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 17:20:00 +0800 Subject: [PATCH 413/414] fix --- .../linear_parallel_lcoc_runner.cpp | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp index 0e8c4965..3d3ebdc8 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp @@ -181,10 +181,35 @@ Status LinearParallelLcocRunner::LaunchKernel(Lcal::CoCInputPkg inputPkg, Lcal:: ret = lcoc_->MatmulAllReduce(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, GetExecuteStream(runnerVariantPack.context)); break; + case infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER: + ret = lcoc_->MatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + GetExecuteStream(runnerVariantPack.context)); + break; + case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR: + if (param_.keepIntermediate) { + ret = lcoc_->AllGatherMatmulV2(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + GetExecuteStream(runnerVariantPack.context)); + break; + } + ret = lcoc_->AllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + GetExecuteStream(runnerVariantPack.context)); + break; case infer::LinearParallelParam::ParallelType::ALL_GATHER_LINEAR_REDUCE_SCATTER: ret = lcoc_->AllGatherMatmulReduceScatter(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, GetExecuteStream(runnerVariantPack.context)); break; + case infer::LinearParallelParam::ParallelType::PURE_LINEAR: + ret = lcoc_->PureMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + GetExecuteStream(runnerVariantPack.context)); + break; + case infer::LinearParallelParam::ParallelType::ALLTOALLVC_ALL_GATHER_GMM: + ret = lcoc_->AllToAllVAllGatherMatmul(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + runnerVariantPack.context->GetExecuteStream()); + break; + case infer::LinearParallelParam::ParallelType::GMM_REDUCE_SCATTER_ALLTOALLVC: + ret = lcoc_->MatmulReduceScatterAllToAllVHidden(inputPkg, outputPkg, runnerVariantPack.workspaceBuffer, + runnerVariantPack.context->GetExecuteStream()); + break; default: ATB_LOG(ERROR) << GetLogPrefix() << "UnSupported type: " << param_.type; return ERROR_INVALID_PARAM; -- Gitee From 896d4b24d63d20d451690ec465c22a441f9b2ebf Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 2 Sep 2025 20:16:30 +0800 Subject: [PATCH 414/414] fix --- comm/lcal/src/lcal_internal.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 695b116b..5fd8b8b5 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -78,7 +78,7 @@ const int* FindNextOpStart(const int opStartMaigc, const int* cclBinEndPtr, cons MKI_LOG(ERROR) << "FindNextOpStart failed! cclBinPtr is nullptr"; return nullptr; } - while (cclBinPtr < cclBinEndPtr and *cclBinPtr != opStartMaigc) { + while (cclBinPtr < cclBinEndPtr && *cclBinPtr != opStartMaigc) { cclBinPtr++; } if (*cclBinPtr == opStartMaigc) { @@ -92,7 +92,7 @@ int RegistCCLOp2Kernel(const int* cclBinPtr, const int* nextPtr) vector registerTypes = { HCCL_DATA_TYPE_INT32, HCCL_DATA_TYPE_INT16, HCCL_DATA_TYPE_INT8, HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, HCCL_DATA_TYPE_INT64 }; - std::vector registerCCLTypesOp2 = { + std::vector registerCCLTypesOp2 = { // 完成算子实现后在这里添加算子注册 LcalType::ALL_GATHER, LcalType::REDUCE_SCATTER, LcalType::ALL2ALL, }; int res = LCAL_SUCCESS; @@ -115,7 +115,7 @@ int RegistCCLOp1Kernel(const int* cclBinPtr, const int* nextPtr) vector registerTypes = { HCCL_DATA_TYPE_INT32, HCCL_DATA_TYPE_INT16, HCCL_DATA_TYPE_INT8, HCCL_DATA_TYPE_FP32, HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16, HCCL_DATA_TYPE_INT64 }; - std::vector registerCCLTypesOp1 = { + std::vector registerCCLTypesOp1 = { // 完成算子实现后在这里添加算子注册 LcalType::ALL_REDUCE, }; int res = LCAL_SUCCESS; @@ -140,7 +140,6 @@ int RegistCCLKernel(const int32_t opGroup) } constexpr int32_t smallGroupNum = 2; - for (int32_t opGroupIdx = 0; opGroupIdx < opGroup; ++opGroupIdx) { for (int32_t opIdx = 0; opIdx < smallGroupNum; ++opIdx) { cclBinPtr = nextPtr; @@ -157,12 +156,14 @@ int RegistCCLKernel(const int32_t opGroup) return LCAL_ERROR_INTERNAL; } + // 切换到大组内第二个小组是 cclBinPtr = nextPtr; nextPtr = FindNextOpStart(opStartMaigc, cclBinEndPtr, nextPtr); if (cclBinPtr == nullptr || cclBinPtr == cclBinEndPtr || nextPtr == nullptr) { return LCAL_ERROR_INTERNAL; } + // 大组内第二个小组是 reducescatter, allgather 等 ret = RegistCCLOp2Kernel(cclBinPtr, nextPtr); if (ret != LCAL_SUCCESS) { return LCAL_ERROR_INTERNAL; @@ -174,8 +175,13 @@ void RegistCoCKernel() { vector registerTypes = { HCCL_DATA_TYPE_FP16, HCCL_DATA_TYPE_BFP16 }; vector> registerCOCTypes = { + { LcalType::PURE_MATMUL}, { LcalType::MATMUL_ALL_REDUCE }, + { LcalType::MATMUL_REDUCE_SCATTER }, + { LcalType::ALL_GATHER_MATMUL, LcalType::ALL_GATHER_MATMUL_V2 }, { LcalType::ALL_GATHER_MATMUL_REDUCE_SCATTER}, + { LcalType::ALLTOALLV_ALLGATHER_MATMUL, LcalType::ALLTOALLVC_ALLGATHER_MATMUL_HIDDEN}, + { LcalType::MATMUL_REDUCESCATTER_ALLTOALLVC_HIDDEN}, }; auto cocCceBinStr = LCAL_CCE_BIN_STR + LCAL_1OP_BIN_SIZE / sizeof(int); -- Gitee