From a6b1e5eff9fcb2bb365067876492881a4fca4c24 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 9 Sep 2025 11:36:49 +0800 Subject: [PATCH 1/2] fix --- .../src/ascendc_kernels/allreduce_big_data.h | 18 ++++++++++++++++++ comm/lcal/src/ascendc_kernels/lccl_op.h | 9 ++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index f8ce0276..5623ab45 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -30,6 +30,8 @@ public: DumpLcclLogInfo(LogId::INIT, static_cast(op)); if constexpr(!std::is_same_v) { BuildScaleOffset(scale, scaleCount, offset); + this->input = input; + this->output = output; } if (blockIdx >= PING_PONG_SIZE * rankSize) { @@ -124,6 +126,22 @@ public: } DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); } + + FORCE_INLINE_AICORE void SupportBigScale() + { + if constexpr(!std::is_same_v) { + constexpr int32_t bigScaleFlagOffset = 2; + if (blockIdx == 0) { + inputGt.SetGlobalBuffer((__gm__ U*)input); + outputGt.SetGlobalBuffer((__gm__ T*)output); + CpGM2GMWithScale(len, inputGt, outputGt, COPYONLY); + sync.SetSyncFlag(magic, 0, blockNum * bigScaleFlagOffset, rank); + } else { + sync.WaitSyncFlag(magic, 0, blockNum * bigScaleFlagOffset, rank); + } + } + return; + } private: FORCE_INLINE_AICORE void Producer() { diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/comm/lcal/src/ascendc_kernels/lccl_op.h index bf54ce2b..115c2690 100644 --- a/comm/lcal/src/ascendc_kernels/lccl_op.h +++ b/comm/lcal/src/ascendc_kernels/lccl_op.h @@ -129,6 +129,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \ constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; \ constexpr int32_t rankSize910a3 = 16; \ + constexpr int32_t scaleCountMax = 12 * 1024 * 1024; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ @@ -142,8 +143,14 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ CLASS_OP_QUANT_LAUNCH(AllReduceOneShot, half, int8_t); \ } else if (len * sizeof(type) <= quantSmallDataSize) { \ CLASS_OP_QUANT_LAUNCH(AllReduceTwoShot, half, int8_t); \ - } else { \ + } else if (scaleCount * rankSize <= scaleCountMax) { \ CLASS_OP_QUANT_LAUNCH(AllReduceBigData, half, int8_t); \ + } else { \ + AllReduceBigData opTmp(localRank, localRankSize, extraFlag); \ + opTmp.Init(KERNELS_ARGS_CALL()); \ + opTmp.SupportBigScale(); \ + input = output; \ + CLASS_OP_LAUNCH(AllReduceBigData, half); \ } \ } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ if (len * sizeof(type) < cceSmallDataSize) { \ -- Gitee From 3d58d2ee09a831c6ea632d52dfcac34886fa5fef Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 9 Sep 2025 11:54:52 +0800 Subject: [PATCH 2/2] fix --- comm/lcal/src/ascendc_kernels/allreduce_big_data.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h index 5623ab45..f8d7c9d5 100644 --- a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/comm/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -269,6 +269,8 @@ private: T offset = 0; bool isEnableScale = false; bool isVectorScale = false; + GM_ADDR input = nullptr; + GM_ADDR output = nullptr; }; #endif // LCCL_ALLREDUCE_BIG_DATA_H \ No newline at end of file -- Gitee