From 1e7bbe34a330ba9b560efb43b7ccc4241abe5675 Mon Sep 17 00:00:00 2001
From: qu_yueze <quyueze@h-partners.com>
Date: Tue, 17 Jun 2025 15:09:14 +0800
Subject: [PATCH] fix tp-extend-ep in ckpt

---
 docs/features/cc_lora.md                              | 2 ++
 examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh | 1 -
 examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh  | 1 -
 examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh  | 1 -
 examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh  | 1 -
 examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh   | 1 -
 6 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/features/cc_lora.md b/docs/features/cc_lora.md
index 0aa916107..fe5fc223d 100644
--- a/docs/features/cc_lora.md
+++ b/docs/features/cc_lora.md
@@ -40,6 +40,8 @@ RC2以上版本，LoRA微调场景，算法与PP、VPP、分布式优化器等
 
 通过设置--lora-fusion开启CCLoRA的加速，在开启了TP或SP场景，加入--use-fused-mlp启用[Fused_MLP](./fused_mlp.md)算法，针对LoRA的MLP模块进一步进行计算通信掩盖提高性能。
 
+注意:cclora 与 --overlap-param-gather 特性冲突 , 不能同时使用 。
+
 ## 使用效果
 
 以下验证硬件信息为Atlas 900 A2 PODc，集群规模1x8
diff --git a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh
index b5c0e10a8..4b372739a 100644
--- a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh
+++ b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh
@@ -87,7 +87,6 @@ GPT_ARGS="
     --bf16 \
     --use-distributed-optimizer \
     --overlap-grad-reduce \
-    --overlap-param-gather 
 "
 TUNE_ARGS="
     --finetune \
diff --git a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh
index ce7932171..7afd8cd23 100644
--- a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh
+++ b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh
@@ -51,7 +51,6 @@ GPT_ARGS="
     --reuse-fp32-param \
     --use-distributed-optimizer \
     --overlap-grad-reduce \
-    --overlap-param-gather \
     --hidden-size 5120 \
     --ffn-hidden-size 13824 \
     --num-attention-heads 40 \
diff --git a/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh
index dfe5272d4..3757bee3d 100644
--- a/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh
+++ b/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh
@@ -51,7 +51,6 @@ GPT_ARGS="
     --reuse-fp32-param \
     --use-distributed-optimizer \
     --overlap-grad-reduce \
-    --overlap-param-gather \
     --sequence-parallel \
     --hidden-size 5120  \
     --ffn-hidden-size 27648 \
diff --git a/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh
index 14dc1b0a9..8ac1c07ce 100644
--- a/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh
+++ b/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh
@@ -93,7 +93,6 @@ GPT_ARGS="
     --bf16 \
     --use-distributed-optimizer \
     --overlap-grad-reduce \
-    --overlap-param-gather \
 "
 
 DATA_ARGS="
diff --git a/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh
index e13868873..904098aa3 100644
--- a/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh
+++ b/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh
@@ -47,7 +47,6 @@ GPT_ARGS="
     --use-mcore-models \
     --use-distributed-optimizer \
     --overlap-grad-reduce \
-    --overlap-param-gather \
     --tensor-model-parallel-size ${TP} \
     --pipeline-model-parallel-size ${PP} \
     --num-layers 28 \
-- 
Gitee