diff --git a/docs/features/cc_lora.md b/docs/features/cc_lora.md index 0aa9161076d068209ac86c52b6e57163583ff2c3..fe5fc223dfae4d2cac5962001eafde1568f86930 100644 --- a/docs/features/cc_lora.md +++ b/docs/features/cc_lora.md @@ -40,6 +40,8 @@ RC2以上版本,LoRA微调场景,算法与PP、VPP、分布式优化器等 通过设置--lora-fusion开启CCLoRA的加速,在开启了TP或SP场景,加入--use-fused-mlp启用[Fused_MLP](./fused_mlp.md)算法,针对LoRA的MLP模块进一步进行计算通信掩盖提高性能。 +注意:cclora 与 --overlap-param-gather 特性冲突 , 不能同时使用 。 + ## 使用效果 以下验证硬件信息为Atlas 900 A2 PODc,集群规模1x8 diff --git a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh index b5c0e10a841a40549116d060d21a57049a28332a..4b372739a18035f22d1cabb0b1715944e3866823 100644 --- a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh +++ b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_pack.sh @@ -87,7 +87,6 @@ GPT_ARGS=" --bf16 \ --use-distributed-optimizer \ --overlap-grad-reduce \ - --overlap-param-gather " TUNE_ARGS=" --finetune \ diff --git a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh index ce79321711eafe8191c40defa8136442ef822e7d..7afd8cd2339c940b555d76a542f2ab9e3e76cec2 100644 --- a/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh +++ b/examples/mcore/qwen25/tune_qwen25_14b_4k_lora_ptd.sh @@ -51,7 +51,6 @@ GPT_ARGS=" --reuse-fp32-param \ --use-distributed-optimizer \ --overlap-grad-reduce \ - --overlap-param-gather \ --hidden-size 5120 \ --ffn-hidden-size 13824 \ --num-attention-heads 40 \ diff --git a/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh index dfe5272d49847156d854205a8de7db08a87096df..3757bee3dde98e15ff949b4660c19d37f00cea57 100644 --- a/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh +++ b/examples/mcore/qwen25/tune_qwen25_32b_4k_lora_ptd.sh @@ -51,7 +51,6 @@ GPT_ARGS=" --reuse-fp32-param \ --use-distributed-optimizer \ --overlap-grad-reduce \ - --overlap-param-gather \ --sequence-parallel \ --hidden-size 5120 \ --ffn-hidden-size 27648 \ diff --git a/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh index 14dc1b0a94e3c75d305ea149d3cd9229dbdbafdc..8ac1c07ce98bc98820ff0cf9851790823868ea1d 100644 --- a/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh +++ b/examples/mcore/qwen25/tune_qwen25_72b_4k_lora_ptd.sh @@ -93,7 +93,6 @@ GPT_ARGS=" --bf16 \ --use-distributed-optimizer \ --overlap-grad-reduce \ - --overlap-param-gather \ " DATA_ARGS=" diff --git a/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh b/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh index e13868873dfb4f869f0a80711f5164df2e0c92bb..904098aa3c9b0a92cb8571453d7fb35e077c45e6 100644 --- a/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh +++ b/examples/mcore/qwen25/tune_qwen25_7b_4k_lora_ptd.sh @@ -47,7 +47,6 @@ GPT_ARGS=" --use-mcore-models \ --use-distributed-optimizer \ --overlap-grad-reduce \ - --overlap-param-gather \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ --num-layers 28 \