diff --git a/examples/mcore/gpt4/pretrain_gpt4_moe_drop.sh b/examples/mcore/gpt4/pretrain_gpt4_moe_drop.sh index 328e9e588bb5cefd77cdbd077056faaa283af808..d358ce513f28b4ddbd18ede4198ba26ad07f7ddd 100644 --- a/examples/mcore/gpt4/pretrain_gpt4_moe_drop.sh +++ b/examples/mcore/gpt4/pretrain_gpt4_moe_drop.sh @@ -43,7 +43,7 @@ MOE_ARGS=" --moe-permutation-async-comm \ --disable-bias-linear \ --moe-expert-capacity-factor 1.1 \ - --moe-token-dispatcher-type alltoall_seq \ + --moe-token-dispatcher-type alltoall \ --moe-pad-expert-input-to-capacity \ --moe-layer-freq -1 \ --first-k-dense-replace -1 \ diff --git a/examples/mcore/gpt4/pretrain_gpt4_moe_drop_A3_ptd.sh b/examples/mcore/gpt4/pretrain_gpt4_moe_drop_A3_ptd.sh index f30d954223c1541a62556e863d0daa49780dada8..ab326e0ba8ee0017f3f1d50d8a68b4348cc29497 100644 --- a/examples/mcore/gpt4/pretrain_gpt4_moe_drop_A3_ptd.sh +++ b/examples/mcore/gpt4/pretrain_gpt4_moe_drop_A3_ptd.sh @@ -44,7 +44,7 @@ MOE_ARGS=" --moe-permutation-async-comm \ --disable-bias-linear \ --moe-expert-capacity-factor 1.1 \ - --moe-token-dispatcher-type alltoall_seq \ + --moe-token-dispatcher-type alltoall \ --moe-pad-expert-input-to-capacity \ --moe-layer-freq -1 \ --first-k-dense-replace -1 \