From 4d4ce9dbe6a3ec473d54635dfd97408904a4b15c Mon Sep 17 00:00:00 2001 From: xiecheng Date: Thu, 19 Jun 2025 10:51:05 +0800 Subject: [PATCH 1/2] add cp & variable_seq_length and add use warning --- mindspeed_llm/training/arguments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index 61b88be8d..a141fc9eb 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -1025,8 +1025,8 @@ def _validate_recompute_args(args): def _validate_instruction_finetune(args): if args.variable_seq_lengths: - if args.context_parallel_size > 1: - raise AssertionError('Context parallelism is forbidden when use variable seq lengths.') + if args.pad_to_multiple_of % (args.tensor_model_parallel_size * args.context_parallel_size) == 0: + raise AssertionError('pad_to_multiple_of must be divided by (tp * cp).') if args.num_experts is not None and args.moe_token_dispatcher_type == "allgather": raise AssertionError('moe_token_dispatcher_type "allgather" is forbidden when use variable seq lengths. you can choose "alltoall"') -- Gitee From 38e3d45b39c7137187c4ac09954f5947da50d33b Mon Sep 17 00:00:00 2001 From: xiecheng Date: Thu, 19 Jun 2025 11:03:13 +0800 Subject: [PATCH 2/2] update warning --- mindspeed_llm/training/arguments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index a141fc9eb..5c71b5ed8 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -1025,8 +1025,8 @@ def _validate_recompute_args(args): def _validate_instruction_finetune(args): if args.variable_seq_lengths: - if args.pad_to_multiple_of % (args.tensor_model_parallel_size * args.context_parallel_size) == 0: - raise AssertionError('pad_to_multiple_of must be divided by (tp * cp).') + if args.context_parallel_size > 1 and args.pad_to_multiple_of % (args.tensor_model_parallel_size * args.context_parallel_size) == 0: + raise AssertionError('pad_to_multiple_of must be divided by (tp * cp) when use cp.') if args.num_experts is not None and args.moe_token_dispatcher_type == "allgather": raise AssertionError('moe_token_dispatcher_type "allgather" is forbidden when use variable seq lengths. you can choose "alltoall"') -- Gitee