From 4d4ce9dbe6a3ec473d54635dfd97408904a4b15c Mon Sep 17 00:00:00 2001
From: xiecheng <xiecheng22@h-partners.com>
Date: Thu, 19 Jun 2025 10:51:05 +0800
Subject: [PATCH 1/2] add cp & variable_seq_length and add use warning

---
 mindspeed_llm/training/arguments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py
index 61b88be8d..a141fc9eb 100644
--- a/mindspeed_llm/training/arguments.py
+++ b/mindspeed_llm/training/arguments.py
@@ -1025,8 +1025,8 @@ def _validate_recompute_args(args):
 
 def _validate_instruction_finetune(args):
     if args.variable_seq_lengths:
-        if args.context_parallel_size > 1:
-            raise AssertionError('Context parallelism is forbidden when use variable seq lengths.')
+        if args.pad_to_multiple_of % (args.tensor_model_parallel_size * args.context_parallel_size) == 0:
+            raise AssertionError('pad_to_multiple_of must be divided by (tp * cp).')
         if args.num_experts is not None and args.moe_token_dispatcher_type == "allgather":
             raise AssertionError('moe_token_dispatcher_type "allgather" is forbidden when use variable seq lengths. you can choose "alltoall"')
 
-- 
Gitee


From 38e3d45b39c7137187c4ac09954f5947da50d33b Mon Sep 17 00:00:00 2001
From: xiecheng <xiecheng22@h-partners.com>
Date: Thu, 19 Jun 2025 11:03:13 +0800
Subject: [PATCH 2/2] update warning

---
 mindspeed_llm/training/arguments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py
index a141fc9eb..5c71b5ed8 100644
--- a/mindspeed_llm/training/arguments.py
+++ b/mindspeed_llm/training/arguments.py
@@ -1025,8 +1025,8 @@ def _validate_recompute_args(args):
 
 def _validate_instruction_finetune(args):
     if args.variable_seq_lengths:
-        if args.pad_to_multiple_of % (args.tensor_model_parallel_size * args.context_parallel_size) == 0:
-            raise AssertionError('pad_to_multiple_of must be divided by (tp * cp).')
+        if args.context_parallel_size > 1 and args.pad_to_multiple_of % (args.tensor_model_parallel_size * args.context_parallel_size) == 0:
+            raise AssertionError('pad_to_multiple_of must be divided by (tp * cp) when use cp.')
         if args.num_experts is not None and args.moe_token_dispatcher_type == "allgather":
             raise AssertionError('moe_token_dispatcher_type "allgather" is forbidden when use variable seq lengths. you can choose "alltoall"')
 
-- 
Gitee