From 9ab2edc3b39e81b08f21fa827d0754f5f4452cbc Mon Sep 17 00:00:00 2001 From: guozhihua Date: Tue, 29 Jul 2025 15:46:33 +0800 Subject: [PATCH] refactor moe in v2 --- .../core/transformer/moe/moe_layer.py | 16 +++ mindspeed_llm/features_manager/__init__.py | 95 ++++++++++--- mindspeed_llm/features_manager/common/data.py | 9 +- .../features_manager/common/embedding.py | 5 +- .../features_manager/common/rotary.py | 15 +- .../features_manager/common/training.py | 16 ++- .../features_manager/communication/coc.py | 12 +- .../features_manager/communication/gloo.py | 17 +-- .../high_availability/high_availability.py | 49 +++---- .../megatron_basic/megatron_basic.py | 72 ++++++---- .../megatron_basic/requirements_basic.py | 16 ++- .../megatron_basic/training_basic.py | 11 ++ .../features_manager/models/mamba.py | 42 +++--- .../moe/moe_allgather_overlap.py | 21 +++ .../moe/moe_alltoallseq_overlap.py | 51 +++++++ .../{common => moe}/moe_router.py | 98 ++++++------- .../features_manager/moe/shared_expert.py | 10 ++ .../features_manager/moe/tp_extend_ep.py | 13 ++ .../optimizer/fused_ema_adamw_feature.py | 13 +- .../tokenizer/build_tokenizer.py | 12 +- .../fusion_attention_feature.py | 2 - .../features_manager/transformer/mtp.py | 63 ++++----- .../multi_latent_attention/mla_feature.py | 51 +++++++ .../transformer/transformer_block.py | 18 +++ mindspeed_llm/training/arguments.py | 3 + .../mixtral_tp1_pp4_ep2_drop_dpp.json | 58 -------- .../mixtral_tp1_pp4_ep2_drop_dpp.sh | 129 ------------------ 27 files changed, 501 insertions(+), 416 deletions(-) create mode 100644 mindspeed_llm/features_manager/moe/moe_allgather_overlap.py create mode 100644 mindspeed_llm/features_manager/moe/moe_alltoallseq_overlap.py rename mindspeed_llm/features_manager/{common => moe}/moe_router.py (47%) create mode 100644 mindspeed_llm/features_manager/moe/shared_expert.py create mode 100644 mindspeed_llm/features_manager/moe/tp_extend_ep.py create mode 100644 mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py create mode 100644 mindspeed_llm/features_manager/transformer/transformer_block.py delete mode 100644 tests/st/baseline_results/mixtral_tp1_pp4_ep2_drop_dpp.json delete mode 100644 tests/st/shell_scripts/mixtral_tp1_pp4_ep2_drop_dpp.sh diff --git a/mindspeed_llm/core/transformer/moe/moe_layer.py b/mindspeed_llm/core/transformer/moe/moe_layer.py index b6be85491f..fd6e4dcf45 100644 --- a/mindspeed_llm/core/transformer/moe/moe_layer.py +++ b/mindspeed_llm/core/transformer/moe/moe_layer.py @@ -20,6 +20,22 @@ from mindspeed.core.transformer.moe.moe_layer_overlap_allgather import MoELayerO from mindspeed_llm.tasks.posttrain.lora.utils import is_enable_lora + +def parallel_transformer_layer_init_wrapper(fn): + @wraps(fn) + def wrapper(self, *args, **kwargs): + fn(self, *args, **kwargs) + from mindspeed.core.transformer.moe.moe_feature.overlap.moe_layer import AllGatherOverlapMoeLayer, AlltoAllSeqOverlapMoeLayer + if self.config.moe_alltoall_overlap_comm or self.config.moe_allgather_overlap_comm: + if isinstance(self.mlp, (AllGatherOverlapMoeLayer, AlltoAllSeqOverlapMoeLayer)): + self.mlp.experts.layer_number = self.layer_number + if self.config.n_shared_experts: + self.mlp.shared_experts.layer_number = self.layer_number + else: + self.mlp.layer_number = self.layer_number + return wrapper + + def moe_layer_init_wrapper(init_func): @wraps(init_func) def moe_layer_init(*args, **kwargs): diff --git a/mindspeed_llm/features_manager/__init__.py b/mindspeed_llm/features_manager/__init__.py index 487cfc5276..b90ec815c5 100644 --- a/mindspeed_llm/features_manager/__init__.py +++ b/mindspeed_llm/features_manager/__init__.py @@ -6,15 +6,24 @@ from mindspeed.features_manager.tensor_parallel.unaligned_linear_feature import from mindspeed.features_manager.features_manager import MindSpeedFeaturesManager from mindspeed.features_manager.fusions.fused_bias_swiglu import FusedSwigluFeature from mindspeed.features_manager.fusions.fused_softmax import FusedSoftmaxFeature -from mindspeed.features_manager.fusions.fused_rope import FusedRoPEFeature from mindspeed.features_manager.optimizer.virtual_optimizer import VirtualOptimizerFeature from mindspeed.features_manager.optimizer.fused_ema_adamw_feature import FusedEmaAdamwFeature +from mindspeed.features_manager.fusions.grouped_matmul import GroupedMatmulFeature +from mindspeed.features_manager.moe.gmm import MoEGmmFeature +from mindspeed.features_manager.moe.fb_overlap import MoEFwdBwdOverlapFeature +from mindspeed.features_manager.memory.reuse_fp32_param import ReuseFP32Param +from mindspeed.features_manager.pipeline_parallel import NoopLayersFeature +from mindspeed.features_manager.recompute.norm_function import RecomputeNormFeature + from mindspeed_llm.features_manager.common.training import TrainingDefaultFeature from mindspeed_llm.features_manager.common.rotary import RotaryPositionEmbeddingFeature from mindspeed_llm.features_manager.common.embedding import LanguageModelEmbeddingFeature from mindspeed_llm.features_manager.common.data import DataFeature -from mindspeed_llm.features_manager.common.moe_router import MOERouter +from mindspeed_llm.features_manager.moe.moe_router import MoERouter +from mindspeed_llm.features_manager.moe.shared_expert import MoESharedExpertsFeature +from mindspeed_llm.features_manager.moe.moe_alltoallseq_overlap import MoEAlltoAllSeqOverLapFeature +from mindspeed_llm.features_manager.moe.moe_allgather_overlap import MoEAllGatherOverLapFeature from mindspeed_llm.features_manager.models.mamba import MambaModel from mindspeed_llm.features_manager.communication.coc import AscendCocFeature from mindspeed_llm.features_manager.communication.gloo import DisableGlooFeature @@ -24,11 +33,15 @@ from mindspeed_llm.features_manager.megatron_basic.megatron_basic import Megatro from mindspeed_llm.features_manager.megatron_basic.requirements_basic import RequirementsBasicFeature from mindspeed_llm.features_manager.megatron_basic.model_basic import ModelBasicFeature from mindspeed_llm.features_manager.megatron_basic.training_basic import TrainingBasicFeature +from mindspeed_llm.features_manager.transformer.transformer_block import TransformerBlockFeature +from mindspeed_llm.features_manager.transformer.multi_latent_attention.mla_feature import MLAFeature +from mindspeed_llm.features_manager.dataset.dataset import DatasetFeature +from mindspeed_llm.features_manager.moe.tp_extend_ep import MoETpExtendEpFeature from mindspeed_llm.features_manager.tokenizer.build_tokenizer import BuildTokenizerFeature -from mindspeed_llm.features_manager.dataset.dataset import DatasetFeature +from mindspeed_llm.features_manager.transformer.flash_attention.fusion_attention_feature import FusionAttentionFeature from mindspeed_llm.features_manager.finetune.finetune import FinetuneFeature from mindspeed_llm.features_manager.finetune.lora import LoraFeature -from mindspeed_llm.features_manager.transformer.flash_attention.fusion_attention_feature import FusionAttentionFeature + FEATURES_LIST = [ # MindSpeed Legacy Features @@ -42,7 +55,7 @@ FEATURES_LIST = [ RotaryPositionEmbeddingFeature(), LanguageModelEmbeddingFeature(), MambaModel(), - MOERouter(), + MoERouter(), AscendCocFeature(), HighAvailabilityFeature(), MultiTokenPredictionFeature(), @@ -62,10 +75,12 @@ def add_llm_features(features_list: List[MindSpeedFeature]): features_list.extend([ ModelBasicFeature(), TrainingBasicFeature(), - RotaryPositionEmbeddingFeature(), DatasetFeature(), FinetuneFeature(), - LoraFeature() + LoraFeature(), + HighAvailabilityFeature(), + MambaModel(), + LanguageModelEmbeddingFeature(), ]) @@ -73,27 +88,70 @@ def add_fusions_features(features_list: List[MindSpeedFeature]): features_list.extend([ FusedSwigluFeature(), FusedSoftmaxFeature(), - FusedRoPEFeature(), + RotaryPositionEmbeddingFeature(), + GroupedMatmulFeature(), ]) -def add_optimizer_features(features_list: List[MindSpeedFeature]): +def add_tensor_parallel_features(features_list: List[MindSpeedFeature]): features_list.extend([ - # Optimizer features: fused-ema-adamw - FusedEmaAdamwFeature(), - VirtualOptimizerFeature(), + AscendCocFeature(), ]) -def add_tokenizer_features(features_list: List[MindSpeedFeature]): +def add_pipeline_parallel_features(features_list: List[MindSpeedFeature]): features_list.extend([ - BuildTokenizerFeature() + NoopLayersFeature(), ]) def add_transformer_features(features_list: List[MindSpeedFeature]): features_list.extend([ FusionAttentionFeature(), + # LLM feature + MLAFeature(), + # LLM feature + MultiTokenPredictionFeature(), + # LLM feature + TransformerBlockFeature(), + ]) + + +def add_tokenizer_features(features_list: List[MindSpeedFeature]): + features_list.extend([ + BuildTokenizerFeature() + ]) + + +def add_reuse_param_features(features_list: List[MindSpeedFeature]): + features_list.extend([ + ReuseFP32Param() + ]) + + +def add_moe_features(features_list: List[MindSpeedFeature]): + features_list.extend([ + MoEGmmFeature(), + # LLM feature + MoERouter(), + MoETpExtendEpFeature(), + MoESharedExpertsFeature(), + MoEAllGatherOverLapFeature(), + MoEAlltoAllSeqOverLapFeature(), + MoEFwdBwdOverlapFeature() + ]) + + +def add_optimizer_features(features_list: List[MindSpeedFeature]): + features_list.extend([ + FusedEmaAdamwFeature(), + VirtualOptimizerFeature(), + ]) + + +def add_recompute_features(features_list: List[MindSpeedFeature]): + features_list.extend([ + RecomputeNormFeature(), ]) @@ -102,9 +160,14 @@ def create_features_list(): add_megatron_basic_features(features_list) add_llm_features(features_list) add_fusions_features(features_list) - add_optimizer_features(features_list) - add_tokenizer_features(features_list) + add_tensor_parallel_features(features_list) + add_pipeline_parallel_features(features_list) add_transformer_features(features_list) + add_tokenizer_features(features_list) + add_reuse_param_features(features_list) + add_moe_features(features_list) + add_optimizer_features(features_list) + add_recompute_features(features_list) return features_list diff --git a/mindspeed_llm/features_manager/common/data.py b/mindspeed_llm/features_manager/common/data.py index b39bf66fc9..f777ac798f 100644 --- a/mindspeed_llm/features_manager/common/data.py +++ b/mindspeed_llm/features_manager/common/data.py @@ -1,8 +1,6 @@ from mindspeed.features_manager.feature import MindSpeedFeature - - class DataFeature(MindSpeedFeature): def __init__(self): super(DataFeature, self).__init__(feature_name="data", optimization_level=0) @@ -10,7 +8,7 @@ class DataFeature(MindSpeedFeature): def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--enable-share-memory', action='store_true', default=False, - help='Enable shared memory for passing actual_seq_len when reset-position-ids is enabled.') + help='Enable shared memory for passing actual_seq_len when reset-position-ids is enabled.') def validate_args(self, args): @@ -22,6 +20,5 @@ class DataFeature(MindSpeedFeature): def register_patches(self, patch_manager, args): from ...training.utils import get_batch_on_this_tp_rank if not args.reset_attention_mask: - patch_manager.register_patch( - 'megatron.training.utils.get_batch_on_this_tp_rank', get_batch_on_this_tp_rank) - + patch_manager.register_patch('megatron.training.utils.get_batch_on_this_tp_rank', + get_batch_on_this_tp_rank) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/common/embedding.py b/mindspeed_llm/features_manager/common/embedding.py index 179c22201a..fbd90fb0b6 100644 --- a/mindspeed_llm/features_manager/common/embedding.py +++ b/mindspeed_llm/features_manager/common/embedding.py @@ -8,6 +8,5 @@ class LanguageModelEmbeddingFeature(MindSpeedFeature): def register_patches(self, patch_manager, args): from mindspeed.core.models.common.embeddings.language_model_embedding import language_model_embedding_forward_wrapper - patch_manager.register_patch( - 'megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.forward', - language_model_embedding_forward_wrapper) \ No newline at end of file + patch_manager.register_patch('megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.forward', + language_model_embedding_forward_wrapper) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/common/rotary.py b/mindspeed_llm/features_manager/common/rotary.py index 43c78d49de..b42ba35502 100644 --- a/mindspeed_llm/features_manager/common/rotary.py +++ b/mindspeed_llm/features_manager/common/rotary.py @@ -18,8 +18,20 @@ class RotaryPositionEmbeddingFeature(MindSpeedFeature): group.add_argument('--original-max-position-embeddings', type=float, help='Base context length used during pretraining ' '(critical for scaling calculations, e.g., 8192 for LLaMA3)') + # Arguments used for yarn + group.add_argument('--beta-fast', type=int, default=32, + help='Yarn rope: rope beta fast') + group.add_argument('--beta-slow', type=int, default=1, + help='Yarn rope: rope beta slow') + group.add_argument('--rope-scaling-mscale', type=float, default=1.0, + help='Yarn rope: rope mscale') + group.add_argument('--rope-scaling-mscale-all-dim', type=float, default=0.0, + help='Yarn rope: rope mscale all dim') + group.add_argument('--rope-scaling-original-max-position-embeddings', type=int, default=None, + help='Yarn rope: rope original max position embeddings') # Arguments used for long RoPE - group.add_argument('--longrope-freqs-type', type=str, default="mul", choices=["mul", "outer"], + group.add_argument('--longrope-freqs-type', type=str, default="mul", + choices=["mul", "outer"], help='Frequency adjustment strategy for LongRoPE: ' '"mul" - Frequency multiplication, ' '"outer" - Frequency outer product') @@ -41,6 +53,7 @@ class RotaryPositionEmbeddingFeature(MindSpeedFeature): # Only used for InternLM3 group.add_argument('--dynamic-factor', type=float, default=1.0, help='Dynamic scaling factor for adaptive rotary position embeddings') + # Only used for glm group.add_argument('--use-glm-rope', action='store_true', help='use custom partial rope in glm model.') diff --git a/mindspeed_llm/features_manager/common/training.py b/mindspeed_llm/features_manager/common/training.py index d993bc3095..23236ee87d 100644 --- a/mindspeed_llm/features_manager/common/training.py +++ b/mindspeed_llm/features_manager/common/training.py @@ -13,12 +13,16 @@ class TrainingDefaultFeature(MindSpeedFeature): if is_enable_qlora(args): from mindspeed_llm.tasks.posttrain.lora.qlora import get_model - patch_manager.register_patch('megatron.training.training.get_model', get_model) + patch_manager.register_patch('megatron.training.training.get_model', + get_model) else: from mindspeed_llm.training.training import get_model_wrapper - patch_manager.register_patch('megatron.training.training.get_model', get_model_wrapper) + patch_manager.register_patch('megatron.training.training.get_model', + get_model_wrapper) - patch_manager.register_patch('megatron.training.training.build_pretraining_data_loader', - build_pretraining_data_loader) - patch_manager.register_patch('megatron.training.training.train', train) - patch_manager.register_patch('megatron.training.training.load_checkpoint', load_checkpoint_wrapper) \ No newline at end of file + patch_manager.register_patch('megatron.training.training.build_pretraining_data_loader', + build_pretraining_data_loader) + patch_manager.register_patch('megatron.training.training.train', + train) + patch_manager.register_patch('megatron.training.training.load_checkpoint', + load_checkpoint_wrapper) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/communication/coc.py b/mindspeed_llm/features_manager/communication/coc.py index ebbd19dc47..420be50b7b 100644 --- a/mindspeed_llm/features_manager/communication/coc.py +++ b/mindspeed_llm/features_manager/communication/coc.py @@ -10,16 +10,16 @@ class AscendCocFeature(MindSpeedFeature): def register_args(self, parser: ArgumentParser): group = parser.add_argument_group(title=self.feature_name) group.add_argument("--use-ascend-coc", action='store_true', - help="Use ascend coc") + help="Use ascend coc") group.add_argument('--coc-mode', type=int, default=-1, - help='coc-mode: 0=original, 1=rewrite, 2=coc default') + help='coc-mode: 0=original, 1=rewrite, 2=coc default') group.add_argument('--coc-parallel-num', type=int, default=1, - help='coc parallel num') + help='coc parallel num') group.add_argument('--coc-fused-kernel', action='store_true', - help='use coc fused kernel') + help='use coc fused kernel') def register_patches(self, patch_managesr, args): if args.use_ascend_coc: from mindspeed.initialize import coc_registration_wrapper - patch_managesr.register_patch('megatron.training.initialize.initialize_megatron', - coc_registration_wrapper) + patch_managesr.register_patch('megatron.training.initialize.initialize_megatron', + coc_registration_wrapper) diff --git a/mindspeed_llm/features_manager/communication/gloo.py b/mindspeed_llm/features_manager/communication/gloo.py index 5e87692d6b..1c96940d75 100644 --- a/mindspeed_llm/features_manager/communication/gloo.py +++ b/mindspeed_llm/features_manager/communication/gloo.py @@ -10,9 +10,9 @@ class DisableGlooFeature(MindSpeedFeature): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--disable-gloo-group', action='store_true', - help='Replace the communication method of the DP group in the distributed optimizer from gloo to hccl.') + help='Replace the communication method of the DP group in the distributed optimizer from gloo to hccl.') group.add_argument('--hccl-slice-size', type=int, default=10 * 1024 * 1024, - help='data slice size on each dp rank in distributed optimizer') + help='data slice size on each dp rank in distributed optimizer') def register_patches(self, patch_manager, args): if args.disable_gloo_group: @@ -24,14 +24,15 @@ class DisableGlooFeature(MindSpeedFeature): from mindspeed.utils import check_param_hashes_across_dp_replicas_hccl patch_manager.register_patch('megatron.core.optimizer.distrib_optimizer.DistributedOptimizer.get_parameter_state_dp_zero', - get_parameter_state_dp_zero_hccl) + get_parameter_state_dp_zero_hccl) patch_manager.register_patch('megatron.core.optimizer.distrib_optimizer.DistributedOptimizer.load_parameter_state_from_dp_zero', - load_parameter_state_from_dp_zero_hccl) + load_parameter_state_from_dp_zero_hccl) patch_manager.register_patch('megatron.core.utils.check_param_hashes_across_dp_replicas', - check_param_hashes_across_dp_replicas_hccl) + check_param_hashes_across_dp_replicas_hccl) patch_manager.register_patch('megatron.core.parallel_state.get_data_parallel_group_gloo', - get_data_parallel_group_gloo_replace) + get_data_parallel_group_gloo_replace) patch_manager.register_patch('megatron.core.parallel_state.get_data_modulo_expert_parallel_group_gloo', - get_data_modulo_expert_parallel_group_gloo_replace) - patch_manager.register_patch('torch.distributed.new_group', new_group_wrapper) \ No newline at end of file + get_data_modulo_expert_parallel_group_gloo_replace) + patch_manager.register_patch('torch.distributed.new_group', + new_group_wrapper) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/high_availability/high_availability.py b/mindspeed_llm/features_manager/high_availability/high_availability.py index 7871351a32..25da2b1881 100644 --- a/mindspeed_llm/features_manager/high_availability/high_availability.py +++ b/mindspeed_llm/features_manager/high_availability/high_availability.py @@ -10,13 +10,13 @@ class HighAvailabilityFeature(MindSpeedFeature): def register_args(self, parser: ArgumentParser): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--enable-high-availability', action='store_true', - help='switch of the high availability feature') + help='switch of the high availability feature') group.add_argument('--enable-hbmfault-repair', action='store_true', - help='high availability feature, enable hbmfault repair') + help='high availability feature, enable hbmfault repair') group.add_argument('--enable-worker-reboot', action='store_true', - help='high availability feature, enable worker reboot') + help='high availability feature, enable worker reboot') group.add_argument('--distributed-optimizer-no-replica', action='store_true', - help='high availability feature, repair from ckpt and disable replica optimizer') + help='high availability feature, repair from ckpt and disable replica optimizer') def validate_args(self, args): if args.enable_high_availability: @@ -45,45 +45,45 @@ class HighAvailabilityFeature(MindSpeedFeature): if args.enable_high_availability: patch_manager.register_patch('megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.__init__', - distributed_data_parallel_init_wrapper) + distributed_data_parallel_init_wrapper) patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', - start_grad_sync_wrapper) + start_grad_sync_wrapper) patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.__init__', - param_and_grad_bucket_group_init_wrapper) + param_and_grad_bucket_group_init_wrapper) patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_param_sync', - start_param_sync_wrapper) + start_param_sync_wrapper) patch_manager.register_patch('megatron.training.training.get_megatron_optimizer', - get_megatron_optimizer_wrapper) + get_megatron_optimizer_wrapper) patch_manager.register_patch('megatron.training.initialize._initialize_distributed', - initialize_distributed_wrapper) + initialize_distributed_wrapper) patch_manager.register_patch('megatron.core.optimizer.clip_grads.get_grad_norm_fp32', - get_grad_norm_fp32_wrapper) + get_grad_norm_fp32_wrapper) patch_manager.register_patch('megatron.core.optimizer.distrib_optimizer.DistributedOptimizer.__init__', - distributed_optimizer_init_wrapper) + distributed_optimizer_init_wrapper) patch_manager.register_patch('megatron.training.training.setup_model_and_optimizer', - setup_model_and_optimizer_wrapper) + setup_model_and_optimizer_wrapper) patch_manager.register_patch('megatron.core.pipeline_parallel.schedules.get_forward_backward_func', - high_availability_get_forward_backward_func_wrapper) + high_availability_get_forward_backward_func_wrapper) if args.reuse_fp32_param: from mindspeed.optimizer.optimizer import mixed_precision_optimizer_step, reuse_fp32_param_init_wrapper, \ optimizer_config_init_wrapper patch_manager.register_patch('megatron.core.optimizer.optimizer.MixedPrecisionOptimizer.step', - mixed_precision_optimizer_step) + mixed_precision_optimizer_step) patch_manager.register_patch('megatron.core.optimizer.optimizer.Float16OptimizerWithFloat16Params.__init__', - reuse_fp32_param_init_wrapper) + reuse_fp32_param_init_wrapper) patch_manager.register_patch('megatron.core.optimizer.optimizer_config.OptimizerConfig.__init__', - optimizer_config_init_wrapper) + optimizer_config_init_wrapper) patch_manager.register_patch('megatron.core.optimizer.distrib_optimizer.DistributedOptimizer.__init__', - distributed_optimizer_init_for_reuse_fp32_wrapper) - patch_manager.register_patch( - 'mindio_ttp.adaptor.TTPReplicaOptimizer.get_parameter_state_dp_zero_for_ttp', - get_parameter_state_dp_zero_with_high_availability_wrapper) + distributed_optimizer_init_for_reuse_fp32_wrapper) + patch_manager.register_patch('mindio_ttp.adaptor.TTPReplicaOptimizer.get_parameter_state_dp_zero_for_ttp', + get_parameter_state_dp_zero_with_high_availability_wrapper) if args.enable_worker_reboot: from .initialize_patch import build_train_valid_test_data_iterators_wrapper from mindspeed_llm.features_manager.high_availability.communication_patch import new_group_wrapper patch_manager.register_patch('megatron.training.training.build_train_valid_test_data_iterators', - build_train_valid_test_data_iterators_wrapper) - patch_manager.register_patch('torch.distributed.distributed_c10d.new_group', new_group_wrapper) + build_train_valid_test_data_iterators_wrapper) + patch_manager.register_patch('torch.distributed.distributed_c10d.new_group', + new_group_wrapper) class HighAvailabilityCommFeature(MindSpeedFeature): @@ -93,4 +93,5 @@ class HighAvailabilityCommFeature(MindSpeedFeature): def pre_patch(self, patch_manager, args): from mindspeed_llm.features_manager.high_availability.communication_patch import communication_wrapper for communication in ['barrier', 'all_reduce', '_all_gather_base', 'broadcast', 'all_gather_into_tensor']: - patch_manager.register_patch('torch.distributed.distributed_c10d.' + communication, communication_wrapper) \ No newline at end of file + patch_manager.register_patch('torch.distributed.distributed_c10d.' + communication, + communication_wrapper) diff --git a/mindspeed_llm/features_manager/megatron_basic/megatron_basic.py b/mindspeed_llm/features_manager/megatron_basic/megatron_basic.py index 69f37cab9b..105d680444 100644 --- a/mindspeed_llm/features_manager/megatron_basic/megatron_basic.py +++ b/mindspeed_llm/features_manager/megatron_basic/megatron_basic.py @@ -4,7 +4,7 @@ from mindspeed.features_manager.megatron_basic.megatron_basic import MegatronBas class MegatronBasicFeature(MindSpeedMegatronBasicFeature): - + def register_patches(self, patch_manager, args): super().register_patches(patch_manager, args) @@ -29,54 +29,76 @@ class MegatronBasicFeature(MindSpeedMegatronBasicFeature): from mindspeed_llm.training.utils import print_args_wrapper from mindspeed_llm.training.arguments import validate_args_v2_decorator, parse_args_decorator from mindspeed_llm.core.transformer.transformer_config import transformer_config_post_init_wrapper - pm.register_patch('megatron.training.arguments.parse_args', parse_args_decorator) - pm.register_patch('megatron.training.arguments.validate_args', validate_args_v2_decorator) - pm.register_patch('megatron.training.arguments._print_args', print_args_wrapper) - pm.register_patch('megatron.training.yaml_arguments.validate_yaml', validate_args_v2_decorator) - pm.register_patch('megatron.training.yaml_arguments._print_args', print_args_wrapper) - pm.register_patch("megatron.core.transformer.transformer_config.TransformerConfig.__post_init__", transformer_config_post_init_wrapper) + pm.register_patch('megatron.training.arguments.parse_args', + parse_args_decorator) + pm.register_patch('megatron.training.arguments.validate_args', + validate_args_v2_decorator) + pm.register_patch('megatron.training.arguments._print_args', + print_args_wrapper) + pm.register_patch('megatron.training.yaml_arguments.validate_yaml', + validate_args_v2_decorator) + pm.register_patch('megatron.training.yaml_arguments._print_args', + print_args_wrapper) + pm.register_patch("megatron.core.transformer.transformer_config.TransformerConfig.__post_init__", + transformer_config_post_init_wrapper) # initialization patches from mindspeed.core.megatron_basic.megatron_basic import _set_cuda_rng_state, _compile_dependencies, get_device_wrapper - pm.register_patch('megatron.core.tensor_parallel.random._set_cuda_rng_state', _set_cuda_rng_state) - pm.register_patch('megatron.training.initialize._compile_dependencies', _compile_dependencies) - pm.register_patch('megatron.training.dist_signal_handler.get_device', get_device_wrapper) + pm.register_patch('megatron.core.tensor_parallel.random._set_cuda_rng_state', + _set_cuda_rng_state) + pm.register_patch('megatron.training.initialize._compile_dependencies', + _compile_dependencies) + pm.register_patch('megatron.training.dist_signal_handler.get_device', + get_device_wrapper) # norm patches from mindspeed.core.megatron_basic.megatron_basic import PTNorm - pm.register_patch('megatron.core.models.gpt.gpt_layer_specs.LNImpl', PTNorm) - pm.register_patch('megatron.core.transformer.torch_norm.WrappedTorchNorm', PTNorm) - pm.register_patch('megatron.core.transformer.transformer_block.LayerNormImpl', PTNorm) - pm.register_patch('megatron.core.extensions.transformer_engine.TENorm', PTNorm) + pm.register_patch('megatron.core.models.gpt.gpt_layer_specs.LNImpl', + PTNorm) + pm.register_patch('megatron.core.transformer.torch_norm.WrappedTorchNorm', + PTNorm) + pm.register_patch('megatron.core.transformer.transformer_block.LayerNormImpl', + PTNorm) + pm.register_patch('megatron.core.extensions.transformer_engine.TENorm', + PTNorm) # coalescing_manager patches from mindspeed.core.distributed.param_and_grad_buffer import start_param_sync, finish_param_sync, start_grad_sync, finish_grad_sync - pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_param_sync', start_param_sync) - pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_param_sync', finish_param_sync) - pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', start_grad_sync) - pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_grad_sync', finish_grad_sync) + pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_param_sync', + start_param_sync) + pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_param_sync', + finish_param_sync) + pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.start_grad_sync', + start_grad_sync) + pm.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup.finish_grad_sync', + finish_grad_sync) # Without TE, use ColumnParallelLinear and RowParallelLinear replace TEColumnParallelLinear for Megatron share expert. if hasattr(args, 'transformer_impl') and args.transformer_impl == 'local': from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear pm.register_patch('megatron.core.extensions.transformer_engine.TEColumnParallelLinear', - ColumnParallelLinear, create_dummy=True) - pm.register_patch('megatron.core.extensions.transformer_engine.TERowParallelLinear', RowParallelLinear, - create_dummy=True) + ColumnParallelLinear, create_dummy=True) + pm.register_patch('megatron.core.extensions.transformer_engine.TERowParallelLinear', + RowParallelLinear, + create_dummy=True) # Currently, it is not supported to Cast shard fp32 main params to fp8 model params from mindspeed.core.fp8_utils import quantize_param_shard - pm.register_patch('megatron.core.fp8_utils.quantize_param_shard', quantize_param_shard) + pm.register_patch('megatron.core.fp8_utils.quantize_param_shard', + quantize_param_shard) # fix get_megatron_optimizer for core_r0.12.0 from mindspeed.core.megatron_basic.get_megatron_optimizer import get_megatron_optimizer - pm.register_patch('megatron.core.optimizer.get_megatron_optimizer', get_megatron_optimizer) + pm.register_patch('megatron.core.optimizer.get_megatron_optimizer', + get_megatron_optimizer) from mindspeed.training import get_device_arch_version - pm.register_patch('megatron.training.utils.get_device_arch_version', get_device_arch_version) + pm.register_patch('megatron.training.utils.get_device_arch_version', + get_device_arch_version) # fix count_zeros in ChainedOptimizer for core_r0.12.1. from mindspeed.core.megatron_basic.count_zero_fix import step - pm.register_patch('megatron.core.optimizer.optimizer.ChainedOptimizer.step', step) + pm.register_patch('megatron.core.optimizer.optimizer.ChainedOptimizer.step', + step) from mindspeed_llm.core.transformer.transformer_block import get_layer_offset_wrapper from mindspeed_llm.core import TransformerLayer diff --git a/mindspeed_llm/features_manager/megatron_basic/requirements_basic.py b/mindspeed_llm/features_manager/megatron_basic/requirements_basic.py index 7e84fd590c..99733b17bc 100644 --- a/mindspeed_llm/features_manager/megatron_basic/requirements_basic.py +++ b/mindspeed_llm/features_manager/megatron_basic/requirements_basic.py @@ -12,9 +12,9 @@ class RequirementsBasicFeature(MindspeedRequirementsBasicFeature): super().register_args(parser) group = parser.add_argument_group(title=self.feature_name) group.add_argument('--o2-optimizer', action='store_true', - help='use bf16 exponential moving average to greatly save up memory.') + help='use bf16 exponential moving average to greatly save up memory.') group.add_argument('--o2-gradient', action='store_true', - help='use bf16 gradient accumulation to greatly save up memory.') + help='use bf16 gradient accumulation to greatly save up memory.') def register_patches(self, patch_manager, args): super().register_patches(patch_manager, args) @@ -24,12 +24,16 @@ class RequirementsBasicFeature(MindspeedRequirementsBasicFeature): if args.o2_optimizer: # O2 optimizer from mindspeed_llm.tasks.models.common.adamw import O2AdamW - pm.register_patch('apex.optimizers.FusedAdam', O2AdamW, create_dummy=True) + pm.register_patch('apex.optimizers.FusedAdam', + O2AdamW, create_dummy=True) else: if args.optimizer_selection == 'fused_torch_adamw': - pm.register_patch('apex.optimizers.FusedAdam', FusedTorchAdamW, create_dummy=True) + pm.register_patch('apex.optimizers.FusedAdam', + FusedTorchAdamW, create_dummy=True) elif args.optimizer_selection == 'fused_adamw': - pm.register_patch('apex.optimizers.FusedAdam', AdamW, create_dummy=True) - pm.register_patch('apex.optimizers.FusedSGD', torch.optim.SGD, create_dummy=True) + pm.register_patch('apex.optimizers.FusedAdam', + AdamW, create_dummy=True) + pm.register_patch('apex.optimizers.FusedSGD', + torch.optim.SGD, create_dummy=True) diff --git a/mindspeed_llm/features_manager/megatron_basic/training_basic.py b/mindspeed_llm/features_manager/megatron_basic/training_basic.py index a85075f693..a766698266 100644 --- a/mindspeed_llm/features_manager/megatron_basic/training_basic.py +++ b/mindspeed_llm/features_manager/megatron_basic/training_basic.py @@ -39,6 +39,8 @@ class TrainingBasicFeature(MindSpeedFeature): help='Setting jit compile mode to True') group.add_argument('--attention-mask-type', type=str, default='causal', choices=['causal', 'general'], help='context parallel attention mask type') + group.add_argument('--use-ascend-mc2', action='store_true', default=False, + help='use ascend mc2') def register_patches(self, patch_manager, args): from mindspeed_llm.training.training import train @@ -62,3 +64,12 @@ class TrainingBasicFeature(MindSpeedFeature): train) patch_manager.register_patch('megatron.training.training.load_checkpoint', load_checkpoint_wrapper) + + + patch_manager.register_patch('megatron.training.training.get_model', + get_model_wrapper) + + patch_manager.register_patch('megatron.training.training.train', + train) + patch_manager.register_patch('megatron.training.training.load_checkpoint', + load_checkpoint_wrapper) diff --git a/mindspeed_llm/features_manager/models/mamba.py b/mindspeed_llm/features_manager/models/mamba.py index ccc61a4b17..0835148438 100644 --- a/mindspeed_llm/features_manager/models/mamba.py +++ b/mindspeed_llm/features_manager/models/mamba.py @@ -9,31 +9,29 @@ class MambaModel(MindSpeedFeature): def register_args(self, parser: ArgumentParser): group = parser.add_argument_group(title=self.feature_name) - group.add_argument('--mamba-d-ssm', type=int, default=None, help='If not None, only apply SSM on this many dimensions, the rest uses gated MLP') - group.add_argument('--mamba-chunk-size', type=int, default=256, help='Split the chunk size of tensor in mamba') - group.add_argument('--mamba-d-conv', type=int, default=4, help='conv channel dim for mamba') - group.add_argument('--mamba-expand', type=int, default=1, help='expand scale for mamba') + group.add_argument('--mamba-d-ssm', type=int, default=None, + help='If not None, only apply SSM on this many dimensions, the rest uses gated MLP') + group.add_argument('--mamba-chunk-size', type=int, default=256, + help='Split the chunk size of tensor in mamba') + group.add_argument('--mamba-d-conv', type=int, default=4, + help='conv channel dim for mamba') + group.add_argument('--mamba-expand', type=int, default=1, + help='expand scale for mamba') def register_patches(self, patch_manager, args): from mindspeed_llm.core.ssm.mamba_mixer import mamba_mixer_init_wrapper, mamba_mixer_forward, Mamba2RMSNorm from mindspeed_llm.core.ssm.mamba_block import mamba_block_forward - patch_manager.register_patch( - 'mamba_ssm.ops.triton.layernorm_gated.RMSNorm', - Mamba2RMSNorm, create_dummy=True) - patch_manager.register_patch( - 'mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined', - create_dummy=True) - patch_manager.register_patch( - 'mamba_ssm.ops.triton.ssd_combined.mamba_split_conv1d_scan_combined', - create_dummy=True) + patch_manager.register_patch('mamba_ssm.ops.triton.layernorm_gated.RMSNorm', + Mamba2RMSNorm, create_dummy=True) + patch_manager.register_patch('mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined', + create_dummy=True) + patch_manager.register_patch('mamba_ssm.ops.triton.ssd_combined.mamba_split_conv1d_scan_combined', + create_dummy=True) - patch_manager.register_patch( - 'megatron.core.ssm.mamba_mixer.MambaMixer.__init__', - mamba_mixer_init_wrapper) - patch_manager.register_patch( - 'megatron.core.ssm.mamba_mixer.MambaMixer.forward', - mamba_mixer_forward) - patch_manager.register_patch( - 'megatron.core.ssm.mamba_block.MambaStack.forward', - mamba_block_forward) \ No newline at end of file + patch_manager.register_patch('megatron.core.ssm.mamba_mixer.MambaMixer.__init__', + mamba_mixer_init_wrapper) + patch_manager.register_patch('megatron.core.ssm.mamba_mixer.MambaMixer.forward', + mamba_mixer_forward) + patch_manager.register_patch('megatron.core.ssm.mamba_block.MambaStack.forward', + mamba_block_forward) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/moe/moe_allgather_overlap.py b/mindspeed_llm/features_manager/moe/moe_allgather_overlap.py new file mode 100644 index 0000000000..0e38cf62ed --- /dev/null +++ b/mindspeed_llm/features_manager/moe/moe_allgather_overlap.py @@ -0,0 +1,21 @@ +from mindspeed.features_manager.moe.moe_allgather_overlap import MoEAllGatherOverLapFeature as MindSpeedMoEAllGatherOverLapFeature + + +class MoEAllGatherOverLapFeature(MindSpeedMoEAllGatherOverLapFeature): + + def register_patches(self, patch_manager, args): + from mindspeed.core.transformer.moe.moe_feature.adaptor import MindSpeedAllGatherOverlapMoeLayerAdaptor + from mindspeed.core.transformer.moe.moe_feature.overlap.moe_common import mlp_init, core_mlp_forward_wrapper + from mindspeed_llm.core.transformer.moe.moe_layer import parallel_transformer_layer_init_wrapper + + patch_manager.register_patch('megatron.core.transformer.mlp.MLP.forward', + core_mlp_forward_wrapper) + + if getattr(args, 'moe_token_dispatcher_type', None) == "allgather": + if args.moe_allgather_overlap_comm: + patch_manager.register_patch('megatron.core.transformer.moe.moe_layer.MoELayer', + MindSpeedAllGatherOverlapMoeLayerAdaptor) + patch_manager.register_patch('megatron.core.transformer.mlp.MLP.__init__', + mlp_init) + patch_manager.register_patch('megatron.core.transformer.transformer_layer.TransformerLayer.__init__', + parallel_transformer_layer_init_wrapper) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/moe/moe_alltoallseq_overlap.py b/mindspeed_llm/features_manager/moe/moe_alltoallseq_overlap.py new file mode 100644 index 0000000000..895720ec38 --- /dev/null +++ b/mindspeed_llm/features_manager/moe/moe_alltoallseq_overlap.py @@ -0,0 +1,51 @@ +from mindspeed.features_manager.moe.moe_alltoallseq_overlap import MoEAlltoAllSeqOverLapFeature as MindSpeedMoEAlltoAllSeqOverLapFeature + + +class MoEAlltoAllSeqOverLapFeature(MindSpeedMoEAlltoAllSeqOverLapFeature): + + def validate_args(self, args): + self.incompatible_check(args, 'use_ascend_mc2') + if args.moe_alltoall_overlap_comm and not args.moe_token_dispatcher_type == 'alltoall_seq': + raise AssertionError('`--moe-alltoall-overlap-comm` only support with `--moe-token-dispatcher-type alltoall_seq`.') + if args.moe_alltoall_overlap_comm: + if not args.moe_permutation_async_comm: + raise AssertionError('`--moe-alltoall-overlap-comm` and `--moe-allgather-overlap-comm` only support with `--moe-permutation-async-comm`.') + if not args.moe_grouped_gemm: + raise AssertionError('`--moe-alltoall-overlap-comm` and `--moe-allgather-overlap-comm` only support with `--moe-grouped-gemm`.') + #Share Experts convert & check. + if args.n_shared_experts is not None and args.moe_shared_expert_intermediate_size is None: + args.moe_shared_expert_intermediate_size = args.n_shared_experts * args.ffn_hidden_size + print(f'Using shared experts. Convert n_shared_experts to moe_shared_expert_intermediate_size, the moe_shared_expert_intermediate_size is {args.moe_shared_expert_intermediate_size}.') + elif args.n_shared_experts is None and args.moe_shared_expert_intermediate_size is not None: + args.n_shared_experts = args.moe_shared_expert_intermediate_size // args.ffn_hidden_size + print(f'Using shared experts. Convert moe_shared_expert_intermediate_size to n_shared_experts, the n_shared_experts is {args.n_shared_experts}.') + #Zero Memory check. + if args.moe_zero_memory_num_layers is not None: + num_layers_per_pipeline_stage = args.num_layers // args.pipeline_model_parallel_size + if args.moe_zero_memory_num_layers < 0 or args.moe_zero_memory_num_layers > num_layers_per_pipeline_stage: + raise AssertionError('`--moe-zero-memory-num-layers` must be between 0 and num layers per pipeline stage') + if args.moe_zero_memory == "disable": + raise AssertionError('`--moe-zero-memory` must be enabled when using `--moe-zero-memory-num-layers`') + if args.moe_zero_memory != "disable" and args.moe_allgather_overlap_comm: + raise AssertionError('`--moe-zero-memory` do not support `--moe-allgather-overlap-comm` for now.') + + def register_patches(self, patch_manager, args): + from mindspeed.core.transformer.moe.moe_feature.adaptor import MindSpeedAlltoAllSeqOverlapMoeLayerAdaptor + from mindspeed.core.transformer.moe.moe_feature.overlap.moe_common import mlp_init, core_mlp_forward_wrapper + from mindspeed.core.transformer.moe.moe_feature.overlap.experts import Zero_Memory_SharedExpertMlp_forward + from mindspeed_llm.core.transformer.moe.moe_layer import parallel_transformer_layer_init_wrapper + + patch_manager.register_patch('megatron.core.transformer.mlp.MLP.forward', + core_mlp_forward_wrapper) + + if getattr(args, 'moe_token_dispatcher_type', None) == "alltoall_seq": + if args.moe_alltoall_overlap_comm: + patch_manager.register_patch('megatron.core.transformer.mlp.MLP.__init__', + mlp_init) + patch_manager.register_patch('megatron.core.transformer.transformer_layer.TransformerLayer.__init__', + parallel_transformer_layer_init_wrapper) + patch_manager.register_patch('megatron.core.transformer.moe.moe_layer.MoELayer', + MindSpeedAlltoAllSeqOverlapMoeLayerAdaptor) + if args.moe_zero_memory != 'disable': + patch_manager.register_patch('megatron.core.transformer.moe.shared_experts.SharedExpertMLP.forward', + Zero_Memory_SharedExpertMlp_forward) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/common/moe_router.py b/mindspeed_llm/features_manager/moe/moe_router.py similarity index 47% rename from mindspeed_llm/features_manager/common/moe_router.py rename to mindspeed_llm/features_manager/moe/moe_router.py index 0c40fb7eb2..e7cc971879 100644 --- a/mindspeed_llm/features_manager/common/moe_router.py +++ b/mindspeed_llm/features_manager/moe/moe_router.py @@ -1,69 +1,58 @@ from mindspeed.features_manager.feature import MindSpeedFeature -class MOERouter(MindSpeedFeature): +class MoERouter(MindSpeedFeature): def __init__(self): - super(MOERouter, self).__init__(feature_name="moe_router", optimization_level=0) + super(MoERouter, self).__init__(feature_name="moe_router", optimization_level=0) + def register_args(self, parser): group = parser.add_argument_group(title=self.feature_name) - group.add_argument('--moe-router-load-balancing-type', type=str, - choices=["aux_loss", "group_limited_greedy", "softmax_topk", "pai_megatron_aux_loss", - "sparsemixer_topk", "noaux_tc", "seq_aux_loss", "sinkhorn", "none"], - default='aux_loss', - help='Determines the load balancing strategy for the router. "aux_loss" corresponds ' - 'to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds ' - 'to the balancing algorithm used in S-BASE, "softmax_topk" implies no load balancing and ' - 'softmax before topk , "None" implies no load balancing, and "group_limited_greedy" corresponds ' - 'to the Device-Limited Routing method in DeepSeekV2. and "pai_megatron_aux_loss" corresponds ' - ' to the load balancing loss used in pai-megatron loss, "noaux_tc" corresponds to no aux loss ' - 'load balancing method in DeepSeekV3' - 'The default is "aux_loss".') - - group.add_argument('--moe-z-loss-coeff', type=float, default=None, - help='Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended.') - group.add_argument('--moe-expert-capacity-factor', type=float, default=None, - help='The capacity factor for each expert, None means no token will be dropped.') + self.add_parser_argument_choices_value(parser, "--moe-router-load-balancing-type", 'group_limited_greedy') + self.add_parser_argument_choices_value(parser, "--moe-router-load-balancing-type", 'softmax_topk') + self.add_parser_argument_choices_value(parser, "--moe-router-load-balancing-type", 'pai_megatron_aux_loss') + self.add_parser_argument_choices_value(parser, "--moe-router-load-balancing-type", 'sparsemixer_topk') + self.add_parser_argument_choices_value(parser, "--moe-router-load-balancing-type", 'noaux_tc') group.add_argument('--topk-group', type=int, default=None, - help='Choose topK group experts in group_limited_greedy_topK method') - group.add_argument('--routed-scaling-factor', type=float, default=None, help='The routed scaling factor') - group.add_argument('--norm-topk-prob', action='store_true', default=False, help='Normalize the topk weight') - group.add_argument('--moe-router-score-function', type=str, - choices=['softmax', 'sigmoid'], - default='softmax', - help='Score function for MoE TopK routing. Can be "softmax" or "sigmoid".') - group.add_argument('--moe-router-enable-expert-bias', action='store_true', - help='TopK routing with dynamic expert bias in the aux-loss-free load balancing strategy. ' - 'The routing decision is based on the sum of the routing scores and the expert bias. ') + help='Choose topK group experts in group_limited_greedy_topK method') + group.add_argument('--routed-scaling-factor', type=float, default=None, + help='The routed scaling factor') + group.add_argument('--norm-topk-prob', action='store_true', default=False, + help='Normalize the topk weight') group.add_argument('--n-group', type=int, default=None, - help='Number of groups for routed experts.' - 'Tips: in deepseek3, set n-group equal to EP to limit each token to experts on a subset of devices,' - 'set n-group equal to number of nodes in EP group to limit each token to experts on a subset of nodes.') - group.add_argument('--seq-aux', action='store_true', default=False, help='Compute aux loss in seq_aux') + help='Number of groups for routed experts.' + 'Tips: in deepseek3, set n-group equal to EP to limit each token to experts on a subset of devices,' + 'set n-group equal to number of nodes in EP group to limit each token to experts on a subset of nodes.') + group.add_argument('--seq-aux', action='store_true', default=False, + help='Compute aux loss in seq_aux') group.add_argument('--moe-device-level-aux-loss-coeff', type=float, default=0., - help='set the coeff for devicie-level balance loss in deepseek moe') + help='set the coeff for devicie-level balance loss in deepseek moe') group.add_argument('--moe-comm-aux-loss-coeff', type=float, default=0., - help='set the coeff for communication balance loss in deepseek moe') + help='set the coeff for communication balance loss in deepseek moe') group.add_argument('--router-gating-in-fp32', action='store_true', default=False, - help='Compute router gating in float32.') - group.add_argument('--moe-router-bias-update-rate', type=float, default=1e-3, - help='Expert bias update rate in the aux-loss-free load balancing strategy. ' - 'The expert bias is updated based on the number of assigned tokens to each expert in a ' - 'global batch, where the bias is increased for the experts with less assigned tokens and ' - 'decreased for the experts with more assigned tokens. ' - 'The default value 1e-3 is same as that used in DeepSeekV3.') + help='Compute router gating in float32.') group.add_argument("--moe-revert-type-after-topk", action='store_true', - help="revert the type of logits after the topk has been computed") - group.add_argument("--fix-router", action='store_true', help="fix router for load balancing.") + help="revert the type of logits after the topk has been computed") + group.add_argument("--fix-router", action='store_true', + help="fix router for load balancing.") + def pre_validate_args(self, args): + self.origin_spec = None + self.origin_spec = args.spec + args.spec = None def validate_args(self, args): self._validate_moe_args(args) self._validate_group_limited_greedy(args) self._validate_aux_loss_free(args) + def post_validate_args(self, args): + if self.origin_spec: + args.spec = self.origin_spec + def _validate_moe_args(self, args): from mindspeed_llm.training.utils import print_rank0_by_args + if args.moe_expert_capacity_factor is not None: if args.moe_token_dispatcher_type == "allgather": raise ValueError(f'moe_expert_capacity_factor not works with allgather token dispatcher') @@ -103,13 +92,14 @@ class MOERouter(MindSpeedFeature): def register_patches(self, patch_manager, args): from ...core import (topk_router_forward, topk_router_routing, z_loss_func, topk_router_init_wrapper, topk_router_gating_func) - patch_manager.register_patch( - 'megatron.core.transformer.moe.router.TopKRouter.__init__', topk_router_init_wrapper) - patch_manager.register_patch( - 'megatron.core.transformer.moe.router.TopKRouter.routing', topk_router_routing) - patch_manager.register_patch( - 'megatron.core.transformer.moe.router.TopKRouter.forward', topk_router_forward) - patch_manager.register_patch( - 'megatron.core.transformer.moe.router.TopKRouter.gating', topk_router_gating_func) - patch_manager.register_patch( - 'megatron.core.transformer.moe.router.z_loss_func', z_loss_func) + + patch_manager.register_patch('megatron.core.transformer.moe.router.TopKRouter.__init__', + topk_router_init_wrapper) + patch_manager.register_patch('megatron.core.transformer.moe.router.TopKRouter.routing', + topk_router_routing) + patch_manager.register_patch('megatron.core.transformer.moe.router.TopKRouter.forward', + topk_router_forward) + patch_manager.register_patch('megatron.core.transformer.moe.router.TopKRouter.gating', + topk_router_gating_func) + patch_manager.register_patch('megatron.core.transformer.moe.router.z_loss_func', + z_loss_func) diff --git a/mindspeed_llm/features_manager/moe/shared_expert.py b/mindspeed_llm/features_manager/moe/shared_expert.py new file mode 100644 index 0000000000..2daf38e45b --- /dev/null +++ b/mindspeed_llm/features_manager/moe/shared_expert.py @@ -0,0 +1,10 @@ +from argparse import ArgumentParser + +from mindspeed.features_manager.moe.shared_expert import MoESharedExpertsFeature as MindSpeedMoESharedExpertsFeature + + +class MoESharedExpertsFeature(MindSpeedMoESharedExpertsFeature): + def pre_validate_args(self, args): + # use megatron shared_experts replace + if args.n_shared_experts and args.moe_shared_expert_intermediate_size is None: + args.moe_shared_expert_intermediate_size = args.n_shared_experts * args.moe_ffn_hidden_size \ No newline at end of file diff --git a/mindspeed_llm/features_manager/moe/tp_extend_ep.py b/mindspeed_llm/features_manager/moe/tp_extend_ep.py new file mode 100644 index 0000000000..70860c5e62 --- /dev/null +++ b/mindspeed_llm/features_manager/moe/tp_extend_ep.py @@ -0,0 +1,13 @@ +from mindspeed.features_manager.moe.tp_extend_ep import MoETpExtendEpFeature as MindSpeedMoETpExtendEpFeature + + +class MoETpExtendEpFeature(MindSpeedMoETpExtendEpFeature): + + def register_patches(self, patch_manager, args): + from mindspeed.core.transformer.moe.moe_feature.adaptor import MindSpeedAlltoAllSEQTptoEpMoELayer + + if hasattr(args, 'moe_token_dispatcher_type') and args.moe_token_dispatcher_type == 'alltoall_seq': + if args.moe_tp_extend_ep: + if not args.moe_alltoall_overlap_comm: + patch_manager.register_patch('megatron.core.transformer.moe.moe_layer.MoELayer', + MindSpeedAlltoAllSEQTptoEpMoELayer) diff --git a/mindspeed_llm/features_manager/optimizer/fused_ema_adamw_feature.py b/mindspeed_llm/features_manager/optimizer/fused_ema_adamw_feature.py index 95db10537d..91019b8d57 100644 --- a/mindspeed_llm/features_manager/optimizer/fused_ema_adamw_feature.py +++ b/mindspeed_llm/features_manager/optimizer/fused_ema_adamw_feature.py @@ -10,10 +10,11 @@ class FusedEmaAdamwFeature(MindSpeedFusedEmaAdamwFeature): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--ema-decay', type=float, default=0.9999, - help='Set ema_decay of fused_ema_adamw optimizer.') - group.add_argument('--optimizer-selection', type=str, default='fused_adamw', choices=['fused_adamw', 'fused_torch_adamw', 'fused_ema_adamw'], - help='Select from the former fused AdamW optimizer and Torch fused AdamW optimizer') + help='Set ema_decay of fused_ema_adamw optimizer.') + group.add_argument('--optimizer-selection', type=str, default='fused_adamw', + choices=['fused_adamw', 'fused_torch_adamw', 'fused_ema_adamw'], + help='Select from the former fused AdamW optimizer and Torch fused AdamW optimizer') group.add_argument('--optimization-level', type=int, choices=[0, 1, 2], default=2, - help='0: The minimum patch set for megatron to adapt to NPU,' - '1: Affinity optimization (fusion operator, etc.), ' - '2: Advanced acceleration algorithm') \ No newline at end of file + help='0: The minimum patch set for megatron to adapt to NPU,' + '1: Affinity optimization (fusion operator, etc.), ' + '2: Advanced acceleration algorithm') diff --git a/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py b/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py index 330250f658..658c5c201b 100644 --- a/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py +++ b/mindspeed_llm/features_manager/tokenizer/build_tokenizer.py @@ -14,17 +14,13 @@ TEMPLATES_DIR = str( class BuildTokenizerFeature(MindSpeedBuildTokenizerFeature): def register_args(self, parser: ArgumentParser): - self.add_parser_argument_choices_value( - parser, - "--tokenizer-type", - 'PretrainedFromHF' - ) + self.add_parser_argument_choices_value(parser, "--tokenizer-type", 'PretrainedFromHF') group = parser.add_argument_group(title=self.feature_name) group.add_argument("--tokenizer-name-or-path", type=str, default=None, - help="Name or path of the huggingface tokenizer.") + help="Name or path of the huggingface tokenizer.") group.add_argument("--tokenizer-not-use-fast", action='store_false', - help="HuggingFace tokenizer not use the fast version.") + help="HuggingFace tokenizer not use the fast version.") group.add_argument('--padded-vocab-size', type=int, default=None, help='set padded vocab size') group.add_argument('--prompt-type', type=str, default=None, @@ -34,3 +30,5 @@ class BuildTokenizerFeature(MindSpeedBuildTokenizerFeature): help='Which template to use for constructing prompts in training/inference.' 'e.g., "qwen"') group.add_argument('--prompt-type-path', type=str, default=TEMPLATES_DIR, help='Path to the json file of templates.') + group.add_argument('--tokenizer-padding-side', type=str, default='right', + help="tokenizer padding side") diff --git a/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py b/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py index e8cd107e96..ce33e3b9dc 100644 --- a/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py +++ b/mindspeed_llm/features_manager/transformer/flash_attention/fusion_attention_feature.py @@ -13,8 +13,6 @@ class FusionAttentionFeature(MindSpeedFusionAttentionFeature): help='input shape order used by Flash attention') group.add_argument('--sliding-window', type=int, default=None, help='Window size when use sliding window attention.') - group.add_argument('--mla-fa-divide-qk', action='store_true', default=False, - help='Flash attn support mla with seperate q and k.') group.add_argument('--pre-tockens', type=int, default=65536, help='pre-tockens is used by Flash attention') group.add_argument('--next-tockens', type=int, default=0, diff --git a/mindspeed_llm/features_manager/transformer/mtp.py b/mindspeed_llm/features_manager/transformer/mtp.py index 700502d31b..e52105a46f 100644 --- a/mindspeed_llm/features_manager/transformer/mtp.py +++ b/mindspeed_llm/features_manager/transformer/mtp.py @@ -10,11 +10,11 @@ class MultiTokenPredictionFeature(MindSpeedFeature): group = parser.add_argument_group(title=self.feature_name) group.add_argument('--recompute-mtp-norm', action='store_true', default=False, - help='Multi-Token prediction recompute norm') + help='Multi-Token prediction recompute norm') group.add_argument('--recompute-mtp-layer', action='store_true', default=False, - help='Multi-Token prediction recompute layer') + help='Multi-Token prediction recompute layer') group.add_argument('--mtp-mem-efficient-logits', action='store_true', default=False, - help='Optimize ce_loss memory when use mtp block.') + help='Optimize ce_loss memory when use mtp block.') def register_patches(self, patch_manager, args): import megatron @@ -25,35 +25,28 @@ class MultiTokenPredictionFeature(MindSpeedFeature): # dualpipe do not need to init embedding weight from mindspeed_llm.core.models.common.embeddings.language_model_embedding import language_model_embedding_init_func - patch_manager.register_patch( - 'megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.__init__', - language_model_embedding_init_func) + patch_manager.register_patch('megatron.core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding.__init__', + language_model_embedding_init_func) # mtp compatibility megatron.core.transformer.multi_token_prediction.LNImpl = PTNorm - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MTPLossLoggingHelper.reduce_loss_in_tracker', - mtp_reduce_loss_in_tracker) - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.get_mtp_num_layers_to_build', - get_mtp_num_layers_to_build) - patch_manager.register_patch( - 'megatron.core.models.common.language_module.language_module.LanguageModule' - '.setup_embeddings_and_output_layer', - setup_embeddings_and_output_layer) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MTPLossLoggingHelper.reduce_loss_in_tracker', + mtp_reduce_loss_in_tracker) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.get_mtp_num_layers_to_build', + get_mtp_num_layers_to_build) + patch_manager.register_patch('megatron.core.models.common.language_module.language_module.LanguageModule' + '.setup_embeddings_and_output_layer', + setup_embeddings_and_output_layer) # change masked_target for better performance if args.mtp_mem_efficient_logits: from mindspeed_llm.core.tensor_parallel.cross_entropy import calculate_logits_max, calculate_predicted_logits - patch_manager.register_patch( - 'megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_logits_max', - calculate_logits_max) - patch_manager.register_patch( - 'megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits', - calculate_predicted_logits) + patch_manager.register_patch('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_logits_max', + calculate_logits_max) + patch_manager.register_patch('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits', + calculate_predicted_logits) else: from mindspeed.core.tensor_parallel.cross_entropy import calculate_predicted_logits - patch_manager.register_patch( - 'megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits', - calculate_predicted_logits) + patch_manager.register_patch('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits', + calculate_predicted_logits) # patch for mtp from mindspeed_llm.core.transformer.multi_token_prediction import ( mtp_layer_init_wrapper, @@ -61,15 +54,11 @@ class MultiTokenPredictionFeature(MindSpeedFeature): mtp_block_build_layers_wrapper, mtp_block_forward, ) - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionLayer.__init__', - mtp_layer_init_wrapper) - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionLayer.forward', - mtp_layer_forward) - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock._build_layers', - mtp_block_build_layers_wrapper) - patch_manager.register_patch( - 'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock.forward', - mtp_block_forward) \ No newline at end of file + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionLayer.__init__', + mtp_layer_init_wrapper) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionLayer.forward', + mtp_layer_forward) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock._build_layers', + mtp_block_build_layers_wrapper) + patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock.forward', + mtp_block_forward) \ No newline at end of file diff --git a/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py b/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py new file mode 100644 index 0000000000..f69614f98e --- /dev/null +++ b/mindspeed_llm/features_manager/transformer/multi_latent_attention/mla_feature.py @@ -0,0 +1,51 @@ +from argparse import ArgumentParser, Namespace +from mindspeed.features_manager.feature import MindSpeedFeature + + +class MLAFeature(MindSpeedFeature): + + def __init__(self): + super().__init__('multi-latent-attention', optimization_level=2) + + def register_args(self, parser: ArgumentParser): + group = parser.add_argument_group(title='multi latent attention') + + group.add_argument('--padded-base-length', type=int, default=128, + help='Fill Q K V of multi-latent-attention to an integer multiple of this parameter.') + group.add_argument('--mla-fa-without-pad', action='store_true', default=False, + help='Do not pad v_head_dim to q_head_dim in MLA') + group.add_argument('--mla-mm-split', action='store_true', default=False, + help='Split 2 up-proj matmul into 4 in MLA') + group.add_argument("--mla-zero-memory", action='store_true', default=False, + help="Save activation memory in multi-latent-attention.") + group.add_argument("--mla-up-proj-tp-overlap", action='store_true', default=False, + help='overlap up proj tp comm') + group.add_argument("--recompute-mla-up-proj", action='store_true', default=False, + help='recompute up projection in mla') + group.add_argument('--mla-swap-core-attn-out', action='store_true', default=False, + help='swap core_attn_out only in mla.') + group.add_argument('--mla-fa-divide-qk', action='store_true', default=False, + help='Flash attn support mla with seperate q and k.') + + def validate_args(self, args: Namespace): + if args.multi_head_latent_attention: + if args.kv_lora_rank is None: + raise AssertionError( + 'The parameter kv-lora-rank should be ' + 'set when use multi_head_latent_attention.' + ) + elif args.v_head_dim is None: + raise AssertionError( + 'The parameter v-head-dim should be ' + 'set when use multi_head_latent_attention.' + ) + elif args.qk_pos_emb_head_dim is None: + raise AssertionError( + 'The parameter qk-pos-emb-head-dim should be ' + 'set when use multi_head_latent_attention.' + ) + elif args.qk_head_dim is None: + raise AssertionError( + 'The parameter qk-head-dim should be ' + 'set when use multi_head_latent_attention.' + ) diff --git a/mindspeed_llm/features_manager/transformer/transformer_block.py b/mindspeed_llm/features_manager/transformer/transformer_block.py new file mode 100644 index 0000000000..33cb9b809e --- /dev/null +++ b/mindspeed_llm/features_manager/transformer/transformer_block.py @@ -0,0 +1,18 @@ +from argparse import ArgumentParser +from mindspeed.features_manager.feature import MindSpeedFeature + + +class TransformerBlockFeature(MindSpeedFeature): + def __init__(self): + super(TransformerBlockFeature, self).__init__(feature_name="transformer-block", optimization_level=0) + + def register_args(self, parser: ArgumentParser): + group = parser.add_argument_group(title=self.feature_name) + group.add_argument('--first-k-dense-replace', type=int, default=None, + help='Set first k layer as dense layer') + + + def register_patches(self, patch_manager, args): + from mindspeed_llm.core.transformer.transformer_block import _transformer_block_build_layers + patch_manager.register_patch('megatron.core.transformer.transformer_block.TransformerBlock._build_layers', + _transformer_block_build_layers) \ No newline at end of file diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index d30c2b87fb..4c165441c3 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -1324,6 +1324,8 @@ def _add_dummy_args_v2(args): args.enable_high_availability = False args.use_fused_mlp = False args.disable_gloo_group = False + args.recompute_activation_function = False + args.input_jitter = False def _validate_noop_layer(args): @@ -1530,6 +1532,7 @@ def validate_args_v2_decorator(megatron_validate_args): def wrapper(args, defaults=None): if defaults is None: defaults = {} + # make prev validation and copy some args. MindSpeedFeaturesManager.pre_validate_features_args(args) diff --git a/tests/st/baseline_results/mixtral_tp1_pp4_ep2_drop_dpp.json b/tests/st/baseline_results/mixtral_tp1_pp4_ep2_drop_dpp.json deleted file mode 100644 index 4de1d3f1c4..0000000000 --- a/tests/st/baseline_results/mixtral_tp1_pp4_ep2_drop_dpp.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "lm loss": [ - 14.16477, - 14.13033, - 13.30181, - 11.25219, - 8.612155, - 8.868297, - 7.460348, - 7.226073, - 7.012052, - 6.687274, - 6.314083, - 6.058187, - 5.813196, - 5.88946, - 5.976941 - ], - "throughput": [ - 0.4, - 41.2, - 42.9, - 43.7, - 43.7, - 43.7, - 43.6, - 43.5, - 42.9, - 43.3, - 43.1, - 43.2, - 43.3, - 43.5, - 43.5 - ], - "memo info": [ - { - "rank": 0, - "allocated memory": 14080.53466796875, - "max allocated memory": 14528.53515625 - }, - { - "rank": 2, - "allocated memory": 25221.056640625, - "max allocated memory": 25669.05712890625 - }, - { - "rank": 4, - "allocated memory": 25221.056640625, - "max allocated memory": 25669.05712890625 - }, - { - "rank": 6, - "allocated memory": 14144.57763671875, - "max allocated memory": 14592.57763671875 - } - ] -} \ No newline at end of file diff --git a/tests/st/shell_scripts/mixtral_tp1_pp4_ep2_drop_dpp.sh b/tests/st/shell_scripts/mixtral_tp1_pp4_ep2_drop_dpp.sh deleted file mode 100644 index b8a97ccd8d..0000000000 --- a/tests/st/shell_scripts/mixtral_tp1_pp4_ep2_drop_dpp.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -NPUS_PER_NODE=8 -MASTER_ADDR=localhost -MASTER_PORT=6012 -NNODES=1 -NODE_RANK=0 -WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) - -basepath=$(cd `dirname $0`; cd ../../../; pwd) - -DISTRIBUTED_ARGS=" - --nproc_per_node $NPUS_PER_NODE \ - --nnodes $NNODES \ - --node_rank $NODE_RANK \ - --master_addr $MASTER_ADDR \ - --master_port $MASTER_PORT -" - -echo "NODE_RANK ${NODE_RANK}" - -DATA_PATH="/data/pretrain_dataset/alpaca_text_document" -TOKENIZER_MODEL="/data/mixtral-8-7b-hf/Mixtral-8x7B/tokenizer.model" -CKPT_LOAD_DIR="/data/Mixtral-8x7B-tp1pp4ep2/" - -TP=1 -PP=4 -EP=2 -NUM_LAYERS=6 -TRAIN_ITER=15 - - -MOE_ARGS=( - --num-experts 8 - --expert-model-parallel-size ${EP} - --moe-router-topk 2 - --moe-router-load-balancing-type aux_loss - --moe-aux-loss-coeff 0.01 - --moe-permutation-async-comm - --moe-expert-capacity-factor 0.5 - --moe-pad-expert-input-to-capacity - --moe-token-drop-policy probs - --moe-token-dispatcher-type alltoall_seq - --moe-layer-freq -1 - --first-k-dense-replace -1 -) - -ACCELERATE_ARGS=( - --tensor-model-parallel-size ${TP} - --pipeline-model-parallel-size ${PP} - --num-layer-list 1,2,2,1 - --sequence-parallel - --use-distributed-optimizer - --recompute-activation-function -) - -GPT_ARGS=( - --use-mcore-models - --disable-bias-linear - --seq-length 4096 - --max-position-embeddings 32768 - --num-layers ${NUM_LAYERS} - --hidden-size 4096 - --ffn-hidden-size 14336 - --num-attention-heads 32 - --init-method-std 0.01 - --attention-dropout 0.0 - --hidden-dropout 0.0 - --normalization RMSNorm - --position-embedding-type rope - --swiglu - --untie-embeddings-and-output-weights - --group-query-attention - --num-query-groups 8 - --no-position-embedding - --vocab-size 32000 - --rotary-base 1000000 - - --no-masked-softmax-fusion - --use-fused-rotary-pos-emb - --use-flash-attn - --use-fused-swiglu - --use-fused-rmsnorm - --no-check-for-nan-in-loss-and-grad - --overlap-grad-reduce - - --tokenizer-type Llama2Tokenizer - --tokenizer-model ${TOKENIZER_MODEL} - --micro-batch-size 1 - --global-batch-size 2 - --lr 1e-5 - --train-iters ${TRAIN_ITER} - --lr-decay-iters 1280 - --lr-decay-style cosine - --min-lr 1.0e-6 - --weight-decay 0.1 - --lr-warmup-iters 2 - --clip-grad 1.0 - --bf16 -) - -DATA_ARGS=( - --data-path $DATA_PATH - --split 100,0,0 -) - -OUTPUT_ARGS=( - --log-interval 1 - --save-interval ${TRAIN_ITER} - --eval-interval ${TRAIN_ITER} - --eval-iters 0 - --no-load-optim - --no-load-rng - --no-save-optim - --no-save-rng - --load ${CKPT_LOAD_DIR} - --finetune - --log-throughput -) - -torchrun ${DISTRIBUTED_ARGS[@]} $basepath/pretrain_gpt.py \ - ${ACCELERATE_ARGS[@]} \ - ${MOE_ARGS[@]} \ - ${GPT_ARGS[@]} \ - ${DATA_ARGS[@]} \ - ${OUTPUT_ARGS[@]} \ - --distributed-backend nccl \ - | tee ${log_dir} -- Gitee