From eb9298739791f3ad5fab7e935d3ac13d783e48f6 Mon Sep 17 00:00:00 2001 From: zhanggaolu Date: Mon, 25 Aug 2025 20:39:10 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=B0=83=E7=94=A8mindspeed?= =?UTF-8?q?=E4=B8=AD=E8=B4=9F=E8=BD=BD=E5=9D=87=E8=A1=A1=E7=89=B9=E6=80=A7?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mindspeed_llm/features_manager/__init__.py | 2 ++ mindspeed_llm/training/training.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/mindspeed_llm/features_manager/__init__.py b/mindspeed_llm/features_manager/__init__.py index 0fba65ab5..01419ce68 100644 --- a/mindspeed_llm/features_manager/__init__.py +++ b/mindspeed_llm/features_manager/__init__.py @@ -13,6 +13,7 @@ from mindspeed.features_manager import ( MoEAllGatherOverLapFeature, MoEFwdBwdOverlapFeature, MoEGmmFeature, + ExpertsPlacementFeature, MoEZeroMemoryFeature, OptimizeSendRecvCommFeature, SwapOptimizerFeature, @@ -210,6 +211,7 @@ def add_moe_features(features_list: List[MindSpeedFeature]): MoEFwdBwdOverlapFeature(), MoEAlltoAllOverLapFeature(), MoEZeroMemoryFeature(), + ExpertsPlacementFeature(), ]) diff --git a/mindspeed_llm/training/training.py b/mindspeed_llm/training/training.py index f2f55a599..ec7ab205f 100644 --- a/mindspeed_llm/training/training.py +++ b/mindspeed_llm/training/training.py @@ -60,6 +60,11 @@ from megatron.core.distributed import DistributedDataParallel as DDP from megatron.core.distributed import finalize_model_grads from mindspeed_llm.training.initialize import set_jit_fusion_options from mindspeed_llm.tasks.posttrain.lora.utils import is_enable_lora +from mindspeed.core.transformer.moe.expert_placement.planner import print_expert_load +from mindspeed.core.transformer.moe.expert_placement.executor import ( + build_param_params_module_mlp_map, + expert_weight_and_optimizer_state_placement +) # The earliest we can measure the start time. _TRAIN_START_TIME = time.time() @@ -292,6 +297,14 @@ def build_train_args(*input_args): model_provider_func = model_provider model, optimizer, opt_param_scheduler = setup_model_and_optimizer( model_provider_func, model_type) + # param mapping to mlp object + if args.enable_expert_placement: + params_module_mlp_map = build_param_params_module_mlp_map(model) + if hasattr(optimizer, "chained_optimizers"): + for optimizer_sub in optimizer.chained_optimizers: + optimizer_sub.params_module_mlp_map = params_module_mlp_map + else: + optimizer.params_module_mlp_map = params_module_mlp_map timers('model-and-optimizer-setup').stop() print_datetime('after model, optimizer, and learning rate ' 'scheduler are built') @@ -604,6 +617,11 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, update_num_microbatches(args.consumed_train_samples, consistency_check=True) args.curr_iteration = iteration + if args.enable_expert_placement: + expert_weight_and_optimizer_state_placement(args, model, optimizer) + if args.print_expert_load: + print_expert_load(args, model, iteration) + loss_dict, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad = \ train_step(forward_step_func, train_data_iterator, -- Gitee