From 3188ad2bb6af4746f123a58ae5c6b029363ba510 Mon Sep 17 00:00:00 2001 From: xinyuan Date: Wed, 18 Jun 2025 03:36:13 +0000 Subject: [PATCH] !2834 [mindspore][bugfix] fix_forward_step_patch Merge pull request !2834 from xinyuan/fix_forward_step_patch --- .../core/pipeline_parallel/schedules.py | 2 +- mindspeed_llm/mindspore/mindspore_adaptor.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mindspeed_llm/core/pipeline_parallel/schedules.py b/mindspeed_llm/core/pipeline_parallel/schedules.py index 42db59a18..cbc710c5b 100644 --- a/mindspeed_llm/core/pipeline_parallel/schedules.py +++ b/mindspeed_llm/core/pipeline_parallel/schedules.py @@ -107,4 +107,4 @@ def forward_step_wrapper(fn): MTPLossAutoScaler.set_loss_scale(loss_scale / num_microbatches) return output, num_tokens - return wrapper + return wrapper \ No newline at end of file diff --git a/mindspeed_llm/mindspore/mindspore_adaptor.py b/mindspeed_llm/mindspore/mindspore_adaptor.py index a140ee171..03ff0b99e 100644 --- a/mindspeed_llm/mindspore/mindspore_adaptor.py +++ b/mindspeed_llm/mindspore/mindspore_adaptor.py @@ -185,7 +185,7 @@ class MindSporeAdaptation(MegatronAdaptationABC): #mindspeed - + from mindspeed.mindspore.core.pipeline_parallel.dualpipev.dualpipev_schedules import backward_step_with_model_graph, set_shared_embedding_from_dual_chunk, forward_step_with_model_graph, get_shared_embedding_from_dual_chunk, forward_backward_pipelining_with_cutinhalf MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.dualpipev.dualpipev_schedules.backward_step_with_model_graph', backward_step_with_model_graph) @@ -208,7 +208,7 @@ class MindSporeAdaptation(MegatronAdaptationABC): transformer_block_forward) MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.fb_overlap.transformer_block.transformer_block_forward_backward_overlaping', transformer_block_forward_backward_overlaping) - + from mindspeed.mindspore.core.pipeline_parallel.fb_overlap.adaptor import _make_param_hook MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.fb_overlap.adaptor._make_param_hook', _make_param_hook) @@ -263,7 +263,7 @@ class MindSporeAdaptation(MegatronAdaptationABC): MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.fb_overlap.overlap_funcs.fwdbwd.transformer_layer_forward_dense_backward_dense_overlaping', transformer_layer_forward_dense_backward_dense_overlaping) MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.fb_overlap.overlap_funcs.fwdbwd.transformer_layer_forward_moe_backward_moe_overlaping', - transformer_layer_forward_moe_backward_moe_overlaping) + transformer_layer_forward_moe_backward_moe_overlaping) from mindspeed.mindspore.core.pipeline_parallel.fb_overlap.modules.weight_grad_store import overlap_matmul @@ -274,12 +274,12 @@ class MindSporeAdaptation(MegatronAdaptationABC): from mindspeed.mindspore.core.transformer.moe.comm_utils import async_all_to_all MindSporeAdaptation.register('mindspeed.core.transformer.moe.comm_utils.async_all_to_all', async_all_to_all) - + from mindspeed.mindspore.core.pipeline_parallel.fb_overlap.modules.token_dispatcher import alltoall_token_perm1, overlap_stream MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.fb_overlap.modules.token_dispatcher.alltoall_token_perm1', - alltoall_token_perm1) + alltoall_token_perm1) MindSporeAdaptation.register('mindspeed.core.pipeline_parallel.fb_overlap.modules.token_dispatcher.overlap_stream', - overlap_stream) + overlap_stream) @@ -344,6 +344,10 @@ class MindSporeAdaptation(MegatronAdaptationABC): from mindspeed_llm.mindspore.core.transformer.dot_product_attention import flash_attention_forward MindSporeAdaptation.register('mindspeed_llm.core.transformer.dot_product_attention.flash_attention_forward', flash_attention_forward) + from mindspeed.mindspore.core.pipeline_parallel.schedules import deallocate_output_tensor_ + MindSporeAdaptation.register('megatron.core.pipeline_parallel.schedules.deallocate_output_tensor', + deallocate_output_tensor_) + @staticmethod def reparse_args(): """ -- Gitee