diff --git a/mindspeed_llm/mindspore/mindspore_adaptor.py b/mindspeed_llm/mindspore/mindspore_adaptor.py index 6586c2c3abf280254d9ef3e0d30d3c5d88bd3e46..3d039c790d77e3be504c91553a78f00a3d95465a 100644 --- a/mindspeed_llm/mindspore/mindspore_adaptor.py +++ b/mindspeed_llm/mindspore/mindspore_adaptor.py @@ -14,7 +14,7 @@ class MindSporeAdaptation(MegatronAdaptationABC): from mindspeed.mindspore.core.data_parallel.distributed_data_parallel import distributed_data_parallel_init_with_cp from mindspeed.mindspore.core.transformer.moe.experts import groupedmlp_init_wrapper, groupedmlp_forward - MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel', GPTModel) + MegatronAdaptation.register('megatron.core.models.gpt.gpt_model.GPTModel', GPTModel, force_patch=True) MegatronAdaptation.register('megatron.core.distributed.distributed_data_parallel.DistributedDataParallel.__init__', distributed_data_parallel_init_with_cp, force_patch=True) MegatronAdaptation.register('megatron.core.transformer.moe.moe_layer.MoELayer.__init__',