From a1fc98461d59575e293efd899279e5b5c835612f Mon Sep 17 00:00:00 2001 From: yuhui Date: Wed, 12 Mar 2025 17:33:18 +0800 Subject: [PATCH] =?UTF-8?q?deepseek3=E6=9D=83=E9=87=8D=E8=BD=AC=E6=8D=A2?= =?UTF-8?q?=E5=8F=82=E6=95=B0=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mcore/deepseek3/README.md | 2 +- .../ckpt_convert_deepseek3_hf2mcore.sh | 10 ++++++---- .../ckpt_convert_deepseek3_mcore2hf.sh | 17 +++++++++-------- .../mcore/deepseek3/convert_ckpt_deepseek3.py | 7 ++++--- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/examples/mcore/deepseek3/README.md b/examples/mcore/deepseek3/README.md index b8dfe411d..d7d3014b7 100644 --- a/examples/mcore/deepseek3/README.md +++ b/examples/mcore/deepseek3/README.md @@ -41,7 +41,7 @@ 【--num-nextn-predict-layers】 -MTP层的层数。如不需要MTP层,可设置为0。最大可设置为1。默认值为1。 +MTP层的层数。如不需要MTP层,可设置为0。最大可设置为1。默认值为0。 MTP层权重默认存储在最后一个pp stage。 【--num-layers】 diff --git a/examples/mcore/deepseek3/ckpt_convert_deepseek3_hf2mcore.sh b/examples/mcore/deepseek3/ckpt_convert_deepseek3_hf2mcore.sh index cdc402bad..2acf4da7b 100644 --- a/examples/mcore/deepseek3/ckpt_convert_deepseek3_hf2mcore.sh +++ b/examples/mcore/deepseek3/ckpt_convert_deepseek3_hf2mcore.sh @@ -3,11 +3,13 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh python examples/mcore/deepseek3/convert_ckpt_deepseek3.py \ --moe-grouped-gemm \ + --target-tensor-parallel-size 2 \ --target-pipeline-parallel-size 8 \ - --target-expert-parallel-size 8 \ + --target-expert-parallel-size 32 \ --load-dir ./model_from_hf/deepseek3-bf16-hf \ --save-dir ./model_weights/deepseek3-mcore \ - --num-layers 61 \ + --num-layers 64 \ --num-nextn-predict-layers 1 \ - --num-layer-list 7,7,8,8,8,8,8,7 - # --num-layer-list, --noop-layers, --num-layers-per-virtual-pipeline-stage等参数根据任务需要进行配置 + --num-layers-per-virtual-pipeline-stage 2 \ + --noop-layers 47,62,63 + # --num-layer-list, --moe-tp-extend-ep 等参数根据任务需要进行配置 diff --git a/examples/mcore/deepseek3/ckpt_convert_deepseek3_mcore2hf.sh b/examples/mcore/deepseek3/ckpt_convert_deepseek3_mcore2hf.sh index 7b7d881c1..07df6d4e0 100644 --- a/examples/mcore/deepseek3/ckpt_convert_deepseek3_mcore2hf.sh +++ b/examples/mcore/deepseek3/ckpt_convert_deepseek3_mcore2hf.sh @@ -2,13 +2,14 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh python examples/mcore/deepseek3/convert_ckpt_deepseek3_mcore2hf.py \ - --source-tensor-parallel-size 1 \ - --source-pipeline-parallel-size 4 \ - --source-expert-parallel-size 8 \ + --moe-grouped-gemm \ + --source-tensor-parallel-size 2 \ + --source-pipeline-parallel-size 8 \ + --source-expert-parallel-size 32 \ --load-dir ./model_weights/deepseek3-mcore \ --save-dir ./model_from_hf/deepseek3-hf \ - --num-layers 61 \ - --first-k-dense-replace 3 \ - --num-layer-list 16,15,15,15 \ - --num-nextn-predict-layers 1 - # --num-layer-list, --noop-layers, --num-layers-per-virtual-pipeline-stage等参数根据任务需要进行配置 + --num-layers 64 \ + --num-nextn-predict-layers 1 \ + --num-layers-per-virtual-pipeline-stage 2 \ + --noop-layers 47,62,63 \ + # --num-layer-list, --moe-tp-extend-ep 等参数根据任务需要进行配置 diff --git a/examples/mcore/deepseek3/convert_ckpt_deepseek3.py b/examples/mcore/deepseek3/convert_ckpt_deepseek3.py index 7c90f47ca..ecea082fa 100644 --- a/examples/mcore/deepseek3/convert_ckpt_deepseek3.py +++ b/examples/mcore/deepseek3/convert_ckpt_deepseek3.py @@ -299,8 +299,9 @@ class CkptConvert(object): self.qlora_nf4_quant(mg_model, ep_rank, tp_rank, f"mtp_layers.{mtp_layer_idx}.eh_proj.weight", eh_proj_lst[tp_rank].clone()) - if not self.share_mtp_embedding_and_output_weight or self.vpp_stage is not None: - mg_model[ep_rank][tp_rank][f"mtp_layers.{mtp_layer_idx}.embedding.word_embeddings.weight"] = emb_lst[tp_rank].clone() + if not self.share_mtp_embedding_and_output_weight or self.pp_size > 1: + mg_model[ep_rank][tp_rank][f"mtp_layers.{mtp_layer_idx}.embedding.word_embeddings.weight"] = \ + emb_lst[tp_rank].clone() def set_mtp_postprocess(self, hf_layer_idx, mtp_layer_idx, weights_dict, mg_model): """MTP layer postprocess""" @@ -702,7 +703,7 @@ def get_args(): parser.add_argument('--moe-grouped-gemm', action='store_true', help='Usr moe grouped gemm.') parser.add_argument("--noop-layers", type=str, default=None, help='Specity the noop layers.') - parser.add_argument('--num-nextn-predict-layers', type=int, default=1, help='Multi-Token prediction layer num') + parser.add_argument('--num-nextn-predict-layers', type=int, default=0, help='Multi-Token prediction layer num') parser.add_argument('--num-layer-list', type=str, help='a list of number of layers, seperated by comma; e.g., 4,4,4,4') parser.add_argument('--num-layers', type=int, default=61, -- Gitee