From c7c792cf82d0fd9e7f8013ad0db91a8a5fa3a783 Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Sun, 25 May 2025 17:00:50 +0800
Subject: [PATCH 1/7] docs fix bug

---
 README.md                                     |  17 ++-
 SECURITYNOTE.md                               |   5 +-
 docs/pytorch/features/yarn.md                 |   2 +-
 docs/pytorch/models/models_evaluation.md      | 107 +++++++++---------
 docs/pytorch/models/ssm_model.md              |  13 +--
 docs/pytorch/solutions/checkpoint_convert.md  |   6 +-
 .../finetune/instruction_finetune.md          |   2 +
 docs/pytorch/solutions/inference/chat.md      |   5 +-
 docs/pytorch/solutions/inference/inference.md |   2 +-
 .../preference-alignment/offline_dpo.md       |   2 +-
 docs/quick_start.md                           |  32 +++---
 11 files changed, 103 insertions(+), 90 deletions(-)
diff --git a/README.md b/README.md
index 75e28d945..e5b5c494e 100644
--- a/README.md
+++ b/README.md
@@ -512,12 +512,19 @@ MindSpeed LLM包含分布式预训练、分布式微调、分布式偏好对齐
   </tr>
   <tr>
     <td rowspan="2"><a href="docs/pytorch/solutions/finetune/qlora_finetune.md">QLoRA微调</a></td>
-    <td>/</td>
-    <td>/</td>
-    <td>/</td>
-    <td>/</td>
-    <td>/</td>
+    <td><a href="docs/pytorch/features/cc_lora.md">CCLoRA</a></td>
+    <td>❌</td>
+    <td>❌</td>
+    <td>❌</td>
+    <td>【NAIE】</td>
+  </tr>
   <tr>
+    <td><a href="docs/pytorch/features/fused_mlp.md">Fused_MLP</a></td>
+    <td>❌</td>
+    <td>❌</td>
+    <td>❌</td>
+    <td>【NAIE】</td>
+  </tr>
   <tr>
     <td>长序列微调</td>
     <td><a href="docs/pytorch/features/fine-tuning-with-context-parallel.md">长序列CP</a></td>
diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index da7ad08f1..b0a199bde 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -68,12 +68,11 @@ MindSpeed-LLM 暂时未发布wheel包，无正式对外公开接口，所有功
 
 ## 通信安全加固
 
-[通信安全加固说明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md#%E9%80%9A%E4%BF%A1%E5%AE%89%E5%85%A8%E5%8A%A0%E5%9B%BA
-)
+[通信安全加固说明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md#%E9%80%9A%E4%BF%A1%E5%AE%89%E5%85%A8%E5%8A%A0%E5%9B%BA)
 
 ## 通信矩阵
 
-### [通信矩阵说明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md#%E9%80%9A%E4%BF%A1%E7%9F%A9%E9%98%B5%E4%BF%A1%E6%81%AF)
+[通信矩阵说明](https://gitee.com/ascend/pytorch/blob/master/SECURITYNOTE.md#%E9%80%9A%E4%BF%A1%E7%9F%A9%E9%98%B5%E4%BF%A1%E6%81%AF)
 
 ### 特殊场景
 | 场景                                  | 使用方法                                         | 端口 | 可能的风险       |
diff --git a/docs/pytorch/features/yarn.md b/docs/pytorch/features/yarn.md
index 9d1818781..c923031b3 100644
--- a/docs/pytorch/features/yarn.md
+++ b/docs/pytorch/features/yarn.md
@@ -41,7 +41,7 @@ yarn通过ntk-by-part调整位置编码，提升序列扩增后的精度。
 | DeepSeek-V2-Lite-16B | MMLU   | 57.4%     | [58.3%](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)          |
 | DeepSeek-Math-7B     |MMLU-STEM| 56.5%    | [56.5%](https://github.com/deepseek-ai/DeepSeek-Math)          |
 | DeepSeek-V2-236B     | MMLU   | 78.1%         | [78.5%](https://huggingface.co/deepseek-ai/DeepSeek-V2)          |
-| DeepSeek-V2.5        | MMLU   | 79.3%         | [80.6%](https://github.com/deepseek-ai/DeepSeek-V3)          |
+| DeepSeek-V2.5        | MMLU   | 79.3%         | [80.6%](https://huggingface.co/deepseek-ai/DeepSeek-V2.5)          |
 
 
 
diff --git a/docs/pytorch/models/models_evaluation.md b/docs/pytorch/models/models_evaluation.md
index 87f0691e8..841e26067 100644
--- a/docs/pytorch/models/models_evaluation.md
+++ b/docs/pytorch/models/models_evaluation.md
@@ -1,60 +1,63 @@
 MindSpeed-LLM 支持大模型在公开基准数据集上进行准确率评估，当前支持的 Benchmark 如下：
 
-| Benchmark | 下载链接                                                                      | 验证集  | MindSpeed-LLM                                                               | OpenCompass                                                      |
-|-----------|---------------------------------------------------------------------------|------|-----------------------------------------------------------------------------|------------------------------------------------------------------|
-| MMLU      | [GitHub](https://people.eecs.berkeley.edu/~hendrycks/data.tar)            | test | [45.73%](../../../examples/mcore/llama2/evaluate_llama2_7b_mmlu_ptd.sh)     | [45.3%](https://hub.opencompass.org.cn/dataset-detail/MMLU)      |
-| CEval     | [HuggingFace](https://huggingface.co/datasets/ceval/ceval-exam/tree/main) | val  | [33.87%](../../../examples/mcore/llama2/evaluate_llama2_7b_ceval_ptd.sh)    | [32.5%](https://hub.opencompass.org.cn/dataset-detail/C-Eval)    |
-| BoolQ     | [Juhe](https://www.juhe.cn/market/product/id/10243)                       | dev  | [75.44%](../../../examples/mcore/llama2/evaluate_llama2_7b_boolq_ptd.sh)    | [74.9%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)     |
-| BBH       | [GitHub](https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh)     | test | [34.4%](../../../examples/mcore/llama2/evaluate_llama2_7b_bbh_ptd.sh)       | [32.5%](https://hub.opencompass.org.cn/dataset-detail/BBH)       |
-| AGIEval   | [GitHub](https://github.com/ruixiangcui/AGIEval/tree/main)                | test | [20.6%](../../../examples/mcore/llama2/evaluate_llama2_7b_agieval_ptd.sh)   | [20.6%](https://hub.opencompass.org.cn/dataset-detail/AGIEval)   |
-| HumanEval | [GitHub](https://github.com/openai/human-eval/tree/master/data)           | test | [12.8%](../../../examples/mcore/llama2/evaluate_llama2_7b_humaneval_ptd.sh) | [12.2%](https://hub.opencompass.org.cn/dataset-detail/HumanEval) |
-
+| Benchmark   | 下载链接                                                                      | 验证集  | MindSpeed-LLM                                                               | OpenCompass                                                      |
+|-------------|---------------------------------------------------------------------------|------|-----------------------------------------------------------------------------|------------------------------------------------------------------|
+| MMLU        | [GitHub](https://people.eecs.berkeley.edu/~hendrycks/data.tar)            | test | [45.73%](../../../examples/mcore/llama2/evaluate_llama2_7b_mmlu_ptd.sh)     | [45.3%](https://hub.opencompass.org.cn/dataset-detail/MMLU)      |
+| CEval       | [HuggingFace](https://huggingface.co/datasets/ceval/ceval-exam/tree/main) | val  | [33.87%](../../../examples/mcore/llama2/evaluate_llama2_7b_ceval_ptd.sh)    | [32.5%](https://hub.opencompass.org.cn/dataset-detail/C-Eval)    |
+| BoolQ       | [Juhe](https://www.juhe.cn/market/product/id/10243)                       | dev  | [75.44%](../../../examples/mcore/llama2/evaluate_llama2_7b_boolq_ptd.sh)    | [74.9%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)     |
+| BBH         | [GitHub](https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh)     | test | [34.4%](../../../examples/mcore/llama2/evaluate_llama2_7b_bbh_ptd.sh)       | [32.5%](https://hub.opencompass.org.cn/dataset-detail/BBH)       |
+| AGIEval     | [GitHub](https://github.com/ruixiangcui/AGIEval/tree/main)                | test | [20.6%](../../../examples/mcore/llama2/evaluate_llama2_7b_agieval_ptd.sh)   | [20.6%](https://hub.opencompass.org.cn/dataset-detail/AGIEval)   |
+| HumanEval   | [GitHub](https://github.com/openai/human-eval/tree/master/data)           | test | [12.8%](../../../examples/mcore/llama2/evaluate_llama2_7b_humaneval_ptd.sh) | [12.2%](https://hub.opencompass.org.cn/dataset-detail/HumanEval) |
+| CMMLU       | [HuggingFace](https://huggingface.co/datasets/haonan-li/cmmlu/tree/main)  | test | -- | -- |
+| GSM8k       | [HuggingFace](https://huggingface.co/datasets/openai/gsm8k/tree/main)     | -- | -- | -- |
+| Hellaswag   | [GitHub](https://github.com/rowanz/hellaswag)                             | -- | -- | -- |
+| Needlebench | [HuggingFace](https://huggingface.co/datasets/opencompass/NeedleBench/tree/main)     | -- | -- | -- |
 
 MindSpeed-LLM 已支持的大模型评估数据统计如下：
 
 
-| 模型                   | 任务     | MindSpeed-LLM | 社区                                                                    | 模型               | 任务     | MindSpeed-LLM | 社区                                                                                 |
-|----------------------|--------|-----------|-----------------------------------------------------------------------|------------------|--------|-----------|------------------------------------------------------------------------------------|
-| Aquila-7B            | BoolQ  | 77.3%     | --                                                                    | Aquila2-7B       | BoolQ  | 77.8%     | --                                                                                 |
-| Aquila2-34B          | BoolQ  | 88.0%     | --                                                                    | Baichuan-7B      | BoolQ  | 69.0%     | [67.0%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)                       |
-| Baichuan-13B         | BoolQ  | 74.7%     | [73.6%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)          | Baichuan2-7B     | BoolQ  | 70.0%     | [63.2%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)                       |
-| Baichuan2-13B        | BoolQ  | 78.0%     | [67.0%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)          | Bloom-7B         | MMLU   | 25.1%     | --                                                                                 |
-| Bloom-176B           | BoolQ  | 64.5%     | --                                                                    | ChatGLM3-6B      | MMLU   | 61.5%     | --                                                                                 |
-| GLM4-9B              | MMLU   | 74.5%     | [74.7%](https://huggingface.co/THUDM/glm-4-9b)                        | CodeQwen1.5-7B   | Human. | 54.8%     | [51.8%](https://qwenlm.github.io/zh/blog/codeqwen1.5/)                             |
-| CodeLLaMA-34B        | Human. | 48.8%     | [48.8%](https://paperswithcode.com/sota/code-generation-on-humaneval) | Gemma-2B         | MMLU   | 39.6%     | --                                                                                 |
-| Gemma-7B             | MMLU   | 52.2%     | --                                                                    | InternLM-7B      | MMLU   | 48.7%     | [51.0%](https://huggingface.co/internlm/internlm-7b)                               |
-| Gemma2-9B            | MMLU   | 70.7%     | [71.3%](https://huggingface.co/google/gemma-2-9b)                     | Gemma2-27B       | MMLU   | 75.5%     | [75.2%](https://huggingface.co/google/gemma-2-27b)                                 |
-| LLaMA-7B             | BoolQ  | 74.6%     | [75.4%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)          | LLaMA-13B        | BoolQ  | 79.6%     | [78.7%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)                       |
-| LLaMA-33B            | BoolQ  | 83.2%     | [83.1%](https://paperswithcode.com/sota/question-answering-on-boolq)  | LLaMA-65B        | BoolQ  | 85.7%     | [86.6%](https://paperswithcode.com/sota/question-answering-on-boolq)               |
-| LLaMA2-7B            | MMLU   | 45.7%     | --                                                                    | LLaMA2-13B       | BoolQ  | 82.2%     | [81.7%](https://paperswithcode.com/sota/question-answering-on-boolq)               |
-| LLaMA2-34B           | BoolQ  | 82.0%     | --                                                                    | LLaMA2-70B       | BoolQ  | 86.4%     | --                                                                                 |
-| LLaMA3-8B            | MMLU   | 65.2%     | --                                                                    | LLaMA3-70B       | BoolQ  | 78.4%     | --                                                                                 |
-| LLaMA3.1-8B          | MMLU   | 65.3%     | --                                                                    | LLaMA3.1-70B     | MMLU   | 81.8%     | --                                                                                 |
-| LLaMA3.2-1B          | MMLU   | 31.8%     | [32.2%](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B)       | LLaMA3.2-3B      | MMLU   | 56.3%     | [58.0%](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B)                    |
-| Mistral-7B           | MMLU   | 56.3%     | --                                                                    | Mixtral-8x7B     | MMLU   | 70.6%     | [70.6%](https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu) |
-| Mistral-8x22B        | MMLU   | 77%       | [77.8%](https://mistral.ai/news/mixtral-8x22b/)                       | MiniCPM-MoE-8x2B | BoolQ  | 83.9%     | --                                                                                 |
-| QWen-7B              | MMLU   | 58.1%     | [58.2%](https://huggingface.co/Qwen/Qwen-7B)                          | Qwen-14B         | MMLU   | 65.3%     | [66.3%](https://huggingface.co/Qwen/Qwen-14B)                                      |
-| QWen-72B             | MMLU   | 74.6%     | [77.4%](https://huggingface.co/Qwen/Qwen-72B)                         | QWen1.5-0.5B     | MMLU   | 39.1%     | --                                                                                 |
-| QWen1.5-1.8b         | MMLU   | 46.2%     | [46.8%](https://qwenlm.github.io/zh/blog/qwen1.5/)                    | QWen1.5-4B       | MMLU   | 59.0%     | [56.1%](https://qwenlm.github.io/zh/blog/qwen1.5)                                  |
-| QWen1.5-7B           | MMLU   | 60.3%     | [61.0%](https://qwenlm.github.io/zh/blog/qwen1.5/)                    | QWen1.5-14B      | MMLU   | 67.3%     | [67.6%](https://qwenlm.github.io/zh/blog/qwen1.5)                                  |
-| QWen1.5-32B          | MMLU   | 72.5%     | [73.4%](https://huggingface.co/Qwen/Qwen-72B)                         | QWen1.5-72B      | MMLU   | 76.4%     | [77.5%](https://qwenlm.github.io/zh/blog/qwen1.5)                                  |
-| Qwen1.5-110B         | MMLU   | 80.4%     | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/)               | Yi-34B           | MMLU   | 76.3%     | [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU)                        |
-| QWen2-0.5B           | MMLU   | 44.6%     | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/)                      | QWen2-1.5B       | MMLU   | 54.7%     | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/)                                   |
-| QWen2-7B             | MMLU   | 70.3%     | [70.3%](https://qwenlm.github.io/zh/blog/qwen2/)                      | QWen2-57B-A14B   | MMLU   | 75.6%     | [76.5%](https://qwenlm.github.io/zh/blog/qwen2/)                                   |
-| QWen2-72B            | MMLU   | 83.6%     | [84.2%](https://qwenlm.github.io/zh/blog/qwen2/)                      | MiniCPM-2B       | MMLU   | 51.6%     | [53.4%](https://github.com/OpenBMB/MiniCPM?tab=readme-ov-file#3)                   |
-| DeepSeek-V2-Lite-16B | MMLU   | 58.1%     | [58.3%](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)          | QWen2.5-0.5B     | MMLU   | 47.67%    | [47.5%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
-| QWen2.5-1.5B         | MMLU   | 59.4%     | [60.9%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-3B       | MMLU   | 65.6%     | [65.6%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
-| QWen2.5-7B           | MMLU   | 73.8%     | [74.2%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-14B      | MMLU   | 79.4%     | [79.7%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
-| QWen2.5-32B          | MMLU   | 83.3%     | [83.3%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-72B      | MMLU   | 85.59%    | [86.1%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
-| InternLM2.5-1.8b     | MMLU   | 51.3%     | [53.5%](https://huggingface.co/internlm/internlm2_5-1_8b)             | InternLM2.5-7B   | MMLU   | 71.6%     | [71.6%](https://huggingface.co/internlm/internlm2_5-7b)                            |
-| InternLM2.5-20b      | MMLU   | 73.3%     | [74.2%](https://huggingface.co/internlm/internlm2_5-20b)              | InternLM3-8b     | MMLU   | 76.6%     | [76.6%](https://huggingface.co/internlm/internlm3-8b-instruct)                     |
-| Yi1.5-6B             | MMLU   | 63.2%     | [63.5%](https://huggingface.co/01-ai/Yi-1.5-6B/tree/main)             | Yi1.5-9B         | MMLU   | 69.2%     | [69.5%](https://huggingface.co/01-ai/Yi-1.5-9B/tree/main)                          |
-| Yi1.5-34B            | MMLU   | 76.9%     | [77.1%](https://huggingface.co/01-ai/Yi-1.5-34B/tree/main)            | CodeQWen2.5-7B   | Human. | 66.5%     | [61.6%](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B)                        |
-| Qwen2.5-Math-7B      |MMLU-STEM| 67.8%    | [67.8%](https://github.com/QwenLM/Qwen2.5-Math/tree/main/)            | Qwen2.5-Math-72B |MMLU-STEM| 83.7%    | [82.8%](https://github.com/QwenLM/Qwen2.5-Math/tree/main/)                         |
-| MiniCPM3-4B          | MMLU   | 63.7%     | 64.6%                                                                 | Phi-3.5-mini-instruct | MMLU   | 64.39%    | 64.34%                                                                        |
-| Phi-3.5-MoE-instruct | MMLU   | 78.5%     | [78.9%](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)        | DeepSeek-Math-7B  |MMLU-STEM| 56.5%   | [56.5%](https://github.com/deepseek-ai/DeepSeek-Math)                              |
-| DeepSeek-V2.5        | MMLU   | 79.3%     | [80.6%](https://github.com/deepseek-ai/DeepSeek-V3)                   | DeepSeek-V2-236B | MMLU   | 78.1%     | [78.5%](https://huggingface.co/deepseek-ai/DeepSeek-V2)                            |
-| LLaMA3.3-70B-Instruct | MMLU   | 82.7%     | --                                                                   | QwQ-32B          | MMLU   | 81.19%    | --                                                                                 |
+| 模型                   | 任务         | MindSpeed-LLM | 社区                                                                    | 模型               | 任务     | MindSpeed-LLM | 社区                                                                                 |
+|----------------------|------------|-----------|-----------------------------------------------------------------------|------------------|--------|-----------|------------------------------------------------------------------------------------|
+| Aquila-7B            | BoolQ      | 77.3%     | --                                                                    | Aquila2-7B       | BoolQ  | 77.8%     | --                                                                                 |
+| Aquila2-34B          | BoolQ      | 88.0%     | --                                                                    | Baichuan-7B      | BoolQ  | 69.0%     | [67.0%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)                       |
+| Baichuan-13B         | BoolQ      | 74.7%     | [73.6%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)          | Baichuan2-7B     | BoolQ  | 70.0%     | [63.2%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)                       |
+| Baichuan2-13B        | BoolQ      | 78.0%     | [67.0%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)          | Bloom-7B         | MMLU   | 25.1%     | --                                                                                 |
+| Bloom-176B           | BoolQ      | 64.5%     | --                                                                    | ChatGLM3-6B      | MMLU   | 61.5%     | --                                                                                 |
+| GLM4-9B              | MMLU       | 74.5%     | [74.7%](https://huggingface.co/THUDM/glm-4-9b)                        | CodeQwen1.5-7B   | Human. | 54.8%     | [51.8%](https://qwenlm.github.io/zh/blog/codeqwen1.5/)                             |
+| CodeLLaMA-34B        | HumanEval  | 48.8%     | [48.8%](https://paperswithcode.com/sota/code-generation-on-humaneval) | Gemma-2B         | MMLU   | 39.6%     | --                                                                                 |
+| Gemma-7B             | MMLU       | 52.2%     | --                                                                    | InternLM-7B      | MMLU   | 48.7%     | [51.0%](https://huggingface.co/internlm/internlm-7b)                               |
+| Gemma2-9B            | MMLU       | 70.7%     | [71.3%](https://huggingface.co/google/gemma-2-9b)                     | Gemma2-27B       | MMLU   | 75.5%     | [75.2%](https://huggingface.co/google/gemma-2-27b)                                 |
+| LLaMA-7B             | BoolQ      | 74.6%     | [75.4%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)          | LLaMA-13B        | BoolQ  | 79.6%     | [78.7%](https://hub.opencompass.org.cn/dataset-detail/BoolQ)                       |
+| LLaMA-33B            | BoolQ      | 83.2%     | [83.1%](https://paperswithcode.com/sota/question-answering-on-boolq)  | LLaMA-65B        | BoolQ  | 85.7%     | [86.6%](https://paperswithcode.com/sota/question-answering-on-boolq)               |
+| LLaMA2-7B            | MMLU       | 45.7%     | --                                                                    | LLaMA2-13B       | BoolQ  | 82.2%     | [81.7%](https://paperswithcode.com/sota/question-answering-on-boolq)               |
+| LLaMA2-34B           | BoolQ      | 82.0%     | --                                                                    | LLaMA2-70B       | BoolQ  | 86.4%     | --                                                                                 |
+| LLaMA3-8B            | MMLU       | 65.2%     | --                                                                    | LLaMA3-70B       | BoolQ  | 78.4%     | --                                                                                 |
+| LLaMA3.1-8B          | MMLU       | 65.3%     | --                                                                    | LLaMA3.1-70B     | MMLU   | 81.8%     | --                                                                                 |
+| LLaMA3.2-1B          | MMLU       | 31.8%     | [32.2%](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B)       | LLaMA3.2-3B      | MMLU   | 56.3%     | [58.0%](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B)                    |
+| Mistral-7B           | MMLU       | 56.3%     | --                                                                    | Mixtral-8x7B     | MMLU   | 70.6%     | [70.6%](https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu) |
+| Mistral-8x22B        | MMLU       | 77%       | [77.8%](https://mistral.ai/news/mixtral-8x22b/)                       | MiniCPM-MoE-8x2B | BoolQ  | 83.9%     | --                                                                                 |
+| QWen-7B              | MMLU       | 58.1%     | [58.2%](https://huggingface.co/Qwen/Qwen-7B)                          | Qwen-14B         | MMLU   | 65.3%     | [66.3%](https://huggingface.co/Qwen/Qwen-14B)                                      |
+| QWen-72B             | MMLU       | 74.6%     | [77.4%](https://huggingface.co/Qwen/Qwen-72B)                         | QWen1.5-0.5B     | MMLU   | 39.1%     | --                                                                                 |
+| QWen1.5-1.8b         | MMLU       | 46.2%     | [46.8%](https://qwenlm.github.io/zh/blog/qwen1.5/)                    | QWen1.5-4B       | MMLU   | 59.0%     | [56.1%](https://qwenlm.github.io/zh/blog/qwen1.5)                                  |
+| QWen1.5-7B           | MMLU       | 60.3%     | [61.0%](https://qwenlm.github.io/zh/blog/qwen1.5/)                    | QWen1.5-14B      | MMLU   | 67.3%     | [67.6%](https://qwenlm.github.io/zh/blog/qwen1.5)                                  |
+| QWen1.5-32B          | MMLU       | 72.5%     | [73.4%](https://huggingface.co/Qwen/Qwen-72B)                         | QWen1.5-72B      | MMLU   | 76.4%     | [77.5%](https://qwenlm.github.io/zh/blog/qwen1.5)                                  |
+| Qwen1.5-110B         | MMLU       | 80.4%     | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/)               | Yi-34B           | MMLU   | 76.3%     | [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU)                        |
+| QWen2-0.5B           | MMLU       | 44.6%     | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/)                      | QWen2-1.5B       | MMLU   | 54.7%     | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/)                                   |
+| QWen2-7B             | MMLU       | 70.3%     | [70.3%](https://qwenlm.github.io/zh/blog/qwen2/)                      | QWen2-57B-A14B   | MMLU   | 75.6%     | [76.5%](https://qwenlm.github.io/zh/blog/qwen2/)                                   |
+| QWen2-72B            | MMLU       | 83.6%     | [84.2%](https://qwenlm.github.io/zh/blog/qwen2/)                      | MiniCPM-2B       | MMLU   | 51.6%     | [53.4%](https://github.com/OpenBMB/MiniCPM?tab=readme-ov-file#3)                   |
+| DeepSeek-V2-Lite-16B | MMLU       | 58.1%     | [58.3%](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)          | QWen2.5-0.5B     | MMLU   | 47.67%    | [47.5%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
+| QWen2.5-1.5B         | MMLU       | 59.4%     | [60.9%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-3B       | MMLU   | 65.6%     | [65.6%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
+| QWen2.5-7B           | MMLU       | 73.8%     | [74.2%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-14B      | MMLU   | 79.4%     | [79.7%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
+| QWen2.5-32B          | MMLU       | 83.3%     | [83.3%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-72B      | MMLU   | 85.59%    | [86.1%](https://qwenlm.github.io/blog/qwen2.5-llm/)                                |
+| InternLM2.5-1.8b     | MMLU       | 51.3%     | [53.5%](https://huggingface.co/internlm/internlm2_5-1_8b)             | InternLM2.5-7B   | MMLU   | 71.6%     | [71.6%](https://huggingface.co/internlm/internlm2_5-7b)                            |
+| InternLM2.5-20b      | MMLU       | 73.3%     | [74.2%](https://huggingface.co/internlm/internlm2_5-20b)              | InternLM3-8b     | MMLU   | 76.6%     | [76.6%](https://huggingface.co/internlm/internlm3-8b-instruct)                     |
+| Yi1.5-6B             | MMLU       | 63.2%     | [63.5%](https://huggingface.co/01-ai/Yi-1.5-6B/tree/main)             | Yi1.5-9B         | MMLU   | 69.2%     | [69.5%](https://huggingface.co/01-ai/Yi-1.5-9B/tree/main)                          |
+| Yi1.5-34B            | MMLU       | 76.9%     | [77.1%](https://huggingface.co/01-ai/Yi-1.5-34B/tree/main)            | CodeQWen2.5-7B   | Human. | 66.5%     | [61.6%](https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B)                        |
+| Qwen2.5-Math-7B      | MMLU-STEM  | 67.8%    | [67.8%](https://github.com/QwenLM/Qwen2.5-Math/tree/main/)            | Qwen2.5-Math-72B |MMLU-STEM| 83.7%    | [82.8%](https://github.com/QwenLM/Qwen2.5-Math/tree/main/)                         |
+| MiniCPM3-4B          | MMLU       | 63.7%     | 64.6%                                                                 | Phi-3.5-mini-instruct | MMLU   | 64.39%    | 64.34%                                                                        |
+| Phi-3.5-MoE-instruct | MMLU       | 78.5%     | [78.9%](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)        | DeepSeek-Math-7B  |MMLU-STEM| 56.5%   | [56.5%](https://github.com/deepseek-ai/DeepSeek-Math)                              |
+| DeepSeek-V2.5        | MMLU       | 79.3%     | [80.6%](https://github.com/deepseek-ai/DeepSeek-V3)                   | DeepSeek-V2-236B | MMLU   | 78.1%     | [78.5%](https://huggingface.co/deepseek-ai/DeepSeek-V2)                            |
+| LLaMA3.3-70B-Instruct | MMLU       | 82.7%     | --                                                                   | QwQ-32B          | MMLU   | 81.19%    | --                                                                                 |
 
 ## 评估指导手册
 
@@ -77,5 +80,5 @@ MindSpeed-LLM 评估操作指导手册请见链接：[evaluation_guide.md](../so
 
 [agi评估介绍](../solutions/evaluation/evaluation_datasets/agi_evaluation.md)
 
-[human_eval_evaluation.md](../solutions/evaluation/evaluation_datasets/human_eval_evaluation.md)
+[humanEval评估介绍](../solutions/evaluation/evaluation_datasets/human_eval_evaluation.md)
 
diff --git a/docs/pytorch/models/ssm_model.md b/docs/pytorch/models/ssm_model.md
index d61bab2ec..d3ce95f20 100644
--- a/docs/pytorch/models/ssm_model.md
+++ b/docs/pytorch/models/ssm_model.md
@@ -16,9 +16,9 @@
   </thead>
   <tbody>
     <tr>
-      <td rowspan="2"><a href="https://huggingface.co/">Mamba2</a></td>
+      <td rowspan="2">Mamba2</td>
       <td><a href="https://huggingface.co/state-spaces/mamba2-2.7b/tree/main">2.7B</a></td>
-      <td rowspan="2"><a href="../../examples/mcore/mamba2">mamba2</a></td>
+      <td rowspan="2"><a href="../../../examples/mcore/mamba2">mamba2</a></td>
       <td>4K</td>
       <th>Mcore</th>
       <td> 1x8</td>
@@ -34,9 +34,9 @@
       <td>【test】</td>
     </tr>      
     <tr>
-      <td rowspan="2"><a href="https://huggingface.co/nvidia/mamba2-hybrid-8b-3t-4k">Mamba2Hybrid</a></td>
+      <td rowspan="2">Mamba2Hybrid</td>
       <td><a href="https://huggingface.co/nvidia/mamba2-hybrid-8b-3t-4k/tree/main">8B</a></td>
-       <td><a href="../../examples/mcore/mamba2">mamba2</a></td>
+       <td><a href="../../../examples/mcore/mamba2">mamba2</a></td>
       <td> 4K</td>
       <th>Mcore</th>
       <td>1x8</td>
@@ -47,7 +47,4 @@
 </table>
 
 ## 以上模型脚本环境变量声明：
-HCCL_CONNECT_TIMEOUT：设置HCCL超时时间，默认值为120<br>
-CUDA_DEVICE_MAX_CONNECTIONS：定义了任务流能够利用或映射到的硬件队列的数量<br>
-PYTORCH_NPU_ALLOC_CONF：内存碎片优化开关，默认是expandable_segments:False，使能时expandable_segments:True<br>
-NPUS_PER_NODE： 配置一个计算节点上使用的NPU数量<br>
\ No newline at end of file
+关于脚本的环境变量定义见[environment_variable.md](../features/environment_variable.md)。
\ No newline at end of file
diff --git a/docs/pytorch/solutions/checkpoint_convert.md b/docs/pytorch/solutions/checkpoint_convert.md
index 4c3f1d531..465f4f882 100644
--- a/docs/pytorch/solutions/checkpoint_convert.md
+++ b/docs/pytorch/solutions/checkpoint_convert.md
@@ -15,7 +15,7 @@
 
   - [Megatron-LM权重转换到Huggingface格式](#22-megatron-lm权重转换到huggingface格式)
 
-    将Huggingface模型权重转换为Megatron-LM格式，适用于不同框架间的模型迁移。
+    将Megatron-LM模型权重转换为Huggingface格式，适用于不同框架间的模型迁移。
 
   - [Megatron-LM格式权重互转](#23-megatron-lm格式权重互转)
 
@@ -29,7 +29,7 @@
 
     - [mcore格式权重合并](#242-megatron-mcore格式权重合并)
 
-      支持将legacy格式的Lora微调权重与基础模型权重合并，转换为Megatron或Huggingface格式；    
+      支持将mcore格式的Lora微调权重与基础模型权重合并，转换为Megatron或Huggingface格式；    
     
     - [lora权重转换为Huggingface格式](#243-lora权重转换为huggingface权重)
       
@@ -49,7 +49,7 @@
 
 **训练并行策略权重转换**：支持多种训练并行策略之间的权重转换，包括 张量并行、流水线并行、专家并行、流水并行动态划分 和 虚拟流水并行 等。无论是针对不同并行策略的训练，还是需要在不同策略之间切换的场景，都能实现灵活的权重转换，以适应各种训练和推理需求。
 
-**Lora权重合并与转换**：支持将 Lora 权重与 Base 权重合并，简化了模型推理过程中的加载步骤。合并后的模型可直接用于推理，显著提升了推理效率，减少了不必要的计算资源消耗。支持将Lora微调权重单独转为Huggingface格式,以支持客户下游任务。
+**Lora权重合并与转换**：支持将 Lora 权重与 Base 权重合并，简化了模型推理过程中的加载步骤。合并后的模型可直接用于推理，显著提升了推理效率，减少了不必要的计算资源消耗。支持将Lora微调权重单独转为Huggingface格式，以支持客户下游任务。
 
 **优化器权重转换**：支持多种并行切分策略，确保优化器状态在不同并行策略间的迁移与兼容，便于在不同训练环境下进行优化器状态恢复。
 
diff --git a/docs/pytorch/solutions/finetune/instruction_finetune.md b/docs/pytorch/solutions/finetune/instruction_finetune.md
index 8a17c162d..47bad36e4 100644
--- a/docs/pytorch/solutions/finetune/instruction_finetune.md
+++ b/docs/pytorch/solutions/finetune/instruction_finetune.md
@@ -20,6 +20,8 @@
 
 `ShareGPT`数据预处理部分详见[**ShareGPT风格数据的说明文档**](datasets/sharegpt_dataset.md)。
 
+`Pairwise`数据预处理部分详见[**Pairwise风格数据的说明文档**](datasets/pairwise_dataset.md)。
+
 **接下来将以Alpaca数据集作为输入，进行全参数微调示例。**
 
 ### 初始化环境变量 
diff --git a/docs/pytorch/solutions/inference/chat.md b/docs/pytorch/solutions/inference/chat.md
index 007965403..881d8ce49 100644
--- a/docs/pytorch/solutions/inference/chat.md
+++ b/docs/pytorch/solutions/inference/chat.md
@@ -13,8 +13,7 @@
 `source /usr/local/Ascend/nnal/atb/set_env.sh`
 
 ### 启动脚本
-
-使用LLaMA2-7B模型目录下的<a href="../../examples/mcore/llama2/chat_llama2_7b_ptd.sh">chat脚本</a>。
+使用LLaMA2-7B模型目录下的[chat脚本](../../../../examples/mcore/llama2/chat_llama2_7b_ptd.sh)。
 
 #### 填写相关路径
 
@@ -58,7 +57,7 @@ bash examples/mcore/llama2/chat_llama2_7b_ptd.sh
 
 #### Chat对话脚本相关参数
 
-对轮对话脚本与流式推理相关的参数设置可以在[`流式推理`](inference.md)文件内查看。
+多轮对话脚本与流式推理相关的参数设置可以在[`流式推理`](inference.md)文件内查看。
 
 【--task】
 
diff --git a/docs/pytorch/solutions/inference/inference.md b/docs/pytorch/solutions/inference/inference.md
index e8a05df64..c27cb4e95 100644
--- a/docs/pytorch/solutions/inference/inference.md
+++ b/docs/pytorch/solutions/inference/inference.md
@@ -34,7 +34,7 @@ HuggingFace: "I hope you are doing well. I am writing to ask for your help with
 
 ### 启动脚本
 
-使用LLaMA2-7B模型目录下的<a href="../../examples/mcore/llama2/generate_llama2_7b_ptd.sh">流式推理脚本</a>。
+使用LLaMA2-7B模型目录下的[流式推理脚本](../../../../examples/mcore/llama2/generate_llama2_7b_ptd.sh)。
 
 #### 填写相关路径
 
diff --git a/docs/pytorch/solutions/preference-alignment/offline_dpo.md b/docs/pytorch/solutions/preference-alignment/offline_dpo.md
index 19a933927..ef357e9b3 100644
--- a/docs/pytorch/solutions/preference-alignment/offline_dpo.md
+++ b/docs/pytorch/solutions/preference-alignment/offline_dpo.md
@@ -64,7 +64,7 @@ dpo训练脚本参照：[dpo_llama3_8b_full_ptd.sh](../../../../examples/mcore/l
 
 - **`--refer-model-iter`**
 
-  可选参数，指定参考模型初始权重的迭代步数，默认为1。在DPO的断点续训场景中指定此参数以区分续训时参考模型和训练模型导入的权重，训练模型将导入训练中断钱保存的权重，参考模型将导入refer_model_iter指定迭代步数的权重，以保证断点续训时导入的参考模型权重是初始的，而不是训练过的。
+  可选参数，指定参考模型初始权重的迭代步数，默认为1。在DPO的断点续训场景中指定此参数以区分续训时参考模型和训练模型导入的权重，训练模型将导入训练中断前保存的权重，参考模型将导入refer_model_iter指定迭代步数的权重，以保证断点续训时导入的参考模型权重是初始的，而不是训练过的。
 
 ### DPO-LORA
 
diff --git a/docs/quick_start.md b/docs/quick_start.md
index 2f2338cfb..87810a207 100644
--- a/docs/quick_start.md
+++ b/docs/quick_start.md
@@ -12,15 +12,17 @@
 
 # 1 环境搭建
 
-如果已经完成了环境搭建，请跳转2，进行预训练任务拉起
+如果已经完成了环境搭建，请跳转2，进行预训练任务拉起。
+
+请参考MindSpeed-LLM仓首页[“版本配套表”](../README.md#版本配套表)，选择下载对应版本的软件依赖，请参考[install_guide.md](features/install_guide.md)完成环境搭建，本章节通过配图辅助您完成环境安装。
 
 ## 1.1 驱动固件安装
-请参考MindSpeed-LLM仓首页[“版本配套表”](https://gitee.com/ascend/MindSpeed-LLM/blob/2.0.0/README.md#版本配套表)，选择下载对应版本的软件依赖。
 
-下载[驱动固件](https://www.hiascend.com/hardware/firmware-drivers/community?product=4&model=26&cann=8.0.0.beta1&driver=Ascend+HDK+24.1.0)，请根据系统和硬件产品型号选择对应版本的`driver`和`firmware`下载到本地并上传到服务器任意目录，驱动固件下载示意图如下:
+下载[驱动固件](https://www.hiascend.com/hardware/firmware-drivers/community)，请根据系统和硬件产品型号选择对应版本的`driver`和`firmware`下载到本地并上传到服务器任意目录，驱动固件下载示意图如下:
+
 ![](../sources/images/quick_start/1747637900628_image.png)
 
-参考[安装NPU驱动固件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/softwareinst/instg/instg_0005.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)或执行以下命令安装：
+参考[安装NPU驱动固件](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/softwareinst/instg/instg_0005.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit),选择对应的版本，执行以下命令安装：
 
 ```shell
 # 因为版本迭代，包名存在出入，根据实际下载的包进行修改
@@ -32,10 +34,10 @@ bash Ascend-hdk-910b-npu-driver_24.1.0.3_linux-aarch64.run --full --force
 
 ## 1.2 CANN安装
 
-也可使用仓库提供的docker镜像，参考指导见[DOCKER_GUIDE](https://gitee.com/ascend/MindSpeed-LLM/blob/2.0.0/docs/features/docker_guide.md)，拉取镜像完成安装后，跳到2.huggleface源文件获取。
-
 下载[CANN](https://www.hiascend.com/developer/download/community/result?module=cann)，请根据系统选择`aarch64`或`x86_64`对应版本的`cann-toolkit`、`cann-kernel`和`cann-nnal`下载到本地并上传到服务器任意目录。相关软件下载示意图如下：
+
 ![](../sources/images/quick_start/1747480182437_image.png)
+
 参考[CANN安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/softwareinst/instg/instg_0008.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)完成cann包安装，若缺少依赖请参考[CANN依赖安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/softwareinst/instg/instg_0007.html?Mode=PmIns&OS=Ubuntu&Software=cannToolKit)：
 
 ```shell
@@ -49,7 +51,7 @@ bash Ascend-cann-nnal_8.1.RC1_linux-aarch64.run --install
 
 ## 1.3 PTA安装
 
-准备[torch_npu](https://www.hiascend.com/developer/download/community/result?module=pt)和[apex](https://gitee.com/ascend/apex)，参考[Ascend Extension for PyTorch 配置与安装](https://www.hiascend.com/document/detail/zh/Pytorch/700/configandinstg/instg/insg_0004.html)或执行以下命令安装
+准备[torch_npu](https://www.hiascend.com/developer/download/community/result?module=pt)和[apex](https://gitee.com/ascend/apex)，参考[Ascend Extension for PyTorch](https://www.hiascend.com/document/detail/zh/Pytorch/700/configandinstg/instg/insg_0001.html)或执行以下命令安装
 
 ```shell
 # 安装torch和torch_npu，因为版本迭代，包名存在出入，根据实际修改
@@ -72,7 +74,7 @@ source /usr/local/Ascend/nnal/atb/set_env.sh
 # 安装MindSpeed加速库
 git clone https://gitee.com/ascend/MindSpeed.git
 cd MindSpeed
-git checkout 2.0.0_core_r0.8.0  # checkout commit from MindSpeed 2.0.0_core_r0.8.0
+git checkout master              # 以install_guide.md中的版本为准，此处仅做参考
 pip install -r requirements.txt
 pip3 install -e .
 cd ..
@@ -81,10 +83,10 @@ cd ..
 git clone https://gitee.com/ascend/MindSpeed-LLM.git
 git clone https://github.com/NVIDIA/Megatron-LM.git  # megatron从github下载，请确保网络能访问
 cd Megatron-LM
-git checkout core_r0.8.0
+git checkout core_r0.8.0         # 以install_guide.md中的版本为准，此处仅做参考
 cp -r megatron ../MindSpeed-LLM/
 cd ../MindSpeed-LLM
-git checkout 2.0.0
+git checkout master              # 以install_guide.md中的版本为准，此处仅做参考
 
 pip install -r requirements.txt  # 安装其余依赖库
 ```
@@ -122,6 +124,7 @@ sha256sum model-00003-of-00004.safetensors
 sha256sum model-00004-of-00004.safetensors
 ```
 ![img.png](../sources/images/quick_start/sha256.png)
+
 ![img_1.png](../sources/images/quick_start/sha256_hf.png)
 
 # 3 预训练实战
@@ -135,7 +138,7 @@ sha256sum model-00004-of-00004.safetensors
 ## 3.1 权重转换
 
 昇腾MindSpeed-LLM要求模型权重采用Megatron-LM格式，在这里我们将原始HuggingFace权重格式转换为Megatron-Mcore格式。
-详见[hf2mg权重转换](https://gitee.com/ascend/MindSpeed-LLM/blob/2.0.0/docs/features/checkpoint.md#21-huggingface%E6%9D%83%E9%87%8D%E8%BD%AC%E6%8D%A2%E5%88%B0megatron-lm%E6%A0%BC%E5%BC%8F)
+详见[hf2mg权重转换](./pytorch/solutions/checkpoint_convert.md#21-huggingface权重转换到megatron-lm格式)
 
 使用官方提供的转换脚本，获取对应切分的mg权重。
 
@@ -186,7 +189,7 @@ python convert_ckpt.py \
 
 ## 3.2 预训练数据集处理
 
-通过对各种格式的数据做提前预处理，避免原始数据的反复处理加载，将所有的数据都统一存储到为.bin和.idx两个文件中，详见[预训练数据处理](https://gitee.com/ascend/MindSpeed-LLM/blob/2.0.0/docs/features/pretrain_dataset.md#%E6%95%B0%E6%8D%AE%E9%9B%86%E5%A4%84%E7%90%86)
+通过对各种格式的数据做提前预处理，避免原始数据的反复处理加载，将所有的数据都统一存储到为.bin和.idx两个文件中，详见[预训练数据处理](./pytorch/solutions/pretrain/pretrain_dataset.md)
 
 常用的预训练数据集包括alpaca、enwiki、c4等，链接中提供了数据集下载地址。
 
@@ -323,10 +326,12 @@ TOKENIZER_PATH="./model_from_hf/qwen2.5-7b-hf/"
 ## 常见问题
 - **问题1：训练日志显示"Checkpoint path not found"？**  
   → 检查`CKPT_LOAD_DIR`是否指向正确的权重转换后路径，确认文件夹内包含`.ckpt`或`.bin`文件。
+
 ![img_1.png](../sources/images/quick_start/img_1.png)
 
 **问题2：显示数据集加载out of range？**  
   → 微调脚本，没有读取到数据集，请检查DATA_PATH是否符合上面示例的规范。
+
 ![img_3.png](../sources/images/quick_start/img_3.png)
  
 - **问题3：训练脚本拉起失败？**  
@@ -334,12 +339,13 @@ TOKENIZER_PATH="./model_from_hf/qwen2.5-7b-hf/"
 
 - **问题4：没有生成运行日志文件？**  
   → 需要自行创建logs文件夹。
+
 ![img_2.png](../sources/images/quick_start/img_2.png)
 
 ## 加入昇腾开发者生态
 
 - 🌐 **社区资源**：访问[昇腾开源社区](https://gitee.com/ascend)获取最新模型支持
-- 📈 **性能优化**：参考[MindSpeed Profiling](https://gitee.com/ascend/MindSpeed/blob/master/docs/profiling.md)分析瓶颈
+- 📈 **性能优化**：参考[MindSpeed Profiling](pytorch/features/profiling.md)分析瓶颈
 - 💡 **定制需求**：通过`model_cfg.json`扩展自定义模型
 
 ---
-- 
Gitee


From 42f93bbb0bf1ace0339dc939c2222770c5fe7a71 Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Tue, 27 May 2025 11:00:47 +0800
Subject: [PATCH 2/7] update evaluate_guide

---
 docs/pytorch/solutions/evaluation/evaluation_guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/pytorch/solutions/evaluation/evaluation_guide.md b/docs/pytorch/solutions/evaluation/evaluation_guide.md
index fc93aba45..d08169922 100644
--- a/docs/pytorch/solutions/evaluation/evaluation_guide.md
+++ b/docs/pytorch/solutions/evaluation/evaluation_guide.md
@@ -22,7 +22,7 @@ DATA_PATH="./mmlu/data/test/"
 TASK="mmlu"  # 支持 mmlu、ceval、agieval、bbh、boolq、human_eval
 
 # 启动评估脚本
-bash examples/mcore/llama2/evaluate_llama2_7B_mmlu_ptd.sh
+bash examples/mcore/llama2/evaluate_llama2_7b_mmlu_ptd.sh
 ```
 
 【--max-new-tokens】
-- 
Gitee


From 021545beef78ed0195b1ce05e26580f4718cdcc7 Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Tue, 27 May 2025 19:17:09 +0800
Subject: [PATCH 3/7] update

---
 docs/pytorch/solutions/pretrain/pretrain.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/pytorch/solutions/pretrain/pretrain.md b/docs/pytorch/solutions/pretrain/pretrain.md
index a2152d437..01f19bbe2 100644
--- a/docs/pytorch/solutions/pretrain/pretrain.md
+++ b/docs/pytorch/solutions/pretrain/pretrain.md
@@ -72,7 +72,7 @@ TOKENIZER_MODEL="./model_from_hf/llama-2-7b-hf/tokenizer.model"
 格式二（根据数据集的长度推出数据集的权重）
 
 ```shell 
---data-path dataset1-path dataset2-path
+--data-path "dataset1-path dataset2-path"
 ```
 
 **示例：**
-- 
Gitee


From ae00b5c497b78c665aac261c8ac9ed86b38fa7bc Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Thu, 29 May 2025 18:34:38 +0800
Subject: [PATCH 4/7] =?UTF-8?q?moe-alltoall-overlap-comm=E8=A7=A3=E8=80=A6?=
 =?UTF-8?q?gemm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../solutions/finetune/lora_finetune.md       |  2 +-
 .../solutions/finetune/qlora_finetune.md      |  5 +-
 .../tasks/posttrain/lora/moe/experts.py       | 76 ++++++++++++++-----
 .../tune_qwen3_30b_a3b_lora_ptd.sh            |  7 +-
 4 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/docs/pytorch/solutions/finetune/lora_finetune.md b/docs/pytorch/solutions/finetune/lora_finetune.md
index eec81712b..cc713c1bf 100644
--- a/docs/pytorch/solutions/finetune/lora_finetune.md
+++ b/docs/pytorch/solutions/finetune/lora_finetune.md
@@ -213,7 +213,7 @@ bash examples/legacy/llama2/evaluate_llama2_7B_lora_mmlu_ptd.sh
 - **对话模板选择**：根据模型和数据需求选择合适模板，确保微调和评估一致性。
 - **语言匹配**：设定数据集语言以优化评估效果。
 - **内置模板使用**：启用 `--hf-chat-template` 时简化输入格式，确保评估的准确性。
-- **使用限制**：MOE 场景下，Lora 微调开启 `--moe-grouped-gemm` GMM算子时需同步开启 `--moe-alltoall-overlap-comm` 特性。
+- MOE 场景下，Lora 微调支持开启 `--moe-grouped-gemm` GMM算子，同时支持开启 `--moe-alltoall-overlap-comm` 特性。
 ## 参考文献
 
 - [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)
\ No newline at end of file
diff --git a/docs/pytorch/solutions/finetune/qlora_finetune.md b/docs/pytorch/solutions/finetune/qlora_finetune.md
index ae5c4e9bc..61f7219a0 100644
--- a/docs/pytorch/solutions/finetune/qlora_finetune.md
+++ b/docs/pytorch/solutions/finetune/qlora_finetune.md
@@ -79,7 +79,7 @@ QLoRA在LoRA的基础上，对主干部分的权重进行量化，大幅降低
 
 将原精度的hf权重转换为的mg权重时，可以通过增加`--qlora-nf4`选项开启QLoRA的NF4量化，会得到量化压缩后的mg权重，目前不支持其它量化方式。
 
-注意：目前QLoRA特性与`--moe-grouped-gemm`同时开启时，会导致大量的MoE模型中的专家权重无法被量化，影响整体量化压缩比。
+> 目前QLoRA特性支持开启 `--moe-grouped-gemm` GMM算子以及 `--moe-alltoall-overlap-comm` 特性。
 
 ### 2、QLoRA微调
 
@@ -106,7 +106,8 @@ QLoRA在LoRA的基础上，对主干部分的权重进行量化，大幅降低
 
 ## 使用限制
 
-* MOE 场景下，Lora 微调暂不支持开启 --moe-grouped-gemm 使用 gmm 算子。
 * QLoRA 暂不支持 lora-fusion 特性，开启时无性能收益。
 
+> 目前QLoRA特性支持开启 `--moe-grouped-gemm` GMM算子以及 `--moe-alltoall-overlap-comm` 特性。
+
 > QLoRA支持分布式LoRA、PP、TP、VPP、CP、SP、重计算等LoRA支持的特性，并且精度正常，更多特性的亲和性还在补充验证中。
diff --git a/mindspeed_llm/tasks/posttrain/lora/moe/experts.py b/mindspeed_llm/tasks/posttrain/lora/moe/experts.py
index 41eda113f..7d5993cd4 100644
--- a/mindspeed_llm/tasks/posttrain/lora/moe/experts.py
+++ b/mindspeed_llm/tasks/posttrain/lora/moe/experts.py
@@ -13,6 +13,7 @@ from megatron.core.tensor_parallel.layers import (
 from megatron.core.tensor_parallel.utils import divide
 from megatron.core.transformer.moe.moe_utils import permute
 from megatron.core.transformer.moe.experts import GroupedMLP
+from megatron.core.transformer.moe import grouped_gemm_util as gg
 from megatron.training import get_args
 from megatron.core.parallel_state import (
     get_expert_model_parallel_group,
@@ -422,14 +423,14 @@ class LoraParallelGroupedMLP(GroupedMLP):
         setattr(self.weight1_lora_b, 'allreduce', not self.expert_parallel)
         setattr(self.weight2_lora_a, 'allreduce', not self.expert_parallel)
 
-        if not get_args().moe_alltoall_overlap_comm:
-            raise AssertionError("Currently GMM LoRA Finetune only support moe_alltoall_overlap_comm")
-
         if get_args().moe_hierarchical_alltoallv or get_args().moe_experts_pipeline_degree:
             raise AssertionError("Currently GMM LoRA Finetune not support moe_hierarchical_alltoallv")
 
     def forward(self, permuted_local_hidden_states, tokens_per_expert, ctx=None):
+        args = get_args()
+
         if permuted_local_hidden_states.nelement() != 0:
+            # input is empty
             w1 = self.weight1.view(self.num_local_experts, self.config.hidden_size, -1)
             w2 = self.weight2.view(self.num_local_experts, -1, self.config.hidden_size)
             w1_a = self.weight1_lora_a.view(self.num_local_experts, -1, self.lora_r)
@@ -440,26 +441,63 @@ class LoraParallelGroupedMLP(GroupedMLP):
                 self.weight1.quant_state.shape = (self.num_local_experts, self.config.hidden_size, w1.shape[-1] * 2)
                 self.weight2.quant_state.shape = (self.num_local_experts, w2.shape[1] * 2, self.config.hidden_size)
         else:
+            # input is not empty
             w1 = self.weight1.view(self.config.hidden_size, -1)
             w2 = self.weight2.view(-1, self.config.hidden_size)
-            w1_a = self.weight1_lora_a.view(-1, self.lora_r)
+            w1_a = self.weight1_lora_a.view(self.num_local_experts, -1, self.lora_r)[0]
             w1_b = self.weight1_lora_b.view(self.lora_r, -1)
             w2_a = self.weight2_lora_a.view(-1, self.lora_r)
-            w2_b = self.weight2_lora_b.view(self.lora_r, -1)
+            w2_b = self.weight2_lora_b.view(self.num_local_experts, self.lora_r, -1)[0]
             if hasattr(self.weight1, "quant_state"):
                 self.weight1.quant_state.shape = (self.config.hidden_size, w1.shape[-1] * 2)
                 self.weight2.quant_state.shape = (w2.shape[0] * 2, self.config.hidden_size)
-        w1, w2 = self.weight1, self.weight2
-        group_list = torch.cumsum(tokens_per_expert, dim=0)
-
-        return lora_parallel_grouped_mlp_with_comp_and_comm_overlap_all2all(permuted_local_hidden_states,
-                                                                            w1_a, w1_b,
-                                                                            w2_a, w2_b,
-                                                                            (w1, w2,
-                                                                             self.weight1_lora_a,
-                                                                             self.weight1_lora_b,
-                                                                             self.weight2_lora_a,
-                                                                             self.weight2_lora_b,
-                                                                             self.activation_func,
-                                                                             group_list, self.layer_number,
-                                                                             self.scaling), ctx=ctx)
+        if hasattr(self.weight1, "quant_state"):
+            w1, w2 = self.weight1, self.weight2
+
+        if args.moe_alltoall_overlap_comm:
+            # alltoall-overlap-comm
+            group_list = torch.cumsum(tokens_per_expert, dim=0)
+            return lora_parallel_grouped_mlp_with_comp_and_comm_overlap_all2all(permuted_local_hidden_states,
+                                                                                w1_a, w1_b,
+                                                                                w2_a, w2_b,
+                                                                                (w1, w2,
+                                                                                 self.weight1_lora_a,
+                                                                                 self.weight1_lora_b,
+                                                                                 self.weight2_lora_a,
+                                                                                 self.weight2_lora_b,
+                                                                                 self.activation_func,
+                                                                                 group_list, self.layer_number,
+                                                                                 self.scaling), ctx=ctx)
+        else:
+            # origin gemm
+            if permuted_local_hidden_states.nelement() != 0:
+                # Reshape the weights for the grouped GEMMs.
+                fc1_output = gg.ops.gmm(permuted_local_hidden_states, w1, tokens_per_expert, trans_b=False)
+                mm1_a = gg.ops.gmm(permuted_local_hidden_states, w1_a, tokens_per_expert, trans_b=False)
+                mm1_b = gg.ops.gmm(mm1_a, w1_b, tokens_per_expert, trans_b=False) * self.scaling
+
+                intermediate_parallel = self.activation_func(fc1_output + mm1_b)
+
+                mm2_a = gg.ops.gmm(intermediate_parallel, w2_a, tokens_per_expert, trans_b=False)
+                mm2_b = gg.ops.gmm(mm2_a, w2_b, tokens_per_expert, trans_b=False) * self.scaling
+                fc2_output = gg.ops.gmm(intermediate_parallel, w2, tokens_per_expert, trans_b=False)
+
+                fc2_output += mm2_b
+            else:
+                # Make sure parameters still have gradients when no tokens are routed to this set of experts.
+                # No token is allocated for local experts.
+                assert torch.count_nonzero(tokens_per_expert) == 0
+
+                h = torch.matmul(permuted_local_hidden_states, w1)
+                mm1_a = torch.matmul(permuted_local_hidden_states, w1_a)
+                mm1_b = torch.matmul(mm1_a, w1_b) * self.scaling
+
+                h = self.activation_func(h + mm1_b)
+
+                mm2_a = torch.matmul(h, w2_a)
+                mm2_b = torch.matmul(mm2_a, w2_b) * self.scaling
+                h = torch.matmul(h, w2)
+
+                fc2_output = h + mm2_b
+
+            return fc2_output, None
\ No newline at end of file
diff --git a/tests/0day/qwen3/qwen3-30b-a3b/tune_qwen3_30b_a3b_lora_ptd.sh b/tests/0day/qwen3/qwen3-30b-a3b/tune_qwen3_30b_a3b_lora_ptd.sh
index c46b24f28..bd1c7cba8 100644
--- a/tests/0day/qwen3/qwen3-30b-a3b/tune_qwen3_30b_a3b_lora_ptd.sh
+++ b/tests/0day/qwen3/qwen3-30b-a3b/tune_qwen3_30b_a3b_lora_ptd.sh
@@ -18,9 +18,9 @@ CKPT_SAVE_DIR="your model save ckpt path"
 DATA_PATH="your data path"
 TOKENIZER_PATH="your tokenizer path"
 
-TP=4
-PP=1
-EP=1
+TP=1
+PP=4
+EP=2
 SEQ_LENGTH=4096
 TRAIN_ITERS=2000
 
@@ -37,6 +37,7 @@ MOE_ARGS="
     --moe-router-topk 8 \
     --moe-intermediate-size 768 \
     --moe-grouped-gemm \
+    --moe-alltoall-overlap-comm \
     --moe-permutation-async-comm \
     --moe-token-dispatcher-type alltoall \
     --moe-router-load-balancing-type aux_loss \
-- 
Gitee


From 284229ada80c47a776fb5db2412aa0e149b1b500 Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Thu, 29 May 2025 19:02:18 +0800
Subject: [PATCH 5/7] fix assert

---
 mindspeed_llm/tasks/posttrain/lora/moe/experts.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mindspeed_llm/tasks/posttrain/lora/moe/experts.py b/mindspeed_llm/tasks/posttrain/lora/moe/experts.py
index 7d5993cd4..c072a8528 100644
--- a/mindspeed_llm/tasks/posttrain/lora/moe/experts.py
+++ b/mindspeed_llm/tasks/posttrain/lora/moe/experts.py
@@ -484,10 +484,6 @@ class LoraParallelGroupedMLP(GroupedMLP):
 
                 fc2_output += mm2_b
             else:
-                # Make sure parameters still have gradients when no tokens are routed to this set of experts.
-                # No token is allocated for local experts.
-                assert torch.count_nonzero(tokens_per_expert) == 0
-
                 h = torch.matmul(permuted_local_hidden_states, w1)
                 mm1_a = torch.matmul(permuted_local_hidden_states, w1_a)
                 mm1_b = torch.matmul(mm1_a, w1_b) * self.scaling
-- 
Gitee


From 27f351796ad3739bedddd90c3743a12da024522f Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Thu, 12 Jun 2025 22:30:39 +0800
Subject: [PATCH 6/7] fix bug

---
 examples/mcore/qwen2/pretrain_qwen2_7b_32k_ptd.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/mcore/qwen2/pretrain_qwen2_7b_32k_ptd.sh b/examples/mcore/qwen2/pretrain_qwen2_7b_32k_ptd.sh
index 7f9ff4a04..31311b1db 100644
--- a/examples/mcore/qwen2/pretrain_qwen2_7b_32k_ptd.sh
+++ b/examples/mcore/qwen2/pretrain_qwen2_7b_32k_ptd.sh
@@ -104,4 +104,4 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
     --load ${CKPT_LOAD_DIR} \
     --save ${CKPT_SAVE_DIR} \
     --distributed-backend nccl \
-    | tee logs/pretrain_mcore_qwen2_72b_32k.log
+    | tee logs/pretrain_mcore_qwen2_7b_32k.log
-- 
Gitee


From ee1b5a2de9742529e7b8b5504e35de4d1a77f339 Mon Sep 17 00:00:00 2001
From: jzh6229 <jiangzhihui4@huawei.com>
Date: Fri, 13 Jun 2025 17:26:16 +0800
Subject: [PATCH 7/7] fix sh

---
 examples/legacy/llama3/tune_llama3_8b_ptd_full.sh               | 2 +-
 .../deepseek_r1_distill_qwen/tune_distill_qwen_32b_full.sh      | 2 +-
 examples/mcore/llama3/tune_llama3_8b_full_ptd.sh                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/legacy/llama3/tune_llama3_8b_ptd_full.sh b/examples/legacy/llama3/tune_llama3_8b_ptd_full.sh
index 034fa49fe..0d2d7c6da 100644
--- a/examples/legacy/llama3/tune_llama3_8b_ptd_full.sh
+++ b/examples/legacy/llama3/tune_llama3_8b_ptd_full.sh
@@ -98,4 +98,4 @@ torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \
     $OUTPUT_ARGS \
     --distributed-backend nccl \
     --load ${CKPT_LOAD_DIR} \
-    | tee logs/train_llama3_8b_full.log
+    | tee logs/tune_llama3_8b_full.log
diff --git a/examples/mcore/deepseek_r1_distill_qwen/tune_distill_qwen_32b_full.sh b/examples/mcore/deepseek_r1_distill_qwen/tune_distill_qwen_32b_full.sh
index 241ab0e52..9de296b43 100644
--- a/examples/mcore/deepseek_r1_distill_qwen/tune_distill_qwen_32b_full.sh
+++ b/examples/mcore/deepseek_r1_distill_qwen/tune_distill_qwen_32b_full.sh
@@ -117,4 +117,4 @@ torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \
     --load ${CKPT_LOAD_DIR} \
     --save ${CKPT_SAVE_DIR} \
     --distributed-backend nccl \
-    | tee logs/pretrain_mcore_distill_qwen_32b_full_8k.log
+    | tee logs/tune_mcore_distill_qwen_32b_full_8k.log
diff --git a/examples/mcore/llama3/tune_llama3_8b_full_ptd.sh b/examples/mcore/llama3/tune_llama3_8b_full_ptd.sh
index 645fd84dc..d2c1f8fd7 100644
--- a/examples/mcore/llama3/tune_llama3_8b_full_ptd.sh
+++ b/examples/mcore/llama3/tune_llama3_8b_full_ptd.sh
@@ -106,4 +106,4 @@ torchrun $DISTRIBUTED_ARGS posttrain_gpt.py \
     $DATA_ARGS \
     $OUTPUT_ARGS \
     --distributed-backend nccl \
-    | tee logs/tune_llama31_8b_full_ptd.log
\ No newline at end of file
+    | tee logs/tune_llama3_8b_full_ptd.log
\ No newline at end of file
-- 
Gitee