From ac72a050e4f6ac095f1353e890cfec9d2e124419 Mon Sep 17 00:00:00 2001
From: liu lili <liulili715@huawei.com>
Date: Thu, 28 Aug 2025 15:54:27 +0800
Subject: [PATCH 1/2] lll: solve some error

---
 .../source_zh_cn/model_infer/introduction.md  |  2 +-
 .../ms_infer/ms_infer_model_infer.rst         |  2 +-
 .../ms_infer/ms_infer_parallel_infer.md       | 20 +++++++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tutorials/source_zh_cn/model_infer/introduction.md b/tutorials/source_zh_cn/model_infer/introduction.md
index f383b2f914..a1c4e3b952 100644
--- a/tutorials/source_zh_cn/model_infer/introduction.md
+++ b/tutorials/source_zh_cn/model_infer/introduction.md
@@ -28,7 +28,7 @@ MindSpore框架为用户提供高效的模型推理能力，从AI功能上来看
 
     - **带框架推理**：将模型网络结构和模型权重文件分开保存，通过构建网络并单独加载模型权重来构建AI模型推理，这种推理的好处是推理权重通常是不需要更新的，不管是对模型进行微调优化，还是开发调试，面向当前大语言模型上百G的权重时，该推理方案有明显的优势。
 
-    - **模型文件推理**：将模型网络和权重打包到一个文件（protobuf或者flatbed格式），用户只需要管理一个文件即可以将模型执行起来，面向模型比较多而模型大小不是很大的场景，在部署上管理更加方便。
+    - **模型文件推理**：将模型网络和权重打包到一个文件（protobuf或者flatbuffer格式），用户只需要管理一个文件即可以将模型执行起来，面向模型比较多而模型大小不是很大的场景，在部署上管理更加方便。
 
 - **根据推理后端分类**
 
diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst
index b395c374bb..4b423724ce 100644
--- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst
+++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst
@@ -149,7 +149,7 @@ MindSpore大语言模型带框架推理主要依赖MindSpore开源软件，用
 .. code:: shell
 
    git lfs install
-   git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct
+   git clone https://huggingface.co/Qwen/Qwen2-7B
 
 下载完成后，相关目录下应该显示如下文件树结构：
 
diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md
index c6bd687e0c..b942851a18 100644
--- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md
+++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md
@@ -655,22 +655,22 @@ class Qwen2Attention(nn.Cell):
             param_dtype=self.param_dtype
             bias=True
         )
--        self.q_proj = Qwen2Linear(
-+        self.q_proj = Qwen2ColParallelLinear(
+-        self.k_proj = Qwen2Linear(
++        self.k_proj = Qwen2ColParallelLinear(
             input_size=self.hidden_size,
             output_size=self.kv_size,
             param_dtype=self.param_dtype,
             bias=True
         )
--        self.q_proj = Qwen2Linear(
-+        self.q_proj = Qwen2ColParallelLinear(
+-        self.v_proj = Qwen2Linear(
++        self.v_proj = Qwen2ColParallelLinear(
             input_size=self.hidden_size,
             output_size=self.kv_size,
             param_dtype=self.param_dtype,
             bias=True
         )
--        self.q_proj = Qwen2Linear(
-+        self.q_proj = Qwen2RowParallelLinear(
+-        self.o_proj = Qwen2Linear(
++        self.o_proj = Qwen2RowParallelLinear(
             input_size=self.q_size,
             output_size=self.hidden_size,
             param_dtype=self.param_dtype,
@@ -744,21 +744,21 @@ class Qwen2MLP(nn.Cell):
     def __init__(self, config: Qwen2Config) -> None:
         super().__init__()
 
--        self.q_proj = Qwen2Linear(
+-        self.up_proj = Qwen2Linear(
 +        self.up_proj = Qwen2ColParallelLinear(
             input_size=config.hidden_size,
             output_size=config.intermediate_size,
             param_dtype=config.param_dtype,
             bias=False
         )
--        self.q_proj = Qwen2Linear(
+-        self.qgate_proj = Qwen2Linear(
 +        self.gate_proj = Qwen2ColParallelLinear(
             input_size=config.hidden_size,
             output_size=config.intermediate_size,
             param_dtype=config.param_dtype,
             bias=False
         )
--        self.q_proj = Qwen2Linear(
+-        self.down_proj = Qwen2Linear(
 +        self.down_proj = Qwen2RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
@@ -790,7 +790,7 @@ class Qwen2ForCausalLM(nn.Cell):
         super().__init__()
 
         self.model = Qwen2Model(config=config)
--        self.q_proj = Qwen2Linear(
+-        self.lm_head = Qwen2Linear(
 +        self.lm_head = Qwen2ColParallelLinear(
             input_size=config.hidden_size,
             output_size=config.vocab_size,
-- 
Gitee


From 39c9c077f1f976e5ac9b82b453891be58200aaf8 Mon Sep 17 00:00:00 2001
From: liu lili <liulili715@huawei.com>
Date: Thu, 28 Aug 2025 16:06:06 +0800
Subject: [PATCH 2/2] lll: solve some error

---
 .../source_en/model_infer/introduction.md     |  2 +-
 .../ms_infer/ms_infer_model_infer.rst         |  2 +-
 .../ms_infer/ms_infer_parallel_infer.md       | 26 ++++++++++++-------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/tutorials/source_en/model_infer/introduction.md b/tutorials/source_en/model_infer/introduction.md
index aab84a6f0e..3d02777f1a 100644
--- a/tutorials/source_en/model_infer/introduction.md
+++ b/tutorials/source_en/model_infer/introduction.md
@@ -28,7 +28,7 @@ The inference capability required by a model varies with scenarios. Based on com
 
     - **Inference with a framework**: The model network structure and model weight file are saved separately. You can build a network and separately load the model weight to build AI model inference. The advantage of this type of inference is that the inference weight does not need to be updated, regardless of whether the model is fine-tuned, optimized, or developed and debugged, this inference solution has obvious advantages when the weight of the current LLM reaches hundreds of GB.
 
-    - **Inference with a model file**: The model network and weight are packed into a file (ProtoBuf or FlatBed file). You only need to manage one file to execute the model. This inference solution is convenient for deployment management when there are a large number of models and the model size is not large.
+    - **Inference with a model file**: The model network and weight are packed into a file (ProtoBuf or FlatBuffer file). You only need to manage one file to execute the model. This inference solution is convenient for deployment management when there are a large number of models and the model size is not large.
 
 - **By inference backend**
 
diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst
index 036971db78..89b4935363 100644
--- a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst
+++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst
@@ -149,7 +149,7 @@ For the Qwen2 LLM, you are advised to use the pre-trained weight files and token
 .. code:: shell
 
    git lfs install
-   git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct
+   git clone https://huggingface.co/Qwen/Qwen2-7B
 
 After the download is complete, the following file tree structure should be displayed in the related directory:
 
diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md
index 127650eb4e..c5757040be 100644
--- a/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md
+++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md
@@ -644,31 +644,36 @@ class Qwen2Attention(nn.Cell):
 +        self.paged_attn = PagedAttention(self.num_heads // self.tp_size, self.scaling, self.num_kv_heads // self.tp_size)
         self.reshape_and_cache = ops.auto_generate.ReshapeAndCache()
 
-        self.q_proj = Qwen2ColParallelLinear(
+-        self.q_proj = Qwen2Linear(
++        self.q_proj = Qwen2ColParallelLinear(
             input_size=self.hidden_size,
             output_size=self.q_size,
             param_dtype=self.param_dtype
             bias=True
         )
-        self.k_proj = Qwen2ColParallelLinear(
+-        self.k_proj = Qwen2Linear(
++        self.k_proj = Qwen2ColParallelLinear(
             input_size=self.hidden_size,
             output_size=self.kv_size,
             param_dtype=self.param_dtype,
             bias=True
         )
-        self.v_proj = Qwen2ColParallelLinear(
+-        self.v_proj = Qwen2Linear(
++        self.v_proj = Qwen2ColParallelLinear(
             input_size=self.hidden_size,
             output_size=self.kv_size,
             param_dtype=self.param_dtype,
             bias=True
         )
-        self.o_proj = Qwen2RowParallelLinear(
+-        self.o_proj = Qwen2Linear(
++        self.o_proj = Qwen2RowParallelLinear(
             input_size=self.q_size,
             output_size=self.hidden_size,
             param_dtype=self.param_dtype,
             bias=False
         )
 
+
         self.rotary_emb = Qwen2RotaryEmbedding(
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
@@ -723,24 +728,26 @@ class Qwen2Attention(nn.Cell):
 
         output = self.o_proj(attn_output).view(bs, seq_len, -1)
         return output
-
 class Qwen2MLP(nn.Cell):
     def __init__(self, config: Qwen2Config) -> None:
         super().__init__()
 
-        self.up_proj = Qwen2ColParallelLinear(
+-        self.up_proj = Qwen2Linear(
++        self.up_proj = Qwen2ColParallelLinear(
             input_size=config.hidden_size,
             output_size=config.intermediate_size,
             param_dtype=config.param_dtype,
             bias=False
         )
-        self.gate_proj = Qwen2ColParallelLinear(
+-        self.qgate_proj = Qwen2Linear(
++        self.gate_proj = Qwen2ColParallelLinear(
             input_size=config.hidden_size,
             output_size=config.intermediate_size,
             param_dtype=config.param_dtype,
             bias=False
         )
-        self.down_proj = Qwen2RowParallelLinear(
+-        self.down_proj = Qwen2Linear(
++        self.down_proj = Qwen2RowParallelLinear(
             input_size=config.intermediate_size,
             output_size=config.hidden_size,
             param_dtype=config.param_dtype,
@@ -769,7 +776,8 @@ class Qwen2ForCausalLM(nn.Cell):
         super().__init__()
 
         self.model = Qwen2Model(config=config)
-        self.lm_head = Qwen2ColParallelLinear(
+-        self.lm_head = Qwen2Linear(
++        self.lm_head = Qwen2ColParallelLinear(
             input_size=config.hidden_size,
             output_size=config.vocab_size,
             param_dtype=config.param_dtype,
-- 
Gitee