diff --git a/tutorials/source_en/model_infer/introduction.md b/tutorials/source_en/model_infer/introduction.md index aab84a6f0e14d027451a4ebda61adda32f2d0c03..3d02777f1a6560c77e3d7d2e3e1440a632285770 100644 --- a/tutorials/source_en/model_infer/introduction.md +++ b/tutorials/source_en/model_infer/introduction.md @@ -28,7 +28,7 @@ The inference capability required by a model varies with scenarios. Based on com - **Inference with a framework**: The model network structure and model weight file are saved separately. You can build a network and separately load the model weight to build AI model inference. The advantage of this type of inference is that the inference weight does not need to be updated, regardless of whether the model is fine-tuned, optimized, or developed and debugged, this inference solution has obvious advantages when the weight of the current LLM reaches hundreds of GB. - - **Inference with a model file**: The model network and weight are packed into a file (ProtoBuf or FlatBed file). You only need to manage one file to execute the model. This inference solution is convenient for deployment management when there are a large number of models and the model size is not large. + - **Inference with a model file**: The model network and weight are packed into a file (ProtoBuf or FlatBuffer file). You only need to manage one file to execute the model. This inference solution is convenient for deployment management when there are a large number of models and the model size is not large. - **By inference backend** diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst index 036971db788f2f70937db46ab684ab27074aee4d..89b49353632646b5a740bcecc2f140cad00c86d1 100644 --- a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_infer.rst @@ -149,7 +149,7 @@ For the Qwen2 LLM, you are advised to use the pre-trained weight files and token .. code:: shell git lfs install - git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct + git clone https://huggingface.co/Qwen/Qwen2-7B After the download is complete, the following file tree structure should be displayed in the related directory: diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md index 127650eb4eda9eab2b0560cdb6f0eb61e556ed4e..c5757040bea62b824561935dbea5de628450203b 100644 --- a/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_parallel_infer.md @@ -644,31 +644,36 @@ class Qwen2Attention(nn.Cell): + self.paged_attn = PagedAttention(self.num_heads // self.tp_size, self.scaling, self.num_kv_heads // self.tp_size) self.reshape_and_cache = ops.auto_generate.ReshapeAndCache() - self.q_proj = Qwen2ColParallelLinear( +- self.q_proj = Qwen2Linear( ++ self.q_proj = Qwen2ColParallelLinear( input_size=self.hidden_size, output_size=self.q_size, param_dtype=self.param_dtype bias=True ) - self.k_proj = Qwen2ColParallelLinear( +- self.k_proj = Qwen2Linear( ++ self.k_proj = Qwen2ColParallelLinear( input_size=self.hidden_size, output_size=self.kv_size, param_dtype=self.param_dtype, bias=True ) - self.v_proj = Qwen2ColParallelLinear( +- self.v_proj = Qwen2Linear( ++ self.v_proj = Qwen2ColParallelLinear( input_size=self.hidden_size, output_size=self.kv_size, param_dtype=self.param_dtype, bias=True ) - self.o_proj = Qwen2RowParallelLinear( +- self.o_proj = Qwen2Linear( ++ self.o_proj = Qwen2RowParallelLinear( input_size=self.q_size, output_size=self.hidden_size, param_dtype=self.param_dtype, bias=False ) + self.rotary_emb = Qwen2RotaryEmbedding( head_size=self.head_dim, rotary_dim=self.head_dim, @@ -723,24 +728,26 @@ class Qwen2Attention(nn.Cell): output = self.o_proj(attn_output).view(bs, seq_len, -1) return output - class Qwen2MLP(nn.Cell): def __init__(self, config: Qwen2Config) -> None: super().__init__() - self.up_proj = Qwen2ColParallelLinear( +- self.up_proj = Qwen2Linear( ++ self.up_proj = Qwen2ColParallelLinear( input_size=config.hidden_size, output_size=config.intermediate_size, param_dtype=config.param_dtype, bias=False ) - self.gate_proj = Qwen2ColParallelLinear( +- self.qgate_proj = Qwen2Linear( ++ self.gate_proj = Qwen2ColParallelLinear( input_size=config.hidden_size, output_size=config.intermediate_size, param_dtype=config.param_dtype, bias=False ) - self.down_proj = Qwen2RowParallelLinear( +- self.down_proj = Qwen2Linear( ++ self.down_proj = Qwen2RowParallelLinear( input_size=config.intermediate_size, output_size=config.hidden_size, param_dtype=config.param_dtype, @@ -769,7 +776,8 @@ class Qwen2ForCausalLM(nn.Cell): super().__init__() self.model = Qwen2Model(config=config) - self.lm_head = Qwen2ColParallelLinear( +- self.lm_head = Qwen2Linear( ++ self.lm_head = Qwen2ColParallelLinear( input_size=config.hidden_size, output_size=config.vocab_size, param_dtype=config.param_dtype, diff --git a/tutorials/source_zh_cn/model_infer/introduction.md b/tutorials/source_zh_cn/model_infer/introduction.md index f383b2f91487d029105a7bd2789d62b6266ea756..a1c4e3b952f2f85ad9c7371164034d426ba46404 100644 --- a/tutorials/source_zh_cn/model_infer/introduction.md +++ b/tutorials/source_zh_cn/model_infer/introduction.md @@ -28,7 +28,7 @@ MindSpore框架为用户提供高效的模型推理能力,从AI功能上来看 - **带框架推理**:将模型网络结构和模型权重文件分开保存,通过构建网络并单独加载模型权重来构建AI模型推理,这种推理的好处是推理权重通常是不需要更新的,不管是对模型进行微调优化,还是开发调试,面向当前大语言模型上百G的权重时,该推理方案有明显的优势。 - - **模型文件推理**:将模型网络和权重打包到一个文件(protobuf或者flatbed格式),用户只需要管理一个文件即可以将模型执行起来,面向模型比较多而模型大小不是很大的场景,在部署上管理更加方便。 + - **模型文件推理**:将模型网络和权重打包到一个文件(protobuf或者flatbuffer格式),用户只需要管理一个文件即可以将模型执行起来,面向模型比较多而模型大小不是很大的场景,在部署上管理更加方便。 - **根据推理后端分类** diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst index b395c374bbc804f8e4e205ffe0f481f3302831a4..4b423724ce0f460d21c0e1aea6f4ee6b757da0e0 100644 --- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst +++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_infer.rst @@ -149,7 +149,7 @@ MindSpore大语言模型带框架推理主要依赖MindSpore开源软件,用 .. code:: shell git lfs install - git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct + git clone https://huggingface.co/Qwen/Qwen2-7B 下载完成后,相关目录下应该显示如下文件树结构: diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md index c6bd687e0c23093fddaf96492cb0414a438dc70b..b942851a18ada553ee3f822e133742ce8a6b0be8 100644 --- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md +++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_parallel_infer.md @@ -655,22 +655,22 @@ class Qwen2Attention(nn.Cell): param_dtype=self.param_dtype bias=True ) -- self.q_proj = Qwen2Linear( -+ self.q_proj = Qwen2ColParallelLinear( +- self.k_proj = Qwen2Linear( ++ self.k_proj = Qwen2ColParallelLinear( input_size=self.hidden_size, output_size=self.kv_size, param_dtype=self.param_dtype, bias=True ) -- self.q_proj = Qwen2Linear( -+ self.q_proj = Qwen2ColParallelLinear( +- self.v_proj = Qwen2Linear( ++ self.v_proj = Qwen2ColParallelLinear( input_size=self.hidden_size, output_size=self.kv_size, param_dtype=self.param_dtype, bias=True ) -- self.q_proj = Qwen2Linear( -+ self.q_proj = Qwen2RowParallelLinear( +- self.o_proj = Qwen2Linear( ++ self.o_proj = Qwen2RowParallelLinear( input_size=self.q_size, output_size=self.hidden_size, param_dtype=self.param_dtype, @@ -744,21 +744,21 @@ class Qwen2MLP(nn.Cell): def __init__(self, config: Qwen2Config) -> None: super().__init__() -- self.q_proj = Qwen2Linear( +- self.up_proj = Qwen2Linear( + self.up_proj = Qwen2ColParallelLinear( input_size=config.hidden_size, output_size=config.intermediate_size, param_dtype=config.param_dtype, bias=False ) -- self.q_proj = Qwen2Linear( +- self.qgate_proj = Qwen2Linear( + self.gate_proj = Qwen2ColParallelLinear( input_size=config.hidden_size, output_size=config.intermediate_size, param_dtype=config.param_dtype, bias=False ) -- self.q_proj = Qwen2Linear( +- self.down_proj = Qwen2Linear( + self.down_proj = Qwen2RowParallelLinear( input_size=config.intermediate_size, output_size=config.hidden_size, @@ -790,7 +790,7 @@ class Qwen2ForCausalLM(nn.Cell): super().__init__() self.model = Qwen2Model(config=config) -- self.q_proj = Qwen2Linear( +- self.lm_head = Qwen2Linear( + self.lm_head = Qwen2ColParallelLinear( input_size=config.hidden_size, output_size=config.vocab_size,