From 23d4c5a3c5c85d8281c2ff9140bea4a37653f2ee Mon Sep 17 00:00:00 2001 From: hangangqiang Date: Wed, 11 Jun 2025 16:37:36 +0800 Subject: [PATCH] update vllm-mindspore quantization docs --- .../quantization/quantization.md | 85 ++----------------- .../quantization/quantization.md | 84 ++---------------- 2 files changed, 14 insertions(+), 155 deletions(-) diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md index 9663e0f882..9fcf6f953f 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md @@ -2,96 +2,25 @@ [![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md) -This document introduces model quantization and quantized inference methods. Quantization reduces inference resource with minor cost of precision, while improving inference performance to enable deployment on more devices. With the large scale of LLMs, post-training quantization has become the mainstream approach for model quantization. For details, refer to [Post-Training Quantization Introduction](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/README_CN.md). +This document introduces model quantization and quantized inference methods. Quantization reduces inference resource with minor cost of precision, while improving inference performance to enable deployment on more devices. With the large scale of LLMs, post-training quantization has become the mainstream approach for model quantization. For details, refer to [Post-Training Quantization Introduction](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/README.md). -In this document, the [Creating Quantized Models](#creating-quantized-models) section introduces post-training quantization steps using [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) as an example. A the [Quantized Model Inference](#quantized-model-inference) section explains how to perform inference with quantized models. +In this document, the [Creating Quantized Models](#creating-quantized-models) section introduces post-training quantization steps using [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) as an example. A the [Quantized Model Inference](#quantized-model-inference) section explains how to perform inference with quantized models. ## Creating Quantized Models -We use the [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) network as an example to introduce A8W8 quantization with the SmoothQuant algorithm. +We use the [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) network as an example to introduce W8A8 quantization with the OutlierSuppressionLite algorithm. ### Quantizing Networks with MindSpore Golden Stick -We employ [MindSpore Golden Stick's PTQ algorithm](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/ptq/README_CN.md) for SmoothQuant quantization of Qwen3-8B. For detailed methods, refer to [Qwen3-SmoothQuant Quantization Example](todo). - -#### Downloading Qwen3-8B Weights - -Users can download the weights using huggingface-cli: - -```bash -huggingface-cli download --resume-download Qwen/Qwen3-8B --local-dir Qwen3-8B-bf16 -``` - -Alternatively, use [other download methods](../../../getting_started/quick_start/quick_start.md#download-model). - -#### Loading the Network with MindSpore Transformers - -Load the network using [MindSpore Transformers](https://gitee.com/mindspore/mindformers) with the following script: - -```python -from mindformers import AutoModel -from mindformers import AutoTokenizer - -network = AutoModel.from_pretrained("Qwen3-8B-bf16") -tokenizer = AutoTokenizer.from_pretrained("Qwen3-8B-bf16") -``` - -#### Preparing the CEval Dataset - -Download the CEval dataset to the `ceval` directory with the following structure: - -```bash -ceval - ├── dev - ├── test - └── val -``` - -Create a dataset handle using MindSpore: - -```python -from mindspore import GeneratorDataset -ds = GeneratorDataset(source="ceval", column_names=["subjects", "input_ids", "labels"]) -``` - -#### Performing Post-Training Quantization with Golden Stick - -Use the following Python script for post-training quantization: - -```python -from mindspore import dtype as msdtype -from mindspore_gs.ptq import PTQ -from mindspore_gs.common import BackendTarget -from mindspore_gs.ptq import PTQConfig, PTQMode, OutliersSuppressionType, QuantGranularity, PrecisionRecovery - -cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH, - opname_blacklist=['lm_head']) -w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, - outliers_suppression=OutliersSuppressionType.NONE, - precision_recovery=PrecisionRecovery.NONE, - act_quant_granularity=QuantGranularity.PER_TOKEN, - weight_quant_granularity=QuantGranularity.PER_CHANNEL) -layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) -ptq = PTQ(config=cfg, layer_policies=layer_policies) -from research.qwen3.qwen3_transformers import Qwen3ParallelTransformerLayer -ptq.decoder_layer_types.append(Qwen3ParallelTransformerLayer) -ptq.apply(network, ds) -ptq.convert(network) -ms.save_checkpoint(network.parameters_dict(), "Qwen3-8B-A8W8", format="safetensors", - choice_func=lambda x: "key_cache" not in x and "value_cache" not in x and "float_weight" not in x) -``` - -Before calibration, add the MindSpore Transformers root directory to the `PYTHONPATH` environment variable, and check Qwen3-related classes have been successfully imported. +We employ [MindSpore Golden Stick's PTQ algorithm](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/ptq/README.md) for SmoothQuant quantization of Qwen3-8B. For detailed methods, refer to [DeepSeekR1-OutlierSuppressionLite Quantization Example](https://gitee.com/mindspore/golden-stick/blob/master/example/deepseekv3/a8w8-osl/readme.md). ### Downloading Quantized Weights -We have uploaded the quantized Qwen3-8B to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/Qwen3-8B-A8W8](https://modelers.cn/models/MindSpore-Lab/Qwen3-8B-A8W8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. +We have uploaded the quantized DeepSeek-R1 to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/DeepSeek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/en/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. ## Quantized Model Inference -After obtaining the Qwen3-8B SmoothQuant weights, ensure they are stored in the relative path `Qwen3-8B-A8W8`. +After obtaining the Qwen3-8B OutlierSuppressionLite weights, ensure they are stored in the relative path `DeepSeek-R1-W8A8`. ### Offline Inference @@ -112,7 +41,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.0, top_p=0.95) # Initialize LLM -llm = LLM(model="Qwen3-8B-A8W8", quantization='SmoothQuant') +llm = LLM(model="DeepSeek-R1-W8A8") # Generate text outputs = llm.generate(prompts, sampling_params) # Print results diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md index 1a88312048..5a1dcb2850 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md @@ -4,93 +4,23 @@ 本文档将为用户介绍模型量化与量化推理的方法。量化方法通过牺牲部分模型精度的方式,达到降低模型部署时的资源需求的目的,并提升模型部署时的性能,从而允许模型被部署到更多的设备上。由于大语言模型的规模较大,出于成本考虑,训练后量化成为主流模型量化方案,具体可以参考[后量化技术简介](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/README_CN.md)。 -本文档中,[创建量化模型](#创建量化模型)章节,将以[Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)为例,介绍模型后量化的步骤;[量化模型推理](#量化模型推理)章节,介绍如何使用量化模型进行推理。 +本文档中,[创建量化模型](#创建量化模型)章节,将以[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)为例,介绍模型后量化的步骤;[量化模型推理](#量化模型推理)章节,介绍如何使用量化模型进行推理。 ## 创建量化模型 -以[Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)网络为例,使用SmoothQuant算法对其进行A8W8量化。 +以[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)网络为例,使用OutlierSuppressionLite算法对其进行W8A8量化。 ### 使用MindSpore金箍棒量化网络 -我们将使用[MindSpore 金箍棒的PTQ算法](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/ptq/README_CN.md)对Qwen3-8B网络进行SmoothQuant量化,详细方法参考[Qwen3-SmoothQuant量化样例](todo) +我们将使用[MindSpore 金箍棒的PTQ算法](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/ptq/README_CN.md)对DeepSeek-R1网络进行OutlierSuppressionLite量化,详细方法参考[DeepSeekR1-OutlierSuppressionLite量化样例](https://gitee.com/mindspore/golden-stick/blob/master/example/deepseekv3/a8w8-osl/readme.md) -#### Qwen3-8B网络权重下载 +### 直接下载量化权重 -用户可使用huggingface-cli下载网络权重: - -```bash -huggingface-cli download --resume-download Qwen/Qwen3-8B --local-dir Qwen3-8B-bf16 -``` - -或可以使用[其他下载方式](../../../getting_started/quick_start/quick_start.md#下载模型),进行权重下载。 - -#### 使用MindSpore Transformers加载网络 - -用户可以使用如下的脚本,依赖[MindSpore Transformers](https://gitee.com/mindspore/mindformers),进行网络加载: - -```python -from mindformers import AutoModel -from mindformers import AutoTokenizer - -network = AutoModel.from_pretrained("Qwen3-8B-bf16") -tokenizer = AutoTokenizer.from_pretrained("Qwen3-8B-bf16") -``` - -#### 准备CEval数据集 - -将CEval数据集的下载到ceval目录下,目录结构如下: - -```bash -ceval - ├── dev - ├── test - └── val -``` - -使用MindSpore创建数据集句柄: - -```python -from mindspore import GeneratorDataset -ds = GeneratorDataset(source="ceval", column_names=["subjects", "input_ids", "labels"]) -``` - -#### 使用金箍棒进行后量化 - -用户可使用如下Python脚本,进行模型后量化: - -```python -from mindspore import dtype as msdtype -from mindspore_gs.ptq import PTQ -from mindspore_gs.common import BackendTarget -from mindspore_gs.ptq import PTQConfig, PTQMode, OutliersSuppressionType, QuantGranularity, PrecisionRecovery -cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH, - opname_blacklist=['lm_head']) -w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, - outliers_suppression=OutliersSuppressionType.NONE, - precision_recovery=PrecisionRecovery.NONE, - act_quant_granularity=QuantGranularity.PER_TOKEN, - weight_quant_granularity=QuantGranularity.PER_CHANNEL) -layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) -ptq = PTQ(config=cfg, layer_policies=layer_policies) -from research.qwen3.qwen3_transformers import Qwen3ParallelTransformerLayer -ptq.decoder_layer_types.append(Qwen3ParallelTransformerLayer) -ptq.apply(network, ds) -ptq.convert(network) -ms.save_checkpoint(network.parameters_dict(), "Qwen3-8B-A8W8", format="safetensors", - choice_func=lambda x: "key_cache" not in x and "value_cache" not in x and "float_weight" not in x) -``` - -执行校准前,需要将MindSpore Transformers工程根目录加到`PYTHONPATH`环境变量,从而用户可以成功import Qwen3网络相关类。 - -### 下载量化权重 - -我们已经将量化好的Qwen3-8B上传到[魔乐社区](https://modelers.cn):[MindSpore-Lab/Qwen3-8B-A8W8](https://modelers.cn/models/MindSpore-Lab/Qwen3-8B-A8W8),可以参考[魔乐社区文档](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html)将权重下载到本地。 +我们已经将量化好的DeepSeek-R1上传到[魔乐社区](https://modelers.cn):[MindSpore-Lab/DeepSeek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8),可以参考[魔乐社区文档](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html)将权重下载到本地。 ## 量化模型推理 -在上一步中获取到Qwen3-8B SmoothQuant量化权重后,保证该权重存放相对路径为`Qwen3-8B-A8W8`。 +在上一步中获取到DeepSeek-R1 W8A8量化权重后,保证该权重存放相对路径为`DeepSeek-R1-W8A8`。 ### 离线推理 @@ -111,7 +41,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.0, top_p=0.95) # Create a LLM -llm = LLM(model="Qwen3-8B-A8W8", quantization='SmoothQuant') +llm = LLM(model="DeepSeek-R1-W8A8") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) -- Gitee