From b0a5760a9df91d079d342cf3cffa7225a95b9779 Mon Sep 17 00:00:00 2001 From: HighCloud Date: Fri, 4 Jul 2025 15:04:51 +0800 Subject: [PATCH 1/8] support native qwq --- vllm_mindspore/__init__.py | 15 +++ .../distributed/communication_op.py | 8 ++ vllm_mindspore/distributed/parallel_state.py | 93 ++++++++++++++++++ .../model_executor/layers/linear.py | 24 +++-- .../layers/vocab_parallel_embedding.py | 3 +- .../model_loader/weight_utils.py | 8 +- .../model_executor/models/model_base.py | 61 +++++++++--- vllm_mindspore/model_executor/models/qwen2.py | 29 +++++- vllm_mindspore/utils.py | 97 ++++++++++++++----- vllm_mindspore/v1/worker/gpu_model_runner.py | 20 +++- vllm_mindspore/worker/cache_engine.py | 19 +++- vllm_mindspore/worker/model_runner.py | 5 +- 12 files changed, 324 insertions(+), 58 deletions(-) create mode 100644 vllm_mindspore/distributed/parallel_state.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 0f39d7d0..489b899e 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -271,6 +271,16 @@ RejectionSampler._smallest_positive_value.__set_name__( RejectionSampler, '_smallest_positive_value') vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial +import vllm.distributed.communication_op +import vllm.worker.worker_base +from vllm_mindspore.distributed.communication_op import cpu_broadcast_tensor_dict +vllm.distributed.communication_op.broadcast_tensor_dict = cpu_broadcast_tensor_dict +vllm.worker.worker_base.broadcast_tensor_dict = cpu_broadcast_tensor_dict + +import vllm.distributed.parallel_state +from vllm_mindspore.distributed.parallel_state import gc_broadcast_tensor_dict +vllm.distributed.parallel_state.GroupCoordinator.broadcast_tensor_dict = gc_broadcast_tensor_dict + ######### for multi-model from vllm_mindspore.inputs.registry import call_hf_processor from vllm.inputs.registry import InputProcessingContext @@ -344,6 +354,11 @@ vllm.v1.worker.gpu_input_batch.BlockTable = BlockTable import vllm.v1.worker.gpu_input_batch from vllm_mindspore.v1.worker.gpu_input_batch import _make_sampling_metadata, _make_prompt_token_ids_tensor +# TODO: need this? +# from vllm_mindspore.model_executor.model_loader.loader import _process_weights_after_loading + +# vllm.model_executor.model_loader.loader._process_weights_after_loading = _process_weights_after_loading + vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata = _make_sampling_metadata vllm.v1.worker.gpu_model_runner.InputBatch._make_sampling_metadata = _make_sampling_metadata vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py index a24d4959..31e8d892 100644 --- a/vllm_mindspore/distributed/communication_op.py +++ b/vllm_mindspore/distributed/communication_op.py @@ -19,6 +19,7 @@ # 不要去照搬mindspeed的, 因为训练当中包含太多的特性, 推理只需要非常简单的通信,可以提升性能。 from typing import Any, Dict, Optional, Union +import torch from mindspore import Tensor, nn, ops from mindspore.communication.comm_func import all_reduce, broadcast @@ -48,6 +49,13 @@ def broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[Tensor, # return tensor_dict # return get_tp_group().broadcast_tensor_dict(tensor_dict, src) +def cpu_broadcast_tensor_dict(tensor_dict: Optional[Dict[Any, Union[torch.Tensor, + Any]]] = None, + src: int = 0): + if not torch.distributed.is_initialized(): + return tensor_dict + return get_tp_group().broadcast_tensor_dict(tensor_dict, src, group=get_tp_group().cpu_group) + class ReduceFromModelParallelRegion(nn.Cell): "All reduce the input from the model parallel region." diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py new file mode 100644 index 00000000..697196fa --- /dev/null +++ b/vllm_mindspore/distributed/parallel_state.py @@ -0,0 +1,93 @@ +import torch +import torch.distributed +from torch.distributed import ProcessGroup + +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union) +from vllm.distributed.parallel_state import _split_tensor_dict, TensorMetadata +from vllm_mindspore.utils import atlas_inference + +def gc_broadcast_tensor_dict( + self, + tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None, + src: int = 0, + group: Optional[ProcessGroup] = None, + metadata_group: Optional[ProcessGroup] = None + ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]: + """Broadcast the input tensor dictionary. + NOTE: `src` is the local rank of the source rank. + """ + # Bypass the function if we are using only 1 GPU. + if (not torch.distributed.is_initialized() or self.world_size == 1): + return tensor_dict + + if not atlas_inference(): + group = self.device_group + metadata_group = self.cpu_group + assert src < self.world_size, f"Invalid src rank ({src})" + + rank_in_group = self.rank_in_group + if rank_in_group == src: + metadata_list: List[Tuple[Any, Any]] = [] + assert isinstance( + tensor_dict, + dict), (f"Expecting a dictionary, got {type(tensor_dict)}") + metadata_list, tensor_list = _split_tensor_dict(tensor_dict) + # `metadata_list` lives in CPU memory. + # `broadcast_object_list` has serialization & deserialization, + # all happening on CPU. Therefore, we can use the CPU group. + self.broadcast_object(metadata_list, src=src) + async_handles = [] + for tensor in tensor_list: + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast(tensor, + src=self.ranks[src], + group=metadata_group, + async_op=True) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast(tensor, + src=self.ranks[src], + group=group, + async_op=True) + async_handles.append(handle) + for async_handle in async_handles: + async_handle.wait() + + else: + metadata_list = self.broadcast_object(None, src=src) + tensor_dict = {} + async_handles = [] + for key, value in metadata_list: + if isinstance(value, TensorMetadata): + tensor = torch.empty(value.size, + dtype=value.dtype, + device=value.device) + if tensor.numel() == 0: + # Skip broadcasting empty tensors. + tensor_dict[key] = tensor + continue + if tensor.is_cpu: + # use metadata_group for CPU tensors + handle = torch.distributed.broadcast( + tensor, + src=self.ranks[src], + group=metadata_group, + async_op=True) + else: + # use group for GPU tensors + handle = torch.distributed.broadcast( + tensor, + src=self.ranks[src], + group=group, + async_op=True) + async_handles.append(handle) + tensor_dict[key] = tensor + else: + tensor_dict[key] = value + for async_handle in async_handles: + async_handle.wait() + return tensor_dict diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py index e0851149..53ebc22a 100644 --- a/vllm_mindspore/model_executor/layers/linear.py +++ b/vllm_mindspore/model_executor/layers/linear.py @@ -388,9 +388,15 @@ class MergedColumnParallelLinear(ColumnParallelLinear): if not use_bitsandbytes_4bit: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous() assert param_data.shape == loaded_weight.shape - # param_data.copy_(loaded_weight) - # param_data.set_data(loaded_weight) - param[shard_offset: shard_offset + shard_size, :] = loaded_weight + if len(loaded_weight.shape) == 2: + param[shard_offset: shard_offset + shard_size, :] = loaded_weight + else: + param[shard_offset: shard_offset + shard_size] = loaded_weight + else: + assert param.shape == loaded_weight.shape + if loaded_weight.dtype == ms.float32 and param.dtype == ms.float16: + loaded_weight = loaded_weight.astype(ms.float16) + param.set_data(loaded_weight.contiguous()) class QKVParallelLinear(ColumnParallelLinear): @@ -474,10 +480,10 @@ class QKVParallelLinear(ColumnParallelLinear): if not use_bitsandbytes_4bit: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous() assert param_data.shape == loaded_weight.shape - if param.name.endswith("weight"): - self.weight[shard_offset: shard_offset + shard_size, :] = loaded_weight - if param.name.endswith("bias"): - self.bias[shard_offset: shard_offset + shard_size] = loaded_weight + if len(loaded_weight.shape) == 2: + param[shard_offset: shard_offset + shard_size, :] = loaded_weight + else: + param[shard_offset: shard_offset + shard_size] = loaded_weight # tp_rank = get_tensor_model_parallel_rank() # if shard_id is "q": # start_index = self.num_heads * tp_rank * self.head_size @@ -586,6 +592,7 @@ class RowParallelLinear(LinearBase): def weight_loader(self, param, loaded_weight): tp_rank = get_tensor_model_parallel_rank() + param_data = param.data input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) is_sharded_weight = getattr(param, "is_sharded_weight", False) @@ -606,6 +613,5 @@ class RowParallelLinear(LinearBase): loaded_weight = loaded_weight.reshape(1) assert param.shape == loaded_weight.shape - # param_data.copy_(loaded_weight) + param_data.copy_(loaded_weight) # self.weight[:, start_idx : start_idx + shard_size] = loaded_weight - param.set_data(loaded_weight.contiguous()) diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py index 768a8238..3af4878d 100644 --- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py @@ -18,7 +18,7 @@ from dataclasses import dataclass from typing import List, Optional, Sequence, Tuple -from mindspore import Parameter, Tensor, mint, nn, ops +from mindspore import Parameter, Tensor, mint, nn, ops, jit from mindspore.common.dtype import typing from vllm.config import get_current_vllm_config from vllm.distributed import (divide, get_tensor_model_parallel_rank, @@ -56,6 +56,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): self.gather = ops.Gather() self.bias_add = ops.Add() + # @jit def apply(self, layer: nn.Cell, x: Tensor, diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index 0fc4d3d2..c8edd319 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -23,6 +23,8 @@ import torch import mindspore as ms from mindspore import Parameter, Tensor +from vllm_mindspore.utils import atlas_inference +import numpy as np def safetensors_weights_iterator( @@ -41,8 +43,10 @@ def safetensors_weights_iterator( ): with safe_open(st_file, framework="np") as f: for name in f.keys(): - param = f.get_tensor(name) - yield name, ms.tensor(param) + x = f.get_tensor(name) + x = x.astype(np.float16) \ + if (str(x.dtype) == 'bfloat16' and atlas_inference()) else x + yield name, ms.tensor(x) def default_weight_loader(param: Parameter, diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 8b1b02a2..ada08c1d 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -31,13 +31,13 @@ from vllm.sequence import IntermediateTensors import vllm.envs as envs import mindspore as ms -from mindspore import Tensor, nn, mutable +from mindspore import Tensor, nn, mutable, ops from mindspore.common import dtype as mstype from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata - +from vllm_mindspore.utils import atlas_inference class AttentionWrapper: @@ -48,11 +48,32 @@ class AttentionWrapper: vllm_config.parallel_config) head_size = vllm_config.model_config.get_head_size() num_block = 0 - self.kv_shape = [num_block, block_size, num_kv_heads, head_size] - self.kv_cache = [( - ms.mint.zeros(self.kv_shape, dtype=vllm_config.model_config.dtype), - ms.mint.zeros(self.kv_shape, dtype=vllm_config.model_config.dtype), - ) for _ in range(vllm_config.parallel_config.pipeline_parallel_size)] + if atlas_inference(): + self.kv_shape = [num_block, block_size, num_kv_heads * head_size] + self.kv_cache = [ + ( + ops.auto_generate.format_cast( + ms.mint.zeros( + self.kv_shape, dtype=vllm_config.model_config.dtype + ), + 29, + ), + ops.auto_generate.format_cast( + ms.mint.zeros( + self.kv_shape, dtype=vllm_config.model_config.dtype + ), + 29, + ), + ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + else: + self.kv_shape = [num_block, block_size, num_kv_heads, head_size] + self.kv_cache = [( + ms.mint.zeros(self.kv_shape, dtype=vllm_config.model_config.dtype), + ms.mint.zeros(self.kv_shape, dtype=vllm_config.model_config.dtype), + ) for _ in range(vllm_config.parallel_config.pipeline_parallel_size)] + self.attn_type = AttentionType.DECODER # add for v1 @@ -69,11 +90,24 @@ class MLAAttentionWrapper(AttentionWrapper): def __init__(self): super().__init__() vllm_config = get_current_vllm_config() - self.kv_cache = [ - (ms.mint.zeros(self.kv_shape, - dtype=vllm_config.model_config.dtype), ) - for _ in range(vllm_config.parallel_config.pipeline_parallel_size) - ] + if atlas_inference(): + self.kv_cache = [ + ( + ops.auto_generate.format_cast( + ms.mint.zeros( + self.kv_shape, dtype=vllm_config.model_config.dtype + ), + 29, + ), + ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + else: + self.kv_cache = [ + (ms.mint.zeros(self.kv_shape, + dtype=vllm_config.model_config.dtype), ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] class MsModelBase: @@ -396,7 +430,8 @@ class NativeModel(MsModelBase): block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - kv_cache_shape = (None, block_size, num_kv_heads, head_size) + kv_cache_shape = (None, block_size, num_kv_heads * head_size) if atlas_inference() \ + else (None, block_size, num_kv_heads, head_size) kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ else self.cache_config.cache_dtype diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 87c54c21..3b62385f 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -24,7 +24,8 @@ if TYPE_CHECKING: else: Qwen2Config = None -from mindspore import Parameter, Tensor, mint, nn +from mindspore import Parameter, Tensor, mint, nn, ops +import mindspore as ms from vllm.attention.backends.abstract import AttentionType from vllm.config import CacheConfig, VllmConfig @@ -33,6 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.sequence import IntermediateTensors +from vllm_mindspore.utils import atlas_inference from vllm_mindspore.attention import Attention from vllm_mindspore.model_executor.layers.activation import SwiGLU from vllm_mindspore.model_executor.layers.layernorm import RMSNorm @@ -397,9 +399,34 @@ class Qwen2Model(nn.Cell): param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) + # Norm type in weights may be f32 + if(loaded_weight.dtype != param.dtype): + loaded_weight = loaded_weight.to(dtype=param.dtype) weight_loader(param, loaded_weight) loaded_params.add(name) + def adjust_weight(params_dict): + if not atlas_inference(): + return + + target_keywords = [ + "qkv_proj.weight", + "o_proj.weight", + "gate_up_proj.weight", + "down_proj.weight", + # "lm_head.weight", + ] + + for name, param in params_dict.items(): + if any(name.endswith(keyword) for keyword in target_keywords): + cast_weight = ops.auto_generate.format_cast(param, 29) + ms.runtime.synchronize() + param.set_data(cast_weight) + + ms.runtime.synchronize() + adjust_weight(params_dict) + ms.runtime.synchronize() + return loaded_params diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index e4ab9fca..759bd3d4 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -179,29 +179,6 @@ def is_mindone_model_backend(): == vllmModelBackendEnum.MIND_ONE) -def check_ready(): - from mindspore import set_context - - # Common environment variables of predict. - set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) - default_env = { - "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": - "FlashAttentionScore,PagedAttention", - } - env_setup(default_env) - - if os.getenv("MS_MEMPOOL_BLOCK_SIZE"): - set_context( - mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB") - - if is_mindformers_model_backend(): - logger.info("Run with Mindformers backend!") - elif is_mindone_model_backend(): - logger.info("Run with MindONE backend!") - else: - logger.info("Run with native model backend!") - - def convert_np_to_ms_dtype(value): """convert_np_to_ms_dtype""" if value.dtype == np.int8: @@ -303,3 +280,77 @@ def ms_memory_profiling( result.non_torch_increase = diff_from_create.non_torch_memory result.profile_time = diff_profile.timestamp result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa + + +def is_version_ge(current_version, base_version): + """ + return current_version >= base_version. + Check whether the current version is higher than or equal to the base version. + for current_version: 1.8.1, base_version: 1.11.0, it return False. + """ + version_split_char = '.' + if version_split_char not in base_version or version_split_char not in current_version: + raise ValueError("The version string will contain the `.`." + "For example, current_version 1.8.1, base_version: 1.11.0.") + for x, y in zip(current_version.split(version_split_char), base_version.split(version_split_char)): + if not x.isdigit() or not y.isdigit(): + continue + if int(x) != int(y): + return int(x) >= int(y) + return True + +def get_ascend_soc_version(): + """Get ascend soc version.""" + if is_version_ge(ms.__version__, "2.2.0"): + from mindspore._c_expression import MSContext + return MSContext.get_instance().get_ascend_soc_version() + ascend_chip_type = os.getenv("ASCEND_CHIP_TYPE", "UNSET") + if ascend_chip_type not in ["910a", "910b", "UNSET"]: + raise EnvironmentError(f"ASCEND_CHIP_TYPE should be in ['910a', '910b'],but get {ascend_chip_type}") + if ascend_chip_type == "UNSET": + logger.info("Environment variables need to be set manually to obtain the chip type," + "which can be set as follows: \n" + "For Atlas 800, run 'export ASCEND_CHIP_TYPE=910a' before the program runs.\n" + "For Atlas 800T A2, run 'export ASCEND_CHIP_TYPE=910b' before the program runs.\n" + "If you need to get chip information automatically, MindSpore 2.2 and above is recommended") + return ascend_chip_type + +def atlas_inference(): + device = get_ascend_soc_version() + return device in ['310p', 'ascend310p'] + +def check_ready(): + import vllm.envs as envs + from mindspore import set_context + + # Common environment variables of predict. + set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + custom_kernels = "FlashAttentionScore,PagedAttention" + if atlas_inference(): + set_context(graph_kernel_flags="--disable_pass=add_rms_norm_fusion") + custom_kernels = "InferenceMatmulSplit," + custom_kernels + ",AddRmsNorm" + + default_env = { + "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": custom_kernels + } + env_setup(default_env) + + if os.getenv("MS_MEMPOOL_BLOCK_SIZE"): + set_context( + mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB") + + if is_mindformers_model_backend(): + logger.info("Run with Mindformers backend!") + necessary_envs = ("MINDFORMERS_MODEL_CONFIG", ) + lost_envs = [ + env_item for env_item in necessary_envs if not os.getenv(env_item) + ] + + if lost_envs: + raise RuntimeError( + f'For "MindFormers" model backend, environments {str(lost_envs)} should be set!' + ) + elif is_mindone_model_backend(): + logger.info("Run with MindONE backend!") + else: + logger.info("Run with native model backend!") diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 75fbb36d..3e702705 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -22,9 +22,9 @@ import numpy as np import torch import mindspore as ms -from mindspore import mutable +from mindspore import mutable, ops from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata -from vllm_mindspore.utils import get_valid_dtype, get_dtype_size +from vllm_mindspore.utils import get_valid_dtype, get_dtype_size, atlas_inference from vllm_mindspore.model_executor.layers.rotary_embedding import InferMRotaryEmbedding as MRotaryEmbedding # type: ignore[attr-defined] from vllm.v1.outputs import ModelRunnerOutput @@ -202,8 +202,17 @@ def _prepare_inputs( def create_block(shape, dtype, name=None, device=None): - from mindspore import mint - blocks = mint.empty(shape, dtype=dtype, device=device) + from mindspore.mint import empty as empty_tensor + from mindspore.common.api import _pynative_executor + blocks = empty_tensor(*shape, dtype=dtype, device=device) + if device == "Ascend" and atlas_inference(): + blocks_nz = ops.auto_generate.format_cast(blocks, 29) + _pynative_executor.sync() + import gc + del blocks + gc.collect() + ms.hal.empty_cache() + return blocks_nz return blocks @@ -285,6 +294,9 @@ def _reshape_kv_cache_tensors( kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) + if atlas_inference(): + *dims, second_last, last = kv_cache_shape + kv_cache_shape = (*dims, second_last * last) try: kv_cache_stride_order = self.attn_backends[ i].get_kv_cache_stride_order() diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py index 2df44ee5..8190e03b 100644 --- a/vllm_mindspore/worker/cache_engine.py +++ b/vllm_mindspore/worker/cache_engine.py @@ -18,16 +18,26 @@ """CacheEngine class for managing the KV cache.""" import mindspore as ms -from mindspore import mutable, mint +from mindspore import mutable, mint, ops from typing import List from vllm.logger import init_logger -from vllm_mindspore.utils import MsKVCache, get_valid_dtype +from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference logger = init_logger(__name__) def create_block(shape, dtype, name=None, device=None): - blocks = mint.empty(shape, dtype=dtype, device=device) + from mindspore.ops.function.array_func import empty as empty_tensor + from mindspore.common.api import _pynative_executor + blocks = empty_tensor(*shape, dtype=dtype, device=device) + if device == "Ascend" and atlas_inference(): + blocks_nz = ops.auto_generate.format_cast(blocks, 29) + _pynative_executor.sync() + import gc + del blocks + gc.collect() + ms.hal.empty_cache() + return blocks_nz return blocks @@ -39,6 +49,9 @@ def ms_allocate_kv_cache( """Allocates KV cache on the specified device.""" kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) + if atlas_inference(): + *dims, second_last, last = kv_cache_shape + kv_cache_shape = (*dims, second_last * last) kv_cache: List[MsKVCache] = [] self.dtype = get_valid_dtype(self.dtype) diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index 55bb26ec..706a2058 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -24,7 +24,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import SequenceGroupMetadata -from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE +from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE, atlas_inference from mindspore import mutable @@ -137,7 +137,8 @@ def _dummy_run(self, block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() - kv_shape = [0, block_size, num_kv_heads, head_size] + kv_shape = [0, block_size, num_kv_heads * head_size] if atlas_inference() else \ + [0, block_size, num_kv_heads, head_size] kv_caches = mutable([ mutable(( mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)), -- Gitee From 838c5bb8bca33fcb58946997c511fbb94aec03be Mon Sep 17 00:00:00 2001 From: HighCloud Date: Sat, 12 Jul 2025 14:43:14 +0800 Subject: [PATCH 2/8] support ds weight process --- .../models/mf_models/deepseek_v3.py | 2 +- .../mf_models/deepseekv3_weight_processor.py | 265 +++++++++++++++++- .../models/mf_models/weight_processor.py | 22 +- 3 files changed, 286 insertions(+), 3 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 1e2df73a..110585a8 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -227,7 +227,7 @@ class DeepseekV3ForCausalLM(MfModelBase): def create_ptq(self, quant_type: str, quant_mode: PTQMode): """create_ptq""" - if quant_type.lower() == 'ptq': + if quant_type.lower() in ['ptq', 'ptq-duo']: cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_PLUS, diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index c63abe69..d9c5d213 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -1661,6 +1661,266 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): return w1_weight_param, w1_scale_param, w3_weight_param, w3_scale_param, w2_weight_param + def dynamic_quant_process_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict): + '''dynamic_quant_process_qkv_weight''' + qkv_concat = self.config.model.model_config.qkv_concat + # q2l_proj + q2l_weight_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.weight" + q2l_weight_param, _ = self.get_safetensor_from_file(q2l_weight_name, src_hf_dir, hf_weight_map) + q2l_bias_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.quant_bias" + q2l_bias_param, _ = self.get_safetensor_from_file(q2l_bias_name, src_hf_dir, hf_weight_map) + q2l_scale_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.dequant_scale" + q2l_scale_param, _ = self.get_safetensor_from_file(q2l_scale_name, src_hf_dir, hf_weight_map) + + q2l_quant_zp = f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_zp" + q2l_quant_scale = f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_scale" + q2l_quant_beta= f"model.layers.{layer_id}.attention.q2l_proj.quant_op.beta" + q2l_quant_zp_param, _ = self.get_safetensor_from_file(q2l_quant_zp, src_hf_dir, hf_weight_map) + q2l_quant_scale_param, _ = self.get_safetensor_from_file(q2l_quant_scale, src_hf_dir, hf_weight_map) + q2l_quant_beta_param, _ = self.get_safetensor_from_file(q2l_quant_beta, src_hf_dir, hf_weight_map) + + kv2l_weight_name = f"model.layers.{layer_id}.attention.kv2l._layer.weight" + kv2l_weight_param, _ = self.get_safetensor_from_file(kv2l_weight_name, src_hf_dir, hf_weight_map) + kv2l_bias_name = f"model.layers.{layer_id}.attention.kv2l._layer.matmul.quant_bias" + kv2l_bias_param, _ = self.get_safetensor_from_file(kv2l_bias_name, src_hf_dir, hf_weight_map) + kv2l_scale_name = f"model.layers.{layer_id}.attention.kv2l._layer.matmul.dequant_scale" + kv2l_scale_param, _ = self.get_safetensor_from_file(kv2l_scale_name, src_hf_dir, hf_weight_map) + + kv2l_quant_zp = f"model.layers.{layer_id}.attention.kv2l.quant_op.input_zp" + kv2l_quant_scale = f"model.layers.{layer_id}.attention.kv2l.quant_op.input_scale" + kv2l_quant_beta = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta" + kv2l_quant_zp_param, _ = self.get_safetensor_from_file(kv2l_quant_zp, src_hf_dir, hf_weight_map) + kv2l_quant_scale_param, _ = self.get_safetensor_from_file(kv2l_quant_scale, src_hf_dir, hf_weight_map) + kv2l_quant_beta_param, _ = self.get_safetensor_from_file(kv2l_quant_beta, src_hf_dir, hf_weight_map) + + if qkv_concat: + qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight" + qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias" + qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale" + qkv2l_quant_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp" + qkv2l_quant_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale" + qkv2l_quant_beta_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.beta" + + qkv2l_weight = np.concatenate((q2l_weight_param, kv2l_weight_param), 0) + parameter_dict[qkv2l_weight_name] = ms.Parameter(ms.Tensor(qkv2l_weight, ms.int8), name=qkv2l_weight_name, + requires_grad=False) + qkv2l_bias = np.concatenate((q2l_bias_param, kv2l_bias_param), 0) + parameter_dict[qkv2l_bias_name] = ms.Parameter(ms.Tensor(qkv2l_bias, ms.int32), name=qkv2l_bias_name, + requires_grad=False) + qkv2l_scale = np.concatenate((q2l_scale_param, kv2l_scale_param), 0) + parameter_dict[qkv2l_scale_name] = ms.Parameter(ms.Tensor(qkv2l_scale, ms.int64), name=qkv2l_scale_name, + requires_grad=False) + parameter_dict[qkv2l_quant_zp_name] = ms.Parameter(ms.Tensor(q2l_quant_zp_param, ms.int8), + name=qkv2l_quant_zp_name, requires_grad=False) + parameter_dict[qkv2l_quant_scale_name] = ms.Parameter(ms.Tensor(q2l_quant_scale_param, ms.float16), + name=qkv2l_quant_scale_name, requires_grad=False) + parameter_dict[qkv2l_quant_beta_name] = ms.Parameter(ms.Tensor(q2l_quant_beta_param, ms.float16), + name=qkv2l_quant_beta_name, requires_grad=False) + else: + parameter_dict[q2l_weight_name] = ms.Parameter(ms.Tensor(q2l_weight_param, ms.int8), name=q2l_weight_name, + requires_grad=False) + parameter_dict[kv2l_weight_name] = ms.Parameter(ms.Tensor(kv2l_weight_param, ms.int8), + name=kv2l_weight_name, requires_grad=False) + parameter_dict[q2l_bias_name] = ms.Parameter(ms.Tensor(q2l_bias_param, ms.int32), name=q2l_bias_name, + requires_grad=False) + parameter_dict[kv2l_bias_name] = ms.Parameter(ms.Tensor(kv2l_bias_param, ms.int32), name=kv2l_bias_name, + requires_grad=False) + parameter_dict[q2l_scale_name] = ms.Parameter(ms.Tensor(q2l_scale_param, ms.int64), name=q2l_scale_name, + requires_grad=False) + parameter_dict[kv2l_scale_name] = ms.Parameter(ms.Tensor(kv2l_scale_param, ms.int64), + name=kv2l_scale_name, requires_grad=False) + parameter_dict[q2l_quant_zp] = ms.Parameter(ms.Tensor(q2l_quant_zp_param, ms.int8), name=q2l_quant_zp, + requires_grad=False) + parameter_dict[kv2l_quant_zp] = ms.Parameter(ms.Tensor(kv2l_quant_zp_param, ms.int8), name=kv2l_quant_zp, + requires_grad=False) + parameter_dict[q2l_quant_scale] = ms.Parameter(ms.Tensor(q2l_quant_scale_param, ms.float16), + name=q2l_quant_scale, requires_grad=False) + parameter_dict[q2l_quant_beta] = ms.Parameter(ms.Tensor(q2l_quant_beta_param, ms.float16), + name=q2l_quant_beta, requires_grad=False) + parameter_dict[kv2l_quant_scale] = ms.Parameter(ms.Tensor(kv2l_quant_scale_param, ms.float16), + name=kv2l_quant_scale, requires_grad=False) + parameter_dict[kv2l_quant_beta] = ms.Parameter(ms.Tensor(kv2l_quant_beta_param, ms.float16), + name=kv2l_quant_beta, requires_grad=False) + + def dynamic_quant_process_route_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): + """dynamic_quant_process_route_ffn_weight""" + ffn_concat = self.config.model.model_config.ffn_concat + w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" + w1_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w1_weight_name, src_hf_dir, hf_weight_map, + split_axis=1) + + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale" + w1_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_name, src_hf_dir, hf_weight_map, + split_axis=1) + + w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" + w3_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w3_weight_name, src_hf_dir, hf_weight_map, + split_axis=1) + + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale" + w3_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_name, src_hf_dir, hf_weight_map, + split_axis=1) + + if ffn_concat: + concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" + concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=1), dtype=ms.int8) + parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, + requires_grad=False) + + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale" + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=1), dtype=ms.float32) + parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, + requires_grad=False) + else: + # w1 w3 + parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, + requires_grad=False) + parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, + requires_grad=False) + + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32), + name=w1_scale_name, requires_grad=False) + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32), + name=w3_scale_name, requires_grad=False) + + def dynamic_quant_process_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): + """dynamic_quant_process_ffn_weight""" + + ffn_concat = self.config.model.model_config.ffn_concat + w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" + w1_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w1_weight_name, src_hf_dir, hf_weight_map, + split_axis=0) + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale" + w1_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_name, src_hf_dir, hf_weight_map, + split_axis=0) + + w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" + w3_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w3_weight_name, src_hf_dir, hf_weight_map, + split_axis=0) + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale" + w3_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_name, src_hf_dir, hf_weight_map, + split_axis=0) + w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" + w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale" + w2_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w2_weight_name, src_hf_dir, hf_weight_map, + split_axis=1) + w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map) + + if ffn_concat: + concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" + concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8) + parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, + requires_grad=False) + + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale" + concat_scale_type = convert_np_to_ms_dtype(w1_scale_param) + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=concat_scale_type) + parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, + requires_grad=False) + else: + # w1 w3 + parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, + requires_grad=False) + parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, + requires_grad=False) + w1_scale_type = convert_np_to_ms_dtype(w1_scale_param) + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, w1_scale_type), + name=w1_scale_name, requires_grad=False) + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, w1_scale_type), + name=w3_scale_name, requires_grad=False) + + parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, + requires_grad=False) + w2_scale_type = convert_np_to_ms_dtype(w2_scale_param) + parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, w2_scale_type), + name=w2_scale_name, requires_grad=False) + + def infer_dynamic_quant_get_value(self, param_name, src_hf_dir, hf_weight_map, no_need_split_layer): + '''infer_dynamic_quant_get_value''' + + if any([name in param_name for name in no_need_split_layer]): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".l2q_proj."]]): + if param_name.endswith(".weight") or "matmul" in param_name: + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, + hf_weight_map, + split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".wo.", "feed_forward.w2", "shared_experts.w2"]]): + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, + hf_weight_map, + split_axis=1) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif ".routed_experts.ffn.w2" in param_name: + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=2) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + elif any([name in param_name for name in ["lkv2kv_k_nope", "absorb", "lkv2kv_v"]]): + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=0) + elif "lm_head" in param_name: + if not self.config.parallel_config.vocab_emb_dp: + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + else: + raise ValueError(f"not found layer {param_name}, please check safetensors file.") + return value + + def infer_dynamic_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map): + '''infer_dynamic_quant_net_ms_convert_layer_weight''' + parameter_dict = {} + start_layer_index, end_layer_index = self.get_layer_index(num_layers) + + no_need_split_layer = ["tok_embeddings", "norm", "routed_experts.router.dense", + "routed_experts.router.e_score_correction_bias", + "topk_bias"] + network_names = [] + for m in self.network.parameters_and_names(): + network_names.append(m[0]) + for layer_id in tqdm(range(start_layer_index, end_layer_index), desc="qkv/ffn params load"): + if layer_id >= 3: + self.dynamic_quant_process_route_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward.routed_experts.ffn") + self.dynamic_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward.shared_experts") + + else: + self.dynamic_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward") + self.dynamic_quant_process_qkv_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict) + + skip_layer = ["feed_forward.routed_experts.ffn.w1", "feed_forward.shared_experts.w1", "feed_forward.w1", + "feed_forward.routed_experts.ffn.w3", "feed_forward.shared_experts.w3", "feed_forward.w3", + "feed_forward.routed_experts.ffn.w_gate_hidden", "feed_forward.shared_experts.w_gate_hidden", + "feed_forward.w_gate_hidden", "attention.kv2l", "attention.q2l_proj", "attention.qkv2l"] + + for param_name, _ in tqdm(hf_weight_map.items(), desc="remaining params load"): + if param_name not in network_names: + continue + + if any([name in param_name for name in skip_layer]): + continue + + value = self.infer_dynamic_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer) + dst_dtype = convert_np_to_ms_dtype(value) + + parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype), + name=param_name, requires_grad=False) + + param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict) + print(f"dsquant param_not_load:{param_not_load}") + print(f"dsquant ckpt_not_load:{ckpt_not_load}") + def smooth_quant_process_shared_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): @@ -2141,7 +2401,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): quantization_config = self.config.model.model_config.quantization_config quant_method = quantization_config.quant_method if quantization_config else None - support_quant_method = ["gptq-pergroup", "smoothquant", "osl"] + support_quant_method = ["gptq-pergroup", "smoothquant", "osl", 'ptq-duo'] if not quant_method or (quant_method not in support_quant_method) and \ not is_mtp_model: self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) @@ -2158,6 +2418,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): self.infer_smooth_quant_net_ms_convert_layer_weight( src_hf_dir, self.num_layers, hf_weight_map) return + if quant_method and quant_method == "ptq-duo": + self.infer_dynamic_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map) + return enable_tqdm = rank_id == 0 mtp_layers = self.config.model.model_config.num_nextn_predict_layers diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 89d786eb..6c8612eb 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -22,7 +22,7 @@ from enum import Enum from safetensors import safe_open from mindspore.communication.management import get_rank, get_group_size from mindformers.parallel_core.inference.utils import get_tp_world_size -from mindformers.parallel_core.inference.parallel_state import get_data_parallel_world_size +from mindformers.parallel_core.inference.parallel_state import get_data_parallel_world_size, get_pp_world_size class EPMethod(Enum): @@ -71,6 +71,26 @@ class BaseWeightProcessor: self.parameter_dict = {} self.file_handles = {} + def get_layer_index(self, num_layers): + pp_nums = get_pp_world_size() + tp_nums = self.tp_group_size + offset = self.config.model.model_config.offset + offset_index = self.global_rank_id // tp_nums + stage_layers = num_layers // pp_nums + start_layer_index = offset_index * stage_layers + end_layer_index = start_layer_index + stage_layers + + if pp_nums > 1 and num_layers % pp_nums != 0: + if isinstance(offset, list): + raise ValueError(f"The parameter 'offset' is expected to be a list, but got {offset} instead." + f" Please check whether your offset parameter is set correctly!") + for num in range(0, offset_index): + start_layer_index += offset[num] + end_layer_index += offset[num] + end_layer_index += offset[offset_index] + + return start_layer_index, end_layer_index + def get_file_handles(self, filename): if filename not in self.file_handles: fp = safe_open(filename, framework="np") -- Gitee From e2e6afa37f2dc346696a4321df0477875315fa65 Mon Sep 17 00:00:00 2001 From: HighCloud Date: Sat, 12 Jul 2025 14:43:29 +0800 Subject: [PATCH 3/8] fix kvcache nz bug --- vllm_mindspore/v1/worker/gpu_model_runner.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 3e702705..b76ea889 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -320,7 +320,16 @@ def _reshape_kv_cache_tensors( kv_cache_layer = [] for kv_cache_raw_tensor in kv_cache_raw_tensors[layer_name]: cache_block = kv_cache_raw_tensor.view(kv_cache_shape[1:]).permute(*inv_order[1:]) - kv_cache_layer.append(cache_block) + if atlas_inference(): + from mindspore.common.api import _pynative_executor + cache_block_nz = ops.auto_generate.format_cast(cache_block, 29) + _pynative_executor.sync() + import gc + del cache_block + gc.collect() + kv_cache_layer.append(cache_block_nz) + else: + kv_cache_layer.append(cache_block) kv_caches[layer_name] = mutable(tuple(kv_cache_layer)) else: raise NotImplementedError -- Gitee From 7e8f61435f4fec4fb85d89510feb65e57740bbdf Mon Sep 17 00:00:00 2001 From: HighCloud Date: Tue, 15 Jul 2025 14:46:11 +0800 Subject: [PATCH 4/8] support pp --- vllm_mindspore/__init__.py | 4 + vllm_mindspore/model_executor/models/llama.py | 27 +++---- .../models/mf_models/deepseek_v3.py | 23 ++++-- .../mf_models/deepseekv3_weight_processor.py | 51 +++++++------ .../models/mf_models/mf_model_base.py | 17 ++++- .../models/mf_models/weight_processor.py | 74 ++++++++++++------- .../model_executor/models/model_base.py | 33 +++++++-- vllm_mindspore/model_executor/models/qwen2.py | 30 ++++---- vllm_mindspore/utils.py | 28 +++++++ vllm_mindspore/v1/worker/gpu_worker.py | 3 + vllm_mindspore/worker/worker.py | 18 ++++- 11 files changed, 206 insertions(+), 102 deletions(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 489b899e..4d838dcf 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -440,3 +440,7 @@ try: except: pass check_ready() + +from vllm_mindspore.utils import view +from mindspore import Tensor +Tensor.view = view diff --git a/vllm_mindspore/model_executor/models/llama.py b/vllm_mindspore/model_executor/models/llama.py index 954579f1..e49f0097 100644 --- a/vllm_mindspore/model_executor/models/llama.py +++ b/vllm_mindspore/model_executor/models/llama.py @@ -371,19 +371,16 @@ class LlamaModel(nn.Cell): batch_valid_length: Tensor, q_seq_lens: Tensor, block_tables: Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, + hidden_states: Optional[Tensor] = None, + residual: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, - ) -> Union[Tensor, IntermediateTensors]: + ) -> Tuple[Tensor, Tensor]: if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) residual = None - else: - assert intermediate_tensors is not None - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] for i in range(self.start_layer, self.end_layer): layer = self.layers[i] @@ -394,14 +391,9 @@ class LlamaModel(nn.Cell): attn_mask, batch_valid_length, q_seq_lens, block_tables, residual) - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states + if get_pp_group().is_last_rank: + hidden_states, residual = self.norm(hidden_states, residual) + return hidden_states, residual def load_weights(self, weights: Iterable[Tuple[str, Tensor]], params_dict): loaded_params: Set[str] = set() @@ -493,8 +485,13 @@ class LlamaForCausalLM(NativeModel, SupportsPP): intermediate_tensors=None, inputs_embeds=None, **kwargs): - hidden_states = self.exec_model(input_ids, positions, + hidden_states, residual = self.exec_model(input_ids, positions, intermediate_tensors, inputs_embeds) + if not get_pp_group().is_first_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual, + }) return hidden_states def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 110585a8..8d5a8d42 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -26,10 +26,11 @@ import mindspore as ms from vllm.config import VllmConfig from vllm.config import get_current_vllm_config -from vllm.distributed.parallel_state import get_dp_group, get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_dp_group, get_tensor_model_parallel_world_size, get_pp_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.attention.layer import Attention +from vllm.model_executor.models.interfaces import SupportsPP import mindspore as ms from mindspore import Tensor, JitConfig, Model, mutable @@ -56,6 +57,7 @@ from vllm_mindspore.model_executor.models.model_base import MLAAttentionWrapper from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor from vllm_mindspore.model_executor.models.attention_mask import MLALowerTriangularMask +from vllm_mindspore.model_executor.models.utils import make_empty_intermediate_tensors_factory try: # Need to apply dllm pd patch on vllm to use pd disagg related functions @@ -121,7 +123,7 @@ def _get_padding_index(q_seq_len): -class DeepseekV3ForCausalLM(MfModelBase): +class DeepseekV3ForCausalLM(MfModelBase, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super(DeepseekV3ForCausalLM, self).__init__( vllm_config=vllm_config, prefix=prefix @@ -133,18 +135,21 @@ class DeepseekV3ForCausalLM(MfModelBase): self.sampler = get_sampler() self.set_modules({"model": self.network}) - self.kv_caches = [MLAAttentionWrapper() for i in range(self.mf_model_config.num_layers)] + self.num_layers = self.model_config.get_num_layers(self.parallel_config) + self.kv_caches = [MLAAttentionWrapper() for _ in range(self.num_layers)] compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") - for i in range(self.mf_model_config.num_layers): + for i in range(self.num_layers): compilation_config.static_forward_context[str(i)] = self.kv_caches[i] self.set_flags = False set_runtime_kernel_launch_group() self.casual_mask = MLALowerTriangularMask(dtype=self.mf_model_config.compute_dtype, max_model_len=self.model_config.max_model_len) + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(keys=["hidden_states"], + hidden_size=self.model_config.hf_config.hidden_size) def _generate_model_config(self): self.mf_config.load_checkpoint = self.get_model_path() @@ -171,12 +176,14 @@ class DeepseekV3ForCausalLM(MfModelBase): if ptq is not None: ptq.apply(network) ptq.convert(network) - return network, network.lm_head + if get_pp_group().is_last_rank: + return network, network.lm_head + return network, None def get_kvcache(self): key_cache = [] forward_context = get_forward_context() - for i in range(self.mf_model_config.num_layers): + for i in range(self.num_layers): k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0] key_cache.append(k_cache) return mutable(key_cache), None @@ -185,7 +192,7 @@ class DeepseekV3ForCausalLM(MfModelBase): logger.debug(f"reached deepseek_v3 connector_send_kvcache") _pynative_executor.sync() forward_context = get_forward_context() - for i in range(self.mf_model_config.num_layers): + for i in range(self.num_layers): kv_cache_module = self.kv_caches[i] kv_cache = kv_cache_module.kv_cache[forward_context.virtual_engine][0] maybe_save_kv_layer_to_connector(str(i), kv_cache) @@ -201,7 +208,7 @@ class DeepseekV3ForCausalLM(MfModelBase): self.mf_config, model, self.network, infer_data, do_predict=True ) else: - weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant) + weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant, self.vllm_config) weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint) return None diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index d9c5d213..4f943c4f 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -27,6 +27,7 @@ from mindspore import dtype from mindspore.communication.management import get_rank from tqdm import tqdm from vllm.logger import init_logger +from vllm.distributed import get_pp_group, get_pp_indices from vllm_mindspore.model_executor.models.mf_models.weight_processor import ( BaseWeightProcessor, EPMethod) @@ -60,9 +61,15 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): """ - def __init__(self, config, network, is_quant): - super().__init__(config, network, is_quant) - self.num_layers = self.config.model.model_config.num_layers + def __init__(self, config, network, is_quant, vllm_config): + super().__init__(config, network, is_quant, vllm_config) + self.num_layers = self.vllm_config.model_config.get_num_layers(self.vllm_config.parallel_config) + self.start_layer, self.end_layer = get_pp_indices( + self.config.model.model_config.num_layers, + get_pp_group().rank_in_group, + get_pp_group().world_size, + ) + print(f'[yyd] start_layer: {self.start_layer}, end_layer: {self.end_layer}') self.expert_num = self.config.moe_config.expert_num self.moe_split_tp = self.moe_tp_size > 1 self.moe_split_ep = self.moe_ep_size > 1 @@ -415,18 +422,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_hf_name, w3_scale_hf_name, src_hf_dir, hf_weight_map): if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]: - w1_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w1_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) - w2_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w2_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w2_hf_name, src_hf_dir, hf_weight_map, split_axis=1) - w3_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w3_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w3_hf_name, src_hf_dir, hf_weight_map, split_axis=0) - w1_scale_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w1_scale_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w1_scale_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w2_scale_ms_param, _ = self.get_safetensor_from_file( w2_scale_hf_name, src_hf_dir, hf_weight_map) - w3_scale_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w3_scale_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w3_scale_hf_name, src_hf_dir, hf_weight_map, split_axis=0) elif self.ep_method == EPMethod.ALLTOALL: w1_ms_param, _ = self.get_safetensor_from_file( @@ -1115,7 +1122,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def convert_mtp_weight_name(self, weight_name: str): layer = 0 if 'layers.' not in weight_name else int( weight_name[weight_name.find('layers.'):].split('.')[1]) - if layer < self.num_layers: + if self.start_layer <= layer < self.end_layer: return weight_name mtp_prefix = 'mtp_model' is_mtp_layer = 'tok_embeddings' not in weight_name and 'shared_head.' not in weight_name @@ -1161,13 +1168,13 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_list = [] w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1.weight" - w1_ms_name = w1_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name( + w1_ms_name = w1_ms_name if self.start_layer <=layer_id < self.end_layer else self.convert_mtp_weight_name( w1_ms_name) w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2.weight" - w2_ms_name = w2_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name( + w2_ms_name = w2_ms_name if self.start_layer <=layer_id < self.end_layer else self.convert_mtp_weight_name( w2_ms_name) w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight" - w3_ms_name = w3_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name( + w3_ms_name = w3_ms_name if self.start_layer <=layer_id < self.end_layer else self.convert_mtp_weight_name( w3_ms_name) for index in range(0, self.num_router_experts): @@ -1193,7 +1200,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden.weight" - w_gate_hidden_name = w_gate_hidden_name if layer_id < self.num_layers else \ + w_gate_hidden_name = w_gate_hidden_name if self.start_layer <=layer_id < self.end_layer else \ self.convert_mtp_weight_name(w_gate_hidden_name) w_gate_hidden_np = np.concatenate( [w1_ms_stack_param, w3_ms_stack_param], axis=1) @@ -1225,11 +1232,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def get_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map): if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]: - w1_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w1_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) - w2_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w2_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w2_hf_name, src_hf_dir, hf_weight_map, split_axis=1) - w3_ms_param, _ = self.get_safetensor_from_file_split_global_group( + w3_ms_param, _ = self.get_safetensor_from_file_split_tp_dp_group( w3_hf_name, src_hf_dir, hf_weight_map, split_axis=0) elif self.ep_method == EPMethod.ALLTOALL: w1_ms_param, _ = self.get_safetensor_from_file( @@ -1261,7 +1268,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight" - w_gate_hidden_name = w_gate_hidden_name if layer_id < self.num_layers else \ + w_gate_hidden_name = w_gate_hidden_name if self.start_layer <=layer_id < self.end_layer else \ self.convert_mtp_weight_name(w_gate_hidden_name) w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) @@ -1516,7 +1523,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) # convert mtp shared weights. - if layer_id >= self.num_layers: + if layer_id >= self.end_layer: self.infer_process_mtp_layer_weight(src_hf_dir, layer_id, hf_weight_map) @@ -1879,7 +1886,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def infer_dynamic_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map): '''infer_dynamic_quant_net_ms_convert_layer_weight''' parameter_dict = {} - start_layer_index, end_layer_index = self.get_layer_index(num_layers) + start_layer_index, end_layer_index = self.start_layer, self. end_layer no_need_split_layer = ["tok_embeddings", "norm", "routed_experts.router.dense", "routed_experts.router.e_score_correction_bias", @@ -2424,9 +2431,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): enable_tqdm = rank_id == 0 mtp_layers = self.config.model.model_config.num_nextn_predict_layers - start_layer = 0 if not is_mtp_model else self.num_layers - end_layer = self.num_layers if not is_mtp_model else self.num_layers + mtp_layers - for layer_id in tqdm(range(start_layer, end_layer), + self.start_layer = self.start_layer if not is_mtp_model else self.end_layer + self.end_layer = self.end_layer if not is_mtp_model else self.end_layer + mtp_layers + for layer_id in tqdm(range(self.start_layer, self.end_layer), desc="Weight loading", disable=not enable_tqdm): if self.is_quant: diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index c4df0f43..be185ccd 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -29,7 +29,7 @@ from mindspore.common.api import _pynative_executor from mindspore.communication import get_rank from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.distributed.parallel_state import get_dp_group +from vllm.distributed.parallel_state import get_dp_group, get_pp_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput @@ -77,7 +77,8 @@ class MfModelBase(MsModelBase): self.mf_config.parallel_config) self.mf_config.model.model_config.parallel_config.model_parallel = ( get_tensor_model_parallel_world_size()) - self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 + self.mf_config.model.model_config.parallel_config.pipeline_stage = ( + get_pp_group().world_size) self._generate_model_config() self.casual_mask = LowerTriangularMask( dtype=self.mf_model_config.compute_dtype, @@ -117,7 +118,8 @@ class MfModelBase(MsModelBase): self.network.set_dynamic_inputs() dynamic_hidden_states = Tensor( shape=[None, None], dtype=self.mf_model_config.compute_dtype) - self.lm_head.set_inputs(dynamic_hidden_states) + if get_pp_group().is_last_rank: + self.lm_head.set_inputs(dynamic_hidden_states) def prepare_inputs(self, input_ids, positions): return self.prepare_base_inputs(input_ids, positions) @@ -149,11 +151,15 @@ class MfModelBase(MsModelBase): **kwargs) -> Union[Tensor, IntermediateTensors]: model_inputs, is_prefill = self.prepare_inputs(input_ids, positions) model_inputs = self.update_model_inputs(model_inputs, **kwargs) + model_inputs["hidden_states"] = None + if intermediate_tensors is not None: + model_inputs["hidden_states"] = intermediate_tensors["hidden_states"] # enable_mb_split is True in lager EP enable micro-batch and per-dp-bs > 1 enable_mb_split = self.is_enable_micro_batch_split( is_prefill, model_inputs["q_seq_lens"]) + # print(f'[yyd] model inputs: {model_inputs}') if is_prefill: if self.enable_micro_batch: self.network.phase = "prefill" if not enable_mb_split else "prefill_micro_batch" @@ -184,6 +190,11 @@ class MfModelBase(MsModelBase): logger.debug(f"connector_wait_for_kv_layer success") hidden_states = self.network(**model_inputs) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + }) + return hidden_states def compute_logits( diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 6c8612eb..8fb91884 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -20,9 +20,13 @@ transform huggingface safetensor. import os from enum import Enum from safetensors import safe_open +from mindformers.parallel_core.inference.parallel_state import ( + get_data_parallel_world_size, get_moe_expert_parallel_rank, + get_moe_tensor_parallel_rank, get_pipeline_model_parallel_world_size, + get_tensor_and_data_model_parallel_rank, + get_tensor_and_data_model_parallel_world_size, + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from mindspore.communication.management import get_rank, get_group_size -from mindformers.parallel_core.inference.utils import get_tp_world_size -from mindformers.parallel_core.inference.parallel_state import get_data_parallel_world_size, get_pp_world_size class EPMethod(Enum): @@ -43,54 +47,38 @@ class BaseWeightProcessor: """ - def __init__(self, config, network, is_quant): + def __init__(self, config, network, is_quant, vllm_config): + self.vllm_config = vllm_config self.config = config self.network = network self.is_quant = is_quant self.global_rank_id = get_rank() self.global_group_size = get_group_size() - self.tp_group_size = get_tp_world_size() + self.tp_group_size = get_tensor_model_parallel_world_size() self.dp_group_size = get_data_parallel_world_size() + self.tp_dp_group_size = get_tensor_and_data_model_parallel_world_size() + self.tp_dp_gourp_id = get_tensor_and_data_model_parallel_rank() + self.pp_group_size = get_pipeline_model_parallel_world_size() self.num_router_experts = self.config.moe_config.expert_num if self.config.moe_config.expert_num else 1 self.moe_ep_size = self.config.parallel_config.expert_parallel \ if self.config.parallel_config.expert_parallel else 1 - self.moe_tp_size = self.global_group_size // self.moe_ep_size + self.moe_tp_size = self.global_group_size // self.moe_ep_size // self.pp_group_size self.ep_method = EPMethod.DEFAULT if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size: self.ep_method = EPMethod.ALLTOALL elif self.dp_group_size > 1: self.ep_method = EPMethod.ALLGATHER - self.tp_rank_id = self.global_rank_id % self.tp_group_size + self.tp_rank_id = get_tensor_model_parallel_rank() self.ep_group_nums = self.num_router_experts // self.moe_ep_size - self.moe_ep_rank_id = self.global_rank_id // self.moe_tp_size - self.moe_tp_rank_id = self.global_rank_id % self.moe_tp_size + self.moe_ep_rank_id = get_moe_expert_parallel_rank() + self.moe_tp_rank_id = get_moe_tensor_parallel_rank() self.ep_start = self.moe_ep_rank_id * self.ep_group_nums self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums self.parameter_dict = {} self.file_handles = {} - def get_layer_index(self, num_layers): - pp_nums = get_pp_world_size() - tp_nums = self.tp_group_size - offset = self.config.model.model_config.offset - offset_index = self.global_rank_id // tp_nums - stage_layers = num_layers // pp_nums - start_layer_index = offset_index * stage_layers - end_layer_index = start_layer_index + stage_layers - - if pp_nums > 1 and num_layers % pp_nums != 0: - if isinstance(offset, list): - raise ValueError(f"The parameter 'offset' is expected to be a list, but got {offset} instead." - f" Please check whether your offset parameter is set correctly!") - for num in range(0, offset_index): - start_layer_index += offset[num] - end_layer_index += offset[num] - end_layer_index += offset[offset_index] - - return start_layer_index, end_layer_index - def get_file_handles(self, filename): if filename not in self.file_handles: fp = safe_open(filename, framework="np") @@ -140,6 +128,36 @@ class BaseWeightProcessor: raise ValueError("split_axis:{} is not supported.".format(split_axis)) return split_data, qint4 + def get_safetensor_from_file_split_tp_dp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.tp_dp_group_size + start = self.tp_dp_gourp_id * split_size + stop = (self.tp_dp_gourp_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_dp_group_size + start = self.tp_dp_gourp_id * split_size + stop = (self.tp_dp_gourp_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.tp_dp_group_size + start = self.tp_dp_gourp_id * split_size + stop = (self.tp_dp_gourp_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError("split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + + def get_safetensor_from_file_split_global_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index ada08c1d..2cbf9f2a 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -114,6 +114,7 @@ class MsModelBase: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + self.vllm_config = vllm_config config = vllm_config.model_config.hf_config lora_config = vllm_config.lora_config @@ -250,7 +251,8 @@ class MsModelBase: key_cache = [] value_cache = [] forward_context = get_forward_context() - for i in range(self.config.num_hidden_layers): + num_layers = self.model_config.get_num_layers(self.parallel_config) + for i in range(num_layers): k_cache = self.kv_caches[i].kv_cache[ # type: ignore[attr-defined] forward_context.virtual_engine][0] v_cache = self.kv_caches[i].kv_cache[ # type: ignore[attr-defined] @@ -389,14 +391,15 @@ class NativeModel(MsModelBase): self.casual_mask = LowerTriangularMask( dtype=self.model_config.dtype, max_model_len=self.model_config.max_model_len) + num_layers = self.model_config.get_num_layers(self.parallel_config) self.kv_caches = [ - AttentionWrapper() for i in range(self.config.num_hidden_layers) + AttentionWrapper() for _ in range(num_layers) ] compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") - for i in range(self.config.num_hidden_layers): + for i in range(num_layers): compilation_config.static_forward_context[str( i)] = self.kv_caches[i] @@ -421,11 +424,15 @@ class NativeModel(MsModelBase): dtype=inputs_embeds.dtype) if intermediate_tensors is None: - dyn_intermediate_tensors = None + dyn_hidden_states = None + dyn_residual = None else: dyn_intermediate_tensors = ms.Tensor( - shape=[None] * intermediate_tensors.ndim, - dtype=intermediate_tensors.dtype) + shape=[None] * intermediate_tensors["hidden_states"].ndim, + dtype=intermediate_tensors["hidden_states"].dtype) + dyn_residual = ms.Tensor( + shape=[None] * intermediate_tensors["residual"].ndim, + dtype=intermediate_tensors["residual"].dtype) block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) @@ -460,7 +467,8 @@ class NativeModel(MsModelBase): is_prefill, dyn_slot_mapping, dynamic_attention_mask, - dyn_batch_valid_length, + dyn_hidden_states, + dyn_residual, dyn_q_seq_lens, dyn_block_tables, dyn_intermediate_tensors, @@ -476,6 +484,14 @@ class NativeModel(MsModelBase): model_inputs, is_prefill = self.prepare_base_inputs( input_ids, positions) + #for pp + if intermediate_tensors is not None: + model_inputs["hidden_states"] = intermediate_tensors["hidden_states"] + model_inputs["residual"] = intermediate_tensors["residual"] + else: + model_inputs["hidden_states"] = None + model_inputs["residual"] = None + # for multimodal model model_inputs["intermediate_tensors"] = intermediate_tensors model_inputs["inputs_embeds"] = inputs_embeds @@ -517,7 +533,8 @@ class NativeModel(MsModelBase): batch_valid_length=model_inputs["batch_valid_length"], q_seq_lens=model_inputs["q_seq_lens"], block_tables=model_inputs["block_tables"], - intermediate_tensors=model_inputs["intermediate_tensors"], + hidden_states=model_inputs["hidden_states"], + residual=model_inputs["residual"], inputs_embeds=model_inputs["inputs_embeds"], ) diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 3b62385f..01d643d5 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -31,7 +31,7 @@ from vllm.attention.backends.abstract import AttentionType from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.models.interfaces import SupportsLoRA +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.sequence import IntermediateTensors from vllm_mindspore.utils import atlas_inference @@ -324,18 +324,16 @@ class Qwen2Model(nn.Cell): batch_valid_length: Tensor, q_seq_lens: Tensor, block_tables: Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, + hidden_states: Optional[Tensor] = None, + residual: Optional[Tensor] = None, inputs_embeds: Optional[Tensor] = None, - ) -> Union[Tensor, IntermediateTensors]: + ) -> Tuple[Tensor, Tensor]: if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) residual = None - else: - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] for i in range(self.start_layer, self.end_layer): # PP 并行对层进行切分 layer = self.layers[i] @@ -345,13 +343,10 @@ class Qwen2Model(nn.Cell): is_prefill, slot_mapping, attn_mask, batch_valid_length, q_seq_lens, block_tables, residual) - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states + + if get_pp_group().is_last_rank: + hidden_states, residual = self.norm(hidden_states, residual) + return hidden_states, residual def load_weights(self, weights: Iterable[Tuple[str, Tensor]], params_dict: Dict[str, Parameter]): @@ -430,7 +425,7 @@ class Qwen2Model(nn.Cell): return loaded_params -class Qwen2ForCausalLM(NativeModel, SupportsLoRA): +class Qwen2ForCausalLM(NativeModel, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -492,8 +487,13 @@ class Qwen2ForCausalLM(NativeModel, SupportsLoRA): intermediate_tensors: IntermediateTensors = None, inputs_embeds: Tensor = None, **kwargs) -> Union[Tensor, IntermediateTensors]: - hidden_states = self.exec_model(input_ids, positions, + hidden_states, residual = self.exec_model(input_ids, positions, intermediate_tensors, inputs_embeds) + if not get_pp_group().is_first_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual, + }) return hidden_states def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index 759bd3d4..64bf5cab 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -33,6 +33,7 @@ else: import mindspore as ms from mindspore import dtype as mstype from mindspore.common.initializer import Zero +from mindspore._c_expression import typing from vllm.logger import init_logger from vllm.utils import (TORCH_DTYPE_TO_NUMPY_DTYPE, MemoryProfilingResult, MemorySnapshot, T, make_ndarray_with_pad) @@ -282,6 +283,33 @@ def ms_memory_profiling( result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa +def view(self, *shape_or_dtype): + if len(shape_or_dtype) == 1 and isinstance(shape_or_dtype[0], typing.Type): + target_dtype = shape_or_dtype[0] + ori_shape = self.shape + target_shape = (-1,) + if len(ori_shape) > 1: + target_shape = ori_shape[:-1] + target_shape + out = np.frombuffer(self.numpy(), torch.ops.creation._TypeDict.get(target_dtype, np.float32)) + if not out.flags.aligned: + out = np.require(out, requirements=["ALIGNED"]) + if target_dtype == ms.bfloat16: + return ms.Tensor.from_numpy(out.astype(np.float32)).astype(target_dtype).reshape(target_shape) + return ms.Tensor.from_numpy(out).reshape(target_shape) + result = [] + if type(shape_or_dtype) is tuple: + for items in shape_or_dtype: + if not isinstance(items, int): + for item in items: + if not isinstance(item, int): + result.append(item.item()) + else: + result.append(item) + else: + result.append(items) + return ms.ops.reshape(self, result) + + def is_version_ge(current_version, base_version): """ return current_version >= base_version. diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py index bb77182e..df417c8b 100644 --- a/vllm_mindspore/v1/worker/gpu_worker.py +++ b/vllm_mindspore/v1/worker/gpu_worker.py @@ -51,6 +51,9 @@ def compile_or_warm_up_model(self) -> None: # MindSpore does not support cuda graph. No need to warm up the model. # Since prefill is done previously, we do decode here. default_max_num_reqs = 1 # For MindSpore, we only do one more decode here. + # Only pp_last_rank requires _dummy_sampler_run, and only pp_last_rank can _dummy_sampler_run. if get_pp_group().is_last_rank: self.model_runner._dummy_sampler_run(self.model_runner._dummy_run( num_tokens=default_max_num_reqs)) + else: + self.model_runner._dummy_run(num_tokens=default_max_num_reqs) diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index 5eed6136..740aa059 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -27,6 +27,7 @@ from vllm_mindspore.utils import get_valid_dtype from vllm.model_executor import set_random_seed from vllm.sequence import SequenceGroupMetadata from vllm.sampling_params import SamplingParams +from vllm.distributed import get_pp_group logger = init_logger(__name__) @@ -72,6 +73,17 @@ def _warm_up_model(self) -> None: # cache_engine is a list with length equal to the size of pipeline-parallel, and only pp=1 is supported. kv_cache = self.cache_engine[0].gpu_cache is_mtp_model = self.speculative_config is not None and self.model_config.hf_config.model_type == "deepseek_mtp" + intermediate_tensors = None + if self.vllm_config.scheduler_config.is_multi_step: + make_empty_intermediate_tensors = self.model_runner._base_model_runner.model.make_empty_intermediate_tensors + else: + make_empty_intermediate_tensors = self.model_runner.model.make_empty_intermediate_tensors + if not get_pp_group().is_first_rank: + intermediate_tensors = make_empty_intermediate_tensors( + batch_size=1, + dtype=self.model_config.dtype, + device=self.devices, + ) if is_mtp_model: # prefill mtp model model_input, previous_hidden_states = _prepare_input_for_warmup( @@ -80,7 +92,7 @@ def _warm_up_model(self) -> None: self.model_runner.execute_model( model_input, kv_cache, - None, + intermediate_tensors, previous_hidden_states=previous_hidden_states) # warmup for decode @@ -89,7 +101,7 @@ def _warm_up_model(self) -> None: self.model_config, self.model_runner._base_model_runner, self.cache_engine[0], False) self.model_runner._base_model_runner.execute_model( - model_input, kv_cache, None) + model_input, kv_cache, intermediate_tensors) else: model_input, previous_hidden_states = _prepare_input_for_warmup( self.model_config, self.model_runner, self.cache_engine[0], False, @@ -97,7 +109,7 @@ def _warm_up_model(self) -> None: self.model_runner.execute_model( model_input, kv_cache, - None, + intermediate_tensors, previous_hidden_states=previous_hidden_states) torch.cuda.synchronize() -- Gitee From fcfb96c953ae19c6c2290ee43c3af7d9c2e1f9d4 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Tue, 15 Jul 2025 20:06:50 +0800 Subject: [PATCH 5/8] 310p load checkpoint --- .../mf_models/deepseekv3_weight_processor.py | 160 ++++++++++++------ .../models/mf_models/weight_processor.py | 42 +++++ 2 files changed, 149 insertions(+), 53 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 4f943c4f..6e94999d 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -30,28 +30,11 @@ from vllm.logger import init_logger from vllm.distributed import get_pp_group, get_pp_indices from vllm_mindspore.model_executor.models.mf_models.weight_processor import ( - BaseWeightProcessor, EPMethod) + BaseWeightProcessor, EPMethod, convert_np_to_ms_dtype) logger = init_logger(__name__) -def convert_np_to_ms_dtype(value): - """convert_np_to_ms_dtype""" - if value.dtype == np.int8: - value_dtype = ms.int8 - elif value.dtype == np.int32: - value_dtype = ms.int32 - elif value.dtype == np.int64: - value_dtype = ms.int64 - elif value.dtype == np.float64: - value_dtype = ms.float64 - elif value.dtype == np.float32: - value_dtype = ms.float32 - else: - value_dtype = ms.bfloat16 - return value_dtype - - class DeepseekV3WeightProcessor(BaseWeightProcessor): r""" Provide DeepseekV3/R1 Model weight load and shards. @@ -325,9 +308,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): expert_idx = expert_idx[ in_start_expert_idx:] + expert_idx[:in_start_expert_idx] router_dense_ms_param = np.array(router_dense_ms_param)[expert_idx] - + dense_dtype = convert_np_to_ms_dtype(router_dense_ms_param) self.parameter_dict[router_dense_ms_name] = ms.Parameter( - ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16), + ms.from_numpy(router_dense_ms_param).astype(dense_dtype), name=router_dense_ms_name, requires_grad=False) @@ -361,11 +344,21 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_ms_stack_param = np.stack(w2_scale_list, axis=0) w3_scale_ms_stack_param = np.stack(w3_scale_list, axis=0) + if self.is_310: + weight_scale_type = ms.float32 + weight_concat_axis = 2 + w1_ms_stack_param = w1_ms_stack_param.transpose(0, 2, 1) + w2_ms_stack_param = w2_ms_stack_param.transpose(0, 2, 1) + w3_ms_stack_param = w3_ms_stack_param.transpose(0, 2, 1) + else: + weight_scale_type = ms.bfloat16 + weight_concat_axis = 1 + if ffn_concat: # w_gate_hidden w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.weight" w_gate_hidden_np = np.concatenate( - [w1_ms_stack_param, w3_ms_stack_param], axis=1) + [w1_ms_stack_param, w3_ms_stack_param], axis=weight_concat_axis) w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute( 0, 2, 1).astype(ms.int8) self.parameter_dict[w_gate_hidden_name] = ms.Parameter( @@ -379,7 +372,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w_scale_gate_hidden_np = np.concatenate( [w1_scale_ms_stack_param, w3_scale_ms_stack_param], axis=1) w_scale_gate_hidden_param = ms.from_numpy( - w_scale_gate_hidden_np).astype(ms.bfloat16) + w_scale_gate_hidden_np).astype(weight_scale_type) self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter( w_scale_gate_hidden_param, name=w_scale_gate_hidden_name, @@ -399,11 +392,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): # w1_scale w3_scale self.parameter_dict[w1_scale_ms_name] = ms.Parameter( - ms.from_numpy(w1_scale_ms_stack_param).astype(ms.bfloat16), + ms.from_numpy(w1_scale_ms_stack_param).astype(weight_scale_type), name=w1_ms_name, requires_grad=False) self.parameter_dict[w3_scale_ms_name] = ms.Parameter( - ms.from_numpy(w3_scale_ms_stack_param).astype(ms.bfloat16), + ms.from_numpy(w3_scale_ms_stack_param).astype(weight_scale_type), name=w3_ms_name, requires_grad=False) @@ -413,7 +406,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) self.parameter_dict[w2_scale_ms_name] = ms.Parameter( - ms.from_numpy(w2_scale_ms_stack_param).astype(ms.bfloat16), + ms.from_numpy(w2_scale_ms_stack_param).astype(weight_scale_type), name=w2_scale_ms_name, requires_grad=False) @@ -483,6 +476,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + if self.is_310: + weight_scale_type = ms.float32 + else: + weight_scale_type = ms.bfloat16 + if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.weight" w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], @@ -499,7 +497,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w_scale_gate_hidden_np = np.concatenate( [w1_scale_ms_param, w3_scale_ms_param], axis=0) w_scale_gate_hidden_param = ms.from_numpy( - w_scale_gate_hidden_np).astype(ms.bfloat16) + w_scale_gate_hidden_np).astype(weight_scale_type) self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter( w_scale_gate_hidden_param, name=w_scale_gate_hidden_name, @@ -516,11 +514,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) self.parameter_dict[w1_scale_ms_name] = ms.Parameter( - ms.from_numpy(w1_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(w1_scale_ms_param).astype(weight_scale_type), name=w1_ms_name, requires_grad=False) self.parameter_dict[w3_scale_ms_name] = ms.Parameter( - ms.from_numpy(w3_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(w3_scale_ms_param).astype(weight_scale_type), name=w3_ms_name, requires_grad=False) @@ -530,7 +528,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) self.parameter_dict[w2_scale_ms_name] = ms.Parameter( - ms.from_numpy(w2_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(w2_scale_ms_param).astype(weight_scale_type), name=w2_ms_name, requires_grad=False) @@ -571,6 +569,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + if self.is_310: + weight_scale_type = ms.float32 + else: + weight_scale_type = ms.bfloat16 + if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.weight" w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], @@ -585,7 +588,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.matmul.weight_scale" w_scale_gate_hidden_param = ms.from_numpy( np.concatenate([w1_scale_ms_param, w3_scale_ms_param], - axis=0)).astype(dtype=ms.bfloat16) + axis=0)).astype(dtype=weight_scale_type) self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter( w_scale_gate_hidden_param, name=w_scale_gate_hidden_name, @@ -602,11 +605,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) self.parameter_dict[w1_scale_ms_name] = ms.Parameter( - ms.from_numpy(w1_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(w1_scale_ms_param).astype(weight_scale_type), name=w1_scale_ms_name, requires_grad=False) self.parameter_dict[w3_scale_ms_name] = ms.Parameter( - ms.from_numpy(w3_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(w3_scale_ms_param).astype(weight_scale_type), name=w3_scale_ms_name, requires_grad=False) @@ -616,7 +619,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) self.parameter_dict[w2_scale_ms_name] = ms.Parameter( - ms.from_numpy(w2_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(w2_scale_ms_param).astype(weight_scale_type), name=w2_ms_name, requires_grad=False) @@ -627,17 +630,19 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): embed_tokens_hf_name) np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map) + embed_tokens_dtype = convert_np_to_ms_dtype(np_data) self.parameter_dict[embed_tokens_ms_name] = ms.Parameter( - ms.from_numpy(np_data).astype(ms.bfloat16), + ms.from_numpy(np_data).astype(embed_tokens_dtype), name=embed_tokens_ms_name, requires_grad=False) norm_hf_name = "model.norm.weight" norm_ms_name = self.quant_convert_weight_name(norm_hf_name) + norm_dtype = convert_np_to_ms_dtype(np_data) np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[norm_ms_name] = ms.Parameter( - ms.from_numpy(np_data).astype(ms.bfloat16), + ms.from_numpy(np_data).astype(norm_dtype), name=norm_ms_name, requires_grad=False) @@ -650,8 +655,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) + lm_head_dtype = convert_np_to_ms_dtype(np_data) self.parameter_dict[lm_head_ms_name] = ms.Parameter( - ms.from_numpy(np_data).astype(ms.bfloat16), + ms.from_numpy(np_data).astype(lm_head_dtype), name=lm_head_ms_name, requires_grad=False) @@ -673,8 +679,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): input_scale_hf_name) input_scale_ms_param, _ = self.get_safetensor_from_file( input_scale_hf_name, src_hf_dir, hf_weight_map) + quant_scale_dtype = convert_np_to_ms_dtype(input_scale_ms_param) self.parameter_dict[input_scale_ms_name] = ms.Parameter( - ms.from_numpy(input_scale_ms_param).astype(ms.bfloat16), + ms.from_numpy(input_scale_ms_param).astype(quant_scale_dtype), name=input_scale_ms_name, requires_grad=False) @@ -754,12 +761,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): dequant_scale_ms_param = self.split_weight_by_rank( dequant_scale_ms_param, split_axis=0) + if self.is_310: + dequant_scale_ms_param = dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) + dequant_scale_dtype = ms.int64 + else: + dequant_scale_dtype = ms.float32 + self.parameter_dict[quant_bias_ms_name] = ms.Parameter( ms.from_numpy(quant_bias_ms_param).astype(ms.int32), name=quant_bias_ms_name, requires_grad=False) self.parameter_dict[dequant_scale_ms_name] = ms.Parameter( - ms.from_numpy(dequant_scale_ms_param).astype(ms.float32), + ms.from_numpy(dequant_scale_ms_param).astype(dequant_scale_dtype), name=dequant_scale_ms_name, requires_grad=False) @@ -808,8 +821,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): lq_norm_ms_name = self.quant_convert_weight_name(lq_norm_hf_name) lq_norm_ms_param, _ = self.get_safetensor_from_file( lq_norm_hf_name, src_hf_dir, hf_weight_map) + norm_dtype = convert_np_to_ms_dtype(lq_norm_ms_param) self.parameter_dict[lq_norm_ms_name] = ms.Parameter( - ms.from_numpy(lq_norm_ms_param).astype(ms.bfloat16), + ms.from_numpy(lq_norm_ms_param).astype(norm_dtype), name=lq_norm_ms_name, requires_grad=False) @@ -841,7 +855,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): lkv_norm_ms_param, _ = self.get_safetensor_from_file( lkv_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lkv_norm_ms_name] = ms.Parameter( - ms.from_numpy(lkv_norm_ms_param).astype(ms.bfloat16), + ms.from_numpy(lkv_norm_ms_param).astype(norm_dtype), name=lkv_norm_ms_name, requires_grad=False) @@ -862,7 +876,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.") self.parameter_dict[name_k_nope] = ms.Parameter( - ms.from_numpy(value_k_nope).astype(ms.bfloat16), + ms.from_numpy(value_k_nope).astype(norm_dtype), name=name_k_nope, requires_grad=False) # value_v @@ -871,10 +885,23 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.") self.parameter_dict[name_v] = ms.Parameter( - ms.from_numpy(value_v).astype(ms.bfloat16), + ms.from_numpy(value_v).astype(norm_dtype), name=name_v, requires_grad=False) + if self.is_310: + qabsorb_param = value_k_nope.copy() + qabsorb_param = qabsorb_param.reshape(-1, 128, 512) + qabsorb_matmul_name = f"model.layers.{layer_id}.attention.qabsorb_matmul.weight" + self.parameter_dict[qabsorb_matmul_name] = ms.Parameter(ms.from_numpy(qabsorb_param).astype(norm_dtype), + name=qabsorb_matmul_name, requires_grad=False) + + outabsorb_param = value_v.copy() + outabsorb_param = outabsorb_param.reshape(-1, 128, 512) + outabsorb_matmul_name = f"model.layers.{layer_id}.attention.outabsorb_matmul.weight" + self.parameter_dict[outabsorb_matmul_name] = ms.Parameter(ms.from_numpy(outabsorb_param).astype(norm_dtype), + name=outabsorb_matmul_name, requires_grad=False) + # o_proj->wo wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" wo_ms_name = self.quant_convert_weight_name(wo_hf_name) @@ -973,6 +1000,24 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): attn_rmsnorm_beta_ms_param, _ = self.get_safetensor_from_file( attn_rmsnorm_beta_hf_name, src_hf_dir, hf_weight_map) + kv2l_beta_ms_name = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta" + kv2l_beta_ms_param = attn_rmsnorm_beta_ms_param.copy() + + l2q_proj_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.bias" + l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name) + l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map) + + if self.is_310: + quant_scale_dtype = ms.float16 + deq_scale_dtype = ms.int64 + beta_dtype = ms.float16 + q2l_dequant_scale_ms_param = q2l_dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) + kv2l_dequant_scale_ms_param = kv2l_dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) + else: + quant_scale_dtype = ms.bfloat16 + deq_scale_dtype = ms.float32 + beta_dtype = ms.bfloat16 + if qkv_concat: qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight" qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias" @@ -995,16 +1040,16 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): qkv2l_scale = np.concatenate( (q2l_dequant_scale_ms_param, kv2l_dequant_scale_ms_param), 0) parameter_dict[qkv2l_scale_name] = ms.Parameter( - ms.Tensor(qkv2l_scale, ms.float32), + ms.Tensor(qkv2l_scale, deq_scale_dtype), name=qkv2l_scale_name, requires_grad=False) parameter_dict[qkv2l_quant_zp_name] = ms.Parameter( ms.Tensor(q2l_input_zp_ms_param, ms.int8), requires_grad=False) parameter_dict[qkv2l_quant_scale_name] = ms.Parameter( - ms.Tensor(q2l_input_scale_ms_param, ms.bfloat16), + ms.Tensor(q2l_input_scale_ms_param, quant_scale_dtype), requires_grad=False) parameter_dict[qkv2l_rmsnorm_beta_name] = ms.Parameter( - ms.Tensor(attn_rmsnorm_beta_ms_param, ms.float32), + ms.Tensor(attn_rmsnorm_beta_ms_param, beta_dtype), requires_grad=False) else: parameter_dict[q2l_ms_name] = ms.Parameter(ms.Tensor( @@ -1023,11 +1068,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=kv2l_quant_bias_ms_name, requires_grad=False) parameter_dict[q2l_dequant_scale_ms_name] = ms.Parameter( - ms.Tensor(q2l_dequant_scale_ms_param, ms.float32), + ms.Tensor(q2l_dequant_scale_ms_param, deq_scale_dtype), name=q2l_dequant_scale_ms_name, requires_grad=False) parameter_dict[kv2l_dequant_scale_ms_name] = ms.Parameter( - ms.Tensor(kv2l_dequant_scale_ms_param, ms.float32), + ms.Tensor(kv2l_dequant_scale_ms_param, deq_scale_dtype), name=kv2l_dequant_scale_ms_name, requires_grad=False) parameter_dict[q2l_input_zp_ms_name] = ms.Parameter( @@ -1039,17 +1084,25 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=kv2l_input_zp_ms_name, requires_grad=False) parameter_dict[q2l_input_scale_ms_name] = ms.Parameter( - ms.Tensor(q2l_input_scale_ms_param, ms.bfloat16), + ms.Tensor(q2l_input_scale_ms_param, quant_scale_dtype), name=q2l_input_scale_ms_name, requires_grad=False) parameter_dict[kv2l_input_scale_ms_name] = ms.Parameter( - ms.Tensor(kv2l_input_scale_ms_param, ms.bfloat16), + ms.Tensor(kv2l_input_scale_ms_param, quant_scale_dtype), name=kv2l_input_scale_ms_name, requires_grad=False) parameter_dict[attn_rmsnorm_beta_ms_name] = ms.Parameter( - ms.Tensor(attn_rmsnorm_beta_ms_param, ms.float32), + ms.Tensor(attn_rmsnorm_beta_ms_param, beta_dtype), name=attn_rmsnorm_beta_ms_name, requires_grad=False) + parameter_dict[kv2l_beta_ms_name] = ms.Parameter( + ms.Tensor(kv2l_beta_ms_param, beta_dtype), + name=kv2l_beta_ms_name, + requires_grad=False) + parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter( + ms.Tensor(l2q_proj_bias_ms_param, beta_dtype), + name=l2q_proj_bias_ms_name, + requires_grad=False) _, _ = ms.load_param_into_net(self.network, parameter_dict) del parameter_dict gc.collect() @@ -1071,7 +1124,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): hf_weight_map) self.infer_quant_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) - self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map) + #self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) def convert_weight_name(self, weight_name: str): @@ -1467,8 +1520,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): attention_norm_hf_name) attention_norm_ms_param, _ = self.get_safetensor_from_file( attention_norm_hf_name, src_hf_dir, hf_weight_map) + norm_dtype = convert_np_to_ms_dtype(attention_norm_ms_param) self.parameter_dict[attention_norm_ms_name] = ms.Parameter( - ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16), + ms.from_numpy(attention_norm_ms_param).astype(norm_dtype), name=attention_norm_ms_name, requires_grad=False) @@ -1478,7 +1532,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): ffn_norm_ms_param, _ = self.get_safetensor_from_file( ffn_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[ffn_norm_ms_name] = ms.Parameter( - ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16), + ms.from_numpy(ffn_norm_ms_param).astype(norm_dtype), name=ffn_norm_ms_name, requires_grad=False) diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 8fb91884..07b65ce8 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -18,6 +18,7 @@ transform huggingface safetensor. """ import os +import numpy as np from enum import Enum from safetensors import safe_open from mindformers.parallel_core.inference.parallel_state import ( @@ -26,6 +27,8 @@ from mindformers.parallel_core.inference.parallel_state import ( get_tensor_and_data_model_parallel_rank, get_tensor_and_data_model_parallel_world_size, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from mindformers.version_control import is_310p +import mindspore as ms from mindspore.communication.management import get_rank, get_group_size @@ -37,6 +40,23 @@ class EPMethod(Enum): ALLTOALL = 'alltoall' ALLGATHER = 'allgather' +def convert_np_to_ms_dtype(value): + """convert_np_to_ms_dtype""" + if value.dtype == np.int8: + value_dtype = ms.int8 + elif value.dtype == np.int32: + value_dtype = ms.int32 + elif value.dtype == np.int64: + value_dtype = ms.int64 + elif value.dtype == np.float64: + value_dtype = ms.float64 + elif value.dtype == np.float32: + value_dtype = ms.float32 + elif value.dtype == np.float16: + value_dtype = ms.float16 + else: + value_dtype = ms.bfloat16 + return value_dtype class BaseWeightProcessor: r""" @@ -49,6 +69,7 @@ class BaseWeightProcessor: def __init__(self, config, network, is_quant, vllm_config): self.vllm_config = vllm_config + self.is_310 = is_310p() self.config = config self.network = network self.is_quant = is_quant @@ -97,6 +118,9 @@ class BaseWeightProcessor: qint4 = True np_data = sf_file.get_tensor(hf_param_name) + data_dtype = convert_np_to_ms_dtype(np_data) + if self.is_310 and data_dtype == ms.bfloat16: + np_data = np_data.astype(np.float32).astype(np.float16) return np_data, qint4 def get_safetensor_from_file_split_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): @@ -126,6 +150,9 @@ class BaseWeightProcessor: split_data = np_data[:, :, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) + data_dtype = convert_np_to_ms_dtype(split_data) + if self.is_310 and data_dtype == ms.bfloat16: + split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 def get_safetensor_from_file_split_tp_dp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): @@ -155,6 +182,9 @@ class BaseWeightProcessor: split_data = np_data[:, :, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) + data_dtype = convert_np_to_ms_dtype(split_data) + if self.is_310 and data_dtype == ms.bfloat16: + split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 @@ -185,6 +215,9 @@ class BaseWeightProcessor: split_data = np_data[:, :, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) + data_dtype = convert_np_to_ms_dtype(split_data) + if self.is_310 and data_dtype == ms.bfloat16: + split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 def get_safetensor_from_file_split_moe_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): @@ -209,6 +242,9 @@ class BaseWeightProcessor: split_data = np_data[:, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) + data_dtype = convert_np_to_ms_dtype(split_data) + if self.is_310 and data_dtype == ms.bfloat16: + split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 def get_routed_safetensor_3_dim(self, hf_param_name, src_hf_dir, hf_weight_map, split_ep=False, split_tp=False, @@ -242,6 +278,9 @@ class BaseWeightProcessor: split_data = np_data[self.ep_start:self.ep_stop, :, start:stop] if split_ep else np_data[:, :, start:stop] else: raise ValueError("tp_axis:{} is not supported.".format(tp_axis)) + data_dtype = convert_np_to_ms_dtype(split_data) + if self.is_310 and data_dtype == ms.bfloat16: + split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 def get_routed_safetensor_2_dim(self, hf_param_name, src_hf_dir, hf_weight_map, split_ep=False, split_tp=False, @@ -270,6 +309,9 @@ class BaseWeightProcessor: split_data = np_data[self.ep_start:self.ep_stop, start:stop] if split_ep else np_data[:, start:stop] else: raise ValueError("split_tp is True but tp_axis:{} is not supported.".format(tp_axis)) + data_dtype = convert_np_to_ms_dtype(split_data) + if self.is_310 and data_dtype == ms.bfloat16: + split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 def split_weight_by_rank(self, weight, split_axis=0): -- Gitee From 333de8be0cb40ce6d3e74d03ff024a5837f58f41 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Wed, 16 Jul 2025 16:09:34 +0800 Subject: [PATCH 6/8] use atlas inference replace 310 --- .../mf_models/deepseekv3_weight_processor.py | 12 ++++++------ .../models/mf_models/weight_processor.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 6e94999d..824fc8ee 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -344,7 +344,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_ms_stack_param = np.stack(w2_scale_list, axis=0) w3_scale_ms_stack_param = np.stack(w3_scale_list, axis=0) - if self.is_310: + if self.is_atlas_inference: weight_scale_type = ms.float32 weight_concat_axis = 2 w1_ms_stack_param = w1_ms_stack_param.transpose(0, 2, 1) @@ -476,7 +476,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) - if self.is_310: + if self.is_atlas_inference: weight_scale_type = ms.float32 else: weight_scale_type = ms.bfloat16 @@ -569,7 +569,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) - if self.is_310: + if self.is_atlas_inference: weight_scale_type = ms.float32 else: weight_scale_type = ms.bfloat16 @@ -761,7 +761,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): dequant_scale_ms_param = self.split_weight_by_rank( dequant_scale_ms_param, split_axis=0) - if self.is_310: + if self.is_atlas_inference: dequant_scale_ms_param = dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) dequant_scale_dtype = ms.int64 else: @@ -889,7 +889,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=name_v, requires_grad=False) - if self.is_310: + if self.is_atlas_inference: qabsorb_param = value_k_nope.copy() qabsorb_param = qabsorb_param.reshape(-1, 128, 512) qabsorb_matmul_name = f"model.layers.{layer_id}.attention.qabsorb_matmul.weight" @@ -1007,7 +1007,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name) l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map) - if self.is_310: + if self.is_atlas_inference: quant_scale_dtype = ms.float16 deq_scale_dtype = ms.int64 beta_dtype = ms.float16 diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 07b65ce8..7f698c91 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -27,9 +27,9 @@ from mindformers.parallel_core.inference.parallel_state import ( get_tensor_and_data_model_parallel_rank, get_tensor_and_data_model_parallel_world_size, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from mindformers.version_control import is_310p import mindspore as ms from mindspore.communication.management import get_rank, get_group_size +from vllm_mindspore.utils import atlas_inference class EPMethod(Enum): @@ -69,7 +69,7 @@ class BaseWeightProcessor: def __init__(self, config, network, is_quant, vllm_config): self.vllm_config = vllm_config - self.is_310 = is_310p() + self.is_atlas_inference = atlas_inference() self.config = config self.network = network self.is_quant = is_quant @@ -119,7 +119,7 @@ class BaseWeightProcessor: np_data = sf_file.get_tensor(hf_param_name) data_dtype = convert_np_to_ms_dtype(np_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: np_data = np_data.astype(np.float32).astype(np.float16) return np_data, qint4 @@ -151,7 +151,7 @@ class BaseWeightProcessor: else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) data_dtype = convert_np_to_ms_dtype(split_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 @@ -183,7 +183,7 @@ class BaseWeightProcessor: else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) data_dtype = convert_np_to_ms_dtype(split_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 @@ -216,7 +216,7 @@ class BaseWeightProcessor: else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) data_dtype = convert_np_to_ms_dtype(split_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 @@ -243,7 +243,7 @@ class BaseWeightProcessor: else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) data_dtype = convert_np_to_ms_dtype(split_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 @@ -279,7 +279,7 @@ class BaseWeightProcessor: else: raise ValueError("tp_axis:{} is not supported.".format(tp_axis)) data_dtype = convert_np_to_ms_dtype(split_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 @@ -310,7 +310,7 @@ class BaseWeightProcessor: else: raise ValueError("split_tp is True but tp_axis:{} is not supported.".format(tp_axis)) data_dtype = convert_np_to_ms_dtype(split_data) - if self.is_310 and data_dtype == ms.bfloat16: + if self.is_atlas_inference and data_dtype == ms.bfloat16: split_data = split_data.astype(np.float32).astype(np.float16) return split_data, qint4 -- Gitee From 61969de97d584ea6fdc6a7fb2b7c9136282760d1 Mon Sep 17 00:00:00 2001 From: huangzhuo Date: Wed, 16 Jul 2025 17:32:52 +0800 Subject: [PATCH 7/8] optimize load checkpoint --- .../mf_models/deepseekv3_weight_processor.py | 477 ++++++------------ 1 file changed, 155 insertions(+), 322 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 824fc8ee..097945fb 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -661,12 +661,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=lm_head_ms_name, requires_grad=False) - def quant_special_attention_weight(self, - layer_id, - src_hf_dir, - hf_weight_map, - name, - is_trans_rope_weigh=False, + def quant_special_attention_weight(self, layer_id, src_hf_dir, hf_weight_map, name, is_trans_rope_weigh=False, is_split_param=False): # q_a_proj->q2l_proj # kv_a_proj_with_mqa->kv2l @@ -675,39 +670,30 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): # input_scale, input_zp no split input_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_scale" - input_scale_ms_name = self.quant_convert_weight_name( - input_scale_hf_name) - input_scale_ms_param, _ = self.get_safetensor_from_file( - input_scale_hf_name, src_hf_dir, hf_weight_map) - quant_scale_dtype = convert_np_to_ms_dtype(input_scale_ms_param) + input_scale_ms_name = self.quant_convert_weight_name(input_scale_hf_name) + input_scale_ms_param, _ = self.get_safetensor_from_file(input_scale_hf_name, src_hf_dir, hf_weight_map) + input_scale_dtype = convert_np_to_ms_dtype(input_scale_ms_param) self.parameter_dict[input_scale_ms_name] = ms.Parameter( - ms.from_numpy(input_scale_ms_param).astype(quant_scale_dtype), - name=input_scale_ms_name, - requires_grad=False) + ms.from_numpy(input_scale_ms_param).astype(input_scale_dtype), + name=input_scale_ms_name, requires_grad=False) input_zp_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_offset" input_zp_ms_name = self.quant_convert_weight_name(input_zp_hf_name) - input_zp_ms_param, _ = self.get_safetensor_from_file( - input_zp_hf_name, src_hf_dir, hf_weight_map) - self.parameter_dict[input_zp_ms_name] = ms.Parameter( - ms.from_numpy(input_zp_ms_param).astype(ms.int8), - name=input_zp_ms_name, - requires_grad=False) + input_zp_ms_param, _ = self.get_safetensor_from_file(input_zp_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[input_zp_ms_name] = ms.Parameter(ms.from_numpy(input_zp_ms_param).astype(ms.int8), + name=input_zp_ms_name, + requires_grad=False) if not is_trans_rope_weigh: quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias" - quant_bias_ms_name = self.quant_convert_weight_name( - quant_bias_hf_name) - quant_bias_ms_param, _ = self.get_safetensor_from_file( - quant_bias_hf_name, src_hf_dir, hf_weight_map) - if name == "o_proj" and self.tp_rank_id != 0: + quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name) + quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map) + if name == "o_proj" and get_tensor_model_parallel_rank() != 0: quant_bias_ms_param.fill(0) dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale" - dequant_scale_ms_name = self.quant_convert_weight_name( - dequant_scale_hf_name) - dequant_scale_ms_param, _ = self.get_safetensor_from_file( - dequant_scale_hf_name, src_hf_dir, hf_weight_map) + dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name) + dequant_scale_ms_param, _ = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map) else: kv_lora_rank = self.config.model.model_config.kv_lora_rank qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim @@ -718,48 +704,33 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): kv_head_dim = kv_lora_rank + qk_rope_head_dim quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias" - quant_bias_ms_name = self.quant_convert_weight_name( - quant_bias_hf_name) - quant_bias_ms_param, _ = self.get_safetensor_from_file( - quant_bias_hf_name, src_hf_dir, hf_weight_map) + quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name) + quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map) dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale" - dequant_scale_ms_name = self.quant_convert_weight_name( - dequant_scale_hf_name) - dequant_scale_ms_param, _ = self.get_safetensor_from_file( - dequant_scale_hf_name, src_hf_dir, hf_weight_map) + dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name) + dequant_scale_ms_param, _ = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map) if name == "q_b_proj": - quant_bias_ms_param = quant_bias_ms_param.reshape( - num_heads, rope_dim, -1) - quant_bias_ms_param = self.infer_trans_rope_weight( - quant_bias_ms_param, qk_rope_head_dim) - quant_bias_ms_param = quant_bias_ms_param.reshape( - num_heads * rope_dim, -1).reshape(-1) - - dequant_scale_ms_param = dequant_scale_ms_param.reshape( - num_heads, rope_dim, -1) - dequant_scale_ms_param = self.infer_trans_rope_weight( - dequant_scale_ms_param, qk_rope_head_dim) - dequant_scale_ms_param = dequant_scale_ms_param.reshape( - num_heads * rope_dim, -1).reshape(-1) + quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads, rope_dim, -1) + quant_bias_ms_param = self.infer_trans_rope_weight(quant_bias_ms_param, qk_rope_head_dim) + quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads * rope_dim, -1).reshape(-1) + + dequant_scale_ms_param = dequant_scale_ms_param.reshape(num_heads, rope_dim, -1) + dequant_scale_ms_param = self.infer_trans_rope_weight(dequant_scale_ms_param, qk_rope_head_dim) + dequant_scale_ms_param = dequant_scale_ms_param.reshape(num_heads * rope_dim, -1).reshape(-1) elif name == "kv_a_proj_with_mqa": - quant_bias_ms_param = quant_bias_ms_param.reshape( - kv_head_dim, -1) - quant_bias_ms_param = self.infer_trans_rope_weight( - quant_bias_ms_param, qk_rope_head_dim).reshape(-1) + quant_bias_ms_param = quant_bias_ms_param.reshape(kv_head_dim, -1) + quant_bias_ms_param = self.infer_trans_rope_weight(quant_bias_ms_param, qk_rope_head_dim).reshape(-1) - dequant_scale_ms_param = dequant_scale_ms_param.reshape( - kv_head_dim, -1) - dequant_scale_ms_param = self.infer_trans_rope_weight( - dequant_scale_ms_param, qk_rope_head_dim).reshape(-1) + dequant_scale_ms_param = dequant_scale_ms_param.reshape(kv_head_dim, -1) + dequant_scale_ms_param = self.infer_trans_rope_weight(dequant_scale_ms_param, qk_rope_head_dim).reshape( + -1) if is_split_param: - quant_bias_ms_param = self.split_weight_by_rank( - quant_bias_ms_param, split_axis=0) - dequant_scale_ms_param = self.split_weight_by_rank( - dequant_scale_ms_param, split_axis=0) + quant_bias_ms_param = self.split_weight_by_rank(quant_bias_ms_param, split_axis=0) + dequant_scale_ms_param = self.split_weight_by_rank(dequant_scale_ms_param, split_axis=0) if self.is_atlas_inference: dequant_scale_ms_param = dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) @@ -769,8 +740,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): self.parameter_dict[quant_bias_ms_name] = ms.Parameter( ms.from_numpy(quant_bias_ms_param).astype(ms.int32), - name=quant_bias_ms_name, - requires_grad=False) + name=quant_bias_ms_name, requires_grad=False) self.parameter_dict[dequant_scale_ms_name] = ms.Parameter( ms.from_numpy(dequant_scale_ms_param).astype(dequant_scale_dtype), name=dequant_scale_ms_name, @@ -779,81 +749,148 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def infer_quant_bias_weight(self, src_hf_dir, layer_id, hf_weight_map): # quant_op.beta q2l_proj_bias_hf_name = f"model.layers.{layer_id}.input_layernorm.bias" - q2l_proj_bias_ms_name = self.quant_convert_weight_name( - q2l_proj_bias_hf_name) - q2l_proj_bias_ms_param, _ = self.get_safetensor_from_file( - q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map) + q2l_proj_bias_ms_name = self.quant_convert_weight_name(q2l_proj_bias_hf_name) + q2l_proj_bias_ms_param, _ = self.get_safetensor_from_file(q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map) kv2l_bias_ms_name = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta" kv2l_bias_ms_param = q2l_proj_bias_ms_param.copy() l2q_proj_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.bias" - l2q_proj_bias_ms_name = self.quant_convert_weight_name( - l2q_proj_bias_hf_name) - l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file( - l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map) - - self.parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter( - ms.from_numpy(q2l_proj_bias_ms_param).astype(ms.bfloat16), - name=q2l_proj_bias_ms_name, - requires_grad=False) - self.parameter_dict[kv2l_bias_ms_name] = ms.Parameter( - ms.from_numpy(kv2l_bias_ms_param).astype(ms.bfloat16), - name=kv2l_bias_ms_name, - requires_grad=False) + l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name) + l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map) + bias_dtype = convert_np_to_ms_dtype(q2l_proj_bias_ms_param) + + if self.config.model.model_config.qkv_concat: + qkv2l_bias_ms_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.beta" + self.parameter_dict[qkv2l_bias_ms_name] = ms.Parameter( + ms.from_numpy(q2l_proj_bias_ms_param).astype(bias_dtype), + name=qkv2l_bias_ms_name, + requires_grad=False) + else: + self.parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter( + ms.from_numpy(q2l_proj_bias_ms_param).astype(bias_dtype), + name=q2l_proj_bias_ms_name, + requires_grad=False) + self.parameter_dict[kv2l_bias_ms_name] = ms.Parameter( + ms.from_numpy(kv2l_bias_ms_param).astype(bias_dtype), + name=kv2l_bias_ms_name, + requires_grad=False) self.parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter( - ms.from_numpy(l2q_proj_bias_ms_param).astype(ms.bfloat16), + ms.from_numpy(l2q_proj_bias_ms_param).astype(bias_dtype), name=l2q_proj_bias_ms_name, requires_grad=False) - def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, - hf_weight_map): + def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer quant process attention weight""" num_heads = self.config.model.model_config.num_heads + kv_lora_rank = self.config.model.model_config.kv_lora_rank qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim v_head_dim = self.config.model.model_config.v_head_dim qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim rope_dim = qk_rope_head_dim + qk_nope_head_dim + kv_head_dim = kv_lora_rank + qk_rope_head_dim + + # q_a_proj->q2l_proj + q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" + q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name) + q2l_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map) + + # kv_a_proj_with_mqa->kv2l + kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" + kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name) + kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map) + kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) + kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim) + + + if self.config.model.model_config.qkv_concat: + qkv2l_proj_ms_param = np.concatenate((q2l_proj_ms_param, kv2l_ms_param), 0) + qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight" + self.parameter_dict[qkv2l_weight_name] = ms.Parameter(ms.from_numpy(qkv2l_proj_ms_param).astype(ms.int8), + name=qkv2l_weight_name, + requires_grad=False) + + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj") + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa", + is_trans_rope_weigh=True) + q2l_input_scale = self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_scale"] + qkv2l_input_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale" + self.parameter_dict[qkv2l_input_scale_name] = ms.Parameter(ms.from_numpy(q2l_input_scale.asnumpy()), + name=qkv2l_input_scale_name, requires_grad=False) + + q2l_input_zp = self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_zp"] + qkv2l_input_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp" + self.parameter_dict[qkv2l_input_zp_name] = ms.Parameter(ms.from_numpy(q2l_input_zp.asnumpy()), + name=qkv2l_input_zp_name, + requires_grad=False) + + q2l_scale = self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.dequant_scale"] + kv2l_scale = self.parameter_dict[f"model.layers.{layer_id}.attention.kv2l._layer.matmul.dequant_scale"] + qkv2l_scale_ms_param = np.concatenate((q2l_scale, kv2l_scale), 0) + qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale" + self.parameter_dict[qkv2l_scale_name] = ms.Parameter(ms.from_numpy(qkv2l_scale_ms_param).astype(ms.int64), + name=qkv2l_scale_name, + requires_grad=False) + + q2l_bias = self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.quant_bias"] + kv2l_bias = self.parameter_dict[f"model.layers.{layer_id}.attention.kv2l._layer.matmul.quant_bias"] + qkv2l_bias_ms_param = np.concatenate((q2l_bias, kv2l_bias), 0) + qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias" + self.parameter_dict[qkv2l_bias_name] = ms.Parameter(ms.from_numpy(qkv2l_bias_ms_param).astype(ms.int32), + name=qkv2l_bias_name, + requires_grad=False) + del self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_scale"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.kv2l.quant_op.input_scale"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_zp"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.kv2l.quant_op.input_zp"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.dequant_scale"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.kv2l._layer.matmul.dequant_scale"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.quant_bias"] + del self.parameter_dict[f"model.layers.{layer_id}.attention.kv2l._layer.matmul.quant_bias"] + else: + # q_a_proj->q2l_proj + self.parameter_dict[q2l_proj_ms_name] = ms.Parameter( + ms.from_numpy(q2l_proj_ms_param).astype(ms.int8), + name=q2l_proj_ms_name, + requires_grad=False) + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj") + + # kv_a_proj_with_mqa->kv2l + self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.int8), + name=kv2l_ms_name, + requires_grad=False) + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa", + is_trans_rope_weigh=True) # q_a_layernorm->lq_norm lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight" lq_norm_ms_name = self.quant_convert_weight_name(lq_norm_hf_name) - lq_norm_ms_param, _ = self.get_safetensor_from_file( - lq_norm_hf_name, src_hf_dir, hf_weight_map) + lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map) norm_dtype = convert_np_to_ms_dtype(lq_norm_ms_param) - self.parameter_dict[lq_norm_ms_name] = ms.Parameter( - ms.from_numpy(lq_norm_ms_param).astype(norm_dtype), - name=lq_norm_ms_name, - requires_grad=False) + self.parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.from_numpy(lq_norm_ms_param).astype(norm_dtype), + name=lq_norm_ms_name, + requires_grad=False) # q_b_proj->l2q_proj l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight" l2q_proj_ms_name = self.quant_convert_weight_name(l2q_proj_hf_name) - l2q_proj_ms_param, _ = self.get_safetensor_from_file( - l2q_proj_hf_name, src_hf_dir, hf_weight_map) + l2q_proj_ms_param, _ = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map) l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1) - l2q_proj_ms_param = self.infer_trans_rope_weight( - l2q_proj_ms_param, qk_rope_head_dim) + l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim) l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1) - l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, - split_axis=0) + l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0) self.parameter_dict[l2q_proj_ms_name] = ms.Parameter( ms.from_numpy(l2q_proj_ms_param).astype(ms.int8), name=l2q_proj_ms_name, requires_grad=False) - self.quant_special_attention_weight(layer_id, - src_hf_dir, - hf_weight_map, - "q_b_proj", - is_trans_rope_weigh=True, + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_b_proj", is_trans_rope_weigh=True, is_split_param=True) # kv_a_layernorm->lkv_norm lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight" lkv_norm_ms_name = self.quant_convert_weight_name(lkv_norm_hf_name) - lkv_norm_ms_param, _ = self.get_safetensor_from_file( - lkv_norm_hf_name, src_hf_dir, hf_weight_map) + lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lkv_norm_ms_name] = ms.Parameter( ms.from_numpy(lkv_norm_ms_param).astype(norm_dtype), name=lkv_norm_ms_name, @@ -862,32 +899,25 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): # kv_b_proj->lkv2kv lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight" lkv2kv_ms_name = self.quant_convert_weight_name(lkv2kv_hf_name) - lkv2kv_ms_param, _ = self.get_safetensor_from_file( - lkv2kv_hf_name, src_hf_dir, hf_weight_map) + lkv2kv_ms_param, _ = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map) lkv2kv_head = qk_nope_head_dim + v_head_dim lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1) - value_k_nope, value_v = lkv2kv_ms_param[:, : - qk_nope_head_dim, :], lkv2kv_ms_param[:, - qk_nope_head_dim:, :] + value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :] # value_k_nope value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1]) value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0) - name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", - ".attention.lkv2kv_k_nope.") - self.parameter_dict[name_k_nope] = ms.Parameter( - ms.from_numpy(value_k_nope).astype(norm_dtype), - name=name_k_nope, - requires_grad=False) + name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.") + self.parameter_dict[name_k_nope] = ms.Parameter(ms.from_numpy(value_k_nope).astype(norm_dtype), + name=name_k_nope, + requires_grad=False) # value_v value_v = value_v.reshape(-1, value_v.shape[-1]) value_v = self.split_weight_by_rank(value_v, split_axis=0) - name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", - ".attention.lkv2kv_v.") - self.parameter_dict[name_v] = ms.Parameter( - ms.from_numpy(value_v).astype(norm_dtype), - name=name_v, - requires_grad=False) + name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.") + self.parameter_dict[name_v] = ms.Parameter(ms.from_numpy(value_v).astype(norm_dtype), + name=name_v, + requires_grad=False) if self.is_atlas_inference: qabsorb_param = value_k_nope.copy() @@ -905,207 +935,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): # o_proj->wo wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" wo_ms_name = self.quant_convert_weight_name(wo_hf_name) - wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, - hf_weight_map) + wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map) wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1) - self.parameter_dict[wo_ms_name] = ms.Parameter( - ms.from_numpy(wo_ms_param).astype(ms.int8), - name=wo_ms_name, - requires_grad=False) - self.quant_special_attention_weight(layer_id, src_hf_dir, - hf_weight_map, "o_proj") - - def infer_quant_process_dense_qkv_weight(self, src_hf_dir, layer_id, - hf_weight_map): - """infer_quant_process_dense_qkv_weight""" - parameter_dict = {} - kv_lora_rank = self.config.model.model_config.kv_lora_rank - qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim - kv_head_dim = kv_lora_rank + qk_rope_head_dim - - qkv_concat = self.config.model.model_config.qkv_concat - # q2l - q2l_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" - q2l_ms_name = self.quant_convert_weight_name(q2l_hf_name) - q2l_ms_param, _ = self.get_safetensor_from_file( - q2l_hf_name, src_hf_dir, hf_weight_map) - - q2l_input_scale_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.input_scale" - q2l_input_scale_ms_name = self.quant_convert_weight_name( - q2l_input_scale_hf_name) - q2l_input_scale_ms_param, _ = self.get_safetensor_from_file( - q2l_input_scale_hf_name, src_hf_dir, hf_weight_map) - - q2l_input_zp_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.input_offset" - q2l_input_zp_ms_name = self.quant_convert_weight_name( - q2l_input_zp_hf_name) - q2l_input_zp_ms_param, _ = self.get_safetensor_from_file( - q2l_input_zp_hf_name, src_hf_dir, hf_weight_map) - - q2l_quant_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.quant_bias" - q2l_quant_bias_ms_name = self.quant_convert_weight_name( - q2l_quant_bias_hf_name) - q2l_quant_bias_ms_param, _ = self.get_safetensor_from_file( - q2l_quant_bias_hf_name, src_hf_dir, hf_weight_map) - - q2l_dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.deq_scale" - q2l_dequant_scale_ms_name = self.quant_convert_weight_name( - q2l_dequant_scale_hf_name) - q2l_dequant_scale_ms_param, _ = self.get_safetensor_from_file( - q2l_dequant_scale_hf_name, src_hf_dir, hf_weight_map) - # kv2l - kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" - kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name) - kv2l_ms_param, _ = self.get_safetensor_from_file( - kv2l_hf_name, src_hf_dir, hf_weight_map) - kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) - kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, - qk_rope_head_dim) - - kv2l_input_scale_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.input_scale" - kv2l_input_scale_ms_name = self.quant_convert_weight_name( - kv2l_input_scale_hf_name) - kv2l_input_scale_ms_param, _ = self.get_safetensor_from_file( - kv2l_input_scale_hf_name, src_hf_dir, hf_weight_map) - - kv2l_input_zp_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.input_offset" - kv2l_input_zp_ms_name = self.quant_convert_weight_name( - kv2l_input_zp_hf_name) - kv2l_input_zp_ms_param, _ = self.get_safetensor_from_file( - kv2l_input_zp_hf_name, src_hf_dir, hf_weight_map) - - kv2l_quant_bias_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.quant_bias" - kv2l_quant_bias_ms_name = self.quant_convert_weight_name( - kv2l_quant_bias_hf_name) - kv2l_quant_bias_ms_param, _ = self.get_safetensor_from_file( - kv2l_quant_bias_hf_name, src_hf_dir, hf_weight_map) - kv2l_quant_bias_ms_param = kv2l_quant_bias_ms_param.reshape( - kv_head_dim, -1) - kv2l_quant_bias_ms_param = self.infer_trans_rope_weight( - kv2l_quant_bias_ms_param, qk_rope_head_dim).reshape(-1) - - kv2l_dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.deq_scale" - kv2l_dequant_scale_ms_name = self.quant_convert_weight_name( - kv2l_dequant_scale_hf_name) - kv2l_dequant_scale_ms_param, _ = self.get_safetensor_from_file( - kv2l_dequant_scale_hf_name, src_hf_dir, hf_weight_map) - kv2l_dequant_scale_ms_param = kv2l_dequant_scale_ms_param.reshape( - kv_head_dim, -1) - kv2l_dequant_scale_ms_param = self.infer_trans_rope_weight( - kv2l_dequant_scale_ms_param, qk_rope_head_dim).reshape(-1) - - attn_rmsnorm_beta_hf_name = f"model.layers.{layer_id}.input_layernorm.bias" - attn_rmsnorm_beta_ms_name = self.quant_convert_weight_name( - attn_rmsnorm_beta_hf_name) - attn_rmsnorm_beta_ms_param, _ = self.get_safetensor_from_file( - attn_rmsnorm_beta_hf_name, src_hf_dir, hf_weight_map) - - kv2l_beta_ms_name = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta" - kv2l_beta_ms_param = attn_rmsnorm_beta_ms_param.copy() - - l2q_proj_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.bias" - l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name) - l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map) - - if self.is_atlas_inference: - quant_scale_dtype = ms.float16 - deq_scale_dtype = ms.int64 - beta_dtype = ms.float16 - q2l_dequant_scale_ms_param = q2l_dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) - kv2l_dequant_scale_ms_param = kv2l_dequant_scale_ms_param.astype(np.float32).view(np.int32).astype(np.int64) - else: - quant_scale_dtype = ms.bfloat16 - deq_scale_dtype = ms.float32 - beta_dtype = ms.bfloat16 - - if qkv_concat: - qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight" - qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias" - qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale" - qkv2l_quant_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp" - qkv2l_quant_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale" - qkv2l_rmsnorm_beta_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.beta" - - qkv2l_weight = np.concatenate((q2l_ms_param, kv2l_ms_param), 0) - parameter_dict[qkv2l_weight_name] = ms.Parameter( - ms.Tensor(qkv2l_weight, ms.int8), - name=qkv2l_weight_name, - requires_grad=False) - qkv2l_bias = np.concatenate( - (q2l_quant_bias_ms_param, kv2l_quant_bias_ms_param), 0) - parameter_dict[qkv2l_bias_name] = ms.Parameter( - ms.Tensor(qkv2l_bias, ms.int32), - name=qkv2l_bias_name, - requires_grad=False) - qkv2l_scale = np.concatenate( - (q2l_dequant_scale_ms_param, kv2l_dequant_scale_ms_param), 0) - parameter_dict[qkv2l_scale_name] = ms.Parameter( - ms.Tensor(qkv2l_scale, deq_scale_dtype), - name=qkv2l_scale_name, - requires_grad=False) - parameter_dict[qkv2l_quant_zp_name] = ms.Parameter( - ms.Tensor(q2l_input_zp_ms_param, ms.int8), requires_grad=False) - parameter_dict[qkv2l_quant_scale_name] = ms.Parameter( - ms.Tensor(q2l_input_scale_ms_param, quant_scale_dtype), - requires_grad=False) - parameter_dict[qkv2l_rmsnorm_beta_name] = ms.Parameter( - ms.Tensor(attn_rmsnorm_beta_ms_param, beta_dtype), - requires_grad=False) - else: - parameter_dict[q2l_ms_name] = ms.Parameter(ms.Tensor( - q2l_ms_param, ms.int8), - name=q2l_ms_name, + self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.int8), + name=wo_ms_name, requires_grad=False) - parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor( - kv2l_ms_param, ms.int8), - requires_grad=False) - parameter_dict[q2l_quant_bias_ms_name] = ms.Parameter( - ms.Tensor(q2l_quant_bias_ms_param, ms.int32), - name=q2l_quant_bias_ms_name, - requires_grad=False) - parameter_dict[kv2l_quant_bias_ms_name] = ms.Parameter( - ms.Tensor(kv2l_quant_bias_ms_param, ms.int32), - name=kv2l_quant_bias_ms_name, - requires_grad=False) - parameter_dict[q2l_dequant_scale_ms_name] = ms.Parameter( - ms.Tensor(q2l_dequant_scale_ms_param, deq_scale_dtype), - name=q2l_dequant_scale_ms_name, - requires_grad=False) - parameter_dict[kv2l_dequant_scale_ms_name] = ms.Parameter( - ms.Tensor(kv2l_dequant_scale_ms_param, deq_scale_dtype), - name=kv2l_dequant_scale_ms_name, - requires_grad=False) - parameter_dict[q2l_input_zp_ms_name] = ms.Parameter( - ms.Tensor(q2l_input_zp_ms_param, ms.int8), - name=q2l_input_zp_ms_name, - requires_grad=False) - parameter_dict[kv2l_input_zp_ms_name] = ms.Parameter( - ms.Tensor(kv2l_input_zp_ms_param, ms.int8), - name=kv2l_input_zp_ms_name, - requires_grad=False) - parameter_dict[q2l_input_scale_ms_name] = ms.Parameter( - ms.Tensor(q2l_input_scale_ms_param, quant_scale_dtype), - name=q2l_input_scale_ms_name, - requires_grad=False) - parameter_dict[kv2l_input_scale_ms_name] = ms.Parameter( - ms.Tensor(kv2l_input_scale_ms_param, quant_scale_dtype), - name=kv2l_input_scale_ms_name, - requires_grad=False) - parameter_dict[attn_rmsnorm_beta_ms_name] = ms.Parameter( - ms.Tensor(attn_rmsnorm_beta_ms_param, beta_dtype), - name=attn_rmsnorm_beta_ms_name, - requires_grad=False) - parameter_dict[kv2l_beta_ms_name] = ms.Parameter( - ms.Tensor(kv2l_beta_ms_param, beta_dtype), - name=kv2l_beta_ms_name, - requires_grad=False) - parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter( - ms.Tensor(l2q_proj_bias_ms_param, beta_dtype), - name=l2q_proj_bias_ms_name, - requires_grad=False) - _, _ = ms.load_param_into_net(self.network, parameter_dict) - del parameter_dict - gc.collect() + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj") def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): @@ -1120,11 +955,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): self.infer_quant_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) - self.infer_quant_process_dense_qkv_weight(src_hf_dir, layer_id, - hf_weight_map) self.infer_quant_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) - #self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) def convert_weight_name(self, weight_name: str): -- Gitee From f9e6a4afd3ecd450b2e0ca0a1f3838d700a66a18 Mon Sep 17 00:00:00 2001 From: HighCloud Date: Thu, 17 Jul 2025 10:15:14 +0800 Subject: [PATCH 8/8] fix --- vllm_mindspore/model_executor/models/model_base.py | 8 ++++---- vllm_mindspore/model_executor/models/qwen2.py | 11 ++++++----- vllm_mindspore/utils.py | 3 +++ vllm_mindspore/v1/worker/gpu_model_runner.py | 6 +++--- vllm_mindspore/worker/cache_engine.py | 4 ++-- 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 2cbf9f2a..a11aa8c2 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -37,7 +37,7 @@ from mindspore.common import dtype as mstype from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata -from vllm_mindspore.utils import atlas_inference +from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE class AttentionWrapper: @@ -56,13 +56,13 @@ class AttentionWrapper: ms.mint.zeros( self.kv_shape, dtype=vllm_config.model_config.dtype ), - 29, + FORMAT_TYPE['nz'], ), ops.auto_generate.format_cast( ms.mint.zeros( self.kv_shape, dtype=vllm_config.model_config.dtype ), - 29, + FORMAT_TYPE['nz'], ), ) for _ in range(vllm_config.parallel_config.pipeline_parallel_size) @@ -97,7 +97,7 @@ class MLAAttentionWrapper(AttentionWrapper): ms.mint.zeros( self.kv_shape, dtype=vllm_config.model_config.dtype ), - 29, + FORMAT_TYPE['nz'], ), ) for _ in range(vllm_config.parallel_config.pipeline_parallel_size) diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 01d643d5..5703a985 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.sequence import IntermediateTensors -from vllm_mindspore.utils import atlas_inference +from vllm_mindspore.utils import atlas_inference, FORMAT_TYPE from vllm_mindspore.attention import Attention from vllm_mindspore.model_executor.layers.activation import SwiGLU from vllm_mindspore.model_executor.layers.layernorm import RMSNorm @@ -414,13 +414,14 @@ class Qwen2Model(nn.Cell): for name, param in params_dict.items(): if any(name.endswith(keyword) for keyword in target_keywords): - cast_weight = ops.auto_generate.format_cast(param, 29) + cast_weight = ops.auto_generate.format_cast(param, FORMAT_TYPE['nz']) ms.runtime.synchronize() param.set_data(cast_weight) - ms.runtime.synchronize() - adjust_weight(params_dict) - ms.runtime.synchronize() + if atlas_inference(): + ms.runtime.synchronize() + adjust_weight(params_dict) + ms.runtime.synchronize() return loaded_params diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index 64bf5cab..8ecb65ee 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -54,6 +54,9 @@ STR_DTYPE_TO_MS_DTYPE = { "fp8_e5m2": ms.uint8, } +FORMAT_TYPE = { + "nz": 29, +} def get_valid_dtype(dtype): if isinstance(dtype, str): diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index b76ea889..599f7a0d 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -24,7 +24,7 @@ import torch import mindspore as ms from mindspore import mutable, ops from vllm_mindspore.v1.attention.backends.ms_attn import MsAttentionMetadata -from vllm_mindspore.utils import get_valid_dtype, get_dtype_size, atlas_inference +from vllm_mindspore.utils import get_valid_dtype, get_dtype_size, atlas_inference, FORMAT_TYPE from vllm_mindspore.model_executor.layers.rotary_embedding import InferMRotaryEmbedding as MRotaryEmbedding # type: ignore[attr-defined] from vllm.v1.outputs import ModelRunnerOutput @@ -206,7 +206,7 @@ def create_block(shape, dtype, name=None, device=None): from mindspore.common.api import _pynative_executor blocks = empty_tensor(*shape, dtype=dtype, device=device) if device == "Ascend" and atlas_inference(): - blocks_nz = ops.auto_generate.format_cast(blocks, 29) + blocks_nz = ops.auto_generate.format_cast(blocks, FORMAT_TYPE['nz']) _pynative_executor.sync() import gc del blocks @@ -322,7 +322,7 @@ def _reshape_kv_cache_tensors( cache_block = kv_cache_raw_tensor.view(kv_cache_shape[1:]).permute(*inv_order[1:]) if atlas_inference(): from mindspore.common.api import _pynative_executor - cache_block_nz = ops.auto_generate.format_cast(cache_block, 29) + cache_block_nz = ops.auto_generate.format_cast(cache_block, FORMAT_TYPE['nz']) _pynative_executor.sync() import gc del cache_block diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py index 8190e03b..33bc11f4 100644 --- a/vllm_mindspore/worker/cache_engine.py +++ b/vllm_mindspore/worker/cache_engine.py @@ -21,7 +21,7 @@ import mindspore as ms from mindspore import mutable, mint, ops from typing import List from vllm.logger import init_logger -from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference +from vllm_mindspore.utils import MsKVCache, get_valid_dtype, atlas_inference, FORMAT_TYPE logger = init_logger(__name__) @@ -31,7 +31,7 @@ def create_block(shape, dtype, name=None, device=None): from mindspore.common.api import _pynative_executor blocks = empty_tensor(*shape, dtype=dtype, device=device) if device == "Ascend" and atlas_inference(): - blocks_nz = ops.auto_generate.format_cast(blocks, 29) + blocks_nz = ops.auto_generate.format_cast(blocks, FORMAT_TYPE['nz']) _pynative_executor.sync() import gc del blocks -- Gitee