diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index c417ac247fa3a77f886a4b6da2bfbb373ff06d02..982e79b0565c6624ee0e0cbd27f70f54af87a0da 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -18,327 +18,334 @@ import sys import warnings +import importlib.util -if "vllm" in sys.modules: - # Check models variable in sub process, cannot raise here. + +if importlib.util.find_spec("vllm_ascend") is not None: + import vllm_mindspore.patch.patch_vllm_ascend +else: warnings.warn( - "vllm import before vllm_mindspore, vllm_mindspore cannot worker right!" + f"vllm-ascend is not imported because vllm_ascend is not installed" + ) + + if "vllm" in sys.modules: + # Check models variable in sub process, cannot raise here. + warnings.warn( + "vllm import before vllm_mindspore, vllm_mindspore cannot worker right!" + ) + + # 1. set env before import mindspore. + from vllm_mindspore.scripts import env_setup + env_setup() + + # 2. update the log configuration ahead of other modifications. + import vllm_mindspore.logger + + from vllm_mindspore.platforms.ascend import AscendPlatform + + ascend_platform = AscendPlatform() + + import vllm.config + + vllm.config.current_platform = ascend_platform + + import vllm.platforms + + vllm.platforms.current_platform = ascend_platform + + import vllm.utils + + import vllm.executor.ray_utils + + vllm.executor.ray_utils.current_platform = ascend_platform + + import vllm.attention.selector + vllm.attention.selector.current_platform = ascend_platform + + import vllm.engine.arg_utils + from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle + vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle + + import vllm.v1.engine.core + from vllm_mindspore.v1.engine.core import ( + _init_data_parallel, + shutdown, + ) + vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel = _init_data_parallel + vllm.v1.engine.core.DPEngineCoreProc.shutdown = shutdown + + from vllm_mindspore.utils import ( + direct_register_custom_op, + make_tensor_with_pad, + async_tensor_h2d, + get_dtype_size, + ascend_device_count_stateless, + ascend_is_initialized, + ) + + vllm.utils.direct_register_custom_op = direct_register_custom_op + vllm.utils.make_tensor_with_pad = make_tensor_with_pad + vllm.utils.async_tensor_h2d = async_tensor_h2d + vllm.utils.get_dtype_size = get_dtype_size + vllm.utils.cuda_device_count_stateless = ascend_device_count_stateless + vllm.utils.cuda_is_initialized = ascend_is_initialized + vllm.config.cuda_device_count_stateless = ascend_device_count_stateless + + import vllm.executor + + vllm.executor.cuda_device_count_stateless = ascend_device_count_stateless + + from vllm_mindspore.model_executor.models.registry import ( + MindSporeModelRegistry, + _SUBPROCESS_COMMAND, ) -# 1. set env before import mindspore. -from vllm_mindspore.scripts import env_setup -env_setup() -# 2. update the log configuration ahead of other modifications. -import vllm_mindspore.logger + vllm.config.ModelRegistry = MindSporeModelRegistry -from vllm_mindspore.platforms.ascend import AscendPlatform + import vllm.model_executor -ascend_platform = AscendPlatform() + vllm.model_executor.models.ModelRegistry = MindSporeModelRegistry + vllm.model_executor.models.registry._SUBPROCESS_COMMAND = _SUBPROCESS_COMMAND -import vllm.config + from vllm_mindspore.model_executor.model_loader.utils import get_ms_model_architecture -vllm.config.current_platform = ascend_platform + # To patching the get_model_architecture, should import it first. + from vllm.model_executor.model_loader import get_model_architecture -import vllm.platforms + vllm.model_executor.model_loader.get_model_architecture = get_ms_model_architecture + vllm.model_executor.model_loader.utils.get_model_architecture = ( + get_ms_model_architecture + ) + vllm.model_executor.model_loader.default_loader.get_model_architecture = ( + get_ms_model_architecture + ) -vllm.platforms.current_platform = ascend_platform + from vllm_mindspore.model_executor.sampling_metadata import ( + SequenceGroupToSample, + SamplingMetadataCache, + SamplingMetadata, + SamplingTensors + ) -import vllm.utils + vllm.model_executor.SamplingMetadataCache = SamplingMetadataCache + vllm.model_executor.SamplingMetadata = SamplingMetadata + vllm.model_executor.sampling_metadata.SequenceGroupToSample = SequenceGroupToSample + vllm.model_executor.sampling_metadata.SamplingMetadataCache = SamplingMetadataCache + vllm.model_executor.sampling_metadata.SamplingMetadata = SamplingMetadata + vllm.model_executor.sampling_metadata.SamplingTensors = SamplingTensors + + from vllm_mindspore.worker.cache_engine import ( + ms_allocate_kv_cache, + ms_swap_in, + ms_swap_out, + ) -vllm.utils.current_platform = ascend_platform + import vllm.worker.cache_engine -import vllm.executor.ray_utils + vllm.worker.cache_engine.CacheEngine._allocate_kv_cache = ms_allocate_kv_cache + vllm.worker.cache_engine.CacheEngine.swap_in = ms_swap_in + vllm.worker.cache_engine.CacheEngine.swap_out = ms_swap_out -vllm.executor.ray_utils.current_platform = ascend_platform + from vllm_mindspore.model_executor.model_loader.weight_utils import ( + safetensors_weights_iterator, + ) -import vllm.attention.selector -vllm.attention.selector.current_platform = ascend_platform + vllm.model_executor.model_loader.default_loader.safetensors_weights_iterator = ( + safetensors_weights_iterator + ) -import vllm.engine.arg_utils -from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle -vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle + from vllm_mindspore.worker.worker import _warm_up_model + from vllm_mindspore.worker.profile import ( + wrapper_worker_init, + wrapper_worker_init_device, + ) + from vllm.worker.worker import Worker -import vllm.v1.engine.core -from vllm_mindspore.v1.engine.core import ( - _init_data_parallel, - shutdown, -) -vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel = _init_data_parallel -vllm.v1.engine.core.DPEngineCoreProc.shutdown = shutdown + Worker._warm_up_model = _warm_up_model + Worker.__init__ = wrapper_worker_init(Worker.__init__) + Worker.init_device = wrapper_worker_init_device(Worker.init_device) -from vllm_mindspore.utils import ( - direct_register_custom_op, - make_tensor_with_pad, - async_tensor_h2d, - get_dtype_size, - ascend_device_count_stateless, - ascend_is_initialized, -) + from vllm_mindspore.worker.model_runner import ( + _get_cuda_graph_pad_size, + _dummy_run, + _get_supported_attention_backends, + ) -vllm.utils.direct_register_custom_op = direct_register_custom_op -vllm.utils.make_tensor_with_pad = make_tensor_with_pad -vllm.utils.async_tensor_h2d = async_tensor_h2d -vllm.utils.get_dtype_size = get_dtype_size -vllm.utils.cuda_device_count_stateless = ascend_device_count_stateless -vllm.utils.cuda_is_initialized = ascend_is_initialized -vllm.config.cuda_device_count_stateless = ascend_device_count_stateless + vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = ( + _get_cuda_graph_pad_size + ) + vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run -import vllm.executor + import vllm.worker.multi_step_model_runner -vllm.executor.cuda_device_count_stateless = ascend_device_count_stateless + vllm.worker.multi_step_model_runner._get_supported_attention_backends = ( + _get_supported_attention_backends + ) -from vllm_mindspore.model_executor.models.registry import ( - MindSporeModelRegistry, - _SUBPROCESS_COMMAND, -) + from vllm_mindspore.executor.multiproc_worker_utils import ( + get_mp_context as ms_get_mp_context, + terminate_worker as ms_terminate_worker, + ) + # To patching the get_mp_context, should import it first. + from vllm.executor.multiproc_worker_utils import get_mp_context -vllm.config.ModelRegistry = MindSporeModelRegistry + vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context -import vllm.model_executor + import vllm.executor.multiproc_worker_utils -vllm.model_executor.models.ModelRegistry = MindSporeModelRegistry -vllm.model_executor.models.registry._SUBPROCESS_COMMAND = _SUBPROCESS_COMMAND + vllm.executor.multiproc_worker_utils.ProcessWorkerWrapper.terminate_worker = ms_terminate_worker -from vllm_mindspore.model_executor.model_loader.utils import get_ms_model_architecture + import vllm.v1.executor.multiproc_executor + vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context + import vllm.v1.utils + vllm.v1.utils.get_mp_context = ms_get_mp_context -# To patching the get_model_architecture, should import it first. -from vllm.model_executor.model_loader import get_model_architecture + from vllm_mindspore.executor.ray_gpu_executor import ( + ms_init_workers_ray, + initialize_ray_cluster, + ) -vllm.model_executor.model_loader.get_model_architecture = get_ms_model_architecture -vllm.model_executor.model_loader.utils.get_model_architecture = ( - get_ms_model_architecture -) -vllm.model_executor.model_loader.default_loader.get_model_architecture = ( - get_ms_model_architecture -) + from vllm.executor.ray_distributed_executor import RayDistributedExecutor -from vllm_mindspore.model_executor.sampling_metadata import ( - SequenceGroupToSample, - SamplingMetadataCache, - SamplingMetadata, - SamplingTensors -) + RayDistributedExecutor._init_workers_ray = ms_init_workers_ray -vllm.model_executor.SamplingMetadataCache = SamplingMetadataCache -vllm.model_executor.SamplingMetadata = SamplingMetadata -vllm.model_executor.sampling_metadata.SequenceGroupToSample = SequenceGroupToSample -vllm.model_executor.sampling_metadata.SamplingMetadataCache = SamplingMetadataCache -vllm.model_executor.sampling_metadata.SamplingMetadata = SamplingMetadata -vllm.model_executor.sampling_metadata.SamplingTensors = SamplingTensors + vllm.executor.ray_distributed_executor.initialize_ray_cluster = initialize_ray_cluster + vllm.executor.ray_utils.initialize_ray_cluster = initialize_ray_cluster -from vllm_mindspore.worker.cache_engine import ( - ms_allocate_kv_cache, - ms_swap_in, - ms_swap_out, -) + import vllm.engine.llm_engine + import vllm.engine.async_llm_engine -import vllm.worker.cache_engine + vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster + vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster -vllm.worker.cache_engine.CacheEngine._allocate_kv_cache = ms_allocate_kv_cache -vllm.worker.cache_engine.CacheEngine.swap_in = ms_swap_in -vllm.worker.cache_engine.CacheEngine.swap_out = ms_swap_out -from vllm_mindspore.model_executor.model_loader.weight_utils import ( - safetensors_weights_iterator, -) + from .config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \ + _get_and_verify_dtype, stateless_init_dp_group, has_unfinished_dp -vllm.model_executor.model_loader.default_loader.safetensors_weights_iterator = ( - safetensors_weights_iterator -) + vllm.config.ModelConfig._verify_quantization = _verify_quantization + vllm.config.VllmConfig.__post_init__ = vllm_config_post_init + vllm.config.SchedulerConfig._verify_args = _verify_args + vllm.config.CompilationConfig.model_post_init = model_post_init + vllm.config._get_and_verify_dtype = _get_and_verify_dtype + vllm.config.ParallelConfig.stateless_init_dp_group = stateless_init_dp_group + vllm.config.ParallelConfig.has_unfinished_dp = has_unfinished_dp -from vllm_mindspore.worker.worker import _warm_up_model -from vllm_mindspore.worker.profile import ( - wrapper_worker_init, - wrapper_worker_init_device, -) -from vllm.worker.worker import Worker - -Worker._warm_up_model = _warm_up_model -Worker.__init__ = wrapper_worker_init(Worker.__init__) -Worker.init_device = wrapper_worker_init_device(Worker.init_device) - -from vllm_mindspore.worker.model_runner import ( - _get_cuda_graph_pad_size, - _dummy_run, - _get_supported_attention_backends, -) - -vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = ( - _get_cuda_graph_pad_size -) -vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run + from .utils import update_modules + from vllm_mindspore.attention.backends import ms_attn + update_modules("vllm.attention.backends.flash_attn", ms_attn) -import vllm.worker.multi_step_model_runner + from vllm_mindspore.worker.spec_decode_worker import ( + spec_decode_worker_init, + _run_no_spec, + _verify_tokens, + _create_output, + _merge_outputs, + ) + from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker + SpecDecodeWorker.__init__ = spec_decode_worker_init + SpecDecodeWorker._verify_tokens = _verify_tokens + SpecDecodeWorker._run_no_spec = _run_no_spec -vllm.worker.multi_step_model_runner._get_supported_attention_backends = ( - _get_supported_attention_backends -) + from vllm.model_executor.layers.spec_decode_base_sampler import SpecDecodeBaseSampler + SpecDecodeBaseSampler._create_output = _create_output -from vllm_mindspore.executor.multiproc_worker_utils import ( - get_mp_context as ms_get_mp_context, - terminate_worker as ms_terminate_worker, -) + from vllm.spec_decode.top1_proposer import Top1Proposer + Top1Proposer._merge_outputs = _merge_outputs -# To patching the get_mp_context, should import it first. -from vllm.executor.multiproc_worker_utils import get_mp_context + from vllm_mindspore.model_executor.layers.rejection_sampler import _smallest_positive_value, _multinomial + from vllm.model_executor.layers.rejection_sampler import RejectionSampler + RejectionSampler._smallest_positive_value = _smallest_positive_value + RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value') + vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial -vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context + from vllm_mindspore.v1.sample import rejection_sampler + update_modules("vllm.v1.sample.rejection_sampler", rejection_sampler) -import vllm.executor.multiproc_worker_utils + from vllm_mindspore.v1.spec_decode import eagle + update_modules("vllm.v1.spec_decode.eagle", eagle) -vllm.executor.multiproc_worker_utils.ProcessWorkerWrapper.terminate_worker = ms_terminate_worker + from vllm_mindspore.v1.attention.backends import flash_attn + import vllm.v1.attention.backends + sys.modules['vllm.v1.attention.backends.flash_attn'] = flash_attn + import vllm.v1.attention.backends.flash_attn -import vllm.v1.executor.multiproc_executor -vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context -import vllm.v1.utils -vllm.v1.utils.get_mp_context = ms_get_mp_context + import vllm.v1.worker.gpu_model_runner -from vllm_mindspore.executor.ray_gpu_executor import ( - ms_init_workers_ray, - initialize_ray_cluster, -) + from vllm_mindspore.v1.worker.gpu_model_runner import _prepare_inputs + vllm.v1.worker.gpu_model_runner.GPUModelRunner._prepare_inputs = _prepare_inputs -from vllm.executor.ray_distributed_executor import RayDistributedExecutor + from vllm_mindspore.v1.worker.gpu_model_runner import _update_states + vllm.v1.worker.gpu_model_runner.GPUModelRunner._update_states = _update_states -RayDistributedExecutor._init_workers_ray = ms_init_workers_ray - -vllm.executor.ray_distributed_executor.initialize_ray_cluster = initialize_ray_cluster -vllm.executor.ray_utils.initialize_ray_cluster = initialize_ray_cluster - -import vllm.engine.llm_engine -import vllm.engine.async_llm_engine - -vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster -vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster + from vllm_mindspore.v1.worker.gpu_model_runner import _allocate_kv_cache_tensors + vllm.v1.worker.gpu_model_runner.GPUModelRunner._allocate_kv_cache_tensors = _allocate_kv_cache_tensors + from vllm_mindspore.v1.worker.gpu_model_runner import _reshape_kv_cache_tensors + vllm.v1.worker.gpu_model_runner.GPUModelRunner._reshape_kv_cache_tensors = _reshape_kv_cache_tensors -from .config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \ - _get_and_verify_dtype, stateless_init_dp_group, has_unfinished_dp + import vllm.v1.worker.block_table + from vllm_mindspore.v1.worker.block_table import BlockTable + vllm.v1.worker.block_table.BlockTable = BlockTable + vllm.v1.worker.gpu_input_batch.BlockTable = BlockTable -vllm.config.ModelConfig._verify_quantization = _verify_quantization -vllm.config.VllmConfig.__post_init__ = vllm_config_post_init -vllm.config.SchedulerConfig._verify_args = _verify_args -vllm.config.CompilationConfig.model_post_init = model_post_init -vllm.config._get_and_verify_dtype = _get_and_verify_dtype -vllm.config.ParallelConfig.stateless_init_dp_group = stateless_init_dp_group -vllm.config.ParallelConfig.has_unfinished_dp = has_unfinished_dp + import vllm.v1.worker.gpu_input_batch + from vllm_mindspore.v1.worker.gpu_input_batch import _make_sampling_metadata, _make_prompt_token_ids_tensor + vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata = _make_sampling_metadata + vllm.v1.worker.gpu_model_runner.InputBatch._make_sampling_metadata = _make_sampling_metadata + vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor + vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor -from .utils import update_modules -from vllm_mindspore.attention.backends import ms_attn -update_modules("vllm.attention.backends.flash_attn", ms_attn) + from vllm.v1.worker.gpu_worker import Worker + from vllm_mindspore.v1.worker.gpu_worker import init_device -from vllm_mindspore.worker.spec_decode_worker import ( - spec_decode_worker_init, - _run_no_spec, - _verify_tokens, - _create_output, - _merge_outputs, -) -from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker -SpecDecodeWorker.__init__ = spec_decode_worker_init -SpecDecodeWorker._verify_tokens = _verify_tokens -SpecDecodeWorker._run_no_spec = _run_no_spec - -from vllm.model_executor.layers.spec_decode_base_sampler import SpecDecodeBaseSampler -SpecDecodeBaseSampler._create_output = _create_output - -from vllm.spec_decode.top1_proposer import Top1Proposer -Top1Proposer._merge_outputs = _merge_outputs - -from vllm_mindspore.model_executor.layers.rejection_sampler import _smallest_positive_value, _multinomial -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -RejectionSampler._smallest_positive_value = _smallest_positive_value -RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value') -vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial + Worker.__init__ = wrapper_worker_init(Worker.__init__) + Worker.init_device = wrapper_worker_init_device(init_device) -from vllm_mindspore.v1.sample import rejection_sampler -update_modules("vllm.v1.sample.rejection_sampler", rejection_sampler) - -from vllm_mindspore.v1.spec_decode import eagle -update_modules("vllm.v1.spec_decode.eagle", eagle) -from vllm_mindspore.v1.attention.backends import flash_attn -import vllm.v1.attention.backends -sys.modules['vllm.v1.attention.backends.flash_attn'] = flash_attn -import vllm.v1.attention.backends.flash_attn + import vllm.v1.utils + from vllm_mindspore.v1.utils import copy_slice + vllm.v1.utils.copy_slice = copy_slice + vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice -import vllm.v1.worker.gpu_model_runner + from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors + import vllm.v1.sample.ops.penalties + vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors + import vllm.model_executor.layers.utils + from vllm_mindspore.model_executor.layers.utils import apply_penalties + vllm.model_executor.layers.utils.apply_penalties = apply_penalties + vllm.v1.sample.ops.penalties.apply_penalties = apply_penalties -from vllm_mindspore.v1.worker.gpu_model_runner import _prepare_inputs -vllm.v1.worker.gpu_model_runner.GPUModelRunner._prepare_inputs = _prepare_inputs -from vllm_mindspore.v1.worker.gpu_model_runner import _update_states -vllm.v1.worker.gpu_model_runner.GPUModelRunner._update_states = _update_states + from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample, \ + apply_top_k_only, topk_topp_sampler_forward_native -from vllm_mindspore.v1.worker.gpu_model_runner import _allocate_kv_cache_tensors -vllm.v1.worker.gpu_model_runner.GPUModelRunner._allocate_kv_cache_tensors = _allocate_kv_cache_tensors + import vllm.v1.sample.ops.topk_topp_sampler + from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler + TopKTopPSampler.forward_native = topk_topp_sampler_forward_native + vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p + vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample + vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_only = apply_top_k_only + from vllm_mindspore.v1.sample.sampler import apply_temperature + import vllm.v1.sample.sampler + vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature -from vllm_mindspore.v1.worker.gpu_model_runner import _reshape_kv_cache_tensors -vllm.v1.worker.gpu_model_runner.GPUModelRunner._reshape_kv_cache_tensors = _reshape_kv_cache_tensors + from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer + from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer + ShmRingBuffer.__init__ = initialize_ShmRingBuffer -import vllm.v1.worker.block_table -from vllm_mindspore.v1.worker.block_table import BlockTable -vllm.v1.worker.block_table.BlockTable = BlockTable -vllm.v1.worker.gpu_input_batch.BlockTable = BlockTable + from vllm_mindspore.distributed.device_communicators.base_device_communicator import( + prepare_communication_buffer_for_model + ) + import vllm.distributed.device_communicators.base_device_communicator + vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase.prepare_communication_buffer_for_model = ( + prepare_communication_buffer_for_model + ) -import vllm.v1.worker.gpu_input_batch -from vllm_mindspore.v1.worker.gpu_input_batch import _make_sampling_metadata, _make_prompt_token_ids_tensor -vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata = _make_sampling_metadata -vllm.v1.worker.gpu_model_runner.InputBatch._make_sampling_metadata = _make_sampling_metadata -vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor -vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor - -from vllm.v1.worker.gpu_worker import Worker -from vllm_mindspore.v1.worker.gpu_worker import init_device - -Worker.__init__ = wrapper_worker_init(Worker.__init__) -Worker.init_device = wrapper_worker_init_device(init_device) - - -import vllm.v1.utils -from vllm_mindspore.v1.utils import copy_slice -vllm.v1.utils.copy_slice = copy_slice -vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice - -from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors -import vllm.v1.sample.ops.penalties -vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors -import vllm.model_executor.layers.utils -from vllm_mindspore.model_executor.layers.utils import apply_penalties -vllm.model_executor.layers.utils.apply_penalties = apply_penalties -vllm.v1.sample.ops.penalties.apply_penalties = apply_penalties - - -from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample, \ - apply_top_k_only, topk_topp_sampler_forward_native - -import vllm.v1.sample.ops.topk_topp_sampler -from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler -TopKTopPSampler.forward_native = topk_topp_sampler_forward_native -vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p -vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample -vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_only = apply_top_k_only -from vllm_mindspore.v1.sample.sampler import apply_temperature -import vllm.v1.sample.sampler -vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature - -from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer -from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer -ShmRingBuffer.__init__ = initialize_ShmRingBuffer - -from vllm_mindspore.distributed.device_communicators.base_device_communicator import( - prepare_communication_buffer_for_model -) -import vllm.distributed.device_communicators.base_device_communicator -vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase.prepare_communication_buffer_for_model = ( - prepare_communication_buffer_for_model -) - -from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model -from vllm.v1.worker.gpu_worker import Worker -Worker.compile_or_warm_up_model = compile_or_warm_up_model + from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model + from vllm.v1.worker.gpu_worker import Worker + Worker.compile_or_warm_up_model = compile_or_warm_up_model from .utils import check_ready diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py index d6123b0a89790ba630888066cb857d995f190c10..e76557df0dcf9d33b2b217f43d959f4913e80c97 100644 --- a/vllm_mindspore/attention/backends/ms_attn.py +++ b/vllm_mindspore/attention/backends/ms_attn.py @@ -501,17 +501,13 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): self, seq_lens: List[int], query_lens: List[int], - cuda_graph_pad_size: int, - batch_size: int, + graph_size: int = -1, ): """Build attention metadata with on-device tensors. Args: seq_lens: The maybe padded sequence lengths of the input sequences. query_lens: The query lengths of the input sequences. - cuda_graph_pad_size: The padding size for cuda graph. - -1 if cuda graph is not used. - batch_size: The maybe padded batch size. """ prefix_cache_hit = any( [ @@ -525,7 +521,6 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): ) device = self.runner.device - use_captured_graph = cuda_graph_pad_size != -1 max_query_len = max(query_lens) decode_query_lens = query_lens[self.num_prefills:] @@ -539,15 +534,12 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): query_start_loc = list(accumulate(query_lens, initial=0)) seq_start_loc = list(accumulate(seq_lens, initial=0)) - if use_captured_graph: - raise RuntimeError("Doesnot support captured graph now!") - else: - block_tables = make_tensor_with_pad( - self.block_tables, - pad=-1, - dtype=torch.int, - device=device, - ) + block_tables = make_tensor_with_pad( + self.block_tables, + pad=-1, + dtype=torch.int, + device=device, + ) assert max_query_len > 0, "query_lens: {}".format(query_lens) context_lens_tensor = ms.Tensor(self.context_lens, dtype=ms.int32) @@ -595,6 +587,10 @@ class MsAttentionBackend(AttentionBackend): @staticmethod def get_builder_cls() -> Type["MsAttentionMetadataBuilder"]: return MsAttentionMetadataBuilder + + @classmethod + def make_metadata_builder(cls, *args, **kwargs) -> "MsAttentionMetadataBuilder": + return cls.get_builder_cls()(*args, **kwargs) @staticmethod def get_state_cls() -> Type["AttentionState"]: diff --git a/vllm_mindspore/patch/__init__.py b/vllm_mindspore/patch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vllm_mindspore/patch/patch_vllm_ascend.py b/vllm_mindspore/patch/patch_vllm_ascend.py new file mode 100644 index 0000000000000000000000000000000000000000..2db58104cc3f50fbb79cd911f94e2266dce7a28d --- /dev/null +++ b/vllm_mindspore/patch/patch_vllm_ascend.py @@ -0,0 +1,310 @@ +import sys +import warnings +import types + +fake_mod = types.ModuleType("torch_npu.npu.streams") +fake_mod.Event = type("Event", (), {}) +sys.modules.update({"torch_npu.npu.streams": fake_mod}) + +if "vllm" in sys.modules: + # Check models variable in sub process, cannot raise here. + warnings.warn( + "vllm import before vllm_mindspore, vllm_mindspore cannot worker right!" + ) + +# 1. set env before import mindspore. +from vllm_mindspore.scripts import env_setup +env_setup() + +# 2. update the log configuration ahead of other modifications. +import vllm_mindspore.logger +# ================ For vllm ================ + +import vllm.engine.arg_utils +from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle +vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle + +import vllm.v1.engine.core +from vllm_mindspore.v1.engine.core import ( + _init_data_parallel, + shutdown, +) +vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel = _init_data_parallel +vllm.v1.engine.core.DPEngineCoreProc.shutdown = shutdown + +from vllm_mindspore.utils import ( + direct_register_custom_op, + make_tensor_with_pad, + async_tensor_h2d, + get_dtype_size, + ascend_device_count_stateless, + ascend_is_initialized, +) + +vllm.utils.direct_register_custom_op = direct_register_custom_op +vllm.utils.make_tensor_with_pad = make_tensor_with_pad +vllm.utils.async_tensor_h2d = async_tensor_h2d +vllm.utils.get_dtype_size = get_dtype_size +vllm.utils.cuda_device_count_stateless = ascend_device_count_stateless +vllm.utils.cuda_is_initialized = ascend_is_initialized +vllm.config.cuda_device_count_stateless = ascend_device_count_stateless + +import vllm.executor + +vllm.executor.cuda_device_count_stateless = ascend_device_count_stateless + +from vllm_mindspore.model_executor.models.registry import ( + MindSporeModelRegistry, + _SUBPROCESS_COMMAND, +) + + +vllm.config.ModelRegistry = MindSporeModelRegistry + +import vllm.model_executor + +vllm.model_executor.models.ModelRegistry = MindSporeModelRegistry +vllm.model_executor.models.registry._SUBPROCESS_COMMAND = _SUBPROCESS_COMMAND + +from vllm_mindspore.model_executor.model_loader.utils import get_ms_model_architecture + +# To patching the get_model_architecture, should import it first. +from vllm.model_executor.model_loader import get_model_architecture + +vllm.model_executor.model_loader.get_model_architecture = get_ms_model_architecture +vllm.model_executor.model_loader.utils.get_model_architecture = ( + get_ms_model_architecture +) +vllm.model_executor.model_loader.default_loader.get_model_architecture = ( + get_ms_model_architecture +) + +from vllm_mindspore.model_executor.sampling_metadata import ( + SequenceGroupToSample, + SamplingMetadataCache, + SamplingMetadata, + SamplingTensors +) + +vllm.model_executor.SamplingMetadataCache = SamplingMetadataCache +vllm.model_executor.SamplingMetadata = SamplingMetadata +vllm.model_executor.sampling_metadata.SequenceGroupToSample = SequenceGroupToSample +vllm.model_executor.sampling_metadata.SamplingMetadataCache = SamplingMetadataCache +vllm.model_executor.sampling_metadata.SamplingMetadata = SamplingMetadata +vllm.model_executor.sampling_metadata.SamplingTensors = SamplingTensors + +from vllm_mindspore.model_executor.model_loader.weight_utils import ( + safetensors_weights_iterator, +) + +vllm.model_executor.model_loader.default_loader.safetensors_weights_iterator = ( + safetensors_weights_iterator +) + +from vllm_mindspore.executor.multiproc_worker_utils import ( + get_mp_context as ms_get_mp_context, + terminate_worker as ms_terminate_worker, +) + +# To patching the get_mp_context, should import it first. +from vllm.executor.multiproc_worker_utils import get_mp_context + +vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context + +import vllm.executor.multiproc_worker_utils + +vllm.executor.multiproc_worker_utils.ProcessWorkerWrapper.terminate_worker = ms_terminate_worker + +import vllm.v1.executor.multiproc_executor +vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context +import vllm.v1.utils +vllm.v1.utils.get_mp_context = ms_get_mp_context + +from vllm_mindspore.executor.ray_gpu_executor import ( + ms_init_workers_ray, + initialize_ray_cluster, +) + +from vllm.executor.ray_distributed_executor import RayDistributedExecutor + +RayDistributedExecutor._init_workers_ray = ms_init_workers_ray + +vllm.executor.ray_distributed_executor.initialize_ray_cluster = initialize_ray_cluster +vllm.executor.ray_utils.initialize_ray_cluster = initialize_ray_cluster + +import vllm.engine.llm_engine +import vllm.engine.async_llm_engine + +vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster +vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster + + +from vllm_mindspore.config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \ + _get_and_verify_dtype, stateless_init_dp_group, has_unfinished_dp + +vllm.config.ModelConfig._verify_quantization = _verify_quantization +vllm.config.VllmConfig.__post_init__ = vllm_config_post_init +vllm.config.SchedulerConfig._verify_args = _verify_args +vllm.config.CompilationConfig.model_post_init = model_post_init +vllm.config._get_and_verify_dtype = _get_and_verify_dtype +vllm.config.ParallelConfig.stateless_init_dp_group = stateless_init_dp_group +vllm.config.ParallelConfig.has_unfinished_dp = has_unfinished_dp + +from vllm_mindspore.utils import update_modules +from vllm_mindspore.attention.backends import ms_attn +update_modules("vllm.attention.backends.flash_attn", ms_attn) + +from vllm_mindspore.worker.spec_decode_worker import ( + spec_decode_worker_init, + _run_no_spec, + _verify_tokens, + _create_output, + _merge_outputs, +) +from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker +SpecDecodeWorker.__init__ = spec_decode_worker_init +SpecDecodeWorker._verify_tokens = _verify_tokens +SpecDecodeWorker._run_no_spec = _run_no_spec + +from vllm.model_executor.layers.spec_decode_base_sampler import SpecDecodeBaseSampler +SpecDecodeBaseSampler._create_output = _create_output + +from vllm.spec_decode.top1_proposer import Top1Proposer +Top1Proposer._merge_outputs = _merge_outputs + +from vllm_mindspore.model_executor.layers.rejection_sampler import _smallest_positive_value, _multinomial +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +RejectionSampler._smallest_positive_value = _smallest_positive_value +RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value') +vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial + +from vllm_mindspore.v1.sample import rejection_sampler +update_modules("vllm.v1.sample.rejection_sampler", rejection_sampler) + +from vllm_mindspore.v1.spec_decode import eagle +update_modules("vllm.v1.spec_decode.eagle", eagle) + +from vllm_mindspore.v1.attention.backends import flash_attn +import vllm.v1.attention.backends +sys.modules['vllm.v1.attention.backends.flash_attn'] = flash_attn +import vllm.v1.attention.backends.flash_attn + +import vllm.v1.worker.gpu_model_runner + +import vllm.v1.worker.block_table +from vllm_mindspore.v1.worker.block_table import BlockTable +vllm.v1.worker.block_table.BlockTable = BlockTable +vllm.v1.worker.gpu_input_batch.BlockTable = BlockTable + +import vllm.v1.worker.gpu_input_batch +from vllm_mindspore.v1.worker.gpu_input_batch import _make_sampling_metadata, _make_prompt_token_ids_tensor +vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata = _make_sampling_metadata +vllm.v1.worker.gpu_model_runner.InputBatch._make_sampling_metadata = _make_sampling_metadata +vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor +vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor + +import vllm.v1.utils +from vllm_mindspore.v1.utils import copy_slice +vllm.v1.utils.copy_slice = copy_slice +vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice + +from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors +import vllm.v1.sample.ops.penalties +vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors +import vllm.model_executor.layers.utils +from vllm_mindspore.model_executor.layers.utils import apply_penalties +vllm.model_executor.layers.utils.apply_penalties = apply_penalties +vllm.v1.sample.ops.penalties.apply_penalties = apply_penalties + + +from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample, \ + apply_top_k_only, topk_topp_sampler_forward_native + +import vllm.v1.sample.ops.topk_topp_sampler +from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler +TopKTopPSampler.forward_native = topk_topp_sampler_forward_native +vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p +vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample +vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_only = apply_top_k_only +from vllm_mindspore.v1.sample.sampler import apply_temperature +import vllm.v1.sample.sampler +vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature + +from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer +from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer +ShmRingBuffer.__init__ = initialize_ShmRingBuffer + +from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model +from vllm.v1.worker.gpu_worker import Worker +Worker.compile_or_warm_up_model = compile_or_warm_up_model + +# ============ For 0.9.0 start =========== +import importlib +memory_mod = importlib.import_module("torch.cuda.memory") +if not hasattr(memory_mod, "NPUPluggableAllocator"): + memory_mod.NPUPluggableAllocator = memory_mod.CUDAPluggableAllocator +sys.modules["torch_npu.op_plugin"] = types.ModuleType("torch_npu.op_plugin") +sys.modules["torch_npu.op_plugin.atb"] = types.ModuleType("torch_npu.op_plugin.atb") +fake_mod = types.ModuleType("torch_npu.op_plugin.atb._atb_ops") +fake_mod._register_atb_extensions = lambda *a, **kw: None +sys.modules["torch_npu.op_plugin.atb._atb_ops"] = fake_mod +fake_mod = types.ModuleType("torchair._contrib") +sys.modules["torchair._contrib"] = fake_mod +fake_mod = types.ModuleType("torchair._contrib.custom_torch_ops") +sys.modules["torchair._contrib.custom_torch_ops"] = fake_mod +import torch +if not hasattr(torch, "Tag"): + class _FakeTag: + needs_fixed_stride_order = "needs_fixed_stride_order" + torch.Tag = _FakeTag +if not hasattr(torch._dynamo, "cache_size"): + class CacheSize: + def __init__(self): + self.config = types.SimpleNamespace(cache_size_limit=0) + sys.modules["torch._dynamo.cache_size"] = CacheSize + torch._dynamo.cache_size = CacheSize() +if not hasattr(torch, "_logging"): + class FakeLogging: + @staticmethod + def set_logs(*args, **kwargs): ... + torch._logging = FakeLogging() +fake_fused_moe = types.ModuleType("vllm.model_executor.layers.fused_moe.fused_moe") +fake_fused_moe.direct_register_custom_op = lambda *a, **kw: None +fake_fused_moe.grouped_topk = lambda *a, **kw: None +sys.modules["vllm.model_executor.layers.fused_moe.fused_moe"] = fake_fused_moe +import vllm_ascend.ops +vllm_ascend.ops.register_dummy_fusion_op = lambda *a, **kw: None +# ============ For 0.9.0 end =========== + +# ================ For vllm-ascend ================ +fake_mod = types.ModuleType("vllm_ascend.vllm_ascend_C") +fake_mod.init_module = fake_mod.python_create_and_map = fake_mod.python_unmap_and_release = lambda *a, **kw: None +sys.modules.update({"vllm_ascend.vllm_ascend_C": fake_mod}) + +from vllm_mindspore.platforms.ascend import AscendPlatform +from vllm_ascend.platform import NPUPlatform +NPUPlatform.get_attn_backend_cls = AscendPlatform.get_attn_backend_cls + +# ================ End ================ + +# ============ For v1 start =========== +from vllm_mindspore.config import _get_and_verify_dtype +vllm.config._get_and_verify_dtype = _get_and_verify_dtype + +from vllm_mindspore.worker.model_runner_v1 import _dummy_run, _process_reqs, wrapper_runner_init +from vllm_ascend.worker.model_runner_v1 import NPUModelRunner +NPUModelRunner._dummy_run = _dummy_run +NPUModelRunner._process_reqs = _process_reqs +NPUModelRunner.__init__ = wrapper_runner_init(NPUModelRunner.__init__) + +from vllm_mindspore.worker.worker_v1 import determine_available_memory +from vllm_ascend.worker.worker_v1 import NPUWorker +NPUWorker.determine_available_memory = determine_available_memory + +from vllm_mindspore.worker.model_runner_v1 import initialize_kv_cache +NPUModelRunner.initialize_kv_cache = initialize_kv_cache + +from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model +from vllm_ascend.worker.worker_v1 import NPUWorker +NPUWorker.compile_or_warm_up_model = compile_or_warm_up_model +# ============ For v1 end =========== \ No newline at end of file diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py index 356a33a040c050b0825a1c2fe5fea2179fbafa60..d559bdbda5bc16b8cb46047b4dadb3e71c0c0674 100644 --- a/vllm_mindspore/platforms/ascend.py +++ b/vllm_mindspore/platforms/ascend.py @@ -109,7 +109,6 @@ class AscendPlatform(Platform): if use_mla: return "vllm_mindspore.v1.attention.backends.flash_attn.MLABackend" return "vllm_mindspore.v1.attention.backends.flash_attn.FlashAttentionBackend" - raise RuntimeError("vLLM-MindSpore do not support v1 egine now!") if use_mla: logger.info("Using MindSpore MLA backend.") return "vllm_mindspore.attention.backends.ms_attn.MLABackend" @@ -144,4 +143,4 @@ class AscendPlatform(Platform): @classmethod def supports_v1(cls, model_config: ModelConfig) -> bool: - return True \ No newline at end of file + return True diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index da942479e5600de8c9929e8fe9364094b0fab25e..270cd6685ba417088631ab64de335d0e96cb36b8 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -38,6 +38,8 @@ if TYPE_CHECKING: else: Library = None +from packaging.version import Version + from vllm.logger import init_logger from vllm.utils import T, TORCH_DTYPE_TO_NUMPY_DTYPE, make_ndarray_with_pad diff --git a/vllm_mindspore/v1/attention/backends/flash_attn.py b/vllm_mindspore/v1/attention/backends/flash_attn.py index 13d35162cbf970790766762ef17466456724e196..80784a6671577a83a646e2bbc18d7579a3113fb9 100644 --- a/vllm_mindspore/v1/attention/backends/flash_attn.py +++ b/vllm_mindspore/v1/attention/backends/flash_attn.py @@ -193,8 +193,8 @@ class MsAttentionImpl(AttentionImpl): class FlashAttentionMetadataBuilder: - def __init__(self, runner: "GPUModelRunner", kv_cache_spec, - block_table): + def __init__(self, runner: "GPUModelRunner", kv_cache_spec=None, + block_table=None): self.runner = runner self.block_table = block_table @@ -208,7 +208,13 @@ class FlashAttentionMetadataBuilder: # because it will cause a certain amount of host time. query_start_loc = ms.from_numpy(self.runner.query_start_loc_np[:num_reqs + 1]) max_context_lens = self.runner.input_batch.num_computed_tokens_cpu[:num_reqs].max() - slot_mapping = ms.from_numpy(self.block_table.slot_mapping_np[:num_actual_tokens]) + if self.block_table is None: + # If block_table is not provided, use the slot_mapping from the runner. + slot_mapping = ms.from_numpy(self.runner.slot_mapping_np[:num_actual_tokens]) + block_tables = self.runner.input_batch.block_table[0].get_device_tensor()[:num_reqs] + else: + slot_mapping = ms.from_numpy(self.block_table.slot_mapping_np[:num_actual_tokens]) + block_tables = self.block_table.get_device_tensor()[:num_reqs] seq_lens_np = self.runner.seq_lens_np[:num_reqs] max_seq_len = seq_lens_np.max() seq_lens = ms.from_numpy(seq_lens_np) @@ -220,13 +226,13 @@ class FlashAttentionMetadataBuilder: attn_metadata = FlashAttentionMetadata( seq_lens=seq_lens, seq_lens_np=seq_lens_np, - block_tables=(self.block_table.get_device_tensor()[:num_reqs]), + block_tables=(block_tables), slot_mapping=slot_mapping, q_seq_lens=q_seq_lens, q_seq_lens_np=q_seq_lens_np, max_seq_len=max_seq_len, context_lens=context_lens, max_context_lens=max_context_lens, - query_start_loc = query_start_loc + query_start_loc=query_start_loc ) return attn_metadata diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py index 3046a9f84ef013514fc001c16555b8680f46aba8..5b121f87edfdb980f0857d2249826d7e4dc42f37 100644 --- a/vllm_mindspore/v1/worker/gpu_worker.py +++ b/vllm_mindspore/v1/worker/gpu_worker.py @@ -54,9 +54,11 @@ def init_device(self): def compile_or_warm_up_model(self) -> None: + from vllm.model_executor import set_random_seed # MindSpore does not support cuda graph. No need to warm up the model. # Since prefill is done previously, we do decode here. default_max_num_reqs = 1 # For MindSpore, we only do one more decode here. if get_pp_group().is_last_rank: - self.model_runner._dummy_sampler_run(self.model_runner._dummy_run( - num_tokens=default_max_num_reqs)) + self.model_runner._dummy_run( + num_tokens=default_max_num_reqs) + set_random_seed(self.model_config.seed) diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index 55bb26ec4ee65181cfc30425640149532c5b36bd..68a9360c1ff0da583282d3ef6ee819bbb8c9827a 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -179,4 +179,4 @@ def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ if chunked_prefill_enabled: return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS else: - return MULTI_STEP_ATTENTION_BACKENDS \ No newline at end of file + return MULTI_STEP_ATTENTION_BACKENDS diff --git a/vllm_mindspore/worker/model_runner_v1.py b/vllm_mindspore/worker/model_runner_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..2c54d361b3ce87113a8e76e7b5a016c6e433fc6d --- /dev/null +++ b/vllm_mindspore/worker/model_runner_v1.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from typing import List, Optional, Tuple +import numpy as np +import weakref + +import torch +import mindspore as ms +from mindspore import Tensor + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.distributed.parallel_state import get_pp_group +from vllm.forward_context import set_forward_context +from vllm.sequence import IntermediateTensors +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.attention.layer import Attention + +from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata + +#################33 +from mindspore import mutable +from vllm_mindspore.utils import get_valid_dtype +# from vllm_mindspore.utils import is_use_mla + +from vllm.attention import AttentionType +from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec +from vllm.v1.utils import bind_kv_cache +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.logger import logger +from vllm.distributed.parallel_state import get_pp_group +from vllm.utils import cdiv +from vllm.logger import init_logger +from vllm.v1.worker.gpu_input_batch import CachedRequestState +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.sampling_params import SamplingType +################### +import vllm_ascend.envs as envs_ascend + + +logger = init_logger(__name__) + + +def wrapper_runner_init(func): + def wrapper(*args, **kwargs): + func(*args, **kwargs) + self = args[0] + self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + pin_memory=True) + self.query_start_loc_np = self.query_start_loc_cpu.numpy() + + return wrapper + + +@torch.inference_mode() +def _dummy_run( + self, + num_tokens: int = None, +) -> torch.Tensor: + if num_tokens is None: + num_tokens = self.max_num_tokens + model = self.model + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None + + if self.uses_mrope: + positions = self.mrope_positions[:, :num_tokens] + else: + positions_np = self.positions_np[:num_tokens] + positions = torch.from_numpy(positions_np) + + if get_pp_group().is_first_rank: + intermediate_tensors = None + else: + if self.intermediate_tensors is None: + self.intermediate_tensors = ( + self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device)) + intermediate_tensors = IntermediateTensors({ + k: v[:num_tokens] + for k, v in self.intermediate_tensors.items() + }) + + with set_forward_context(None, self.vllm_config): + hidden_states = model(input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) + return hidden_states + + +def _process_reqs( + self, + scheduler_output: "SchedulerOutput", + intermediate_tensors: Optional[IntermediateTensors] = None, +) -> torch.Tensor: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + num_input_tokens = total_num_scheduled_tokens + + modified_batch = self.attn_metadata_builder.reorder_batch( + self.input_batch, scheduler_output) + if modified_batch: + self.input_batch.refresh_sampling_metadata() + + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit(num_reqs) + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. + num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32) + num_valid_tokens = np.empty(num_reqs, dtype=np.int32) + max_num_scheduled_tokens = 0 + for i, req_id in enumerate(self.input_batch.req_ids): + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_scheduled_tokens[i] = num_tokens + num_valid_tokens[i] = num_tokens - \ + len(scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + max_num_scheduled_tokens = max(max_num_scheduled_tokens, + num_tokens) + + # Hot-Swap lora model + if self.lora_config: + self.set_active_loras(self.input_batch, num_scheduled_tokens) + + # Prepare positions + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) + cu_num_tokens = np.cumsum(num_scheduled_tokens) + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + sample_indices = cu_num_tokens - 1 + sample_indices = torch.tensor(sample_indices, device=self.device) + + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets + + positions_np = self.positions_np[:total_num_scheduled_tokens] + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + if self.uses_mrope: + self._calc_mrope_positions(scheduler_output) + + if self.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + + self.positions[:total_num_scheduled_tokens] = torch.from_numpy(positions_np) + positions = self.positions[:total_num_scheduled_tokens] + # self.positions[:total_num_scheduled_tokens].copy_( + # self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) + # positions = self.positions[:num_input_tokens] + + + # Get token indices. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] + # where M is the max_model_len. + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) + + self.input_ids[:total_num_scheduled_tokens] = torch.from_numpy( + np.take(self.input_batch.token_ids_cpu.ravel(), + token_indices, + 0) + ) + + # Calculate the slot mapping. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` here + # because M (max_model_len) is not necessarily divisible by block_size. + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size) + + + block_numbers = self.input_batch.block_table[0].block_table_np.ravel()[block_table_indices] + block_offsets = positions_np % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.slot_mapping_np[:total_num_scheduled_tokens]) + + # # Prepare the attention metadata. + self.query_start_loc_np[0] = 0 + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens + + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + + attn_metadata = self.attn_metadata_builder.build( + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + common_prefix_len=0, + ) + + input_ids = self.input_ids[:total_num_scheduled_tokens] + attn_metadata.num_input_tokens = total_num_scheduled_tokens + + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_mm_encoder(scheduler_output) + mm_embeds = self._gather_mm_embeddings(scheduler_output) + else: + mm_embeds = [] + + if self.is_multimodal_model: + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:num_input_tokens] + if mm_embeds: + inputs_embeds = self.model.get_input_embeddings( + input_ids, mm_embeds) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None + else: + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the CUDA graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None + if self.uses_mrope: + positions = self.mrope_positions[:, :num_input_tokens] + else: + positions = self.positions[:num_input_tokens] + + # Run forward pass + with set_forward_context(attn_metadata, self.vllm_config): + assert self.model is not None + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=None, + ) + + use_spec_decode = len( + scheduler_output.scheduled_spec_decode_tokens) > 0 + if not use_spec_decode: + # NOTE(woosuk): Due to chunked prefills, the batch may contain + # partial requests. While we should not sample any token + # from these partial requests, we do so for simplicity. + # We will ignore the sampled tokens from the partial requests. + # TODO: Support prompt logprobs. + spec_decode_metadata = None + else: + # Get the number of draft tokens for each request. + # Iterate over the dictionary rather than all requests since not all + # requests have draft tokens. + num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) + for req_id, draft_token_ids in ( + scheduler_output.scheduled_spec_decode_tokens.items()): + req_idx = self.input_batch.req_id_to_index[req_id] + num_draft_tokens[req_idx] = len(draft_token_ids) + + spec_decode_metadata = self._calc_spec_decode_metadata( + num_draft_tokens, cu_num_tokens) + sample_indices = spec_decode_metadata.logits_indices + + return (attn_metadata, hidden_states, spec_decode_metadata, positions, + total_num_scheduled_tokens, sample_indices) + + +def create_block(shape, dtype, name=None, device=None): + from mindspore import mint + blocks = mint.empty(shape, dtype=dtype, device=device) + return blocks + + +def initialize_kv_cache(self, kv_cache_config) -> None: + from vllm.v1.worker.gpu_input_batch import InputBatch + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + kv_caches: Dict[str, torch.Tensor] = {} + + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.cache_config.block_size], + ) + + kv_cache_sizes = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in " + "NPU.") + kv_cache_sizes[ + kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + + for kv_cache_group in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group.kv_cache_spec + for layer_name in kv_cache_group.layer_names: + tensor_size = kv_cache_sizes[layer_name] + assert tensor_size % kv_cache_spec.page_size_bytes == 0 + num_blocks = tensor_size // kv_cache_spec.page_size_bytes + # `num_blocks` is the number of blocks the model runner can use. + # `kv_cache_config.num_blocks` is the number of blocks that + # KVCacheManager may allocate. + # Since different GPUs may have different number of layers and + # different memory capacities, `num_blocks` can be different on + # different GPUs, and `kv_cache_config.num_blocks` is set to + # the min of all `num_blocks`. Verify it here. + assert num_blocks >= kv_cache_config.num_blocks + if isinstance(kv_cache_spec, FullAttentionSpec): + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size) + dtype = kv_cache_spec.dtype + dtype = get_valid_dtype(dtype) + current_cache = [] + device_type = "CPU" if self.device.type == "cpu" else "Ascend" + for i in range(kv_cache_shape[0]): + cache_blocks = create_block( + kv_cache_shape[1:], dtype, device=device_type + ) + current_cache.append(mutable(cache_blocks)) + kv_caches[layer_name] = mutable(tuple(current_cache)) + else: + raise NotImplementedError + + bind_kv_cache( + kv_caches, + self.vllm_config.compilation_config.static_forward_context, + self.kv_caches) diff --git a/vllm_mindspore/worker/worker_v1.py b/vllm_mindspore/worker/worker_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..db106838f73d6a03a3d5100e8b15ffd030a56ab1 --- /dev/null +++ b/vllm_mindspore/worker/worker_v1.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import gc +from typing import Dict, List + +import torch + +from vllm.logger import init_logger +from vllm.v1.utils import bind_kv_cache +from vllm.v1.kv_cache_interface import FullAttentionSpec + +from vllm_ascend.platform import NPUPlatform + +logger = init_logger(__name__) + +@torch.inference_mode() +def determine_available_memory(self) -> int: + # kv_caches: Dict[str, torch.Tensor] = {} + # kv_cache_spec = self.model_runner.get_kv_cache_spec() + # for layer_name, layer_spec in kv_cache_spec.items(): + # if isinstance(layer_spec, FullAttentionSpec): + # # Use an empty tensor instead of `None`` to force Dynamo to pass + # # it by reference, rather by specializing on the value ``None``. + # npu_k_cache = torch.zeros([0, 0, 0, 0], + # dtype=layer_spec.dtype, + # device=self.device) + # npu_v_cache = torch.zeros([0, 0, 0, 0], + # dtype=layer_spec.dtype, + # device=self.device) + # kv_caches[layer_name] = (npu_k_cache, npu_v_cache) + # else: + # raise NotImplementedError + + # runner_kv_caches: List[torch.Tensor] = [] + # bind_kv_cache( + # kv_caches, + # self.vllm_config.compilation_config.static_forward_context, + # runner_kv_caches) + + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + NPUPlatform.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + free_npu_memory, total_npu_memory = NPUPlatform.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_npu_memory - free_npu_memory + assert peak_memory > 0, ( + "Error in memory profiling. " + f"Initial free memory {self.init_npu_memory}, current free memory" + f" {free_npu_memory}. This happens when the NPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + gc.collect() + # TODO: don`t need impl this func after empty_cache in + # Worker.determine_num_available_blocks() unified` + NPUPlatform.empty_cache() + usable_memory_size = total_npu_memory * self.cache_config.gpu_memory_utilization - peak_memory + npu_kv_cache_bytes = max(usable_memory_size, 0) + logger.info( + f"Available memory: {usable_memory_size}, total memory: {total_npu_memory}" + ) + return int(npu_kv_cache_bytes)