diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 2b711153a2c9db12690b254af6700404ac8d1631..7c5e98f79be383b6ba7f3512a190d73bb763bf57 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -389,12 +389,12 @@ from vllm_mindspore.v1.worker.gpu_model_runner import _reshape_kv_cache_tensors vllm.v1.worker.gpu_model_runner.GPUModelRunner._reshape_kv_cache_tensors = ( _reshape_kv_cache_tensors) -from vllm_mindspore.v1.worker.gpu_model_runner import ( - wrapper_gpu_model_runner_execute_model, ) -from vllm.v1.worker.gpu_model_runner import GPUModelRunner - -vllm.v1.worker.gpu_model_runner.GPUModelRunner.execute_model = ( - wrapper_gpu_model_runner_execute_model(GPUModelRunner.execute_model)) +# from vllm_mindspore.v1.worker.gpu_model_runner import ( +# wrapper_gpu_model_runner_execute_model, ) +# from vllm.v1.worker.gpu_model_runner import GPUModelRunner +# +# vllm.v1.worker.gpu_model_runner.GPUModelRunner.execute_model = ( +# wrapper_gpu_model_runner_execute_model(GPUModelRunner.execute_model)) from vllm_mindspore.v1.worker.gpu_model_runner import get_dp_padding diff --git a/vllm_mindspore/model_executor/models/attention_mask.py b/vllm_mindspore/model_executor/models/attention_mask.py index 1dc8a5ec66f87c828930daef13140afa8f5666c9..986ac95a102d6d3d96ac4ad52c981a592649129b 100644 --- a/vllm_mindspore/model_executor/models/attention_mask.py +++ b/vllm_mindspore/model_executor/models/attention_mask.py @@ -157,23 +157,23 @@ class MultiModalLowerTriangularMask(LowerTriangularMask): seq_lens_np, attn_metadata=None): max_query_len = query_lens_np.max() - max_seq_len = seq_lens_np.max() + # max_seq_len = seq_lens_np.max() if is_prefill: attention_mask = self.prefill_mask elif max_query_len > 1: - if max_seq_len <= self.cached_mask_len: - mm_position_ids_list = [] - for i in range(len(seq_lens_np)): - mm_position_ids_list.append( - np.arange(seq_lens_np[i] - query_lens_np, - seq_lens_np[i])) - mm_position_ids = np.concatenate(mm_position_ids_list) - mm_position_ids = Tensor(mm_position_ids, - dtype=position_ids.dtype) - attention_mask = mint.index_select(self.decode_mask, 0, - mm_position_ids) - else: - attention_mask = self.create_mask(query_lens_np, seq_lens_np) + # if max_seq_len <= self.cached_mask_len: + # mm_position_ids_list = [] + # for i in range(len(seq_lens_np)): + # mm_position_ids_list.append( + # np.arange(seq_lens_np[i] - query_lens_np, + # seq_lens_np[i])) + # mm_position_ids = np.concatenate(mm_position_ids_list) + # mm_position_ids = Tensor(mm_position_ids, + # dtype=position_ids.dtype) + # attention_mask = mint.index_select(self.decode_mask, 0, + # mm_position_ids) + # else: + attention_mask = self.create_mask(query_lens_np, seq_lens_np) else: attention_mask = self.hard_mask return attention_mask diff --git a/vllm_mindspore/model_executor/models/qwen2_5_vl.py b/vllm_mindspore/model_executor/models/qwen2_5_vl.py index 9087339acbd3bb8f328975c2445aec4cd969ea87..243551809c1689e4e14c40d59b2272f7caa7d1e5 100644 --- a/vllm_mindspore/model_executor/models/qwen2_5_vl.py +++ b/vllm_mindspore/model_executor/models/qwen2_5_vl.py @@ -991,7 +991,7 @@ class Qwen2_5_VisionTransformer(nn.Cell): dyn_rotary_pos_emb = ms.Tensor(shape=[None, None], dtype=mstype.float32) dyn_window_index = ms.Tensor(shape=[None], dtype=mstype.int64) - dyn_cu_window_seqlens = ms.Tensor(shape=[None], dtype=mstype.int64) + dyn_cu_window_seqlens = ms.Tensor(shape=[None], dtype=mstype.int32) dyn_grid_thw = ms.Tensor(shape=[None, None], dtype=mstype.int64) self.set_inputs( @@ -1017,8 +1017,6 @@ class Qwen2_5_VisionTransformer(nn.Cell): hidden_states = x.to(dtype=self.dtype) hidden_states = self.patch_embed(hidden_states) - cu_window_seqlens = cu_window_seqlens.astype(ms.int32) - cu_window_seqlens = mint.unique_consecutive(cu_window_seqlens) seq_len, _ = hidden_states.shape hidden_states = hidden_states.reshape( seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) @@ -1432,6 +1430,8 @@ class Qwen2_5_VLForConditionalGeneration(NativeModel, SupportsMultiModal): rotary_pos_emb = self.rot_pos_emb(grid_thw) # windows attention window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = cu_window_seqlens.astype(ms.int32) + cu_window_seqlens = mint.unique_consecutive(cu_window_seqlens) image_embeds = self.visual(pixel_values, rotary_pos_emb, window_index, cu_window_seqlens, grid_thw) @@ -1459,6 +1459,8 @@ class Qwen2_5_VLForConditionalGeneration(NativeModel, SupportsMultiModal): rotary_pos_emb = self.rot_pos_emb(grid_thw) # windows attention window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = cu_window_seqlens.astype(ms.int32) + cu_window_seqlens = mint.unique_consecutive(cu_window_seqlens) video_embeds = self.visual(pixel_values_videos, rotary_pos_emb, window_index, cu_window_seqlens, grid_thw)