diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py new file mode 100644 index 0000000000000000000000000000000000000000..14ec2930b0df8fdfd7232995d31b1c8a933cdf0f --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""test mf qwen chunked prefill.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen_chunked_prefill: + """ + Test qwen7b enable chunked prefill + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen_7b_chunked_prefill(self): + """ + test case qwen_7b_chunked_prefill + """ + # First prompts. + prompts = "请详细介绍一下,什么是大语言模型?以及什么是高性能计算?两者如何结合????" + '?' * 512 + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, block_size=16, enable_chunked_prefill=True, + max_num_batched_tokens=256, gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=[' many times and each time I have found something new'] + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + env_manager.unset_all()