From c7e005b9871898cf1133efe6d2cdcaaea214787b Mon Sep 17 00:00:00 2001 From: zhang_xu_hao1230 Date: Thu, 17 Apr 2025 14:33:54 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0chunked=20prefill=20st?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_vllm_mf_qwen_7b_chunked_prefill.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py new file mode 100644 index 0000000..14ec293 --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunked_prefill.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""test mf qwen chunked prefill.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen_chunked_prefill: + """ + Test qwen7b enable chunked prefill + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen_7b_chunked_prefill(self): + """ + test case qwen_7b_chunked_prefill + """ + # First prompts. + prompts = "请详细介绍一下,什么是大语言模型?以及什么是高性能计算?两者如何结合????" + '?' * 512 + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, block_size=16, enable_chunked_prefill=True, + max_num_batched_tokens=256, gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=[' many times and each time I have found something new'] + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + env_manager.unset_all() -- Gitee