From 96a9c5f3434f86279dd1b4c555df215198bc48c0 Mon Sep 17 00:00:00 2001 From: GuangJie1 Date: Thu, 19 Dec 2024 11:50:53 +0800 Subject: [PATCH] =?UTF-8?q?vllm=EF=BC=9Aadd=20v0.7.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update dockerfile --- vllm/0.7.1/24.03-lts/Dockerfile | 53 ++++++++++++ vllm/README.md | 147 ++++++++++++++++++++++++++++++++ vllm/meta.yml | 5 +- 3 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 vllm/0.7.1/24.03-lts/Dockerfile create mode 100644 vllm/README.md diff --git a/vllm/0.7.1/24.03-lts/Dockerfile b/vllm/0.7.1/24.03-lts/Dockerfile new file mode 100644 index 0000000..ccf5b47 --- /dev/null +++ b/vllm/0.7.1/24.03-lts/Dockerfile @@ -0,0 +1,53 @@ +# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. + +FROM openeuler/openeuler:24.03-lts AS cpu-test-1 + +ARG TARGETARCH +ARG VERSION=0.7.1 + +ENV CCACHE_DIR=/root/.cache/ccache + +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache + +RUN --mount=type=cache,target=/var/cache/yum \ + yum update -y \ + && yum install -y curl ccache git wget vim numactl gcc g++ python3-devel python3-pip gperftools-libs numactl-libs numactl-devel \ + && yum install -y ffmpeg libSM libXext mesa-libGL google-perftools \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install py-cpuinfo + +ENV LD_PRELOAD="/usr/lib64/libtcmalloc_minimal.so.4" + +RUN echo 'ulimit -c 0' >> ~/.bashrc + +WORKDIR /workspace + +RUN git clone -b v${VERSION} https://github.com/vllm-project/vllm.git + +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +RUN --mount=type=cache,target=/root/.cache/pip \ + pip3 install --upgrade pip && \ + pip install -r vllm/requirements-build.txt + +FROM cpu-test-1 AS build + +WORKDIR /workspace/vllm +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -v -r requirements-cpu.txt + +ENV VLLM_CPU_DISABLE_AVX512="true" +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/ccache \ + VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ + pip install dist/*.whl && \ + rm -rf dist + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/vllm/README.md b/vllm/README.md new file mode 100644 index 0000000..7f092bd --- /dev/null +++ b/vllm/README.md @@ -0,0 +1,147 @@ +# Quick reference + +- The official vLLM docker image. + +- Maintained by: [openEuler CloudNative SIG](https://gitee.com/openeuler/cloudnative). + +- Where to get help: [openEuler CloudNative SIG](https://gitee.com/openeuler/cloudnative), [openEuler](https://gitee.com/openeuler/community). + +# vLLM | openEuler +Current vLLM docker images are built on the [openEuler](https://repo.openeuler.org/). This repository is free to use and exempted from per-user rate limits. + +vLLM is a fast and easy-to-use library for LLM inference and serving. Originally developed in the Sky Computing Lab at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. + +Learn more on [vLLM website](https://docs.vllm.ai/en/latest/). + +# Supported tags and respective Dockerfile links +The tag of each `vllm` docker image is consist of the version of `vllm` and the version of basic image. The details are as follows +| Tag | Currently | Architectures | +|----------|-------------|------------------| +|[0.6.6-oe2403lts](https://gitee.com/openeuler/openeuler-docker-images/blob/master/vllm/0.6.6/24.03-lts/Dockerfile)| vLLM 0.6.6 on openEuler 24.03-LTS | amd64 | +|[0.7.1-oe2403lts](https://gitee.com/openeuler/openeuler-docker-images/blob/master/vllm/0.7.1/24.03-lts/Dockerfile)| vLLM 0.7.1 on openEuler 24.03-LTS | arm64 | + +# Usage +In this usage, users can select the corresponding `{Tag}` and `container startup options` based on their requirements. + +- **Pull the `openeuler/vllm` image from docker** + ```bash + docker pull openeuler/vllm:{Tag} + ``` + +- **Download the large model (optional)** + + If you do not want to download the model when the container is running, you can download the model to your local machine first. + ```bash + huggingface-cli download --resume-download Qwen/Qwen2.5-1.5B-Instruct --local-dir /tmp/Qwen/Qwen2.5-1.5B-Instruct + ``` + Additionally, you can replace the model `Qwen/Qwen2.5-1.5B-Instruct` with the local path and specify the local path in the configuration. + +- **Start a vllm instance** + ```bash + docker run -it -d --name vllm -p 8000:80 -v $LLM_MODEL_PATH:$LLM_MODEL_PATH openeuler/vllm:{Tag} --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --api-key EMPTY + ``` + After the instance `vllm` is started, access the vllm service through `http://localhost:8000`. + +- **Container startup options** + + | Option | Description | + |--|--| + | `-p 8000:80` | Expose vllm service on `localhost:8000`. | + | `-v $LLM_MODEL_PATH:$LLM_MODEL_PATH` | Mounting the local model is optional. | + | `--model $LLM_MODEL_ID` | Specify a model ID or the local model path. | + | `--host 0.0.0.0` | Specify the host address that the service listens on. | + | `--port 80` | Specify the port number that the service listens on. | + | `--api-key EMPTY` | Provide authentication for API services to ensure that only authorized users can access them. | + +- **OpenAI-Compatible Server** + + This server can be queried in the same format as OpenAI API. For example, to list the models: + ```bash + curl http://localhost:8000/v1/models + ``` + You can pass in the argument --api-key to enable the server to check for API key in the header. + +- **OpenAI Completions API with vLLM** + + Once your server is started, you can query the model with input prompts: + ```bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + ``` + Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. + + For example, another way to query the server is via the openai Python package: + ```bash + from openai import OpenAI + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a") + print("Completion result:", completion) + ``` + Additionally, if you have specified the local model path, you also need to replace `Qwen/Qwen2.5-1.5B-Instruct` with the local model path. + +- **OpenAI Chat Completions API with vLLM** + + vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. + + You can use the create chat completion endpoint to interact with the model: + ``` + curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "messages": [ + {"role": "user", "content": "Who won the world series in 2020?"} + ], + "chat_template": "system: You are a helpful AI assistant." + }' + ``` + Alternatively, you can use the openai Python package: + ``` + from openai import OpenAI + # Set OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + chat_response = client.chat.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + messages=[ + {"role": "user", "content": "Tell me a joke."}, + ], + extra_body={"chat_template": "system: You are a helpful AI assistant."} + ) + print("Chat response:", chat_response) + ``` + Additionally, if you have specified the local model path, you also need to replace `Qwen/Qwen2.5-1.5B-Instruct` with the local model path. + +- **View container running logs** + + ```bash + docker logs -f vllm + ``` + +- **To get an interactive shell** + + ```bash + docker exec -it vllm /bin/bash + ``` + +# Question and answering +If you have any questions or want to use some special features, please submit an issue or a pull request on [openeuler-docker-images](https://gitee.com/openeuler/openeuler-docker-images). \ No newline at end of file diff --git a/vllm/meta.yml b/vllm/meta.yml index f7f8162..c631446 100644 --- a/vllm/meta.yml +++ b/vllm/meta.yml @@ -1,3 +1,6 @@ 0.6.3-oe2403lts: path: vllm/0.6.3/24.03-lts/Dockerfile - arch: x86_64 \ No newline at end of file + arch: x86_64 +0.7.1-oe2403lts: + path: vllm/0.7.1/24.03-lts/Dockerfile + arch: aarch64 \ No newline at end of file -- Gitee