From 96a9c5f3434f86279dd1b4c555df215198bc48c0 Mon Sep 17 00:00:00 2001
From: GuangJie1 <guangjiebai@gmail.com>
Date: Thu, 19 Dec 2024 11:50:53 +0800
Subject: [PATCH] =?UTF-8?q?vllm=EF=BC=9Aadd=20v0.7.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

update dockerfile
---
 vllm/0.7.1/24.03-lts/Dockerfile |  53 ++++++++++++
 vllm/README.md                  | 147 ++++++++++++++++++++++++++++++++
 vllm/meta.yml                   |   5 +-
 3 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 vllm/0.7.1/24.03-lts/Dockerfile
 create mode 100644 vllm/README.md

diff --git a/vllm/0.7.1/24.03-lts/Dockerfile b/vllm/0.7.1/24.03-lts/Dockerfile
new file mode 100644
index 0000000..ccf5b47
--- /dev/null
+++ b/vllm/0.7.1/24.03-lts/Dockerfile
@@ -0,0 +1,53 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM openeuler/openeuler:24.03-lts AS cpu-test-1
+
+ARG TARGETARCH
+ARG VERSION=0.7.1
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/yum \
+    yum update -y \
+    && yum install -y curl ccache git wget vim numactl gcc g++ python3-devel python3-pip gperftools-libs numactl-libs numactl-devel \
+    && yum install -y ffmpeg libSM libXext mesa-libGL google-perftools \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo
+
+ENV LD_PRELOAD="/usr/lib64/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+RUN git clone -b v${VERSION} https://github.com/vllm-project/vllm.git
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip3 install --upgrade pip && \
+    pip install -r vllm/requirements-build.txt
+
+FROM cpu-test-1 AS build
+
+WORKDIR /workspace/vllm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -v -r requirements-cpu.txt
+
+ENV VLLM_CPU_DISABLE_AVX512="true"
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/vllm/README.md b/vllm/README.md
new file mode 100644
index 0000000..7f092bd
--- /dev/null
+++ b/vllm/README.md
@@ -0,0 +1,147 @@
+# Quick reference
+
+- The official vLLM docker image.
+
+- Maintained by: [openEuler CloudNative SIG](https://gitee.com/openeuler/cloudnative).
+
+- Where to get help: [openEuler CloudNative SIG](https://gitee.com/openeuler/cloudnative), [openEuler](https://gitee.com/openeuler/community).
+
+# vLLM | openEuler
+Current vLLM docker images are built on the [openEuler](https://repo.openeuler.org/). This repository is free to use and exempted from per-user rate limits.
+
+vLLM is a fast and easy-to-use library for LLM inference and serving. Originally developed in the Sky Computing Lab at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+
+Learn more on [vLLM website](https://docs.vllm.ai/en/latest/).
+
+# Supported tags and respective Dockerfile links
+The tag of each `vllm` docker image is consist of the version of `vllm` and the version of basic image. The details are as follows
+|    Tag   |  Currently  |   Architectures  |
+|----------|-------------|------------------|
+|[0.6.6-oe2403lts](https://gitee.com/openeuler/openeuler-docker-images/blob/master/vllm/0.6.6/24.03-lts/Dockerfile)| vLLM 0.6.6 on openEuler 24.03-LTS | amd64 |
+|[0.7.1-oe2403lts](https://gitee.com/openeuler/openeuler-docker-images/blob/master/vllm/0.7.1/24.03-lts/Dockerfile)| vLLM 0.7.1 on openEuler 24.03-LTS | arm64 |
+
+# Usage
+In this usage, users can select the corresponding `{Tag}` and `container startup options` based on their requirements.
+
+- **Pull the `openeuler/vllm` image from docker**
+	```bash
+	docker pull openeuler/vllm:{Tag}
+	```
+
+- **Download the large model (optional)**
+
+	If you do not want to download the model when the container is running, you can download the model to your local machine first. 
+	```bash
+	huggingface-cli download --resume-download Qwen/Qwen2.5-1.5B-Instruct --local-dir /tmp/Qwen/Qwen2.5-1.5B-Instruct
+	```
+	Additionally, you can replace the model `Qwen/Qwen2.5-1.5B-Instruct` with the local path and specify the local path in the configuration.
+
+- **Start a vllm instance**
+	```bash
+	docker run -it -d --name vllm -p 8000:80 -v $LLM_MODEL_PATH:$LLM_MODEL_PATH openeuler/vllm:{Tag} --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --api-key EMPTY
+	```
+	After the instance `vllm` is started, access the vllm service through `http://localhost:8000`.
+
+- **Container startup options**
+
+	| Option | Description |
+	|--|--|
+	| `-p 8000:80` | Expose vllm service on `localhost:8000`. |
+	| `-v $LLM_MODEL_PATH:$LLM_MODEL_PATH` | Mounting the local model is optional. |
+	| `--model $LLM_MODEL_ID` | Specify a model ID or the local model path. |
+	| `--host 0.0.0.0` | Specify the host address that the service listens on. |
+	| `--port 80` | Specify the port number that the service listens on. |
+	| `--api-key EMPTY` | Provide authentication for API services to ensure that only authorized users can access them. |
+
+- **OpenAI-Compatible Server**
+
+	This server can be queried in the same format as OpenAI API. For example, to list the models:
+	```bash
+	curl http://localhost:8000/v1/models
+	```
+	You can pass in the argument --api-key to enable the server to check for API key in the header.
+
+- **OpenAI Completions API with vLLM**
+
+	Once your server is started, you can query the model with input prompts:
+	```bash
+	curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'
+	```
+	Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API.
+	
+	For example, another way to query the server is via the openai Python package:
+	```bash
+	from openai import OpenAI
+	# Modify OpenAI's API key and API base to use vLLM's API server.
+	openai_api_key = "EMPTY"
+	openai_api_base = "http://localhost:8000/v1"
+	client = OpenAI(
+		api_key=openai_api_key,
+		base_url=openai_api_base,
+	)
+	completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+										prompt="San Francisco is a")
+	print("Completion result:", completion)
+	```
+	Additionally, if you have specified the local model path, you also need to replace `Qwen/Qwen2.5-1.5B-Instruct` with the local model path.
+
+- **OpenAI Chat Completions API with vLLM**
+
+	vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+
+	You can use the create chat completion endpoint to interact with the model:
+	```
+	curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "Qwen/Qwen2.5-1.5B-Instruct",
+        "messages": [
+            {"role": "user", "content": "Who won the world series in 2020?"}
+        ],
+        "chat_template": "<s>system: You are a helpful AI assistant.</s>"
+    }'
+	```
+	Alternatively, you can use the openai Python package:
+	```
+	from openai import OpenAI
+	# Set OpenAI's API key and API base to use vLLM's API server.
+	openai_api_key = "EMPTY"
+	openai_api_base = "http://localhost:8000/v1"
+
+	client = OpenAI(
+		api_key=openai_api_key,
+		base_url=openai_api_base,
+	)
+
+	chat_response = client.chat.completions.create(
+		model="Qwen/Qwen2.5-1.5B-Instruct",
+		messages=[
+			{"role": "user", "content": "Tell me a joke."},
+		],
+		extra_body={"chat_template": "<s>system: You are a helpful AI assistant.</s>"}
+	)
+	print("Chat response:", chat_response)
+	```
+	Additionally, if you have specified the local model path, you also need to replace `Qwen/Qwen2.5-1.5B-Instruct` with the local model path.
+
+- **View container running logs**
+
+	```bash
+	docker logs -f vllm
+	```
+
+- **To get an interactive shell**
+
+	```bash
+	docker exec -it vllm /bin/bash
+	```
+	
+# Question and answering
+If you have any questions or want to use some special features, please submit an issue or a pull request on [openeuler-docker-images](https://gitee.com/openeuler/openeuler-docker-images).
\ No newline at end of file
diff --git a/vllm/meta.yml b/vllm/meta.yml
index f7f8162..c631446 100644
--- a/vllm/meta.yml
+++ b/vllm/meta.yml
@@ -1,3 +1,6 @@
 0.6.3-oe2403lts:
   path: vllm/0.6.3/24.03-lts/Dockerfile
-  arch: x86_64
\ No newline at end of file
+  arch: x86_64
+0.7.1-oe2403lts:
+  path: vllm/0.7.1/24.03-lts/Dockerfile
+  arch: aarch64
\ No newline at end of file
-- 
Gitee