From 724a07e773c29b6941ca006ae2df737329716fb4 Mon Sep 17 00:00:00 2001 From: Jeaniowang <1104133197@qq.com> Date: Mon, 14 Jul 2025 21:44:32 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0tei=E9=95=9C=E5=83=8F?= =?UTF-8?q?=E6=9E=84=E5=BB=BA=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAGSDK/opensource/README.md | 3 + RAGSDK/opensource/tei/Dockerfile | 135 ++++++ RAGSDK/opensource/tei/README.md | 65 +++ RAGSDK/opensource/tei/start.sh | 134 ++++++ RAGSDK/opensource/tei/tei.patch | 744 +++++++++++++++++++++++++++++++ 5 files changed, 1081 insertions(+) create mode 100644 RAGSDK/opensource/README.md create mode 100644 RAGSDK/opensource/tei/Dockerfile create mode 100644 RAGSDK/opensource/tei/README.md create mode 100644 RAGSDK/opensource/tei/start.sh create mode 100644 RAGSDK/opensource/tei/tei.patch diff --git a/RAGSDK/opensource/README.md b/RAGSDK/opensource/README.md new file mode 100644 index 000000000..6555f1e96 --- /dev/null +++ b/RAGSDK/opensource/README.md @@ -0,0 +1,3 @@ +## 功能说明 +此目录对如何适配开源框架进行说明。 + diff --git a/RAGSDK/opensource/tei/Dockerfile b/RAGSDK/opensource/tei/Dockerfile new file mode 100644 index 000000000..08388183a --- /dev/null +++ b/RAGSDK/opensource/tei/Dockerfile @@ -0,0 +1,135 @@ +FROM ubuntu:20.04 + +ARG ARCH=aarch64 +ARG TORCH_VERSION=2.1.0 +# 请根据在服务器上执行npu-smi info 命令进行查询,将查询到的"Name"字段最后一位数字删除后值修改PLATFORM字段 +ARG PLATFORM=310P +ARG CANN_VERSION=8.0.0 +ARG RAG_SDK_VERSION=7.1.T3 +ENV no_proxy=172.17.0.1,$no_proxy +# 设置时区禁用交互式配置 +ENV DEBIAN_FRONTEND=noninteractive +ENV GIT_SSL_NO_VERIFY=1 +# 配置环境变量 +ENV RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static +ENV RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup +ENV PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update && apt-get install -y software-properties-common && add-apt-repository -y ppa:deadsnakes/ppa && apt-get update +RUN apt-get update && apt-get install -y curl wget git git-lfs ca-certificates libssl-dev zlib1g-dev unzip tar gcc make cmake libffi-dev g++ libprotobuf-dev pkg-config build-essential libbz2-dev libreadline-dev libsqlite3-dev llvm xz-utils liblzma-dev libopenblas-dev vim + +#RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v29.2/protoc-29.2-linux-aarch_64.zip +COPY ./package/protoc-*-linux-aarch_64.zip . +RUN mv protoc-*-linux-aarch_64.zip /tmp && \ + unzip /tmp/protoc-*-linux-aarch_64.zip -d /tmp/protoc3 && \ + mv /tmp/protoc3/bin/protoc /usr/local/bin && \ + mv /tmp/protoc3/include/* /usr/local/include && \ + rm -rf /tmp/* + +ARG PYTHON_VERSION=python3.11 + +RUN apt-get update && apt-get install -y build-essential ${PYTHON_VERSION} ${PYTHON_VERSION}-dev ${PYTHON_VERSION}-distutils ${PYTHON_VERSION}-venv +RUN curl https://bootstrap.pypa.io/get-pip.py | ${PYTHON_VERSION} && update-alternatives --install /usr/bin/python3 python3 /usr/bin/${PYTHON_VERSION} 1 +RUN ln -s /usr/bin/python3 /usr/bin/python + +RUN pip3 install grpcio-tools mypy-protobuf + + +RUN groupadd HwHiAiUser && useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash + +WORKDIR /tmp + +# 安装torch +RUN pip3 install torch=="${TORCH_VERSION}" --index-url https://download.pytorch.org/whl/cpu +# 安装torch-npu +RUN pip3 install torch-npu=="${TORCH_VERSION}".post8 + + +#安装CANN 依赖 +RUN pip3 install --upgrade setuptools && pip3 install --ignore-installed numpy==1.26.4 decorator==5.1.1 sympy==1.12 cffi==1.16.0 pyyaml==6.0.1 pathlib2==2.3.7.post1 protobuf==5.26.0 scipy==1.12.0 requests==2.31.0 psutil==5.9.8 absl-py==2.1.0 attrs==23.2.0 + +ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH + +#安装pip依赖 +RUN pip3 install poetry transformers==4.41.1 scikit-learn + + + +# 安装gcc10 +RUN apt-get remove -y gcc g++ +RUN apt update && apt install -y gcc-10 g++-10 +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 60 && update-alternatives --install /usr/bin/cc cc /usr/bin/gcc-10 60 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 60 && update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++-10 60 + + +#ENV RUSTUP_DIST_SERVER="" +#ENV RUSTUP_UPDATE_ROOT="" +ENV PIP_INDEX_URL="" + +USER HwHiAiUser:HwHiAiUser + +#安装RUST +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --no-update-default-toolchain +#RUN bash -c "source $HOME/.cargo/env && rustup toolchain install 1.83.0" + +#安装 TEI +WORKDIR /home/HwHiAiUser + +RUN git clone https://github.com/huggingface/text-embeddings-inference.git && cd text-embeddings-inference && git checkout v1.6.1 +COPY ./package/tei.patch /tmp +RUN cd /home/HwHiAiUser/text-embeddings-inference && patch -p1 < /tmp/tei.patch +#RUN sed -i 's/channel = .*/channel = "1.83.0"/g' /home/HwHiAiUser/text-embeddings-inference/rust-toolchain.toml +WORKDIR /home/HwHiAiUser/text-embeddings-inference +#ENV RUSTUP_DIST_SERVER="" +#ENV RUSTUP_UPDATE_ROOT="" +RUN bash -c "source $HOME/.cargo/env && cargo install --path router -F python -F http --no-default-features" + +# 以HwHiAiUser用户安装RAG SDK +RUN wget -q http://172.17.0.1:3000/Ascend-mindxsdk-mxrag_${RAG_SDK_VERSION}_linux-${ARCH}.run -P /tmp +RUN bash /tmp/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run --install --install-path=/home/HwHiAiUser/Ascend --quiet --platform=310P --whitelist=operator + +USER root + +ENV PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple +RUN cd /home/HwHiAiUser/text-embeddings-inference/backends/python/server && make -j16 && make install + +#升级python依赖软件 +RUN pip3 install einops + +# 以root用户安装RAG SDK +COPY ./package/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run /tmp +RUN bash /tmp/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run --install --install-path=/usr/local/Ascend --quiet --platform=310P --whitelist=operator + +# 安装cann-toolkit和kernel +RUN wget -q http://172.17.0.1:3000/Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run -P /tmp && \ + platform=$(echo $PLATFORM | tr '[A-Z]' '[a-z]') && \ + wget -q http://172.17.0.1:3000/Ascend-cann-kernels-${platform}_${CANN_VERSION}_linux-aarch64.run -P /tmp && \ + wget -q http://172.17.0.1:3000/Ascend-cann-nnal_${CANN_VERSION}_linux-aarch64.run -P /tmp + + +RUN bash /tmp/Ascend-cann-toolkit*_linux-${ARCH}.run --install --quiet +RUN bash /tmp/Ascend-cann-kernels*_linux-${ARCH}.run --install --quiet + +# 安装nnal +RUN bash -c "source /usr/local/Ascend/ascend-toolkit/set_env.sh && bash /tmp/Ascend-cann-nnal*_linux-${ARCH}.run --install --quiet" + +#拷贝执行脚本 +COPY --chown=HwHiAiUser:HwHiAiUser ./package/start.sh /home/HwHiAiUser + +RUN chmod 751 /home/HwHiAiUser/start.sh +# 添加环境变量 +ENV PATH=/usr/local/bin:$PATH + +#清理临时目录 +USER root +RUN rm -rf /tmp/* && rm -rf /home/HwHiAiUser/text-embeddings-inference/target + +# 设置容器默认启动用户为HwHiAiUser +USER HwHiAiUser:HwHiAiUser + +WORKDIR /home/HwHiAiUser + +ENTRYPOINT ["bash", "/home/HwHiAiUser/start.sh"] + +CMD ["BAAI/bge-large-zh-v1.5", "127.0.0.1", "8080"] + + diff --git a/RAGSDK/opensource/tei/README.md b/RAGSDK/opensource/tei/README.md new file mode 100644 index 000000000..df277d8e3 --- /dev/null +++ b/RAGSDK/opensource/tei/README.md @@ -0,0 +1,65 @@ +1 在Dockerfile同级目录创建package目录 + +2 将start.sh tei.patch存放到package目录中 + +3 在package目录存放好protoc软件包 +从网站上下载:https://github.com/protocolbuffers/protobuf/releases/tag/v29.3 +如下载29.3版本:protoc-29.3-linux-aarch_64.zip + +4 package目录存放Ascend-cann-kernels Ascend-cann-toolkit Ascend-cann-nnal, Ascend-mindxsdk-mxrag相关软件包,确保正确配套的卡和系统架构 + +5 构建环境上提前准备好ubuntu:20.04基础镜像 + +6 在Dockerfile同级目录下执行构建命令 +docker build -t 镜像tag --network host --build-arg ARCH=$(uname -m) --build-arg PLATFORM= -f Dockerfile . + +chip-type取值请根据在服务器上执行npu-smi info 命令进行查询,将查询到的"Name"字段最后一位数字删除后值修改PLATFORM字段 + +安装rust相关 依赖依赖网络,可能比较慢 + +7 在package目录下执行 nodejs server.js & +``` +准备server.js文件,和上述软件包放置于相同目录 +const http = require('http'); +const fs = require('fs'); +const path = require('path'); + +const port = 3000; +const directory = __dirname; + +const server = http.createServer((req, res) => { + const filePath = path.join(directory, req.url); + + if (req.url === '/files') { + // return all file names in current directory + fs.readdir(directory, (err, files) => { + if (err) { + res.writeHead(500, { + 'Content-Type': 'text/plain' + }); + res.end('Internal Server Error\n'); + return; + } + res.writeHead(200, { + 'Content-Type': 'application/json' + }); + res.end(JSON.stringify(files)); + }); + } else { + fs.stat(filePath, (err, stats) => { + if (err || !stats.isFile()) { + res.writeHead(404, { + 'Content-Type': 'text/plain' + }); + res.end('Not Found\n'); + return; + } + fs.createReadStream(filePath).pipe(res); + }); + } +}); + +server.listen(port, () => { + console.log(`Server is running at http://localhost:${port}`); +}); +``` \ No newline at end of file diff --git a/RAGSDK/opensource/tei/start.sh b/RAGSDK/opensource/tei/start.sh new file mode 100644 index 000000000..1be6b9953 --- /dev/null +++ b/RAGSDK/opensource/tei/start.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Copyright © Huawei Technologies Co., Ltd. 2024. All rights reserved. + +if [[ "$#" -ne 3 ]]; then + echo "Need param: " + exit 1 +fi + +MODEL_ID=$1 +LISTEN_IP=$2 +LISTEN_PORT=$3 + +MODEL_NAME=$(echo ${MODEL_ID} | cut -d'/' -f2) +MODEL_DIR="/home/HwHiAiUser/model" +MODEL_MEMORY_LIMIT=2048 +SUPPORT_MODELS=("BAAI/bge-large-zh-v1.5") + +sleep 3 + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +export LD_PRELOAD=$(ls /usr/local/lib/python3.11/dist-packages/scikit_learn.libs/libgomp-*):$LD_PRELOAD +export PATH="/home/HwHiAiUser/.cargo/bin:$PATH" + +if [[ -n $(id | grep uid=0) ]];then + source /usr/local/Ascend/mxRag/script/set_env.sh +else + source /home/HwHiAiUser/Ascend/mxRag/script/set_env.sh +fi + +function check_model_support() { + if [[ "${SUPPORT_MODELS[*]}" =~ ${MODEL_NAME} ]]; then + echo "Support model $MODEL_NAME." + return 0 + else + echo "$MODEL_NAME dose not support." + return 1 + fi +} + +function check_model_exists() { + if [[ ! -e "${MODEL_DIR}/${MODEL_ID##*/}/config.json" ]]; then + echo "Model '${MODEL_DIR}/${MODEL_ID##*/}' does not exist." + return 1 + else + echo "Model '${MODEL_DIR}/${MODEL_ID##*/}' exists." + return 0 + fi +} + +function download_model() { + local retry_time=1 + local max_retries=5 + echo "Downloading model '${MODEL_ID}' from modelscope ..." + while [[ ${retry_time} -le ${max_retries} ]]; do + if git clone "https://www.modelscope.cn/${MODEL_ID}" "${MODEL_DIR}/${MODEL_ID##*/}" && cd "${MODEL_DIR}/${MODEL_ID##*/}" && git lfs pull; then + echo "Download successful." + return 0 + else + retry_time=$((retry_time + 1)) + echo "Download failed, try again." + sleep 5 + fi + done + echo "Maximum retries ${max_retries} reached. Download failed." + return 1 +} + +function start_tei_service() { + if [[ -z ${TEI_NPU_DEVICE} ]]; then + if ! select_device; then + echo "Available device not found" + exit 1 + fi + fi + echo "Starting TEI service on ${LISTEN_IP}:${LISTEN_PORT}..." + text-embeddings-router \ + --model-id "${MODEL_DIR}/${MODEL_ID##*/}" \ + --port "${LISTEN_PORT}" \ + --hostname "${LISTEN_IP}" +} + +function select_device() { + echo "test npu-smi info" + npu-smi info + ret=$? + if [[ $ret -ne 0 ]]; then + echo "test npu-smi info failed" + return 1 + fi + + + while IFS=' ' read -r npu_id chip_id chip_logic_id; do + if [[ $chip_logic_id =~ ^[0-9]+$ ]]; then + local memory_type="DDR" + chip_type=$(npu-smi info -t board -i "$npu_id" -c "$chip_id" | awk -F ":" '/Chip Name/ {print $2}' | sed 's/^[ \t]*//') + if [[ $chip_type =~ ^910 ]]; then + memory_type="HBM" + fi + local total_capacity + local usage_rate + local avail_capacity + total_capacity=$(npu-smi info -t usages -i "$npu_id" -c "$chip_id" | grep "$memory_type Capacity(MB)" | cut -d ":" -f 2 | sed 's/^[ \t]*//') + usage_rate=$(npu-smi info -t usages -i "$npu_id" -c "$chip_id" | grep "$memory_type Usage Rate(%)" | cut -d ":" -f 2 | sed 's/^[ \t]*//') + avail_capacity=$(awk "BEGIN {printf \"%d\", $total_capacity - $total_capacity * ($usage_rate / 100)}") + echo "NPU_ID: $npu_id, CHIP_ID: $chip_id, CHIP_LOGIC_ID: $chip_logic_id CHIP_TYPE: $chip_type, MEMORY_TYPE: $memory_type, CAPACITY: $total_capacity, USAGE_RATE: $usage_rate, AVAIL_CAPACITY: $avail_capacity" + if [[ $avail_capacity -gt $MODEL_MEMORY_LIMIT ]]; then + export TEI_NPU_DEVICE="$chip_logic_id" + echo "Using NPU: $chip_logic_id start TEI service" + return 0 + fi + fi + done <<< "$(npu-smi info -m | awk 'NR>1 {print $1, $2, $3}')" + return 1 +} + +function main() { +:<' failed" + exit 1 + fi +! + if ! check_model_exists; then + if ! download_model; then + echo "Download model ${MODEL_ID} failed" + exit 1 + fi + fi + + start_tei_service +} + +main diff --git a/RAGSDK/opensource/tei/tei.patch b/RAGSDK/opensource/tei/tei.patch new file mode 100644 index 000000000..a730f291f --- /dev/null +++ b/RAGSDK/opensource/tei/tei.patch @@ -0,0 +1,744 @@ +diff --git a/backends/grpc-client/src/client.rs b/backends/grpc-client/src/client.rs +index 2f4868f..98e9cca 100644 +--- a/backends/grpc-client/src/client.rs ++++ b/backends/grpc-client/src/client.rs +@@ -18,7 +18,7 @@ impl Client { + let channel = Channel::builder(uri).connect().await?; + + Ok(Self { +- stub: EmbeddingServiceClient::new(channel), ++ stub: EmbeddingServiceClient::new(channel).max_decoding_message_size(100*1024*1024).max_encoding_message_size(100*1024*1024), + }) + } + +@@ -32,7 +32,7 @@ impl Client { + .await?; + + Ok(Self { +- stub: EmbeddingServiceClient::new(channel), ++ stub: EmbeddingServiceClient::new(channel).max_decoding_message_size(100*1024*1024).max_encoding_message_size(100*1024*1024), + }) + } + +@@ -65,6 +65,27 @@ impl Client { + Ok(response.embeddings) + } + ++ #[instrument(skip_all)] ++ pub async fn embed_all( ++ &mut self, ++ input_ids: Vec, ++ token_type_ids: Vec, ++ position_ids: Vec, ++ cu_seq_lengths: Vec, ++ max_length: u32, ++ ) -> Result> { ++ let request = tonic::Request::new(EmbedRequest { ++ input_ids, ++ token_type_ids, ++ position_ids, ++ max_length, ++ cu_seq_lengths, ++ }) ++ .inject_context(); ++ let response = self.stub.embed_all(request).await?.into_inner(); ++ Ok(response.allembeddings) ++ } ++ + #[instrument(skip_all)] + pub async fn predict( + &mut self, +diff --git a/backends/proto/embed.proto b/backends/proto/embed.proto +index 036f3db..71d72e0 100644 +--- a/backends/proto/embed.proto ++++ b/backends/proto/embed.proto +@@ -5,6 +5,7 @@ package embedding.v1; + service EmbeddingService { + /// Decode token for a list of prefilled batches + rpc Embed (EmbedRequest) returns (EmbedResponse); ++ rpc Embed_all (EmbedRequest) returns (RawEmbedResponse); + /// Health check + rpc Health (HealthRequest) returns (HealthResponse); + /// Predict +@@ -38,3 +39,11 @@ message Score { + message PredictResponse { + repeated Score scores = 1; + } ++ ++message TokenEmbedding { ++ repeated Embedding embeddings = 1; ++} ++ ++message RawEmbedResponse { ++ repeated TokenEmbedding allembeddings = 1; ++} +\ No newline at end of file +diff --git a/backends/python/server/Makefile b/backends/python/server/Makefile +index 6402d63..8ad0028 100644 +--- a/backends/python/server/Makefile ++++ b/backends/python/server/Makefile +@@ -1,9 +1,3 @@ +-include Makefile-flash-att +-include Makefile-flash-att-v2 +- +-unit-tests: +- pytest -s -vv -m "not private" tests +- + gen-server: + # Compile protos + pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir +diff --git a/backends/python/server/pyproject.toml b/backends/python/server/pyproject.toml +index 0654eb7..46c3ca2 100644 +--- a/backends/python/server/pyproject.toml ++++ b/backends/python/server/pyproject.toml +@@ -29,9 +29,9 @@ grpcio-tools = "^1.51.1" + pytest = "^7.3.0" + + [[tool.poetry.source]] +-name = "pytorch-gpu-src" +-url = "https://download.pytorch.org/whl/cu118" +-priority = "explicit" ++name = "mirrors" ++url = "https://pypi.tuna.tsinghua.edu.cn/simple/" ++priority = "default" + + [tool.pytest.ini_options] + markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"] +diff --git a/backends/python/server/requirements.txt b/backends/python/server/requirements.txt +index 687ec10..79cee7a 100644 +--- a/backends/python/server/requirements.txt ++++ b/backends/python/server/requirements.txt +@@ -6,10 +6,10 @@ deprecated==1.2.15 ; python_version >= "3.9" and python_version < "3.13" + filelock==3.16.1 ; python_version >= "3.9" and python_version < "3.13" + fsspec==2024.10.0 ; python_version >= "3.9" and python_version < "3.13" + googleapis-common-protos==1.66.0 ; python_version >= "3.9" and python_version < "3.13" +-grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" +-grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13" +-grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13" +-grpcio==1.68.0 ; python_version >= "3.9" and python_version < "3.13" ++grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13" ++grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13" ++grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13" ++grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13" + huggingface-hub==0.26.2 ; python_version >= "3.9" and python_version < "3.13" + idna==3.10 ; python_version >= "3.9" and python_version < "3.13" + importlib-metadata==7.1.0 ; python_version >= "3.9" and python_version < "3.13" +@@ -19,7 +19,7 @@ loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13" + markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13" + mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13" + networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13" +-numpy==2.0.2 ; python_version >= "3.9" and python_version < "3.13" ++numpy==1.24.0 ; python_version >= "3.9" and python_version < "3.13" + nvidia-cublas-cu12==12.4.5.8 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" + nvidia-cuda-cupti-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" + nvidia-cuda-nvrtc-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "3.13" +@@ -48,17 +48,17 @@ protobuf==4.25.5 ; python_version >= "3.9" and python_version < "3.13" + pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13" + regex==2024.11.6 ; python_version >= "3.9" and python_version < "3.13" + requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13" +-safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13" ++safetensors==0.4.1 ; python_version >= "3.9" and python_version < "3.13" + scikit-learn==1.5.2 ; python_version >= "3.9" and python_version < "3.13" + scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" +-sentence-transformers==3.3.1 ; python_version >= "3.9" and python_version < "3.13" ++sentence-transformers==3.4.1 ; python_version >= "3.9" and python_version < "3.13" + setuptools==75.6.0 ; python_version >= "3.9" and python_version < "3.13" + sympy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" + threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.13" + tokenizers==0.20.3 ; python_version >= "3.9" and python_version < "3.13" +-torch==2.5.1 ; python_version >= "3.9" and python_version < "3.13" ++torch==2.1.0 ; python_version >= "3.9" and python_version < "3.13" + tqdm==4.67.1 ; python_version >= "3.9" and python_version < "3.13" +-transformers==4.46.3 ; python_version >= "3.9" and python_version < "3.13" ++transformers==4.51.3 ; python_version >= "3.9" and python_version < "3.13" + triton==3.1.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.13" and python_version >= "3.9" + typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" + typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" +diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py +index 9f56065..0e47676 100644 +--- a/backends/python/server/text_embeddings_server/models/__init__.py ++++ b/backends/python/server/text_embeddings_server/models/__init__.py +@@ -13,22 +13,16 @@ from text_embeddings_server.models.default_model import DefaultModel + from text_embeddings_server.models.classification_model import ClassificationModel + from text_embeddings_server.utils.device import get_device, use_ipex + ++from modeling_bert_adapter import enable_bert_speed ++from modeling_roberta_adapter import enable_roberta_speed ++from modeling_xlm_roberta_adapter import enable_xlm_roberta_speed ++ + __all__ = ["Model"] + + TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"] + # Disable gradients + torch.set_grad_enabled(False) + +-FLASH_ATTENTION = True +-try: +- from text_embeddings_server.models.flash_bert import FlashBert +-except ImportError as e: +- logger.warning(f"Could not import Flash Attention enabled models: {e}") +- FLASH_ATTENTION = False +- +-if FLASH_ATTENTION: +- __all__.append(FlashBert) +- + + def get_model(model_path: Path, dtype: Optional[str], pool: str): + if dtype == "float32": +@@ -40,11 +34,20 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str): + else: + raise RuntimeError(f"Unknown dtype {dtype}") + ++ ++ enable_boost = os.getenv("ENABLE_BOOST", "False") ++ if enable_boost not in("True", "False"): ++ raise ValueError("env ENABLE_BOOST value must be True or False") ++ ++ if enable_boost == "True": ++ dtype == torch.float16 ++ + device = get_device() + logger.info(f"backend device: {device}") + + config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE) +- if config.model_type == "bert": ++ if config.model_type == "bert" or config.model_type == "qwen2" or config.model_type == "qwen3" or \ ++ config.model_type == "roberta" or config.model_type == "xlm-roberta" : + config: BertConfig + if ( + use_ipex() +diff --git a/backends/python/server/text_embeddings_server/models/default_model.py b/backends/python/server/text_embeddings_server/models/default_model.py +index f5c569f..20c13b3 100644 +--- a/backends/python/server/text_embeddings_server/models/default_model.py ++++ b/backends/python/server/text_embeddings_server/models/default_model.py +@@ -1,17 +1,24 @@ ++import os + import inspect + import torch + + from pathlib import Path + from typing import Type, List +-from transformers import AutoModel ++from transformers import AutoModel, AutoConfig, AutoTokenizer + from opentelemetry import trace ++ ++from collections import defaultdict ++import numpy as np ++from loguru import logger ++ + from text_embeddings_server.models.pooling import DefaultPooling + + from text_embeddings_server.models import Model +-from text_embeddings_server.models.types import PaddedBatch, Embedding, Score ++from text_embeddings_server.models.types import PaddedBatch, Embedding, Score, TokenEmbedding + + tracer = trace.get_tracer(__name__) + ++IS_CAUSAL = os.getenv("IS_CAUSAL", "").lower() + + class DefaultModel(Model): + def __init__( +@@ -26,9 +33,11 @@ class DefaultModel(Model): + AutoModel.from_pretrained(model_path, trust_remote_code=trust_remote) + .to(dtype) + .to(device) ++ .eval() + ) + self.hidden_size = model.config.hidden_size +- self.pooling = DefaultPooling(self.hidden_size, pooling_mode=pool) ++ ++ self.pool = pool + + position_offset = 0 + model_type = model.config.model_type +@@ -50,6 +59,18 @@ class DefaultModel(Model): + is not None + ) + ++ if self.pool == "splade": ++ self.tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True) ++ self.config = AutoConfig.from_pretrained(model_path) ++ self.vocab_size = self.config.vocab_size ++ ++ self.sparse_linear = torch.nn.Linear(self.hidden_size, 1).to(device).to(dtype) ++ sparse_model_path = os.path.join(model_path, "sparse_linear.pt") ++ sparse_state_dict = torch.load(sparse_model_path, map_location="cpu", weights_only=True) ++ self.sparse_linear.load_state_dict(sparse_state_dict) ++ else: ++ self.pooling = DefaultPooling(self.hidden_size, pooling_mode=pool) ++ + super(DefaultModel, self).__init__(model=model, dtype=dtype, device=device) + + @property +@@ -63,19 +84,75 @@ class DefaultModel(Model): + kwargs["token_type_ids"] = batch.token_type_ids + if self.has_position_ids: + kwargs["position_ids"] = batch.position_ids +- output = self.model(**kwargs) + +- embedding = self.pooling.forward(output, batch.attention_mask) ++ if self.pool == "splade": ++ return self._process_splade(batch, kwargs) ++ else: ++ return self._process_default(batch, kwargs) + ++ def _process_splade(self, batch: PaddedBatch, kwargs: dict): ++ with torch.no_grad(): ++ last_hidden_state = self.model(**kwargs, return_dict=True).last_hidden_state ++ sparse_vecs = torch.relu(self.sparse_linear(last_hidden_state)) ++ token_weights = sparse_vecs.squeeze(-1) ++ unused_tokens = {self.tokenizer.cls_token_id, self.tokenizer.eos_token_id, self.tokenizer.pad_token_id, ++ self.tokenizer.unk_token_id} ++ all_token_weights = torch.zeros((len(batch), self.vocab_size), dtype=token_weights.dtype, device=token_weights.device) ++ ++ for i in range(len(batch)): ++ all_token_weights[i, batch.input_ids[i]] = token_weights[i] ++ ++ all_token_weights[:, list(unused_tokens)] = 0 ++ ++ embeddings = [ ++ Embedding( ++ values=all_token_weights[i].detach().cpu().numpy().tolist() ++ ) ++ for i in range(len(batch)) ++ ] ++ ++ return embeddings ++ ++ def _process_default(self, batch: PaddedBatch, kwargs: dict): ++ if IS_CAUSAL in ["false", "0"]: ++ with torch.no_grad(): ++ output = self.model(**kwargs, is_causal=False) ++ else: ++ with torch.no_grad(): ++ output = self.model(**kwargs) ++ ++ embedding = self.pooling.forward(output, batch.attention_mask) + cpu_results = embedding.view(-1).tolist() +- ++ + return [ + Embedding( + values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size] + ) + for i in range(len(batch)) + ] ++ ++ @tracer.start_as_current_span("embed_all") ++ def embed_all(self, batch: PaddedBatch): ++ kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask} ++ if self.has_token_type_ids: ++ kwargs["token_type_ids"] = batch.token_type_ids ++ if self.has_position_ids: ++ kwargs["position_ids"] = batch.position_ids ++ output = self.model(**kwargs) ++ embedding = output[0].contiguous() ++ cpu_results = embedding.view(-1).tolist() ++ embedding_result=[] ++ for i in range(len(batch)): ++ embedding_tmp=[ ++ Embedding(values=cpu_results[(j+i * batch.max_length) * self.hidden_size : ++ (j + 1 + i * batch.max_length) * self.hidden_size]) ++ for j in range(batch.input_ids.size()[1]) ++ ] ++ token_embeddings=TokenEmbedding(embeddings=embedding_tmp) ++ embedding_result.append(token_embeddings) + ++ return embedding_result ++ + @tracer.start_as_current_span("predict") + def predict(self, batch: PaddedBatch) -> List[Score]: + pass +diff --git a/backends/python/server/text_embeddings_server/models/pooling.py b/backends/python/server/text_embeddings_server/models/pooling.py +index 43f77b1..69c69f9 100644 +--- a/backends/python/server/text_embeddings_server/models/pooling.py ++++ b/backends/python/server/text_embeddings_server/models/pooling.py +@@ -25,7 +25,7 @@ class DefaultPooling(_Pooling): + def forward(self, model_output, attention_mask) -> Tensor: + pooling_features = { + "token_embeddings": model_output[0], +- "attention_mask": attention_mask, ++ "attention_mask": attention_mask.type(model_output[0].dtype), + } + return self.pooling.forward(pooling_features)["sentence_embedding"] + +diff --git a/backends/python/server/text_embeddings_server/models/types.py b/backends/python/server/text_embeddings_server/models/types.py +index 4f2cfa4..fbd8c2d 100644 +--- a/backends/python/server/text_embeddings_server/models/types.py ++++ b/backends/python/server/text_embeddings_server/models/types.py +@@ -7,7 +7,7 @@ from dataclasses import dataclass + from opentelemetry import trace + + from text_embeddings_server.pb import embed_pb2 +-from text_embeddings_server.pb.embed_pb2 import Embedding, Score ++from text_embeddings_server.pb.embed_pb2 import Embedding, Score, TokenEmbedding + + tracer = trace.get_tracer(__name__) + PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 128)) +@@ -34,6 +34,7 @@ class PaddedBatch(Batch): + token_type_ids: torch.Tensor + position_ids: torch.Tensor + attention_mask: torch.Tensor ++ max_length: int + + @classmethod + @tracer.start_as_current_span("from_pb") +@@ -78,6 +79,7 @@ class PaddedBatch(Batch): + token_type_ids=all_tensors[1], + position_ids=all_tensors[2], + attention_mask=all_tensors[3], ++ max_length=max_length, + ) + + def __len__(self): +diff --git a/backends/python/server/text_embeddings_server/server.py b/backends/python/server/text_embeddings_server/server.py +index 646d79b..da8f6a1 100644 +--- a/backends/python/server/text_embeddings_server/server.py ++++ b/backends/python/server/text_embeddings_server/server.py +@@ -1,5 +1,8 @@ + import asyncio + import torch ++import torch_npu ++import os ++ + from grpc import aio + from loguru import logger + +@@ -13,6 +16,9 @@ from text_embeddings_server.utils.tracing import UDSOpenTelemetryAioServerInterc + from text_embeddings_server.utils.interceptor import ExceptionInterceptor + + ++clean_npu_cache = os.getenv("CLEAN_NPU_CACHE", "False") ++ ++ + class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer): + def __init__(self, model: Model): + self.model = model +@@ -31,9 +37,22 @@ class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer): + ) + + embeddings = self.model.embed(batch) ++ if clean_npu_cache == "True": ++ torch_npu.npu.empty_cache() + + return embed_pb2.EmbedResponse(embeddings=embeddings) + ++ async def Embed_all(self, request, context): ++ max_input_length = self.model.max_input_length ++ batch = self.model.batch_type.from_pb(request, self.model.device, max_input_length) ++ ++ embeddings = self.model.embed_all(batch) ++ ++ if clean_npu_cache == "True": ++ torch_npu.npu.empty_cache() ++ ++ return embed_pb2.RawEmbedResponse(allembeddings=embeddings) ++ + async def Predict(self, request, context): + max_input_length = self.model.max_input_length + batch = self.model.batch_type.from_pb( +@@ -42,6 +61,9 @@ class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer): + + scores = self.model.predict(batch) + ++ if clean_npu_cache == "True": ++ torch_npu.npu.empty_cache() ++ + return embed_pb2.PredictResponse(scores=scores) + + +@@ -67,6 +89,10 @@ def serve( + interceptors=[ + ExceptionInterceptor(), + UDSOpenTelemetryAioServerInterceptor(), ++ ], ++ options = [ ++ ('grpc_max_send_message_length', 100 * 1024 * 1024), ++ ('grpc_max_recieve_message_length', 100 * 1024 * 1024), + ] + ) + embed_pb2_grpc.add_EmbeddingServiceServicer_to_server( +diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py +index 3f3b04d..2168cf6 100644 +--- a/backends/python/server/text_embeddings_server/utils/device.py ++++ b/backends/python/server/text_embeddings_server/utils/device.py +@@ -4,6 +4,7 @@ import importlib.metadata + import importlib.util + from packaging import version + import torch ++import torch_npu + import subprocess + + ALLOW_REDUCED_PRECISION = os.getenv( +@@ -54,11 +55,19 @@ def use_ipex() -> bool: + value = os.environ.get("USE_IPEX", "True").lower() + return value in ["true", "1"] and _is_ipex_available() + +- ++ + def get_device(): + device = torch.device("cpu") + if torch.cuda.is_available(): + device = torch.device("cuda") ++ elif torch.npu.is_available(): ++ device = torch.device("npu") ++ torch.npu.set_compile_mode(jit_compile=False) ++ option = {"NPU_FUZZY_COMPILE_BLACKLIST": "ReduceProd"} ++ torch.npu.set_option(option) ++ deviceIdx = os.environ.get('TEI_NPU_DEVICE') ++ if deviceIdx != None and deviceIdx.isdigit() and int(deviceIdx) >= 0 and int(deviceIdx) <= 7: ++ torch.npu.set_device(torch.device(f"npu:{deviceIdx}")) + elif is_hpu(): + import habana_frameworks.torch.core as htcore + +diff --git a/backends/python/src/lib.rs b/backends/python/src/lib.rs +index 53255b0..4d24016 100644 +--- a/backends/python/src/lib.rs ++++ b/backends/python/src/lib.rs +@@ -73,31 +73,53 @@ impl Backend for PythonBackend { + } + + fn embed(&self, batch: Batch) -> Result { +- if !batch.raw_indices.is_empty() { +- return Err(BackendError::Inference( +- "raw embeddings are not supported for the Python backend.".to_string(), +- )); +- } + let batch_size = batch.len(); + +- let results = self +- .tokio_runtime +- .block_on(self.backend_client.clone().embed( +- batch.input_ids, +- batch.token_type_ids, +- batch.position_ids, +- batch.cumulative_seq_lengths, +- batch.max_length, +- )) +- .map_err(|err| BackendError::Inference(err.to_string()))?; +- let pooled_embeddings: Vec> = results.into_iter().map(|r| r.values).collect(); +- + let mut embeddings = + HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default()); +- for (i, e) in pooled_embeddings.into_iter().enumerate() { +- embeddings.insert(i, Embedding::Pooled(e)); +- } + ++ if !batch.pooled_indices.is_empty() { ++ let results = self ++ .tokio_runtime ++ .block_on(self.backend_client.clone().embed( ++ batch.input_ids, ++ batch.token_type_ids, ++ batch.position_ids, ++ batch.cumulative_seq_lengths, ++ batch.max_length, ++ )) ++ .map_err(|err| BackendError::Inference(err.to_string()))?; ++ ++ let pooled_embeddings: Vec> = results.into_iter().map(|r| r.values).collect(); ++ for (i, e) in pooled_embeddings.into_iter().enumerate() { ++ embeddings.insert(i, Embedding::Pooled(e)); ++ } ++ } ++ else if !batch.raw_indices.is_empty() { ++ let results = self ++ .tokio_runtime ++ .block_on(self.backend_client.clone().embed_all( ++ batch.input_ids, ++ batch.token_type_ids, ++ batch.position_ids, ++ batch.cumulative_seq_lengths, ++ batch.max_length, ++ )) ++ .map_err(|err| BackendError::Inference(err.to_string()))?; ++ ++ let mut raw_embeddings = Vec::new(); ++ for token_embedding in results { ++ let mut two_dim_list = Vec::new(); ++ for embeddings in token_embedding.embeddings { ++ let values = embeddings.values.clone(); ++ two_dim_list.push(values); ++ } ++ raw_embeddings.push(two_dim_list); ++ } ++ for (i, e) in raw_embeddings.into_iter().enumerate() { ++ embeddings.insert(i, Embedding::All(e)); ++ } ++ } + Ok(embeddings) + } + +diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs +index 3b08e92..960148e 100644 +--- a/backends/src/dtype.rs ++++ b/backends/src/dtype.rs +@@ -59,7 +59,7 @@ impl Default for DType { + } + #[cfg(feature = "python")] + { +- DType::Bfloat16 ++ DType::Float16 + } + } + } +diff --git a/core/src/queue.rs b/core/src/queue.rs +index 3fd8b77..61b38dc 100644 +--- a/core/src/queue.rs ++++ b/core/src/queue.rs +@@ -153,6 +153,7 @@ fn queue_blocking_task( + }; + + if total_tokens > max_batch_tokens { ++ tracing::info!("split new batch because cur up to max_batch_tokens:{max_batch_tokens:?}"); + entries.push_front(entry); + break; + } +@@ -174,7 +175,8 @@ fn queue_blocking_task( + + entry_index += 1; + +- if Some(metadata.len()) == max_batch_requests { ++ if Some(metadata.len()) == Some(capacity) { ++ tracing::info!("split new batch because cur up to:{capacity:?}"); + break; + } + } +@@ -183,6 +185,7 @@ fn queue_blocking_task( + let next_batch = if metadata.is_empty() { + None + } else { ++ tracing::info!("inference batch size is:{batch_size:?}"); + Some(( + metadata, + Batch { +diff --git a/router/src/http/server.rs b/router/src/http/server.rs +index cadb6c1..1f0de1e 100644 +--- a/router/src/http/server.rs ++++ b/router/src/http/server.rs +@@ -1785,8 +1785,7 @@ pub async fn run( + routes = routes.layer(axum::middleware::from_fn(auth)); + } + +- let app = Router::new() +- .merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", doc)) ++ let mut app = Router::new() + .merge(routes) + .merge(public_routes) + .layer(Extension(infer)) +@@ -1796,6 +1795,14 @@ pub async fn run( + .layer(DefaultBodyLimit::max(payload_limit)) + .layer(cors_layer); + ++ if let Ok(swagger_ui) = std::env::var("ENABLE_SWAGGER_UI") { ++ tracing::info!("try to set swagger ui"); ++ let _swagger_ui_on = String::from("true"); ++ match swagger_ui.to_lowercase() { ++ _swagger_ui_on => ++ app =app.merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", doc)) ++ } ++ } + // Run server + let listener = tokio::net::TcpListener::bind(&addr) + .await +diff --git a/router/src/lib.rs b/router/src/lib.rs +index 49e0581..044eb21 100644 +--- a/router/src/lib.rs ++++ b/router/src/lib.rs +@@ -63,6 +63,7 @@ pub async fn run( + api_key: Option, + otlp_endpoint: Option, + otlp_service_name: String, ++ prometheus_port: u16, + cors_allow_origin: Option>, + ) -> Result<()> { + let model_id_path = Path::new(&model_id); +@@ -250,8 +251,9 @@ pub async fn run( + + if !backend.padded_model { + tracing::info!("Warming up model"); ++ let max_batch_requests = Some(3); + backend +- .warmup(max_input_length, max_batch_tokens, max_batch_requests) ++ .warmup(4, 4, max_batch_requests) + .await + .context("Model backend is not healthy")?; + } +@@ -314,7 +316,7 @@ pub async fn run( + } + }; + +- let prom_builder = prometheus::prometheus_builer(info.max_input_length)?; ++ let prom_builder = prometheus::prometheus_builer(addr, prometheus_port, info.max_input_length)?; + + #[cfg(all(feature = "grpc", feature = "http"))] + compile_error!("Features `http` and `grpc` cannot be enabled at the same time."); +@@ -363,7 +365,7 @@ fn get_backend_model_type( + continue; + } + +- if Some(text_embeddings_backend::Pool::Splade) == pooling && arch.ends_with("MaskedLM") { ++ if Some(text_embeddings_backend::Pool::Splade) == pooling && (arch.ends_with("MaskedLM") || arch.ends_with("RobertaModel")) { + return Ok(text_embeddings_backend::ModelType::Embedding( + text_embeddings_backend::Pool::Splade, + )); +diff --git a/router/src/main.rs b/router/src/main.rs +index e4a902d..7b152e4 100644 +--- a/router/src/main.rs ++++ b/router/src/main.rs +@@ -48,7 +48,7 @@ struct Args { + /// The maximum amount of concurrent requests for this particular deployment. + /// Having a low limit will refuse clients requests instead of having them + /// wait for too long and is usually good to handle backpressure correctly. +- #[clap(default_value = "512", long, env)] ++ #[clap(default_value = "64", long, env)] + max_concurrent_requests: usize, + + /// **IMPORTANT** This is one critical control to allow maximum usage +@@ -164,6 +164,10 @@ struct Args { + #[clap(default_value = "text-embeddings-inference.server", long, env)] + otlp_service_name: String, + ++ /// The Prometheus port to listen on. ++ #[clap(default_value = "9000", long, short, env)] ++ prometheus_port: u16, ++ + /// Unused for gRPC servers + #[clap(long, env)] + cors_allow_origin: Option>, +@@ -227,6 +231,7 @@ async fn main() -> Result<()> { + args.api_key, + args.otlp_endpoint, + args.otlp_service_name, ++ args.prometheus_port, + args.cors_allow_origin, + ) + .await?; +diff --git a/router/src/prometheus.rs b/router/src/prometheus.rs +index bded390..4c5fb38 100644 +--- a/router/src/prometheus.rs ++++ b/router/src/prometheus.rs +@@ -1,6 +1,13 @@ ++use std::net::SocketAddr; + use metrics_exporter_prometheus::{BuildError, Matcher, PrometheusBuilder}; + +-pub(crate) fn prometheus_builer(max_input_length: usize) -> Result { ++pub(crate) fn prometheus_builer( ++ addr: SocketAddr, ++ port: u16, ++ max_input_length: usize, ++) -> Result { ++ let mut addr = addr; ++ addr.set_port(port); + // Duration buckets + let duration_matcher = Matcher::Suffix(String::from("duration")); + let n_duration_buckets = 35; +@@ -30,6 +37,7 @@ pub(crate) fn prometheus_builer(max_input_length: usize) -> Result Date: Tue, 15 Jul 2025 08:51:48 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0tei=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAGSDK/opensource/tei/Dockerfile | 4 ++-- RAGSDK/opensource/tei/README.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/RAGSDK/opensource/tei/Dockerfile b/RAGSDK/opensource/tei/Dockerfile index 08388183a..1aa5b9916 100644 --- a/RAGSDK/opensource/tei/Dockerfile +++ b/RAGSDK/opensource/tei/Dockerfile @@ -85,7 +85,7 @@ RUN bash -c "source $HOME/.cargo/env && cargo install --path router -F python - # 以HwHiAiUser用户安装RAG SDK RUN wget -q http://172.17.0.1:3000/Ascend-mindxsdk-mxrag_${RAG_SDK_VERSION}_linux-${ARCH}.run -P /tmp -RUN bash /tmp/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run --install --install-path=/home/HwHiAiUser/Ascend --quiet --platform=310P --whitelist=operator +RUN bash /tmp/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run --install --install-path=/home/HwHiAiUser/Ascend --quiet --platform=${PLATFORM} --whitelist=operator USER root @@ -97,7 +97,7 @@ RUN pip3 install einops # 以root用户安装RAG SDK COPY ./package/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run /tmp -RUN bash /tmp/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run --install --install-path=/usr/local/Ascend --quiet --platform=310P --whitelist=operator +RUN bash /tmp/Ascend-mindxsdk-mxrag_*_linux-${ARCH}.run --install --install-path=/usr/local/Ascend --quiet --platform=${PLATFORM} --whitelist=operator # 安装cann-toolkit和kernel RUN wget -q http://172.17.0.1:3000/Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run -P /tmp && \ diff --git a/RAGSDK/opensource/tei/README.md b/RAGSDK/opensource/tei/README.md index df277d8e3..9e6bc4c9e 100644 --- a/RAGSDK/opensource/tei/README.md +++ b/RAGSDK/opensource/tei/README.md @@ -6,18 +6,18 @@ 从网站上下载:https://github.com/protocolbuffers/protobuf/releases/tag/v29.3 如下载29.3版本:protoc-29.3-linux-aarch_64.zip -4 package目录存放Ascend-cann-kernels Ascend-cann-toolkit Ascend-cann-nnal, Ascend-mindxsdk-mxrag相关软件包,确保正确配套的卡和系统架构 +4 package目录存放Ascend-cann-kernels Ascend-cann-toolkit Ascend-cann-nnal, Ascend-mindxsdk-mxrag相关软件包,确保正确配套系统架构 5 构建环境上提前准备好ubuntu:20.04基础镜像 6 在Dockerfile同级目录下执行构建命令 -docker build -t 镜像tag --network host --build-arg ARCH=$(uname -m) --build-arg PLATFORM= -f Dockerfile . +docker build -t 镜像tag --network host --build-arg ARCH=$(uname -m) --build-arg CANN_VERSION= --build-arg RAG_SDK_VERSION= --build-arg PLATFORM= -f Dockerfile . chip-type取值请根据在服务器上执行npu-smi info 命令进行查询,将查询到的"Name"字段最后一位数字删除后值修改PLATFORM字段 安装rust相关 依赖依赖网络,可能比较慢 -7 在package目录下执行 nodejs server.js & +7 在package目录下执行 后台运行本地sftp服务器 nodejs server.js & ``` 准备server.js文件,和上述软件包放置于相同目录 const http = require('http'); -- Gitee From 7b5aec559511b9ca509b416289e2c7c00ab2edac Mon Sep 17 00:00:00 2001 From: Jeaniowang <1104133197@qq.com> Date: Tue, 15 Jul 2025 11:44:07 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=9B=BE=E6=96=87=E5=B9=B6=E8=8C=82?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAGSDK/PocValidation/dify/README.md | 4 ++-- RAGSDK/PocValidation/dify/dify_demo.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/RAGSDK/PocValidation/dify/README.md b/RAGSDK/PocValidation/dify/README.md index ecea536ac..0ca102691 100644 --- a/RAGSDK/PocValidation/dify/README.md +++ b/RAGSDK/PocValidation/dify/README.md @@ -11,7 +11,7 @@ streamlit 2 部署mis-tei emb,reranker服务([部署参考链接](https://www.hiascend.com/developer/ascendhub/detail/07a016975cc341f3a5ae131f2b52399d)) -3 如果需要解析docx、pdf文件中的图片进行图文并茂回答,需部署VLM模型服务([部署参考链接](https://www.hiascend.com/developer/ascendhub/detail/9eedc82e0c0644b2a2a9d0821ed5e7ad)), LLM服务([部署参考链接](https://www.hiascend.com/developer/ascendhub/detail/125b5fb4e7184b8dabc3ae4b18c6ff99)) +3 如果需要解析docx、pdf文件中的图片进行图文并茂回答,启动demo时请配置 --parse_image 使能图片解析功能,需部署VLM模型服务([部署参考链接](https://www.hiascend.com/developer/ascendhub/detail/9eedc82e0c0644b2a2a9d0821ed5e7ad)), LLM服务([部署参考链接](https://www.hiascend.com/developer/ascendhub/detail/125b5fb4e7184b8dabc3ae4b18c6ff99)),注意如果图片尺寸长或宽小于256,由于信息小,将被丢弃处理。 4 执行dify_demo.py运行服务,具体参数可执行 --help查看 ``` @@ -21,7 +21,7 @@ python3 dify_demo.py 6 支持在dify界面配置外接知识库,[部署参考参考链接](https://docs.dify.ai/zh-hans/guides/knowledge-base/connect-external-knowledge-base) -7 可调用/query接口问答测试,如需图片并茂内容,启动demo时请配置 --parse_image 使能图片解析功能,代码执行路径下存放了LLM回答文件response.md,可通过如下代码启动web网页可直观展示答复内容,复制如下代码在dify_demo.py同级目录下创建st.py +7 可调用/query接口问答测试,代码执行路径下存放了LLM回答文件response.md,可通过如下代码启动web网页可直观展示答复内容,复制如下代码在dify_demo.py同级目录下创建st.py ``` import streamlit as st diff --git a/RAGSDK/PocValidation/dify/dify_demo.py b/RAGSDK/PocValidation/dify/dify_demo.py index 9e5313dd3..b2edaf3fc 100644 --- a/RAGSDK/PocValidation/dify/dify_demo.py +++ b/RAGSDK/PocValidation/dify/dify_demo.py @@ -426,8 +426,9 @@ def extract_images_info_by_vlm(image_out_dir, file_name): if res: info.append({"image_path": image_file, "image_description": res}) - with open(os.path.join(image_dir, "image_info.json"), "w", encoding='utf-8') as f: - f.write(json.dumps(info, indent=4, ensure_ascii=False)) + if len(info) > 0: + with open(os.path.join(image_dir, "image_info.json"), "w", encoding='utf-8') as f: + f.write(json.dumps(info, indent=4, ensure_ascii=False)) logger.info(f"extract images info successfully") @@ -615,6 +616,7 @@ async def deleteallfiles(): # 删除从文件解析出来的图片 try: shutil.rmtree(upload_file_dir) + shutil.rmtree(images_store_dir) except Exception as e: logger.info(f"-------- delete {upload_file_dir} failed: {e}") -- Gitee