From 945d5140a7b274b2f155d6b2eb203590bee48f1d Mon Sep 17 00:00:00 2001 From: lijiaming <1228575330@qq.com> Date: Fri, 11 Apr 2025 10:33:03 +0800 Subject: [PATCH] support int4 & adjust start_ds command --- ...50\347\275\262\346\214\207\345\215\227.md" | 12 ++--- .../roles/prepare/files/lib/ray_start.sh | 4 ++ .../roles/prepare/files/lib/set_env.sh | 26 ++++++---- .../roles/prepare/files/lib/start_docker.sh | 5 +- .../roles/prepare/files/lib/start_ds.sh | 51 +++++++++++-------- .../workspace/roles/prepare/files/prepare.sh | 45 +++++----------- 6 files changed, 71 insertions(+), 72 deletions(-) diff --git "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" index e70be46..1726edc 100644 --- "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" +++ "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" @@ -99,14 +99,14 @@ sh mindspore-deepseek/workspace/roles/prepare/files/lib/ascend_prepare.sh **Step1:下载oedeploy工具(下载到控制节点)** ```shell -# 下载插件包并解压 -wget https://repo.oepkgs.net/openEuler/rpm/openEuler-24.03-LTS/contrib/oedp/plugins/mindspore-deepseek.tar.gz - -tar zxvf mindspore-deepseek.tar.gz # 下载安装oedp工具,例如: wget https://repo.oepkgs.net/openEuler/rpm/openEuler-24.03-LTS/contrib/oedp/aarch64/Packages/oedp-1.0.0-2.oe2503.aarch64.rpm yum localinstall oedp-1.0.0-2.oe2503.aarch64.rpm +# 下载插件包 +git clone https://gitee.com/openeuler/llm_solution.git + +cd llm_solution/script/mindspore-deepseek ``` **Step2:调整oedeploy配置文件** @@ -353,11 +353,11 @@ npu-smi set -t reset -i $id -c $chip_id 该步骤在宿主机执行,需在所有节点执行 -**Step1:** 可使用even-iso.py绑核脚本,进行细粒度绑核提升性能 +**Step1:** 可用绑核脚本,进行细粒度绑核提升性能 ```shell # 所有节点执行 -python ./lib/even-iso.py +python ./lib/fine-grained-bind-cann.py ``` diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh index 826eb12..c4d243a 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh @@ -8,6 +8,10 @@ source $current_path/config.cfg source $ENV_FILE ray_start() { ps -ef | grep "python" | grep -v grep | awk '{print $2}' | xargs kill + if [ $NODE_NUM -eq 1 ]; then + echo "单机部署无需启动ray" + return + fi ray stop if [ "$1" ]; then diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index e43b877..076198a 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -20,6 +20,7 @@ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ASCEND_TOTAL_MEMORY_GB=64 export HCCL_CONNECT_TIMEOUT=7200 export MS_COMPILER_CACHE_ENABLE=1 +export CPU_AFFINITY=0 ' NET_ENV=" @@ -28,18 +29,24 @@ export TP_SOCKET_IFNAME=$RAY_DEVICE export HCCL_SOCKET_IFNAME=$RAY_DEVICE " -if [ $NODE_NUM -eq 2 ]; then - YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' +if [ $NODE_NUM -eq 1 ]; then + YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' + cat $YAML_FILE | grep gptq-pergroup + if [ $? -ne 0 ]; then + sed -e 's/model_parallel:.*/model_parallel: 8/' -i $YAML_FILE + sed -e "s/quant_method:.*/quant_method: 'gptq-pergroup'/" -i $YAML_FILE + sed -e 's/weight_dtype/#weight_dtype/' -i $YAML_FILE + sed -e 's/activation_dtype/#activation_dtype/' -i $YAML_FILE + fi +elif [ $NODE_NUM -eq 2 ]; then + YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' elif [ $NODE_NUM -eq 4 ]; then - YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml' + YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml' fi - # 修改权重类型 sed -e 's/^load_ckpt_format.*/load_ckpt_format: "'$MODEL_TYPE'"/' -i $YAML_FILE -if [ "$MODEL_TYPE" = "ckpt" ]; then - sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE -fi +sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE YAML_ENV="export MINDFORMERS_MODEL_CONFIG=$YAML_FILE" @@ -51,7 +58,8 @@ if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then fi echo "$ENV_ARG" >> $ENV_FILE -echo "$NET_ENV" >> $ENV_FILE echo "$YAML_ENV" >> $ENV_FILE +if [ $NODE_NUM -ne 1 ]; then + echo "$NET_ENV" >> $ENV_FILE +fi source $ENV_FILE - diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh index 5e97f7f..cd1421c 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh @@ -8,6 +8,7 @@ current_path=$( source $current_path/config.cfg # 安装docker yum install docker -y +systemctl restart docker # 检测镜像是否已被拉取 docker images | grep $IMAGE_NAME | grep $IMAGE_TAG @@ -25,7 +26,7 @@ if [ $IS_STOP_OTHER_CONTAINER -ne 0 ]; then fi # 如果存在名称相同的容器,则直接使用 -docker ps -a | grep $IMAGE_NAME:$IMAGE_TAG | grep $CONTAINER_NAME +docker ps -a | grep $IMAGE_NAME:$IMAGE_TAG | grep -w $CONTAINER_NAME if [ $? -eq 0 ]; then echo "发现容器 $CONTAINER_NAME 已存在,直接使用" docker start $CONTAINER_NAME @@ -33,7 +34,7 @@ if [ $? -eq 0 ]; then fi # 如果存在名称相同,但镜像不同容器,则报错 -docker ps -a | grep $CONTAINER_NAME +docker ps -a | grep -w $CONTAINER_NAME if [ $? -eq 0 ]; then echo "发现容器名称 $CONTAINER_NAME 已被使用,请排查" exit 1 diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 0a637ae..49a7bd2 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -8,33 +8,40 @@ source $current_path/config.cfg source $ENV_FILE # 仅主节点运行 -if [ $NODE_NUM -eq 2 ]; then - NPU_NUM=16.0 - PARALLEL=16 -elif [ $NODE_NUM -eq 4 ]; then - NPU_NUM=32.0 - PARALLEL=32 -fi - -ray_status=0 -for i in {1..10}; do - ray status | grep "$NPU_NUM NPU" - if [ $? -eq 0 ]; then - echo "ray集群已全部拉起" - ray_status=1 - break +if [ $NODE_NUM -ne 1 ]; then + if [ $NODE_NUM -eq 2 ]; then + NPU_NUM=16.0 + PARALLEL=16 + elif [ $NODE_NUM -eq 4 ]; then + NPU_NUM=32.0 + PARALLEL=32 fi - sleep 3 -done -if [ $ray_status -eq 0 ]; then - echo "ray集群超时" - exit 1 + ray_status=0 + for i in {1..10}; do + ray status | grep "$NPU_NUM NPU" + if [ $? -eq 0 ]; then + echo "ray集群已全部拉起" + ray_status=1 + break + fi + sleep 3 + done + + if [ $ray_status -eq 0 ]; then + echo "ray集群超时" + exit 1 + fi fi #拉起服务 rm -rf ds.log -nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --distributed-executor-backend=ray &> ds.log & +if [ $NODE_NUM -ne 1 ]; then + nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log & +else + nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log & +fi + #检测推理服务是否拉起 llm_status=0 for i in {1..7200}; do @@ -50,4 +57,4 @@ done if [ $llm_status -eq 0 ]; then echo "推理服务拉起超时,请手动确认" exit 1 -fi \ No newline at end of file +fi diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh index c000112..0ba0d2f 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh @@ -16,49 +16,28 @@ main() { systemctl stop firewalld systemctl stop iptables - # 检查防火墙是否启动,如果启动则检查端口是否在防火墙白名单中,如果不存在则添加到白名单中 - status=$(systemctl status firewalld | grep -E "Active" | awk -F":" '{print $2}' | awk -F" " '{print $1}') - if [[ "${status}" == "active" ]]; then - # ray 端口防火墙检查 - port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp) - if [[ "${port_ray}" == "no" ]]; then - port_ray=$(firewall-cmd --zone=public --add-port=$RAY_PORT/tcp --permanent) - firewall-cmd --reload - fi - port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp) - if [[ "${port_ray}" != "yes" ]]; then - echo -e "防火墙开启 $RAY_PORT端口失败" - exit 1 - fi - port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp) - if [[ "${port_llm}" == "no" ]]; then - port_llm=$(firewall-cmd --zone=public --add-port=$LLM_PORT/tcp --permanent) - firewall-cmd --reload - fi - port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp) - if [[ "${port_llm}" != "yes" ]]; then - echo -e "防火墙开启 $LLM_PORT端口失败" - exit 1 - fi - fi - - # 检测需要部署的节点ip数量 - if [ [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ] ]; then - echo "当前仅支持两/四节点部署,当前数量是$NODE_NUM" - exit 1 - fi - # 1. 启动Docker容器并复制文件 $current_path/lib/start_docker.sh cp_into_container # 2. 执行组网检查 - $current_path/lib/net_check.sh + if [ $NODE_NUM -ne 1 ]; then + $current_path/lib/net_check.sh + fi #进入容器执行 # 3. 设置容器内环境变量 docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh + # 4. 进行绑核 + echo 3 > /proc/sys/vm/drop_caches + pip install psutil + python3 $current_path/lib/fine-grained-bind-cann.py + if [ $? -ne 0 ]; then + echo "细粒度线程绑核失败,请确保驱动版本>=24.1.0" + exit 1 + fi + } # 执行主函数 -- Gitee