From 42db1fbd6f66f863e7eff020d0a1e5fce0d2a6d6 Mon Sep 17 00:00:00 2001
From: lijiaming <1228575330@qq.com>
Date: Mon, 21 Jul 2025 09:45:46 +0800
Subject: [PATCH] =?UTF-8?q?=E9=83=A8=E7=BD=B2=E5=8A=A0=E9=80=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 script/mindspore-deepseek/config.yaml         |  6 ++-
 .../roles/prepare/files/lib/check_ds.sh       |  2 +-
 .../roles/prepare/files/lib/net_check.sh      | 43 +++++++++----------
 .../roles/prepare/files/lib/ray_start.sh      | 24 +++++++++++
 .../roles/prepare/files/lib/set_env.sh        | 14 ++++--
 .../roles/prepare/files/lib/start_ds.sh       | 37 +++-------------
 .../workspace/roles/prepare/files/prepare.sh  |  9 ++--
 .../roles/prepare/templates/config.cfg.j2     |  4 +-
 .../roles/start/check-ds/tasks/main.yml       |  6 +--
 9 files changed, 78 insertions(+), 67 deletions(-)

diff --git a/script/mindspore-deepseek/config.yaml b/script/mindspore-deepseek/config.yaml
index a8384e7..59c9c3e 100644
--- a/script/mindspore-deepseek/config.yaml
+++ b/script/mindspore-deepseek/config.yaml
@@ -18,8 +18,8 @@
 
   vars:
     # 容器镜像
-    image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/deepseek_hyperinfer
-    image_tag: openeuler22.03-py3.11
+    image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore
+    image_tag: 20250717
     # 推理服务所在容器的名称
     container_name: openeuler_ds
     # 模型路径
@@ -38,6 +38,8 @@
     ray_device: enp67s0f0np0
     # 模型权重类型
     model_type: safetensors
+    # 后端(如果启动qwen2.5_vl,请设置为Native)
+    backend_type: MindFormers
     # 跳过 ssh 校验（如需禁用此功能，请注释以下配置项）
     ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
     # SSH连接复用参数，ControlMaster=auto启用连接复用，ControlPersist=60s保持连接60秒，ConnectTimeout=30设置连接超时30秒
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh
index b8fcaed..b567ebd 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh
@@ -23,4 +23,4 @@ done
 if [ $llm_status -eq 0 ]; then
     echo "推理服务拉起超时，请手动确认"
     exit 1
-fi
\ No newline at end of file
+fi
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh
index cca835c..6780585 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh
@@ -2,38 +2,37 @@
 
 # 设置TLS
 for i in {0..7}; do
-    hccn_tool -i $i -tls -s enable 0
+    hccn_tool -i $i -tls -s enable 0 &
 done
+wait
 
 # 检查TLS状态
-for i in {0..7}; do
-    hccn_tool -i $i -tls -g | grep switch
-done
+# for i in {0..7}; do
+#     hccn_tool -i $i -tls -g | grep switch
+# done
 
 # 检查链路状态
 for i in {0..7}; do
-    hccn_tool -i $i -link -g | grep -i 'link status: UP'
-    if [ $? -ne 0 ]; then
-        echo "节点npu设备 $i 检测link status不为UP"
-        exit 1
-    fi
-done
-
-# 检查网络健康状态
-for i in {0..7}; do
-    hccn_tool -i $i -net_health -g | grep -i 'Success'
-    if [ $? -ne 0 ]; then
-        echo "节点npu设备 $i 检测net_health不为Success"
-    fi
-done
-
-# 检查IP信息
-for i in {0..7}; do
-    hccn_tool -i $i -ip -g
+    {
+        output=$(hccn_tool -i $i -link -g 2>&1)
+        if grep -qi 'link status: UP' <<< "$output"; then
+            echo "link status: UP"
+        else
+            echo "节点npu设备 $i 检测link status不为UP" >&2
+            kill 0  # 终止整个脚本进程组
+        fi
+    } &
 done
+wait
 
 # 添加机器卡间互联检查
 check_inter_device_connection() {
+
+    # 检查IP信息
+    for i in {0..7}; do
+        hccn_tool -i $i -ip -g
+    done
+
     echo -e "${BLUE}请输入目标NPU卡的IP地址 (输入q退出检查):${NC}"
     while true; do
         read -p "IP地址: " target_ip
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
index c4d243a..2a60293 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
@@ -6,7 +6,9 @@ current_path=$(
 ENV_FILE=/root/.bashrc
 source $current_path/config.cfg
 source $ENV_FILE
+
 ray_start() {
+
     ps -ef | grep "python" | grep -v grep | awk '{print $2}' | xargs kill
     if [ $NODE_NUM -eq 1 ]; then
         echo "单机部署无需启动ray"
@@ -17,6 +19,28 @@ ray_start() {
     if [ "$1" ]; then
         # 从节点
         nohup ray start --address=$1:$RAY_PORT &
+        sleep 5
+        if [ $NODE_NUM -eq 2 ]; then
+            NPU_NUM=16.0
+        elif [ $NODE_NUM -eq 4 ]; then
+            NPU_NUM=32.0
+        fi
+
+        ray_status=0
+        for i in {1..10}; do
+            ray status | grep "$NPU_NUM NPU"
+            if [ $? -eq 0 ]; then
+                echo "ray集群已全部拉起"
+                ray_status=1
+                break
+            fi
+            sleep 3
+        done
+
+        if [ $ray_status -eq 0 ]; then
+            echo "ray集群超时"
+            exit 1
+        fi
     else
         # 主节点
         nohup ray start --head --include-dashboard=False --port=$RAY_PORT &
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
index 7efbacd..c6cc5d8 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
@@ -12,16 +12,17 @@ ENV_ARG='
 export ASCEND_CUSTOM_PATH=$ASCEND_HOME_PATH/../
 export MS_ENABLE_LCCL=off
 export HCCL_OP_EXPANSION_MODE=AIV
-export vLLM_MODEL_BACKEND=MindFormers
 export vLLM_MODEL_MEMORY_USE_GB=53
 export MS_DEV_RUNTIME_CONF="parallel_dispatch_kernel:True"
 export MS_ALLOC_CONF="enable_vmm:True"
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export ASCEND_TOTAL_MEMORY_GB=64
 export HCCL_CONNECT_TIMEOUT=7200
-export MS_COMPILER_CACHE_ENABLE=1
 export CPU_AFFINITY=0
 export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:4,kernel_group_num:16"
+export MS_INTERNAL_ENABLE_NZ_OPS="QuantBatchMatmul,MlaPreprocess,GroupedMatmulV4"
+export PYTHONPATH=/workspace/mindformers:$PYTHONPATH
+export MS_DISABLE_INTERNAL_KERNELS_LIST="AddRmsNorm,Add,MatMul,Cast"
 '
 
 NET_ENV="
@@ -40,7 +41,8 @@ if [ $NODE_NUM -eq 1 ]; then
         sed -e 's/activation_dtype/#activation_dtype/' -i $YAML_FILE
     fi
 elif [ $NODE_NUM -eq 2 ]; then
-    YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
+    #YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
+    YAML_FILE='/workspace/mindformers/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8_ep4tp4.yaml'
 elif [ $NODE_NUM -eq 4 ]; then
     YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
 fi
@@ -63,4 +65,10 @@ echo "$YAML_ENV" >> $ENV_FILE
 if [ $NODE_NUM -ne 1 ]; then
     echo "$NET_ENV" >> $ENV_FILE
 fi
+
+# 设置模型后端
+if [ "$BACKEND_TYPE" = "MindFormers" ]; then
+    echo 'export vLLM_MODEL_BACKEND=MindFormers' >> $ENV_FILE
+fi
+
 source $ENV_FILE
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
index 7e4eba0..0f5ba03 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
@@ -19,39 +19,16 @@ else
     RANK_START=0
 fi
 
-if [ $NODE_NUM -ne 1 ]; then
-    if [ $NODE_NUM -eq 2 ]; then
-        NPU_NUM=16.0
-        PARALLEL=16
-    elif [ $NODE_NUM -eq 4 ]; then
-        NPU_NUM=32.0
-        PARALLEL=32
-    fi
-
-    ray_status=0
-    for i in {1..10}; do
-        ray status | grep "$NPU_NUM NPU"
-        if [ $? -eq 0 ]; then
-            echo "ray集群已全部拉起"
-            ray_status=1
-            break
-        fi
-        sleep 3
-    done
-
-    if [ $ray_status -eq 0 ]; then
-        echo "ray集群超时"
-        exit 1
-    fi
+if [ $NODE_NUM -eq 2 ]; then
+    PARALLEL=16
+elif [ $NODE_NUM -eq 4 ]; then
+    PARALLEL=32
 fi
 
 #拉起服务
 rm -rf ds.log
 if [ $NODE_NUM -ne 1 ]; then
-    # nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=256 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log &
-    nohup vllm-mindspore serve --model="$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 $headless --data-parallel-start-rank $RANK_START --data-parallel-address $master_ip --data-parallel-rpc-port $DP_PORT --enable-expert-parallel &> ds.log &
+    nohup vllm-mindspore serve --model="$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --max-num-seqs=512 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.93 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 $headless --data-parallel-start-rank $RANK_START --data-parallel-address $master_ip --data-parallel-rpc-port $DP_PORT --enable-expert-parallel &> ds.log &
 else
-    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log &
-fi
-
-
+    nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 &> ds.log &
+fi
\ No newline at end of file
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
index 0ba0d2f..fc282b0 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
+++ b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
@@ -16,6 +16,11 @@ main() {
     systemctl stop firewalld
     systemctl stop iptables
 
+    #0. 启动&预热权重服务
+    #$current_path/lib/mfs_tools.sh init || true
+    #$current_path/lib/mfs_tools.sh load || true
+
+
     # 1. 启动Docker容器并复制文件
     $current_path/lib/start_docker.sh
     cp_into_container
@@ -27,17 +32,15 @@ main() {
 
     #进入容器执行
     # 3. 设置容器内环境变量
-    docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh
+    docker exec $CONTAINER_NAME /workspace/lib/set_env.sh
 
     # 4. 进行绑核
-    echo 3 > /proc/sys/vm/drop_caches
     pip install psutil
     python3 $current_path/lib/fine-grained-bind-cann.py
     if [ $? -ne 0 ]; then
         echo "细粒度线程绑核失败，请确保驱动版本>=24.1.0"
         exit 1
     fi
-
 }
 
 # 执行主函数
diff --git a/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 b/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2
index d7691e3..6b41b7e 100644
--- a/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2
+++ b/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2
@@ -18,4 +18,6 @@ DP_PORT={{ dp_port }}
 #[ray集群使用的网卡]
 RAY_DEVICE={{ ray_device }}
 #[模型权重类型]
-MODEL_TYPE={{ model_type }}
\ No newline at end of file
+MODEL_TYPE={{ model_type }}
+#[模型后端类型]
+BACKEND_TYPE={{ backend_type }}
\ No newline at end of file
diff --git a/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml b/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml
index 3fd1a63..3f48647 100644
--- a/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml
+++ b/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml
@@ -1,6 +1,2 @@
-- name: Set dir_path as a fact
-  set_fact:
-    dir_path: "/home/mindspore-deepseek"
-
 - name: Check DeepSeek status on master
-  shell: sh {{ dir_path }}/lib/check_ds.sh
+  shell: sh /home/mindspore-deepseek/lib/check_ds.sh
-- 
Gitee