From 8d8c0d74cad30f7b466da22551334ec76ead3a29 Mon Sep 17 00:00:00 2001
From: lijiaming <1228575330@qq.com>
Date: Fri, 14 Mar 2025 19:19:11 +0800
Subject: [PATCH] =?UTF-8?q?opt:=20=E6=94=AF=E6=8C=81=E4=B8=8D=E5=90=8C?=
 =?UTF-8?q?=E6=9D=83=E9=87=8D=E7=B1=BB=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 plugins/mindspore-deepseek/config.yaml        |  2 +
 .../mindspore-deepseek/workspace/install.yml  |  7 --
 .../workspace/roles/post/tasks/main.yml       |  6 --
 .../roles/prepare/files/lib/even-iso.py       | 69 -------------------
 .../roles/prepare/files/lib/example_config    |  4 +-
 .../roles/prepare/files/lib/set_env.sh        | 20 ++++--
 .../roles/prepare/files/lib/start_ds.sh       |  6 +-
 .../workspace/roles/prepare/files/prepare.sh  | 34 ++++++++-
 .../roles/prepare/templates/config.cfg.j2     |  4 +-
 9 files changed, 57 insertions(+), 95 deletions(-)
 delete mode 100644 plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml
 delete mode 100644 plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py

diff --git a/plugins/mindspore-deepseek/config.yaml b/plugins/mindspore-deepseek/config.yaml
index 2f18c2a..35db0fb 100644
--- a/plugins/mindspore-deepseek/config.yaml
+++ b/plugins/mindspore-deepseek/config.yaml
@@ -34,5 +34,7 @@
     llm_port: 8000
     # ray集群使用的网卡
     ray_device: enp67s0f0np0
+    # 模型权重类型
+    model_type: safetensors
     # 跳过 ssh 校验（如需禁用此功能，请注释以下配置项）
     ansible_ssh_common_args: '-o StrictHostKeyChecking=no'
diff --git a/plugins/mindspore-deepseek/workspace/install.yml b/plugins/mindspore-deepseek/workspace/install.yml
index 4bbcc9c..2d1fec9 100644
--- a/plugins/mindspore-deepseek/workspace/install.yml
+++ b/plugins/mindspore-deepseek/workspace/install.yml
@@ -22,10 +22,3 @@
   roles:
     - start/deepseek
   any_errors_fatal: true
-  
-- hosts:
-    - masters
-    - workers
-  roles:
-    - post
-  any_errors_fatal: true
diff --git a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml
deleted file mode 100644
index c0c12e6..0000000
--- a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-- name: Set dir_path as a fact
-  set_fact:
-    dir_path: "/home/mindspore-deepseek"
-
-- name: Binding core on host
-  shell: python3 {{ dir_path }}/lib/even-iso.py
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py
deleted file mode 100644
index 4c23624..0000000
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import subprocess
-
-bound_cpus = {0:1,1:1}
-
-
-def execute_command(command):
-    print(command)
-    result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split()
-    print(result)
-
-
-def isolate_tids(tids, cpu_range):
-    command = f"taskset -cp {cpu_range[0]}-{cpu_range[-1]} {tids[0]}"
-    execute_command(command)
-    i = 0
-    cpuSkippedCount = 0
-    while i < len(tids):
-        cpu = cpu_range[i + cpuSkippedCount]+24
-        if cpu in bound_cpus:
-            cpuSkippedCount += 1
-            continue
-        bound_cpus[cpu] = 1
-        command = f"taskset -cp {cpu} {tids[i]}"
-        execute_command(command)
-        i += 1
-
-
-def check_is_valid(info):
-    #return "release_thread" in info or "acl_thread" in infoi
-    #return "rayMsRayWorke" in info or "python3" in info or "Hccl_Notify" in info
-    return "batch_launch" in info or "frontend" in info or "backend" in info or "Actor" in info or "bprop" in info or "python3" in info or "MsRayWorke" in info
-
-def enforce_isolation():
-    npu_len = 8
-    command = "npu-smi info -t topo"
-    result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split()
-    cpu_lists = [val for i, val in enumerate(result) if i > npu_len + 1 and i % (npu_len + 2) == 9][:8]
-    cpu_lists = [range(int(cpu_st.split('-')[0]), int(cpu_st.split('-')[1])) for cpu_st in cpu_lists]
-
-    command = "npu-smi info"
-    result = subprocess.run(command.split(), capture_output=True, text=True).stdout
-    arr = " ".join(result.split('\n')[-npu_len * 2 - 2:]).split('|')
-
-    pids = [pid.strip() for i, pid in enumerate(arr) if i % 5 == 2]
-    tids = []
-
-    for pid in pids:
-        command = f"ps -T -p {pid}"
-        results = subprocess.run(command.split(), capture_output=True, text=True).stdout
-        results = results.split('\n')
-        key_sort = lambda l1: "0" if len(l1) < 4 else l1.split()[3]
-        result = [info for info in results if check_is_valid(info)]
-        result = sorted(result, key=key_sort)
-        len_top_results = 9
-        top_results = result[-len_top_results-1:-1]
-        new_tids = [info.split()[1] for info in top_results]
-        tids += [[pid] + new_tids]
-
-    print("cpu-sets: ", cpu_lists)
-    print("*" * 10)
-    print("pids on npu: ", pids)
-    print("*" * 10)
-    print("tids: ", tids)
-
-    for i in range(len(pids)):
-        isolate_tids(tids[i], cpu_lists[i])
-
-enforce_isolation()
-
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config
index 7649d1d..d96c038 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config
@@ -14,4 +14,6 @@ IS_STOP_OTHER_CONTAINER=0
 #[推理服务端口]
 LLM_PORT=8000
 #[ray集群使用的网卡]
-RAY_DEVICE=enp67s0f0np0
\ No newline at end of file
+RAY_DEVICE=enp67s0f0np0
+#[模型权重类型]
+MODEL_TYPE=safetensors
\ No newline at end of file
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
index 654f0e3..e43b877 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
@@ -13,12 +13,13 @@ export ASCEND_CUSTOM_PATH=$ASCEND_HOME_PATH/../
 export MS_ENABLE_LCCL=off
 export HCCL_OP_EXPANSION_MODE=AIV
 export vLLM_MODEL_BACKEND=MindFormers
-export vLLM_MODEL_MEMORY_USE_GB=50
+export vLLM_MODEL_MEMORY_USE_GB=53
 export MS_DEV_RUNTIME_CONF="parallel_dispatch_kernel:True"
-export MS_ALLOC_CONF="enable_vmm:False"
+export MS_ALLOC_CONF="enable_vmm:True"
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 export ASCEND_TOTAL_MEMORY_GB=64
-export HCCL_CONNECT_TIMEOUT=3600
+export HCCL_CONNECT_TIMEOUT=7200
+export MS_COMPILER_CACHE_ENABLE=1
 '
 
 NET_ENV="
@@ -28,11 +29,20 @@ export HCCL_SOCKET_IFNAME=$RAY_DEVICE
 "
 
 if [ $NODE_NUM -eq 2 ]; then
-	YAML_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
+	YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml'
 elif [ $NODE_NUM -eq 4 ]; then
-	YAML_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
+	YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml'
 fi
 
+
+# 修改权重类型
+sed -e 's/^load_ckpt_format.*/load_ckpt_format: "'$MODEL_TYPE'"/' -i $YAML_FILE
+if [ "$MODEL_TYPE" = "ckpt" ]; then
+    sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE
+fi
+
+YAML_ENV="export MINDFORMERS_MODEL_CONFIG=$YAML_FILE"
+
 ENV_FILE=/root/.bashrc
 
 if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
index 716c318..0a637ae 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
@@ -34,7 +34,7 @@ fi
 
 #拉起服务
 rm -rf ds.log
-nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray &> ds.log &
+nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --distributed-executor-backend=ray &> ds.log &
 #检测推理服务是否拉起
 llm_status=0
 for i in {1..7200}; do
@@ -50,6 +50,4 @@ done
 if [ $llm_status -eq 0 ]; then
     echo "推理服务拉起超时，请手动确认"
     exit 1
-fi
-
-sleep 60
\ No newline at end of file
+fi
\ No newline at end of file
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
index 23cbcfa..c000112 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
-current_path=$(cd $(dirname $0/); pwd)
+current_path=$(
+    cd $(dirname $0/)
+    pwd
+)
 source $current_path/lib/config.cfg
 
 cp_into_container() {
@@ -12,6 +15,33 @@ main() {
 
     systemctl stop firewalld
     systemctl stop iptables
+
+    # 检查防火墙是否启动，如果启动则检查端口是否在防火墙白名单中，如果不存在则添加到白名单中
+    status=$(systemctl status firewalld | grep -E "Active" | awk -F":" '{print $2}' | awk -F" " '{print $1}')
+    if [[ "${status}" == "active" ]]; then
+        # ray 端口防火墙检查
+        port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp)
+        if [[ "${port_ray}" == "no" ]]; then
+            port_ray=$(firewall-cmd --zone=public --add-port=$RAY_PORT/tcp --permanent)
+            firewall-cmd --reload
+        fi
+        port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp)
+        if [[ "${port_ray}" != "yes" ]]; then
+            echo -e "防火墙开启 $RAY_PORT端口失败"
+            exit 1
+        fi
+        port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp)
+        if [[ "${port_llm}" == "no" ]]; then
+            port_llm=$(firewall-cmd --zone=public --add-port=$LLM_PORT/tcp --permanent)
+            firewall-cmd --reload
+        fi
+        port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp)
+        if [[ "${port_llm}" != "yes" ]]; then
+            echo -e "防火墙开启 $LLM_PORT端口失败"
+            exit 1
+        fi
+    fi
+
     # 检测需要部署的节点ip数量
     if [ [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ] ]; then
         echo "当前仅支持两/四节点部署,当前数量是$NODE_NUM"
@@ -28,7 +58,7 @@ main() {
     #进入容器执行
     # 3. 设置容器内环境变量
     docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh
-    
+
 }
 
 # 执行主函数
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 b/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2
index 4c36ef2..de9cb18 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2
@@ -14,4 +14,6 @@ IS_STOP_OTHER_CONTAINER={{ is_stop_other_container }}
 #[推理服务端口]
 LLM_PORT={{ llm_port }}
 #[ray集群使用的网卡]
-RAY_DEVICE={{ ray_device }}
\ No newline at end of file
+RAY_DEVICE={{ ray_device }}
+#[模型权重类型]
+MODEL_TYPE={{ model_type }}
\ No newline at end of file
-- 
Gitee