From c5349e6b14aaf489d7a49cbf25fa01158466f970 Mon Sep 17 00:00:00 2001
From: lijiaming <1228575330@qq.com>
Date: Thu, 13 Mar 2025 20:18:58 +0800
Subject: [PATCH] =?UTF-8?q?opt:=E5=A2=9E=E5=8A=A0=E7=BB=91=E6=A0=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../mindspore-deepseek/workspace/install.yml  |  8 +++
 .../workspace/roles/post/tasks/main.yml       |  6 ++
 .../roles/prepare/files/lib/even-iso.py       | 69 +++++++++++++++++++
 .../roles/prepare/files/lib/ray_start.sh      |  2 +-
 .../roles/prepare/files/lib/set_env.sh        |  5 +-
 .../roles/prepare/files/lib/start_ds.sh       | 14 +++-
 6 files changed, 99 insertions(+), 5 deletions(-)
 create mode 100644 plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml
 create mode 100644 plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py

diff --git a/plugins/mindspore-deepseek/workspace/install.yml b/plugins/mindspore-deepseek/workspace/install.yml
index 37410b9..4bbcc9c 100644
--- a/plugins/mindspore-deepseek/workspace/install.yml
+++ b/plugins/mindspore-deepseek/workspace/install.yml
@@ -21,3 +21,11 @@
     - masters
   roles:
     - start/deepseek
+  any_errors_fatal: true
+  
+- hosts:
+    - masters
+    - workers
+  roles:
+    - post
+  any_errors_fatal: true
diff --git a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml
new file mode 100644
index 0000000..c0c12e6
--- /dev/null
+++ b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml
@@ -0,0 +1,6 @@
+- name: Set dir_path as a fact
+  set_fact:
+    dir_path: "/home/mindspore-deepseek"
+
+- name: Binding core on host
+  shell: python3 {{ dir_path }}/lib/even-iso.py
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py
new file mode 100644
index 0000000..4c23624
--- /dev/null
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py
@@ -0,0 +1,69 @@
+import subprocess
+
+bound_cpus = {0:1,1:1}
+
+
+def execute_command(command):
+    print(command)
+    result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split()
+    print(result)
+
+
+def isolate_tids(tids, cpu_range):
+    command = f"taskset -cp {cpu_range[0]}-{cpu_range[-1]} {tids[0]}"
+    execute_command(command)
+    i = 0
+    cpuSkippedCount = 0
+    while i < len(tids):
+        cpu = cpu_range[i + cpuSkippedCount]+24
+        if cpu in bound_cpus:
+            cpuSkippedCount += 1
+            continue
+        bound_cpus[cpu] = 1
+        command = f"taskset -cp {cpu} {tids[i]}"
+        execute_command(command)
+        i += 1
+
+
+def check_is_valid(info):
+    #return "release_thread" in info or "acl_thread" in infoi
+    #return "rayMsRayWorke" in info or "python3" in info or "Hccl_Notify" in info
+    return "batch_launch" in info or "frontend" in info or "backend" in info or "Actor" in info or "bprop" in info or "python3" in info or "MsRayWorke" in info
+
+def enforce_isolation():
+    npu_len = 8
+    command = "npu-smi info -t topo"
+    result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split()
+    cpu_lists = [val for i, val in enumerate(result) if i > npu_len + 1 and i % (npu_len + 2) == 9][:8]
+    cpu_lists = [range(int(cpu_st.split('-')[0]), int(cpu_st.split('-')[1])) for cpu_st in cpu_lists]
+
+    command = "npu-smi info"
+    result = subprocess.run(command.split(), capture_output=True, text=True).stdout
+    arr = " ".join(result.split('\n')[-npu_len * 2 - 2:]).split('|')
+
+    pids = [pid.strip() for i, pid in enumerate(arr) if i % 5 == 2]
+    tids = []
+
+    for pid in pids:
+        command = f"ps -T -p {pid}"
+        results = subprocess.run(command.split(), capture_output=True, text=True).stdout
+        results = results.split('\n')
+        key_sort = lambda l1: "0" if len(l1) < 4 else l1.split()[3]
+        result = [info for info in results if check_is_valid(info)]
+        result = sorted(result, key=key_sort)
+        len_top_results = 9
+        top_results = result[-len_top_results-1:-1]
+        new_tids = [info.split()[1] for info in top_results]
+        tids += [[pid] + new_tids]
+
+    print("cpu-sets: ", cpu_lists)
+    print("*" * 10)
+    print("pids on npu: ", pids)
+    print("*" * 10)
+    print("tids: ", tids)
+
+    for i in range(len(pids)):
+        isolate_tids(tids[i], cpu_lists[i])
+
+enforce_isolation()
+
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
index e1dcaa7..826eb12 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh
@@ -15,7 +15,7 @@ ray_start() {
         nohup ray start --address=$1:$RAY_PORT &
     else
         # 主节点
-        nohup ray start --head --port=$RAY_PORT &
+        nohup ray start --head --include-dashboard=False --port=$RAY_PORT &
         sleep 5
         for i in {1..10}; do
             ray status | grep '8.0 NPU'
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
index 7a40ea3..654f0e3 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh
@@ -21,9 +21,10 @@ export ASCEND_TOTAL_MEMORY_GB=64
 export HCCL_CONNECT_TIMEOUT=3600
 '
 
-RAY_ENV="
+NET_ENV="
 export GLOO_SOCKET_IFNAME=$RAY_DEVICE
 export TP_SOCKET_IFNAME=$RAY_DEVICE
+export HCCL_SOCKET_IFNAME=$RAY_DEVICE
 "
 
 if [ $NODE_NUM -eq 2 ]; then
@@ -40,7 +41,7 @@ if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then
 fi
 
 echo "$ENV_ARG" >> $ENV_FILE
-echo "$RAY_ENV" >> $ENV_FILE
+echo "$NET_ENV" >> $ENV_FILE
 echo "$YAML_ENV" >> $ENV_FILE
 source $ENV_FILE
 
diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
index 569c68e..716c318 100644
--- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
+++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh
@@ -8,9 +8,17 @@ source $current_path/config.cfg
 source $ENV_FILE
 # 仅主节点运行
 
+if [ $NODE_NUM -eq 2 ]; then
+	NPU_NUM=16.0
+    PARALLEL=16
+elif [ $NODE_NUM -eq 4 ]; then
+	NPU_NUM=32.0
+    PARALLEL=32
+fi
+
 ray_status=0
 for i in {1..10}; do
-    ray status | grep '16.0 NPU'
+    ray status | grep "$NPU_NUM NPU"
     if [ $? -eq 0 ]; then
         echo "ray集群已全部拉起"
         ray_status=1
@@ -26,7 +34,7 @@ fi
 
 #拉起服务
 rm -rf ds.log
-nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=16 --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray > ds.log &
+nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray &> ds.log &
 #检测推理服务是否拉起
 llm_status=0
 for i in {1..7200}; do
@@ -43,3 +51,5 @@ if [ $llm_status -eq 0 ]; then
     echo "推理服务拉起超时，请手动确认"
     exit 1
 fi
+
+sleep 60
\ No newline at end of file
-- 
Gitee