From c5349e6b14aaf489d7a49cbf25fa01158466f970 Mon Sep 17 00:00:00 2001 From: lijiaming <1228575330@qq.com> Date: Thu, 13 Mar 2025 20:18:58 +0800 Subject: [PATCH] =?UTF-8?q?opt:=E5=A2=9E=E5=8A=A0=E7=BB=91=E6=A0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../mindspore-deepseek/workspace/install.yml | 8 +++ .../workspace/roles/post/tasks/main.yml | 6 ++ .../roles/prepare/files/lib/even-iso.py | 69 +++++++++++++++++++ .../roles/prepare/files/lib/ray_start.sh | 2 +- .../roles/prepare/files/lib/set_env.sh | 5 +- .../roles/prepare/files/lib/start_ds.sh | 14 +++- 6 files changed, 99 insertions(+), 5 deletions(-) create mode 100644 plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml create mode 100644 plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py diff --git a/plugins/mindspore-deepseek/workspace/install.yml b/plugins/mindspore-deepseek/workspace/install.yml index 37410b9..4bbcc9c 100644 --- a/plugins/mindspore-deepseek/workspace/install.yml +++ b/plugins/mindspore-deepseek/workspace/install.yml @@ -21,3 +21,11 @@ - masters roles: - start/deepseek + any_errors_fatal: true + +- hosts: + - masters + - workers + roles: + - post + any_errors_fatal: true diff --git a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml new file mode 100644 index 0000000..c0c12e6 --- /dev/null +++ b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml @@ -0,0 +1,6 @@ +- name: Set dir_path as a fact + set_fact: + dir_path: "/home/mindspore-deepseek" + +- name: Binding core on host + shell: python3 {{ dir_path }}/lib/even-iso.py diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py new file mode 100644 index 0000000..4c23624 --- /dev/null +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py @@ -0,0 +1,69 @@ +import subprocess + +bound_cpus = {0:1,1:1} + + +def execute_command(command): + print(command) + result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split() + print(result) + + +def isolate_tids(tids, cpu_range): + command = f"taskset -cp {cpu_range[0]}-{cpu_range[-1]} {tids[0]}" + execute_command(command) + i = 0 + cpuSkippedCount = 0 + while i < len(tids): + cpu = cpu_range[i + cpuSkippedCount]+24 + if cpu in bound_cpus: + cpuSkippedCount += 1 + continue + bound_cpus[cpu] = 1 + command = f"taskset -cp {cpu} {tids[i]}" + execute_command(command) + i += 1 + + +def check_is_valid(info): + #return "release_thread" in info or "acl_thread" in infoi + #return "rayMsRayWorke" in info or "python3" in info or "Hccl_Notify" in info + return "batch_launch" in info or "frontend" in info or "backend" in info or "Actor" in info or "bprop" in info or "python3" in info or "MsRayWorke" in info + +def enforce_isolation(): + npu_len = 8 + command = "npu-smi info -t topo" + result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split() + cpu_lists = [val for i, val in enumerate(result) if i > npu_len + 1 and i % (npu_len + 2) == 9][:8] + cpu_lists = [range(int(cpu_st.split('-')[0]), int(cpu_st.split('-')[1])) for cpu_st in cpu_lists] + + command = "npu-smi info" + result = subprocess.run(command.split(), capture_output=True, text=True).stdout + arr = " ".join(result.split('\n')[-npu_len * 2 - 2:]).split('|') + + pids = [pid.strip() for i, pid in enumerate(arr) if i % 5 == 2] + tids = [] + + for pid in pids: + command = f"ps -T -p {pid}" + results = subprocess.run(command.split(), capture_output=True, text=True).stdout + results = results.split('\n') + key_sort = lambda l1: "0" if len(l1) < 4 else l1.split()[3] + result = [info for info in results if check_is_valid(info)] + result = sorted(result, key=key_sort) + len_top_results = 9 + top_results = result[-len_top_results-1:-1] + new_tids = [info.split()[1] for info in top_results] + tids += [[pid] + new_tids] + + print("cpu-sets: ", cpu_lists) + print("*" * 10) + print("pids on npu: ", pids) + print("*" * 10) + print("tids: ", tids) + + for i in range(len(pids)): + isolate_tids(tids[i], cpu_lists[i]) + +enforce_isolation() + diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh index e1dcaa7..826eb12 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh @@ -15,7 +15,7 @@ ray_start() { nohup ray start --address=$1:$RAY_PORT & else # 主节点 - nohup ray start --head --port=$RAY_PORT & + nohup ray start --head --include-dashboard=False --port=$RAY_PORT & sleep 5 for i in {1..10}; do ray status | grep '8.0 NPU' diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index 7a40ea3..654f0e3 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -21,9 +21,10 @@ export ASCEND_TOTAL_MEMORY_GB=64 export HCCL_CONNECT_TIMEOUT=3600 ' -RAY_ENV=" +NET_ENV=" export GLOO_SOCKET_IFNAME=$RAY_DEVICE export TP_SOCKET_IFNAME=$RAY_DEVICE +export HCCL_SOCKET_IFNAME=$RAY_DEVICE " if [ $NODE_NUM -eq 2 ]; then @@ -40,7 +41,7 @@ if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then fi echo "$ENV_ARG" >> $ENV_FILE -echo "$RAY_ENV" >> $ENV_FILE +echo "$NET_ENV" >> $ENV_FILE echo "$YAML_ENV" >> $ENV_FILE source $ENV_FILE diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 569c68e..716c318 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -8,9 +8,17 @@ source $current_path/config.cfg source $ENV_FILE # 仅主节点运行 +if [ $NODE_NUM -eq 2 ]; then + NPU_NUM=16.0 + PARALLEL=16 +elif [ $NODE_NUM -eq 4 ]; then + NPU_NUM=32.0 + PARALLEL=32 +fi + ray_status=0 for i in {1..10}; do - ray status | grep '16.0 NPU' + ray status | grep "$NPU_NUM NPU" if [ $? -eq 0 ]; then echo "ray集群已全部拉起" ray_status=1 @@ -26,7 +34,7 @@ fi #拉起服务 rm -rf ds.log -nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=16 --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray > ds.log & +nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray &> ds.log & #检测推理服务是否拉起 llm_status=0 for i in {1..7200}; do @@ -43,3 +51,5 @@ if [ $llm_status -eq 0 ]; then echo "推理服务拉起超时,请手动确认" exit 1 fi + +sleep 60 \ No newline at end of file -- Gitee