diff --git a/plugins/mindspore-deepseek/workspace/install.yml b/plugins/mindspore-deepseek/workspace/install.yml index 37410b984bb05f01837f725d6aadf77f073bc473..4bbcc9cbbd5af75b1e42727671dcbe6e57cb08d0 100644 --- a/plugins/mindspore-deepseek/workspace/install.yml +++ b/plugins/mindspore-deepseek/workspace/install.yml @@ -21,3 +21,11 @@ - masters roles: - start/deepseek + any_errors_fatal: true + +- hosts: + - masters + - workers + roles: + - post + any_errors_fatal: true diff --git a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..c0c12e61ade7666a8e3a22da55f07963bb5e265d --- /dev/null +++ b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml @@ -0,0 +1,6 @@ +- name: Set dir_path as a fact + set_fact: + dir_path: "/home/mindspore-deepseek" + +- name: Binding core on host + shell: python3 {{ dir_path }}/lib/even-iso.py diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py new file mode 100644 index 0000000000000000000000000000000000000000..4c23624349489fa1de9cc7553562074c1556b209 --- /dev/null +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py @@ -0,0 +1,69 @@ +import subprocess + +bound_cpus = {0:1,1:1} + + +def execute_command(command): + print(command) + result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split() + print(result) + + +def isolate_tids(tids, cpu_range): + command = f"taskset -cp {cpu_range[0]}-{cpu_range[-1]} {tids[0]}" + execute_command(command) + i = 0 + cpuSkippedCount = 0 + while i < len(tids): + cpu = cpu_range[i + cpuSkippedCount]+24 + if cpu in bound_cpus: + cpuSkippedCount += 1 + continue + bound_cpus[cpu] = 1 + command = f"taskset -cp {cpu} {tids[i]}" + execute_command(command) + i += 1 + + +def check_is_valid(info): + #return "release_thread" in info or "acl_thread" in infoi + #return "rayMsRayWorke" in info or "python3" in info or "Hccl_Notify" in info + return "batch_launch" in info or "frontend" in info or "backend" in info or "Actor" in info or "bprop" in info or "python3" in info or "MsRayWorke" in info + +def enforce_isolation(): + npu_len = 8 + command = "npu-smi info -t topo" + result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split() + cpu_lists = [val for i, val in enumerate(result) if i > npu_len + 1 and i % (npu_len + 2) == 9][:8] + cpu_lists = [range(int(cpu_st.split('-')[0]), int(cpu_st.split('-')[1])) for cpu_st in cpu_lists] + + command = "npu-smi info" + result = subprocess.run(command.split(), capture_output=True, text=True).stdout + arr = " ".join(result.split('\n')[-npu_len * 2 - 2:]).split('|') + + pids = [pid.strip() for i, pid in enumerate(arr) if i % 5 == 2] + tids = [] + + for pid in pids: + command = f"ps -T -p {pid}" + results = subprocess.run(command.split(), capture_output=True, text=True).stdout + results = results.split('\n') + key_sort = lambda l1: "0" if len(l1) < 4 else l1.split()[3] + result = [info for info in results if check_is_valid(info)] + result = sorted(result, key=key_sort) + len_top_results = 9 + top_results = result[-len_top_results-1:-1] + new_tids = [info.split()[1] for info in top_results] + tids += [[pid] + new_tids] + + print("cpu-sets: ", cpu_lists) + print("*" * 10) + print("pids on npu: ", pids) + print("*" * 10) + print("tids: ", tids) + + for i in range(len(pids)): + isolate_tids(tids[i], cpu_lists[i]) + +enforce_isolation() + diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh index e1dcaa78cdba6845a11ed553227c9bc4e0ea657c..826eb121d616e3983fa321b93e5b819c9a834ebe 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh @@ -15,7 +15,7 @@ ray_start() { nohup ray start --address=$1:$RAY_PORT & else # 主节点 - nohup ray start --head --port=$RAY_PORT & + nohup ray start --head --include-dashboard=False --port=$RAY_PORT & sleep 5 for i in {1..10}; do ray status | grep '8.0 NPU' diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index 7a40ea3b5b3479919fe7ad282e9e8a548937a147..654f0e35dfd96f57fae0dcb2b386eb338780143e 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -21,9 +21,10 @@ export ASCEND_TOTAL_MEMORY_GB=64 export HCCL_CONNECT_TIMEOUT=3600 ' -RAY_ENV=" +NET_ENV=" export GLOO_SOCKET_IFNAME=$RAY_DEVICE export TP_SOCKET_IFNAME=$RAY_DEVICE +export HCCL_SOCKET_IFNAME=$RAY_DEVICE " if [ $NODE_NUM -eq 2 ]; then @@ -40,7 +41,7 @@ if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then fi echo "$ENV_ARG" >> $ENV_FILE -echo "$RAY_ENV" >> $ENV_FILE +echo "$NET_ENV" >> $ENV_FILE echo "$YAML_ENV" >> $ENV_FILE source $ENV_FILE diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 569c68e3d97d58f804dc7c987db40bf548c915d0..716c318dd1395de8d1dd38c9b14e04f85701c1ea 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -8,9 +8,17 @@ source $current_path/config.cfg source $ENV_FILE # 仅主节点运行 +if [ $NODE_NUM -eq 2 ]; then + NPU_NUM=16.0 + PARALLEL=16 +elif [ $NODE_NUM -eq 4 ]; then + NPU_NUM=32.0 + PARALLEL=32 +fi + ray_status=0 for i in {1..10}; do - ray status | grep '16.0 NPU' + ray status | grep "$NPU_NUM NPU" if [ $? -eq 0 ]; then echo "ray集群已全部拉起" ray_status=1 @@ -26,7 +34,7 @@ fi #拉起服务 rm -rf ds.log -nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=16 --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray > ds.log & +nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray &> ds.log & #检测推理服务是否拉起 llm_status=0 for i in {1..7200}; do @@ -43,3 +51,5 @@ if [ $llm_status -eq 0 ]; then echo "推理服务拉起超时,请手动确认" exit 1 fi + +sleep 60 \ No newline at end of file