diff --git a/script/mindspore-deepseek/config.yaml b/script/mindspore-deepseek/config.yaml index 35db0fbf0f7dbbc1a258b45fd3a0c0d851d27a6c..3d4ea9cd9754a3fac050607d60ae32c0039d1431 100644 --- a/script/mindspore-deepseek/config.yaml +++ b/script/mindspore-deepseek/config.yaml @@ -32,6 +32,8 @@ is_stop_other_container: 0 # 推理服务端口 llm_port: 8000 + # DP通信端口 + dp_port: 12570 # ray集群使用的网卡 ray_device: enp67s0f0np0 # 模型权重类型 diff --git a/script/mindspore-deepseek/workspace/install.yml b/script/mindspore-deepseek/workspace/install.yml index 2d1fec9dd0c5dce2be21f7af9ea6b6b91acd7821..0fec3d8a7d630077d18150534041ade07f7f3f6c 100644 --- a/script/mindspore-deepseek/workspace/install.yml +++ b/script/mindspore-deepseek/workspace/install.yml @@ -22,3 +22,15 @@ roles: - start/deepseek any_errors_fatal: true + +- hosts: + - workers + roles: + - start/deepseek-worker + any_errors_fatal: true + +- hosts: + - masters + roles: + - start/check-ds + any_errors_fatal: true diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh new file mode 100644 index 0000000000000000000000000000000000000000..b8fcaedcbb31ba6fc8afce87d70549c5a99dfc9a --- /dev/null +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh @@ -0,0 +1,26 @@ +#!/bin/bash +current_path=$( + cd $(dirname $0/) + pwd +) +ENV_FILE=/root/.bashrc +source $current_path/config.cfg +source $ENV_FILE +# 仅主节点运行 + +#检测推理服务是否拉起 +llm_status=0 +for i in {1..7200}; do + netstat -ntlp | grep $LLM_PORT + if [ $? -eq 0 ]; then + echo "推理服务已拉起,端口$LLM_PORT已打开" + llm_status=1 + break + fi + sleep 1 +done + +if [ $llm_status -eq 0 ]; then + echo "推理服务拉起超时,请手动确认" + exit 1 +fi \ No newline at end of file diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index 076198a54432255d559eada63f78f0ffc2321853..7efbacd1a6e0bee942195dabe124eca621a25525 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -21,6 +21,7 @@ export ASCEND_TOTAL_MEMORY_GB=64 export HCCL_CONNECT_TIMEOUT=7200 export MS_COMPILER_CACHE_ENABLE=1 export CPU_AFFINITY=0 +export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:4,kernel_group_num:16" ' NET_ENV=" diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh index cd1421ce954aea8685c655b520d44c955d1906b7..0c7c508c05069d2c356e28264579bd1cd5a7576d 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_docker.sh @@ -8,7 +8,6 @@ current_path=$( source $current_path/config.cfg # 安装docker yum install docker -y -systemctl restart docker # 检测镜像是否已被拉取 docker images | grep $IMAGE_NAME | grep $IMAGE_TAG diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 89209c4639029fe411f752f24a01cbf64b314a19..7e4eba03529f2f01ea77219555c0e7a386b92316 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -6,7 +6,18 @@ current_path=$( ENV_FILE=/root/.bashrc source $current_path/config.cfg source $ENV_FILE -# 仅主节点运行 + +master_ip="$1" + +if [ "$2" ]; then + #从节点 + headless="--headless" + RANK_START=2 +else + #主节点 + headless="" + RANK_START=0 +fi if [ $NODE_NUM -ne 1 ]; then if [ $NODE_NUM -eq 2 ]; then @@ -37,24 +48,10 @@ fi #拉起服务 rm -rf ds.log if [ $NODE_NUM -ne 1 ]; then - nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log & + # nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=256 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log & + nohup vllm-mindspore serve --model="$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 $headless --data-parallel-start-rank $RANK_START --data-parallel-address $master_ip --data-parallel-rpc-port $DP_PORT --enable-expert-parallel &> ds.log & else nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log & fi -#检测推理服务是否拉起 -llm_status=0 -for i in {1..7200}; do - netstat -ntlp | grep $LLM_PORT - if [ $? -eq 0 ]; then - echo "推理服务已拉起,端口$LLM_PORT已打开" - llm_status=1 - break - fi - sleep 1 -done -if [ $llm_status -eq 0 ]; then - echo "推理服务拉起超时,请手动确认" - exit 1 -fi diff --git a/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 b/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 index de9cb1881e44e4004e3874b72084a795a652bdd6..d7691e337c90c955296417c64318ac23c66c54dc 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 +++ b/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 @@ -13,6 +13,8 @@ NODE_NUM={{ node_num }} IS_STOP_OTHER_CONTAINER={{ is_stop_other_container }} #[推理服务端口] LLM_PORT={{ llm_port }} +#[DP通信端口] +DP_PORT={{ dp_port }} #[ray集群使用的网卡] RAY_DEVICE={{ ray_device }} #[模型权重类型] diff --git a/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml b/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..3fd1a63900ec41286e37eb21bc1c29400e1488a9 --- /dev/null +++ b/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml @@ -0,0 +1,6 @@ +- name: Set dir_path as a fact + set_fact: + dir_path: "/home/mindspore-deepseek" + +- name: Check DeepSeek status on master + shell: sh {{ dir_path }}/lib/check_ds.sh diff --git a/script/mindspore-deepseek/workspace/roles/start/deepseek-worker/tasks/main.yml b/script/mindspore-deepseek/workspace/roles/start/deepseek-worker/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..52d9dc0deeea68868515ed05dfeaa0ef9a1fbb6d --- /dev/null +++ b/script/mindspore-deepseek/workspace/roles/start/deepseek-worker/tasks/main.yml @@ -0,0 +1,2 @@ +- name: Start DeepSeek on worker + shell: docker exec {{ container_name }} /workspace/lib/start_ds.sh {{ hostvars[groups['masters'][0]].ansible_host }} 1 diff --git a/script/mindspore-deepseek/workspace/roles/start/deepseek/tasks/main.yml b/script/mindspore-deepseek/workspace/roles/start/deepseek/tasks/main.yml index c1a5feeee13375b59e1369d64def193fb1aef4f7..7db8f2c8136d5dc402e6a0d2ca0a6966b38581b6 100644 --- a/script/mindspore-deepseek/workspace/roles/start/deepseek/tasks/main.yml +++ b/script/mindspore-deepseek/workspace/roles/start/deepseek/tasks/main.yml @@ -1,2 +1,2 @@ - name: Start DeepSeek on master - shell: docker exec {{ container_name }} /workspace/lib/start_ds.sh + shell: docker exec {{ container_name }} /workspace/lib/start_ds.sh {{ hostvars[groups['masters'][0]].ansible_host }}