diff --git a/plugins/mindspore-deepseek/config.yaml b/plugins/mindspore-deepseek/config.yaml index f82a4b37e4ee46f952240cbd9507f32d6426fc40..2f18c2ab09e3ed24006d9dbe8113f220201051d4 100644 --- a/plugins/mindspore-deepseek/config.yaml +++ b/plugins/mindspore-deepseek/config.yaml @@ -18,8 +18,8 @@ vars: # 容器镜像 - image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore - image_tag: latest + image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/deepseek_hyperinfer + image_tag: openeuler22.03-py3.11 # 推理服务所在容器的名称 container_name: openeuler_ds # 模型路径 diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh index a68fa40a06c2caff95d7e2699d1247c485996271..cca835c4b7ab003a9f2bdd52b92c963e28824e32 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh @@ -24,7 +24,6 @@ for i in {0..7}; do hccn_tool -i $i -net_health -g | grep -i 'Success' if [ $? -ne 0 ]; then echo "节点npu设备 $i 检测net_health不为Success" - exit 1 fi done diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh index 42811d71b49d6b6efa9d06bf7961292bb8bad8e0..e1dcaa78cdba6845a11ed553227c9bc4e0ea657c 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh @@ -3,8 +3,9 @@ current_path=$( cd $(dirname $0/) pwd ) +ENV_FILE=/root/.bashrc source $current_path/config.cfg -source /root/.bashrc +source $ENV_FILE ray_start() { ps -ef | grep "python" | grep -v grep | awk '{print $2}' | xargs kill ray stop diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index 8c8565d75528dc14215196b88ee4178d4c6897b8..7a40ea3b5b3479919fe7ad282e9e8a548937a147 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -18,6 +18,7 @@ export MS_DEV_RUNTIME_CONF="parallel_dispatch_kernel:True" export MS_ALLOC_CONF="enable_vmm:False" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ASCEND_TOTAL_MEMORY_GB=64 +export HCCL_CONNECT_TIMEOUT=3600 ' RAY_ENV=" @@ -25,15 +26,21 @@ export GLOO_SOCKET_IFNAME=$RAY_DEVICE export TP_SOCKET_IFNAME=$RAY_DEVICE " -W8A8_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek3_671b/predict_deepseek3_671b_w8a8.yaml' +if [ $NODE_NUM -eq 2 ]; then + YAML_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' +elif [ $NODE_NUM -eq 4 ]; then + YAML_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml' +fi + +ENV_FILE=/root/.bashrc if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then echo "存在已配置的环境变量,详见容器内/root/.bashrc" exit 0 fi -echo "$ENV_ARG" >> /root/.bashrc -echo "$RAY_ENV" >> /root/.bashrc -echo "$W8A8_ENV" >> /root/.bashrc -source /root/.bashrc +echo "$ENV_ARG" >> $ENV_FILE +echo "$RAY_ENV" >> $ENV_FILE +echo "$YAML_ENV" >> $ENV_FILE +source $ENV_FILE diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 83eb0bae33aaff0205b7ba5d6ee407ac2a58ff8b..569c68e3d97d58f804dc7c987db40bf548c915d0 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -3,8 +3,9 @@ current_path=$( cd $(dirname $0/) pwd ) +ENV_FILE=/root/.bashrc source $current_path/config.cfg -source /root/.bashrc +source $ENV_FILE # 仅主节点运行 ray_status=0 @@ -25,17 +26,17 @@ fi #拉起服务 rm -rf ds.log -nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=16 --max_model_len=1024 --max-num-batched-tokens=1024 --block-size=32 --gpu-memory-utilization=0.95 --distributed-executor-backend=ray > ds.log & +nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=16 --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray > ds.log & #检测推理服务是否拉起 llm_status=0 -for i in {1..60}; do +for i in {1..7200}; do netstat -ntlp | grep $LLM_PORT if [ $? -eq 0 ]; then echo "推理服务已拉起,端口$LLM_PORT已打开" llm_status=1 break fi - sleep 30 + sleep 1 done if [ $llm_status -eq 0 ]; then diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh index c88350c8f493588e38487f7fe9c3a45f7b25453a..23cbcfad058ba25b284c095927625cd13f253989 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh @@ -10,9 +10,11 @@ main() { set -e chmod -R +x $current_path/lib + systemctl stop firewalld + systemctl stop iptables # 检测需要部署的节点ip数量 - if [ $NODE_NUM -ne 2 ]; then - echo "当前仅支持两节点部署,当前数量是$NODE_NUM" + if [ [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ] ]; then + echo "当前仅支持两/四节点部署,当前数量是$NODE_NUM" exit 1 fi