From 42db1fbd6f66f863e7eff020d0a1e5fce0d2a6d6 Mon Sep 17 00:00:00 2001 From: lijiaming <1228575330@qq.com> Date: Mon, 21 Jul 2025 09:45:46 +0800 Subject: [PATCH] =?UTF-8?q?=E9=83=A8=E7=BD=B2=E5=8A=A0=E9=80=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- script/mindspore-deepseek/config.yaml | 6 ++- .../roles/prepare/files/lib/check_ds.sh | 2 +- .../roles/prepare/files/lib/net_check.sh | 43 +++++++++---------- .../roles/prepare/files/lib/ray_start.sh | 24 +++++++++++ .../roles/prepare/files/lib/set_env.sh | 14 ++++-- .../roles/prepare/files/lib/start_ds.sh | 37 +++------------- .../workspace/roles/prepare/files/prepare.sh | 9 ++-- .../roles/prepare/templates/config.cfg.j2 | 4 +- .../roles/start/check-ds/tasks/main.yml | 6 +-- 9 files changed, 78 insertions(+), 67 deletions(-) diff --git a/script/mindspore-deepseek/config.yaml b/script/mindspore-deepseek/config.yaml index a8384e7..59c9c3e 100644 --- a/script/mindspore-deepseek/config.yaml +++ b/script/mindspore-deepseek/config.yaml @@ -18,8 +18,8 @@ vars: # 容器镜像 - image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/deepseek_hyperinfer - image_tag: openeuler22.03-py3.11 + image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore + image_tag: 20250717 # 推理服务所在容器的名称 container_name: openeuler_ds # 模型路径 @@ -38,6 +38,8 @@ ray_device: enp67s0f0np0 # 模型权重类型 model_type: safetensors + # 后端(如果启动qwen2.5_vl,请设置为Native) + backend_type: MindFormers # 跳过 ssh 校验(如需禁用此功能,请注释以下配置项) ansible_ssh_common_args: '-o StrictHostKeyChecking=no' # SSH连接复用参数,ControlMaster=auto启用连接复用,ControlPersist=60s保持连接60秒,ConnectTimeout=30设置连接超时30秒 diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh index b8fcaed..b567ebd 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/check_ds.sh @@ -23,4 +23,4 @@ done if [ $llm_status -eq 0 ]; then echo "推理服务拉起超时,请手动确认" exit 1 -fi \ No newline at end of file +fi diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh index cca835c..6780585 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/net_check.sh @@ -2,38 +2,37 @@ # 设置TLS for i in {0..7}; do - hccn_tool -i $i -tls -s enable 0 + hccn_tool -i $i -tls -s enable 0 & done +wait # 检查TLS状态 -for i in {0..7}; do - hccn_tool -i $i -tls -g | grep switch -done +# for i in {0..7}; do +# hccn_tool -i $i -tls -g | grep switch +# done # 检查链路状态 for i in {0..7}; do - hccn_tool -i $i -link -g | grep -i 'link status: UP' - if [ $? -ne 0 ]; then - echo "节点npu设备 $i 检测link status不为UP" - exit 1 - fi -done - -# 检查网络健康状态 -for i in {0..7}; do - hccn_tool -i $i -net_health -g | grep -i 'Success' - if [ $? -ne 0 ]; then - echo "节点npu设备 $i 检测net_health不为Success" - fi -done - -# 检查IP信息 -for i in {0..7}; do - hccn_tool -i $i -ip -g + { + output=$(hccn_tool -i $i -link -g 2>&1) + if grep -qi 'link status: UP' <<< "$output"; then + echo "link status: UP" + else + echo "节点npu设备 $i 检测link status不为UP" >&2 + kill 0 # 终止整个脚本进程组 + fi + } & done +wait # 添加机器卡间互联检查 check_inter_device_connection() { + + # 检查IP信息 + for i in {0..7}; do + hccn_tool -i $i -ip -g + done + echo -e "${BLUE}请输入目标NPU卡的IP地址 (输入q退出检查):${NC}" while true; do read -p "IP地址: " target_ip diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh index c4d243a..2a60293 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/ray_start.sh @@ -6,7 +6,9 @@ current_path=$( ENV_FILE=/root/.bashrc source $current_path/config.cfg source $ENV_FILE + ray_start() { + ps -ef | grep "python" | grep -v grep | awk '{print $2}' | xargs kill if [ $NODE_NUM -eq 1 ]; then echo "单机部署无需启动ray" @@ -17,6 +19,28 @@ ray_start() { if [ "$1" ]; then # 从节点 nohup ray start --address=$1:$RAY_PORT & + sleep 5 + if [ $NODE_NUM -eq 2 ]; then + NPU_NUM=16.0 + elif [ $NODE_NUM -eq 4 ]; then + NPU_NUM=32.0 + fi + + ray_status=0 + for i in {1..10}; do + ray status | grep "$NPU_NUM NPU" + if [ $? -eq 0 ]; then + echo "ray集群已全部拉起" + ray_status=1 + break + fi + sleep 3 + done + + if [ $ray_status -eq 0 ]; then + echo "ray集群超时" + exit 1 + fi else # 主节点 nohup ray start --head --include-dashboard=False --port=$RAY_PORT & diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index 7efbacd..c6cc5d8 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -12,16 +12,17 @@ ENV_ARG=' export ASCEND_CUSTOM_PATH=$ASCEND_HOME_PATH/../ export MS_ENABLE_LCCL=off export HCCL_OP_EXPANSION_MODE=AIV -export vLLM_MODEL_BACKEND=MindFormers export vLLM_MODEL_MEMORY_USE_GB=53 export MS_DEV_RUNTIME_CONF="parallel_dispatch_kernel:True" export MS_ALLOC_CONF="enable_vmm:True" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ASCEND_TOTAL_MEMORY_GB=64 export HCCL_CONNECT_TIMEOUT=7200 -export MS_COMPILER_CACHE_ENABLE=1 export CPU_AFFINITY=0 export EXPERIMENTAL_KERNEL_LAUNCH_GROUP="thread_num:4,kernel_group_num:16" +export MS_INTERNAL_ENABLE_NZ_OPS="QuantBatchMatmul,MlaPreprocess,GroupedMatmulV4" +export PYTHONPATH=/workspace/mindformers:$PYTHONPATH +export MS_DISABLE_INTERNAL_KERNELS_LIST="AddRmsNorm,Add,MatMul,Cast" ' NET_ENV=" @@ -40,7 +41,8 @@ if [ $NODE_NUM -eq 1 ]; then sed -e 's/activation_dtype/#activation_dtype/' -i $YAML_FILE fi elif [ $NODE_NUM -eq 2 ]; then - YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' + #YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' + YAML_FILE='/workspace/mindformers/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8_ep4tp4.yaml' elif [ $NODE_NUM -eq 4 ]; then YAML_FILE='/usr/local/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml' fi @@ -63,4 +65,10 @@ echo "$YAML_ENV" >> $ENV_FILE if [ $NODE_NUM -ne 1 ]; then echo "$NET_ENV" >> $ENV_FILE fi + +# 设置模型后端 +if [ "$BACKEND_TYPE" = "MindFormers" ]; then + echo 'export vLLM_MODEL_BACKEND=MindFormers' >> $ENV_FILE +fi + source $ENV_FILE diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 7e4eba0..0f5ba03 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -19,39 +19,16 @@ else RANK_START=0 fi -if [ $NODE_NUM -ne 1 ]; then - if [ $NODE_NUM -eq 2 ]; then - NPU_NUM=16.0 - PARALLEL=16 - elif [ $NODE_NUM -eq 4 ]; then - NPU_NUM=32.0 - PARALLEL=32 - fi - - ray_status=0 - for i in {1..10}; do - ray status | grep "$NPU_NUM NPU" - if [ $? -eq 0 ]; then - echo "ray集群已全部拉起" - ray_status=1 - break - fi - sleep 3 - done - - if [ $ray_status -eq 0 ]; then - echo "ray集群超时" - exit 1 - fi +if [ $NODE_NUM -eq 2 ]; then + PARALLEL=16 +elif [ $NODE_NUM -eq 4 ]; then + PARALLEL=32 fi #拉起服务 rm -rf ds.log if [ $NODE_NUM -ne 1 ]; then - # nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=256 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log & - nohup vllm-mindspore serve --model="$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 $headless --data-parallel-start-rank $RANK_START --data-parallel-address $master_ip --data-parallel-rpc-port $DP_PORT --enable-expert-parallel &> ds.log & + nohup vllm-mindspore serve --model="$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --max-num-seqs=512 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.93 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 $headless --data-parallel-start-rank $RANK_START --data-parallel-address $master_ip --data-parallel-rpc-port $DP_PORT --enable-expert-parallel &> ds.log & else - nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log & -fi - - + nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 &> ds.log & +fi \ No newline at end of file diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh index 0ba0d2f..fc282b0 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh @@ -16,6 +16,11 @@ main() { systemctl stop firewalld systemctl stop iptables + #0. 启动&预热权重服务 + #$current_path/lib/mfs_tools.sh init || true + #$current_path/lib/mfs_tools.sh load || true + + # 1. 启动Docker容器并复制文件 $current_path/lib/start_docker.sh cp_into_container @@ -27,17 +32,15 @@ main() { #进入容器执行 # 3. 设置容器内环境变量 - docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh + docker exec $CONTAINER_NAME /workspace/lib/set_env.sh # 4. 进行绑核 - echo 3 > /proc/sys/vm/drop_caches pip install psutil python3 $current_path/lib/fine-grained-bind-cann.py if [ $? -ne 0 ]; then echo "细粒度线程绑核失败,请确保驱动版本>=24.1.0" exit 1 fi - } # 执行主函数 diff --git a/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 b/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 index d7691e3..6b41b7e 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 +++ b/script/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 @@ -18,4 +18,6 @@ DP_PORT={{ dp_port }} #[ray集群使用的网卡] RAY_DEVICE={{ ray_device }} #[模型权重类型] -MODEL_TYPE={{ model_type }} \ No newline at end of file +MODEL_TYPE={{ model_type }} +#[模型后端类型] +BACKEND_TYPE={{ backend_type }} \ No newline at end of file diff --git a/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml b/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml index 3fd1a63..3f48647 100644 --- a/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml +++ b/script/mindspore-deepseek/workspace/roles/start/check-ds/tasks/main.yml @@ -1,6 +1,2 @@ -- name: Set dir_path as a fact - set_fact: - dir_path: "/home/mindspore-deepseek" - - name: Check DeepSeek status on master - shell: sh {{ dir_path }}/lib/check_ds.sh + shell: sh /home/mindspore-deepseek/lib/check_ds.sh -- Gitee