From 8d8c0d74cad30f7b466da22551334ec76ead3a29 Mon Sep 17 00:00:00 2001 From: lijiaming <1228575330@qq.com> Date: Fri, 14 Mar 2025 19:19:11 +0800 Subject: [PATCH] =?UTF-8?q?opt:=20=E6=94=AF=E6=8C=81=E4=B8=8D=E5=90=8C?= =?UTF-8?q?=E6=9D=83=E9=87=8D=E7=B1=BB=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- plugins/mindspore-deepseek/config.yaml | 2 + .../mindspore-deepseek/workspace/install.yml | 7 -- .../workspace/roles/post/tasks/main.yml | 6 -- .../roles/prepare/files/lib/even-iso.py | 69 ------------------- .../roles/prepare/files/lib/example_config | 4 +- .../roles/prepare/files/lib/set_env.sh | 20 ++++-- .../roles/prepare/files/lib/start_ds.sh | 6 +- .../workspace/roles/prepare/files/prepare.sh | 34 ++++++++- .../roles/prepare/templates/config.cfg.j2 | 4 +- 9 files changed, 57 insertions(+), 95 deletions(-) delete mode 100644 plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml delete mode 100644 plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py diff --git a/plugins/mindspore-deepseek/config.yaml b/plugins/mindspore-deepseek/config.yaml index 2f18c2a..35db0fb 100644 --- a/plugins/mindspore-deepseek/config.yaml +++ b/plugins/mindspore-deepseek/config.yaml @@ -34,5 +34,7 @@ llm_port: 8000 # ray集群使用的网卡 ray_device: enp67s0f0np0 + # 模型权重类型 + model_type: safetensors # 跳过 ssh 校验(如需禁用此功能,请注释以下配置项) ansible_ssh_common_args: '-o StrictHostKeyChecking=no' diff --git a/plugins/mindspore-deepseek/workspace/install.yml b/plugins/mindspore-deepseek/workspace/install.yml index 4bbcc9c..2d1fec9 100644 --- a/plugins/mindspore-deepseek/workspace/install.yml +++ b/plugins/mindspore-deepseek/workspace/install.yml @@ -22,10 +22,3 @@ roles: - start/deepseek any_errors_fatal: true - -- hosts: - - masters - - workers - roles: - - post - any_errors_fatal: true diff --git a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml b/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml deleted file mode 100644 index c0c12e6..0000000 --- a/plugins/mindspore-deepseek/workspace/roles/post/tasks/main.yml +++ /dev/null @@ -1,6 +0,0 @@ -- name: Set dir_path as a fact - set_fact: - dir_path: "/home/mindspore-deepseek" - -- name: Binding core on host - shell: python3 {{ dir_path }}/lib/even-iso.py diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py deleted file mode 100644 index 4c23624..0000000 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/even-iso.py +++ /dev/null @@ -1,69 +0,0 @@ -import subprocess - -bound_cpus = {0:1,1:1} - - -def execute_command(command): - print(command) - result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split() - print(result) - - -def isolate_tids(tids, cpu_range): - command = f"taskset -cp {cpu_range[0]}-{cpu_range[-1]} {tids[0]}" - execute_command(command) - i = 0 - cpuSkippedCount = 0 - while i < len(tids): - cpu = cpu_range[i + cpuSkippedCount]+24 - if cpu in bound_cpus: - cpuSkippedCount += 1 - continue - bound_cpus[cpu] = 1 - command = f"taskset -cp {cpu} {tids[i]}" - execute_command(command) - i += 1 - - -def check_is_valid(info): - #return "release_thread" in info or "acl_thread" in infoi - #return "rayMsRayWorke" in info or "python3" in info or "Hccl_Notify" in info - return "batch_launch" in info or "frontend" in info or "backend" in info or "Actor" in info or "bprop" in info or "python3" in info or "MsRayWorke" in info - -def enforce_isolation(): - npu_len = 8 - command = "npu-smi info -t topo" - result = subprocess.run(command.split(), capture_output=True, text=True).stdout.split() - cpu_lists = [val for i, val in enumerate(result) if i > npu_len + 1 and i % (npu_len + 2) == 9][:8] - cpu_lists = [range(int(cpu_st.split('-')[0]), int(cpu_st.split('-')[1])) for cpu_st in cpu_lists] - - command = "npu-smi info" - result = subprocess.run(command.split(), capture_output=True, text=True).stdout - arr = " ".join(result.split('\n')[-npu_len * 2 - 2:]).split('|') - - pids = [pid.strip() for i, pid in enumerate(arr) if i % 5 == 2] - tids = [] - - for pid in pids: - command = f"ps -T -p {pid}" - results = subprocess.run(command.split(), capture_output=True, text=True).stdout - results = results.split('\n') - key_sort = lambda l1: "0" if len(l1) < 4 else l1.split()[3] - result = [info for info in results if check_is_valid(info)] - result = sorted(result, key=key_sort) - len_top_results = 9 - top_results = result[-len_top_results-1:-1] - new_tids = [info.split()[1] for info in top_results] - tids += [[pid] + new_tids] - - print("cpu-sets: ", cpu_lists) - print("*" * 10) - print("pids on npu: ", pids) - print("*" * 10) - print("tids: ", tids) - - for i in range(len(pids)): - isolate_tids(tids[i], cpu_lists[i]) - -enforce_isolation() - diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config index 7649d1d..d96c038 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/example_config @@ -14,4 +14,6 @@ IS_STOP_OTHER_CONTAINER=0 #[推理服务端口] LLM_PORT=8000 #[ray集群使用的网卡] -RAY_DEVICE=enp67s0f0np0 \ No newline at end of file +RAY_DEVICE=enp67s0f0np0 +#[模型权重类型] +MODEL_TYPE=safetensors \ No newline at end of file diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh index 654f0e3..e43b877 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/set_env.sh @@ -13,12 +13,13 @@ export ASCEND_CUSTOM_PATH=$ASCEND_HOME_PATH/../ export MS_ENABLE_LCCL=off export HCCL_OP_EXPANSION_MODE=AIV export vLLM_MODEL_BACKEND=MindFormers -export vLLM_MODEL_MEMORY_USE_GB=50 +export vLLM_MODEL_MEMORY_USE_GB=53 export MS_DEV_RUNTIME_CONF="parallel_dispatch_kernel:True" -export MS_ALLOC_CONF="enable_vmm:False" +export MS_ALLOC_CONF="enable_vmm:True" export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ASCEND_TOTAL_MEMORY_GB=64 -export HCCL_CONNECT_TIMEOUT=3600 +export HCCL_CONNECT_TIMEOUT=7200 +export MS_COMPILER_CACHE_ENABLE=1 ' NET_ENV=" @@ -28,11 +29,20 @@ export HCCL_SOCKET_IFNAME=$RAY_DEVICE " if [ $NODE_NUM -eq 2 ]; then - YAML_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' + YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml' elif [ $NODE_NUM -eq 4 ]; then - YAML_ENV='export MINDFORMERS_MODEL_CONFIG=/root/miniconda3/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml' + YAML_FILE='/root/Python-3.11/lib/python3.11/site-packages/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b.yaml' fi + +# 修改权重类型 +sed -e 's/^load_ckpt_format.*/load_ckpt_format: "'$MODEL_TYPE'"/' -i $YAML_FILE +if [ "$MODEL_TYPE" = "ckpt" ]; then + sed -e 's/^auto_trans_ckpt.*/auto_trans_ckpt: False/' -i $YAML_FILE +fi + +YAML_ENV="export MINDFORMERS_MODEL_CONFIG=$YAML_FILE" + ENV_FILE=/root/.bashrc if grep -q "openeuler_deepseek_env_config" /root/.bashrc; then diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 716c318..0a637ae 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -34,7 +34,7 @@ fi #拉起服务 rm -rf ds.log -nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max_model_len=8192 --max-num-batched-tokens=8192 --block-size=32 --gpu-memory-utilization=0.89 --distributed-executor-backend=ray &> ds.log & +nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --distributed-executor-backend=ray &> ds.log & #检测推理服务是否拉起 llm_status=0 for i in {1..7200}; do @@ -50,6 +50,4 @@ done if [ $llm_status -eq 0 ]; then echo "推理服务拉起超时,请手动确认" exit 1 -fi - -sleep 60 \ No newline at end of file +fi \ No newline at end of file diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh b/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh index 23cbcfa..c000112 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/files/prepare.sh @@ -1,5 +1,8 @@ #!/bin/bash -current_path=$(cd $(dirname $0/); pwd) +current_path=$( + cd $(dirname $0/) + pwd +) source $current_path/lib/config.cfg cp_into_container() { @@ -12,6 +15,33 @@ main() { systemctl stop firewalld systemctl stop iptables + + # 检查防火墙是否启动,如果启动则检查端口是否在防火墙白名单中,如果不存在则添加到白名单中 + status=$(systemctl status firewalld | grep -E "Active" | awk -F":" '{print $2}' | awk -F" " '{print $1}') + if [[ "${status}" == "active" ]]; then + # ray 端口防火墙检查 + port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp) + if [[ "${port_ray}" == "no" ]]; then + port_ray=$(firewall-cmd --zone=public --add-port=$RAY_PORT/tcp --permanent) + firewall-cmd --reload + fi + port_ray=$(firewall-cmd --query-port=$RAY_PORT/tcp) + if [[ "${port_ray}" != "yes" ]]; then + echo -e "防火墙开启 $RAY_PORT端口失败" + exit 1 + fi + port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp) + if [[ "${port_llm}" == "no" ]]; then + port_llm=$(firewall-cmd --zone=public --add-port=$LLM_PORT/tcp --permanent) + firewall-cmd --reload + fi + port_llm=$(firewall-cmd --query-port=$LLM_PORT/tcp) + if [[ "${port_llm}" != "yes" ]]; then + echo -e "防火墙开启 $LLM_PORT端口失败" + exit 1 + fi + fi + # 检测需要部署的节点ip数量 if [ [ $NODE_NUM -ne 2 ] && [ $NODE_NUM -ne 4 ] ]; then echo "当前仅支持两/四节点部署,当前数量是$NODE_NUM" @@ -28,7 +58,7 @@ main() { #进入容器执行 # 3. 设置容器内环境变量 docker exec -it $CONTAINER_NAME /workspace/lib/set_env.sh - + } # 执行主函数 diff --git a/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 b/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 index 4c36ef2..de9cb18 100644 --- a/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 +++ b/plugins/mindspore-deepseek/workspace/roles/prepare/templates/config.cfg.j2 @@ -14,4 +14,6 @@ IS_STOP_OTHER_CONTAINER={{ is_stop_other_container }} #[推理服务端口] LLM_PORT={{ llm_port }} #[ray集群使用的网卡] -RAY_DEVICE={{ ray_device }} \ No newline at end of file +RAY_DEVICE={{ ray_device }} +#[模型权重类型] +MODEL_TYPE={{ model_type }} \ No newline at end of file -- Gitee