diff --git a/README.md b/README.md index a086c93fada95d65a07d1c31d4a8c15dcbd4fa18..75b3f09d2b82f441724ba8c7e4fd67089c2078cf 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,8 @@ size的动态变化,相比于静态shape的整图下沉,动态shape的计算 | OpenAI o1-mini | 68.9 | | DeepSeek R1 | 91.8 | | Deepseek R1 w8a8 | 89.52 | +| Deepseek R1 W4A16 | 88.78 | +| Deepseek V3 0324 W4A16 | 87.82 | @@ -141,8 +143,8 @@ size的动态变化,相比于静态shape的整图下沉,动态shape的计算 | 并发数 | 吞吐(Token/s) | | ------ | ------------- | -| 1 | 16.7 | -| 192 | 1188 | +| 1 | 22.4 | +| 192 | 1600 | diff --git "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" index dd60db33b5b28dd7fbfa98d3bdca08a864ab95e2..5515b29c83bbac5c040e72dd4c12708989aa1272 100644 --- "a/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" +++ "b/doc/deepseek/DeepSeek-V3&R1\351\203\250\347\275\262\346\214\207\345\215\227.md" @@ -120,6 +120,7 @@ cd llm_solution/script/mindspore-deepseek ```shell # 调整mindspore-deepseek目录下config.yaml +# 注意:w8a8与int4不同权重部署方式,使用的镜像tag不一致,可按照下文修改 (base) [root@910b-3 mindspore-deepseek]# cat config.yaml all: children: @@ -142,8 +143,8 @@ all: vars: # 容器镜像 # 如果本地docker上已加载镜像,改成docker image的image_name和image_tag - image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/deepseek_hyperinfer - image_tag: openeuler22.03-py3.11 + image_name: hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore + image_tag: 20250415 # 单机部署时,请使用tag为20250326的容器镜像 # 将要拉起的推理容器的名称 container_name: openeuler_ds # 启动之后的docker name,不能和已有镜像重名 # 模型路径 diff --git a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh index 49a7bd29ecd31b330e1860887a706bc75bba674f..89209c4639029fe411f752f24a01cbf64b314a19 100644 --- a/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh +++ b/script/mindspore-deepseek/workspace/roles/prepare/files/lib/start_ds.sh @@ -39,7 +39,7 @@ rm -rf ds.log if [ $NODE_NUM -ne 1 ]; then nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=$PARALLEL --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc --distributed-executor-backend=ray &> ds.log & else - nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=128 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log & + nohup python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "$MODEL_PATH" --port=$LLM_PORT --trust_remote_code --tensor_parallel_size=8 --max-num-seqs=192 --max_model_len=32768 --max-num-batched-tokens=16384 --block-size=32 --gpu-memory-utilization=0.93 --num-scheduler-steps=8 --disable-async-output-proc &> ds.log & fi #检测推理服务是否拉起