From 6e3025e22214e63be7c8d1bddb02d9e52e78a8f1 Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Tue, 4 Mar 2025 09:43:35 +0800
Subject: [PATCH 01/82] add st

---
 .jenkins/test/config/dependent_package.yaml  |  8 ---
 .jenkins/test/config/dependent_packages.yaml |  8 +++
 README.md                                    | 18 ++++---
 vllm_mindspore/tests/st/python/__init__.py   |  0
 vllm_mindspore/tests/st/python/test_demo.py  | 53 ++++++++++++++++++++
 5 files changed, 71 insertions(+), 16 deletions(-)
 delete mode 100644 .jenkins/test/config/dependent_package.yaml
 create mode 100644 .jenkins/test/config/dependent_packages.yaml
 create mode 100644 vllm_mindspore/tests/st/python/__init__.py
 create mode 100644 vllm_mindspore/tests/st/python/test_demo.py

diff --git a/.jenkins/test/config/dependent_package.yaml b/.jenkins/test/config/dependent_package.yaml
deleted file mode 100644
index 19bed914b..000000000
--- a/.jenkins/test/config/dependent_package.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-mindspore:
-  https://repo.mindspore.cn/mindspore/mindspore/version/202502/20250227/master_20250227211723_94ac228bae9cd6d0f00b4ce8d5857773799c4f26_newest/
-
-mindformers:
-  https://repo.mindspore.cn/mindspore/mindformers/version/202502/20250228/dev_20250228220021_4e90ca405720ea2f4a0abdf501d01078f28d724c_newest/
-
-msadapter:
-  https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250301/master_20250301_newest/
diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
new file mode 100644
index 000000000..6c3962997
--- /dev/null
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -0,0 +1,8 @@
+mindspore:
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202502/20250227/master_20250227211723_94ac228bae9cd6d0f00b4ce8d5857773799c4f26_newest/'
+
+mindformers:
+  'https://repo.mindspore.cn/mindspore/mindformers/version/202502/20250228/dev_20250228220021_4e90ca405720ea2f4a0abdf501d01078f28d724c_newest/'
+
+msadapter:
+  'https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250301/master_20250301_newest/'
diff --git a/README.md b/README.md
index 284abf19e..db0305959 100644
--- a/README.md
+++ b/README.md
@@ -14,9 +14,9 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer
 
 - Hardware: Atlas A2/A3
 - Software:
-  - Python >= 3.9
-  - CANN >= 8.0.0
-  - MindSpore >=2.5.0
+    - Python >= 3.9
+    - CANN >= 8.0.0
+    - MindSpore >=2.5.0
 
 ---
 
@@ -27,13 +27,15 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer
 Installation from source code
 
 ```shell
+
 # 1. Uninstall torch-related packages due to msadapter limitations
-pip3 uninstall torch torch-npu torchvision 
+pip3 uninstall torch torch-npu torchvision
 
 # 2.Install vllm_mindspore
 git clone https://gitee.com/mindspore/vllm_mindspore.git
 cd vllm_mindspore
 pip install .
+
 ```
 
 ### Inference and Serving
@@ -43,6 +45,7 @@ pip install .
 You can run vllm_mindspore in your own code on a list of prompts.
 
 ```python
+
 import vllm_mindspore # Add this line on the top of script.
 from vllm import LLM, SamplingParams
 
@@ -66,6 +69,7 @@ for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
 ```
 
 #### Serving（OpenAI-Compatible）
@@ -77,6 +81,7 @@ You can start the server via the vllm_mindspore command:
 To call the server, you can use `curl` or any other HTTP client.
 
 ```shell
+
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
@@ -85,9 +90,8 @@ curl http://localhost:8000/v1/completions \
     "max_tokens": 120,
     "temperature": 0
   }'
-```
-
 
+```
 
 ## Contributing
 
@@ -96,8 +100,6 @@ We welcome and value any contributions and collaborations:
 - Please feel free comments about your usage of vllm_mindspore.
 - Please let us know if you encounter a bug by filing an issue.
 
-
-
 ## License
 
 Apache License 2.0, as found in the [LICENSE](https://gitee.com/mindspore/vllm_mindspore/blob/master/LICENSE) file.
diff --git a/vllm_mindspore/tests/st/python/__init__.py b/vllm_mindspore/tests/st/python/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm_mindspore/tests/st/python/test_demo.py b/vllm_mindspore/tests/st/python/test_demo.py
new file mode 100644
index 000000000..f5b8fae04
--- /dev/null
+++ b/vllm_mindspore/tests/st/python/test_demo.py
@@ -0,0 +1,53 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test demo for st."""
+import pytest
+
+
+class TestDemo:
+    """
+    Test Demo for ST.
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_aaa(self):
+        """
+        test case aaa
+        """
+        from vllm import LLM, SamplingParams
+
+        # Sample prompts.
+        prompts = [
+            "I am",
+            "Today is",
+            "Llama is"
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, top_p=0.95)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Llama-2-7b-hf")
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        assert len(outputs) == 3
-- 
Gitee


From babbb9e4e52b446de91c33e11591c92aea7d9f98 Mon Sep 17 00:00:00 2001
From: lijiakun <lijiakun9@huawei.com>
Date: Sun, 2 Mar 2025 14:29:01 +0800
Subject: [PATCH 02/82] fix copyright

---
 setup.py                                       |  1 +
 vllm_mindspore/__init__.py                     |  1 +
 vllm_mindspore/attention/backends/ms_attn.py   |  1 +
 vllm_mindspore/attention/backends/utils.py     |  1 +
 vllm_mindspore/attention/layer.py              |  1 +
 vllm_mindspore/attention/ops/paged_attn.py     |  1 +
 vllm_mindspore/attention/selector.py           |  1 +
 vllm_mindspore/config.py                       |  1 +
 vllm_mindspore/distributed/communication_op.py | 17 +++++++++++++++++
 vllm_mindspore/distributed/parallel_state.py   |  1 +
 vllm_mindspore/entrypoints.py                  |  1 +
 .../executor/multiproc_worker_utils.py         |  1 +
 vllm_mindspore/executor/ray_gpu_executor.py    |  1 +
 vllm_mindspore/model_executor/custom_op.py     |  1 +
 .../model_executor/layers/activation.py        |  1 +
 .../model_executor/layers/layernorm.py         |  1 +
 vllm_mindspore/model_executor/layers/linear.py |  1 +
 .../model_executor/layers/logits_processor.py  |  1 +
 .../layers/quantization/base_config.py         |  1 +
 .../model_executor/layers/rotary_embedding.py  |  1 +
 .../model_executor/layers/sampler.py           |  1 +
 vllm_mindspore/model_executor/layers/utils.py  | 17 +++++++++++++++++
 .../layers/vocab_parallel_embedding.py         |  1 +
 .../model_executor/model_loader/utils.py       |  1 +
 .../model_loader/weight_utils.py               |  1 +
 .../model_executor/models/interfaces.py        | 18 ++++++++++++++++++
 vllm_mindspore/model_executor/models/llama.py  |  1 +
 .../models/mf_models/deepseek_v3.py            |  1 +
 .../model_executor/models/mf_models/qwen2.py   |  1 +
 .../model_executor/models/model_base.py        |  1 +
 vllm_mindspore/model_executor/models/qwen2.py  | 17 +++++++++++++++++
 .../model_executor/models/registry.py          |  1 +
 vllm_mindspore/model_executor/models/utils.py  |  1 +
 .../model_executor/sampling_metadata.py        |  1 +
 vllm_mindspore/model_executor/utils.py         |  1 +
 vllm_mindspore/platforms/ascend.py             |  1 +
 vllm_mindspore/scripts.py                      |  1 +
 vllm_mindspore/sequence.py                     |  1 +
 vllm_mindspore/tests/test_sampler.py           | 17 +++++++++++++++++
 vllm_mindspore/utils.py                        |  1 +
 vllm_mindspore/worker/cache_engine.py          |  1 +
 vllm_mindspore/worker/model_runner.py          |  1 +
 vllm_mindspore/worker/worker.py                |  1 +
 43 files changed, 124 insertions(+)

diff --git a/setup.py b/setup.py
index e7189008b..a6c147505 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index b86b9127f..797425b4f 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index f01c1517f..37ad40379 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/attention/backends/utils.py b/vllm_mindspore/attention/backends/utils.py
index 00970f8f5..88cf9e1e5 100644
--- a/vllm_mindspore/attention/backends/utils.py
+++ b/vllm_mindspore/attention/backends/utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/attention/layer.py b/vllm_mindspore/attention/layer.py
index 01eacca3a..84335349b 100644
--- a/vllm_mindspore/attention/layer.py
+++ b/vllm_mindspore/attention/layer.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/attention/ops/paged_attn.py b/vllm_mindspore/attention/ops/paged_attn.py
index 57f58db6f..abfb37dca 100644
--- a/vllm_mindspore/attention/ops/paged_attn.py
+++ b/vllm_mindspore/attention/ops/paged_attn.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/attention/selector.py b/vllm_mindspore/attention/selector.py
index 1dd046661..34654ffc8 100644
--- a/vllm_mindspore/attention/selector.py
+++ b/vllm_mindspore/attention/selector.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 18fed6fc5..b7c602322 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py
index 58c8c1e8e..00447432e 100644
--- a/vllm_mindspore/distributed/communication_op.py
+++ b/vllm_mindspore/distributed/communication_op.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
 
 
 # 该文件实现底层通信接口， 要求动静统一， 最后才可以在网络中入图。
diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
index b669f82f9..71e2cdd2d 100644
--- a/vllm_mindspore/distributed/parallel_state.py
+++ b/vllm_mindspore/distributed/parallel_state.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/entrypoints.py b/vllm_mindspore/entrypoints.py
index 208acb724..aa91f07ae 100644
--- a/vllm_mindspore/entrypoints.py
+++ b/vllm_mindspore/entrypoints.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/executor/multiproc_worker_utils.py b/vllm_mindspore/executor/multiproc_worker_utils.py
index e2dc5bab6..8b24cf014 100644
--- a/vllm_mindspore/executor/multiproc_worker_utils.py
+++ b/vllm_mindspore/executor/multiproc_worker_utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/executor/ray_gpu_executor.py b/vllm_mindspore/executor/ray_gpu_executor.py
index 8b9cd11ab..3e6369462 100644
--- a/vllm_mindspore/executor/ray_gpu_executor.py
+++ b/vllm_mindspore/executor/ray_gpu_executor.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/custom_op.py b/vllm_mindspore/model_executor/custom_op.py
index 7b913ef85..a8c273f5f 100644
--- a/vllm_mindspore/model_executor/custom_op.py
+++ b/vllm_mindspore/model_executor/custom_op.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/activation.py b/vllm_mindspore/model_executor/layers/activation.py
index afc2b7939..a1d94ecae 100644
--- a/vllm_mindspore/model_executor/layers/activation.py
+++ b/vllm_mindspore/model_executor/layers/activation.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/layernorm.py b/vllm_mindspore/model_executor/layers/layernorm.py
index dd497e08e..db156c0cc 100644
--- a/vllm_mindspore/model_executor/layers/layernorm.py
+++ b/vllm_mindspore/model_executor/layers/layernorm.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index 62142ac8c..339df9669 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index 9399e518a..b9beb080b 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/quantization/base_config.py b/vllm_mindspore/model_executor/layers/quantization/base_config.py
index ea259ee65..5d3b0acb0 100644
--- a/vllm_mindspore/model_executor/layers/quantization/base_config.py
+++ b/vllm_mindspore/model_executor/layers/quantization/base_config.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/rotary_embedding.py b/vllm_mindspore/model_executor/layers/rotary_embedding.py
index 77827002f..257db72bb 100644
--- a/vllm_mindspore/model_executor/layers/rotary_embedding.py
+++ b/vllm_mindspore/model_executor/layers/rotary_embedding.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index c51b526ad..d3e02ce9a 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py
index eedfaa12b..bbef8d9cb 100644
--- a/vllm_mindspore/model_executor/layers/utils.py
+++ b/vllm_mindspore/model_executor/layers/utils.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
 """Utility methods for model layers."""
 from typing import Tuple
 import torch
diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
index f45064e19..76fa68681 100644
--- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py
index c94e3150b..7d0ef95d3 100644
--- a/vllm_mindspore/model_executor/model_loader/utils.py
+++ b/vllm_mindspore/model_executor/model_loader/utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py
index ead186957..45fe4bdd5 100644
--- a/vllm_mindspore/model_executor/model_loader/weight_utils.py
+++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/interfaces.py b/vllm_mindspore/model_executor/models/interfaces.py
index 0b1510d97..f9b27a079 100644
--- a/vllm_mindspore/model_executor/models/interfaces.py
+++ b/vllm_mindspore/model_executor/models/interfaces.py
@@ -1,3 +1,21 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
 from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
                     Protocol, Type, Union, overload, runtime_checkable)
 
diff --git a/vllm_mindspore/model_executor/models/llama.py b/vllm_mindspore/model_executor/models/llama.py
index c20f54fe8..90ea548b7 100644
--- a/vllm_mindspore/model_executor/models/llama.py
+++ b/vllm_mindspore/model_executor/models/llama.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 0c5b4b109..080ee6205 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 1c222d58e..5df33626c 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 86c12e252..9aa4e1c97 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 7a46c83a0..ca0e86ac8 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Iterable
 
 if TYPE_CHECKING:
diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py
index ef38ee0b9..0fb037469 100644
--- a/vllm_mindspore/model_executor/models/registry.py
+++ b/vllm_mindspore/model_executor/models/registry.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/utils.py b/vllm_mindspore/model_executor/models/utils.py
index c84b6dc31..0a115c2cc 100644
--- a/vllm_mindspore/model_executor/models/utils.py
+++ b/vllm_mindspore/model_executor/models/utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/sampling_metadata.py b/vllm_mindspore/model_executor/sampling_metadata.py
index e6b60f579..c9d11a198 100644
--- a/vllm_mindspore/model_executor/sampling_metadata.py
+++ b/vllm_mindspore/model_executor/sampling_metadata.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/utils.py b/vllm_mindspore/model_executor/utils.py
index c6de292aa..e1f5ec779 100644
--- a/vllm_mindspore/model_executor/utils.py
+++ b/vllm_mindspore/model_executor/utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py
index de56d2680..516ba3a0e 100644
--- a/vllm_mindspore/platforms/ascend.py
+++ b/vllm_mindspore/platforms/ascend.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py
index b0f5a0b32..72a37fae5 100644
--- a/vllm_mindspore/scripts.py
+++ b/vllm_mindspore/scripts.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/sequence.py b/vllm_mindspore/sequence.py
index 82b93f546..c1ca3c750 100644
--- a/vllm_mindspore/sequence.py
+++ b/vllm_mindspore/sequence.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/tests/test_sampler.py b/vllm_mindspore/tests/test_sampler.py
index e0d91147c..24db77803 100644
--- a/vllm_mindspore/tests/test_sampler.py
+++ b/vllm_mindspore/tests/test_sampler.py
@@ -1,3 +1,20 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
 import vllm_mindspore
 import itertools
 import random
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index f729adcad..f2ee5a648 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py
index 99fc7693c..9e9811a16 100644
--- a/vllm_mindspore/worker/cache_engine.py
+++ b/vllm_mindspore/worker/cache_engine.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index e8752d942..b8203a8b6 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index d1e52a410..986691dca 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
 # Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-- 
Gitee


From 977b7b6c74c2fcfadfbdc7ca637e28d2909105f8 Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Tue, 4 Mar 2025 15:13:27 +0800
Subject: [PATCH 03/82] enable_vmm

---
 vllm_mindspore/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index 6e2d6f03a..5059621db 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -323,7 +323,7 @@ def check_ready():
 
         set_context(mode=0, device_target="Ascend", max_call_depth=10000)
     else:
-        env_setup({"MS_ALLOC_CONF": "enable_vmm:False", })
+        env_setup({"MS_ALLOC_CONF": "enable_vmm:True", })
         logger.info("Run with native model backend!")
 
 
-- 
Gitee


From 0077b1f9782246d36c1ba7b207eaa9a1747cb14d Mon Sep 17 00:00:00 2001
From: lijiakun <lijiakun9@huawei.com>
Date: Sun, 2 Mar 2025 12:53:09 +0800
Subject: [PATCH 04/82] create openEuler docker image

---
 Dockerfile     |  71 ++++++++++++-------
 README.md      |  16 ++++-
 build_image.sh | 181 -------------------------------------------------
 3 files changed, 60 insertions(+), 208 deletions(-)
 delete mode 100644 build_image.sh

diff --git a/Dockerfile b/Dockerfile
index 059901e9c..d174da7c2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,6 @@
 FROM hub.oepkgs.net/openeuler/openeuler:22.03-lts-sp4
 
+####################### os #######################
 RUN yum clean all && \
     yum makecache && \
     yum install -y \
@@ -16,74 +17,92 @@ RUN yum clean all && \
 
 ####################### python #######################
 WORKDIR /root
-RUN wget --no-check-certificate https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh && \
+RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh && \
     bash /root/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh -b && \
     rm /root/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh
 ENV PATH="/root/miniconda3/bin:$PATH"
 ENV PYTHONPATH="/root/miniconda3/lib/python3.11/site-packages"
-RUN pip config set global.index-url 'https://mirrors.tools.huawei.com/pypi/simple/' && \
-    pip config set global.trusted-host mirrors.tools.huawei.com
+RUN pip config set global.index-url 'https://pypi.tuna.tsinghua.edu.cn/simple' && \
+    pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn
 
 ####################### CANN #######################
-COPY ascend_install.info /etc/ascend_install.info
-RUN wget --no-check-certificate "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-toolkit_8.0.0_linux-aarch64.run" -o Ascend-cann-toolkit_8.0.0_linux-aarch64.run && \
-    wget --no-check-certificate "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run" -o Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run && \
-    wget --no-check-certificate "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-nnrt_8.0.0_linux-aarch64.run" -o Ascend-cann-nnrt_8.0.0_linux-aarch64.run && \
+WORKDIR /root
+RUN echo "UserName=HwHiAiUser" >> /etc/ascend_install.info && \
+    echo "UserGroup=HwHiAiUser" >> /etc/ascend_install.info && \
+    echo "Firmware_Install_Type=full" >> /etc/ascend_install.info && \
+    echo "Firmware_Install_Path_Param=/usr/local/Ascend" >> /etc/ascend_install.info && \
+    echo "Driver_Install_Type=full" >> /etc/ascend_install.info && \
+    echo "Driver_Install_Path_Param=/usr/local/Ascend" >> /etc/ascend_install.info && \
+    echo "Driver_Install_For_All=no" >> /etc/ascend_install.info && \
+    echo "Driver_Install_Mode=normal" >> /etc/ascend_install.info && \
+    echo "Driver_Install_Status=complete" >> /etc/ascend_install.info
+RUN curl -s "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-toolkit_8.0.0_linux-aarch64.run" -o Ascend-cann-toolkit.run && \
+    curl -s "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run" -o Ascend-cann-kernels-910b.run && \
+    curl -s "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-nnrt_8.0.0_linux-aarch64.run" -o Ascend-cann-nnrt.run && \
     chmod a+x *.run && \
-    bash /root/Ascend-cann-toolkit_8.0.0_linux-aarch64.run --install -q && \
-    bash /root/Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run --install -q && \
-    bash /Ascend-cann-nnrt_8.0.0_linux-aarch64.run --install -q && \
+    bash /root/Ascend-cann-toolkit.run --install -q && \
+    bash /root/Ascend-cann-kernels-910b.run --install -q && \
+    bash /root/Ascend-cann-nnrt.run --install -q && \
     rm /root/*.run
-
 RUN echo "source /usr/local/Ascend/nnrt/set_env.sh" >> /root/.bashrc && \
     echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> /root/.bashrc
 
-####################### dev #######################
+####################### dev env #######################
 RUN pip install --no-cache-dir \
-    cmake \
+    cmake>=3.26 \
     decorator \
     ray==2.42.1 \
+    protobuf==3.20.0 \
+    ml_dtypes \
     wheel \
     setuptools \
     wrap \
-    deprecated
+    deprecated \
+    packaging \
+    ninja \
+    "setuptools-scm>=8" \
+    numpy \
+    build
 
 WORKDIR /workspace
 
-ARG GITEE_USERNAME
-ARG GITEE_PASSWORD
-RUN git config --global credential.helper store && \
-    echo "https://${GITEE_USERNAME}:${GITEE_PASSWORD}@gitee.com" > /root/.git-credentials
-
 RUN git clone -b br_infer_deepseek_os https://gitee.com/mindspore/mindformers.git /workspace/mindformers && \
     cd mindformers && \
+    sed -i 's/-i https:\/\/pypi.tuna.tsinghua.edu.cn\/simple//g' build.sh && \
     bash build.sh && \
     PACKAGE_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])") && \
     cp -a research "$PACKAGE_PATH" && \
     rm -rf /workspace/mindformers
 
-RUN git clone -b deepseek https://gitee.com/mindspore/golden-stick.git /workspace/golden-stick && \
+RUN git clone https://gitee.com/mindspore/golden-stick.git /workspace/golden-stick && \
     cd golden-stick && \
     bash build.sh && \
     pip install --no-cache-dir /workspace/golden-stick/output/*.whl && \
     rm -rf /workspace/golden-stick
 
-RUN git clone https://gitee.com/mindspore/msadapter.git /workspace/msadapter && \
-    cd /workspace/msadapter && \
-    bash scripts/build_and_reinstall.sh && \
-    rm -rf /workspace/msadapter
-
-# vllm_ms
 ENV USE_TORCH="FALSE"
 ENV USE_TF="FALSE"
 RUN git clone -b v0.6.6.post1 https://gitee.com/mirrors/vllm.git /workspace/vllm && \
     cd vllm && \
     VLLM_TARGET_DEVICE=empty pip install --no-cache-dir . && \
     rm -rf /workspace/vllm
+
+RUN git clone https://openi.pcl.ac.cn/OpenI/MSAdapter.git /workspace/msadapter && \
+    cd /workspace/msadapter && \
+    bash scripts/build_and_reinstall.sh && \
+    rm -rf /workspace/msadapter
+
 ADD . /workspace/vllm_mindspore
 RUN cd /workspace/vllm_mindspore && \
     pip install --no-cache-dir -r requirements.txt && \
     pip install . && \
     rm -rf /workspace/vllm_mindspore
 
+RUN wget -O mindspore-2.5.0-cp311-cp311-linux_aarch64.whl \
+https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250303/br_infer_deepseek_os_20250303004707_705727d59236c8c197b25ad0e464c4908434d42f_newest/unified/aarch64/mindspore-2.5.0-cp311-cp311-linux_aarch64.whl && \
+pip install --no-cache-dir mindspore-2.5.0-cp311-cp311-linux_aarch64.whl && \
+rm -f mindspore-2.5.0-cp311-cp311-linux_aarch64.whl
+
+RUN pip uninstall torch torch-npu torchvision -y
+
 CMD ["bash"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 284abf19e..ed414aaa5 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer
 
 ### Installation
 
-Installation from source code
+#### Installation from source code
 
 ```shell
 # 1. Uninstall torch-related packages due to msadapter limitations
@@ -36,6 +36,20 @@ cd vllm_mindspore
 pip install .
 ```
 
+#### Set up using Docker
+
+##### Pre-built images
+
+```shell
+docker pull hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:v1.0
+```
+
+##### Build image from source
+
+```shell
+docker build --network=host .
+```
+
 ### Inference and Serving
 
 #### Offline Inference
diff --git a/build_image.sh b/build_image.sh
deleted file mode 100644
index 801fdcf31..000000000
--- a/build_image.sh
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/bin/bash
-
-validate_args() {
-    if [ $# -lt 2 ]; then
-        echo "Usage: $0 <MODEL> <VERSION>"
-        exit 1
-    fi
-    MODEL=$1
-    VERSION=$2
-}
-
-check_proxy() {
-    if [[ -z "$http_proxy" || -z "$https_proxy" ]]; then
-        echo "Error: http_proxy and https_proxy must be set."
-        exit 1
-    fi
-}
-
-init_variables() {
-    case $MODEL in
-        "300I")
-            DEVICE=310p
-            DEVICE_TAG=300I-Duo
-            ;;
-        "800I")
-            DEVICE=910b
-            DEVICE_TAG=800I-A2
-            ;;
-        "A3")
-            DEVICE=A3
-            DEVICE_TAG=800I-A3
-            ;;
-        *)
-            echo "Unsupported architecture: $MODEL"
-            exit 1
-            ;;
-    esac
-
-    FILE_VERSION="${VERSION%.*}-${VERSION##*.}"
-    IMAGE_FILE_NAME="mindie:dev-${FILE_VERSION}-${DEVICE_TAG}-py311-ubuntu22.04-aarch64"
-    IMAGE_FILE="${IMAGE_FILE_NAME}.tar.gz"
-    IMAGE_URL="https://cmc-nkg-artifactory.cmc.tools.huawei.com/artifactory/cmc-nkg-inner/MindIE/ATB-Models/${VERSION}/MindIE-images/${IMAGE_FILE}"
-    IMAGE_MD5_URL="${IMAGE_URL}.md5"
-    DOCKER_TAG="ms_vllm_$(date +%Y%m%d)"
-}
-
-print_summary() {
-    echo "Model: $MODEL"
-    echo "Version: $VERSION"
-    echo "Image url: $IMAGE_URL"
-}
-
-update_msadapter() {
-    rm -rf vllm_mindspore/msadapter
-    git submodule update --init vllm_mindspore/msadapter || true
-    cd vllm_mindspore/msadapter || exit 1
-    for patch in ../../patch/msadapter/*.patch; do
-        [ -e "$patch" ] || continue
-        git apply "$patch"
-    done
-    touch __init__.py
-    touch mindtorch/__init__.py
-    cd - >/dev/null
-}
-
-function fetch_and_patch_vllm() {
-    local script_dir=$(cd "$(dirname $0)"; pwd)
-    local vllm_tag="v0.6.6.post1"
-    local vllm_source_dir="${script_dir}/vllm-${vllm_tag}"
-    local patch_dir="${script_dir}/patch/vllm"
-
-    if [ -d "${vllm_source_dir}" ]; then
-        echo "The ${vllm_source_dir} already exists. Remove it if reinstallation is needed."
-        exit 1
-    fi
-
-    git clone https://github.com/vllm-project/vllm.git -b ${vllm_tag} --depth 1 ${vllm_source_dir}
-    cd ${vllm_source_dir}
-
-    for patch in $(ls ${patch_dir}); do
-        sed -i 's/\r//g' ${patch_dir}/${patch}
-        git apply ${patch_dir}/${patch}
-    done
-    cd ..
-}
-
-download_file() {
-    local url=$1
-    local output=$2
-    curl -k --noproxy 'cmc-nkg-artifactory.cmc.tools.huawei.com' "$url" -o "$output"
-    if [ $? -ne 0 ]; then
-        echo "Failed to download $output from $url"
-        exit 1
-    fi
-}
-
-verify_md5() {
-    local file=$1
-    local md5_file=$2
-    local downloaded_md5=$(awk '{print $1}' $md5_file)
-    local calculated_md5=$(md5sum $file | awk '{print $1}')
-
-    if [ "$downloaded_md5" == "$calculated_md5" ]; then
-        echo "MD5 checksum for $file verified successfully."
-        return 0
-    else
-        echo "MD5 checksum verification failed!"
-        echo "Expected: $downloaded_md5"
-        echo "Got: $calculated_md5"
-        return 1
-    fi
-}
-
-check_or_download() {
-    local file=$1
-    local md5_file=$2
-    local file_url=$3
-    local md5_url=$4
-
-    if [ -f "$file" ] && [ -f "$md5_file" ]; then
-        verify_md5 "$file" "$md5_file" && return 0
-        echo "Verification failed. Redownloading files..."
-    else
-        echo "Files not found. Downloading..."
-    fi
-
-    download_file "$md5_url" "$md5_file"
-    download_file "$file_url" "$file"
-    verify_md5 "$file" "$md5_file" || { echo "Verification failed after re-downloading. Exiting."; exit 1; }
-}
-
-load_docker_image() {
-    local file=$1
-    docker load -i $file
-    if [ $? -eq 0 ]; then
-        echo "Docker image loaded successfully."
-    else
-        echo "Failed to load Docker image."
-        exit 1
-    fi
-}
-
-build_docker_image() {
-    local tag=$1
-    docker build \
-        --network=host \
-        --build-arg http_proxy=$http_proxy \
-        --build-arg https_proxy=$https_proxy \
-        --build-arg no_proxy=127.0.0.1,*.huawei.com,localhost,local,.local,172.17.0.1,cmc-nkg-artifactory.cmc.tools.huawei.com,mirrors.tools.huawei.com \
-        -f Dockerfile \
-        -t $tag \
-        --target ms_vllm \
-        .
-
-    if [ $? -eq 0 ]; then
-        echo "Docker image $tag built successfully."
-    else
-        echo "Failed to build Docker image."
-        exit 1
-    fi
-}
-
-main() {
-    validate_args "$@"
-    check_proxy
-
-    init_variables
-    print_summary
-
-    # update repo
-    update_msadapter
-    fetch_and_patch_vllm
-
-    # docker build
-    check_or_download "mindie.tar.gz" "mindie.tar.gz.md5" "$IMAGE_URL" "$IMAGE_MD5_URL"
-    load_docker_image "mindie.tar.gz"
-    sed -i "1s|FROM .* AS base|FROM $IMAGE_FILE_NAME AS base|" Dockerfile
-    build_docker_image "$DOCKER_TAG"
-}
-
-main "$@"
\ No newline at end of file
-- 
Gitee


From 09666ca3682aca5f23c29da8c91cb18056c27ca1 Mon Sep 17 00:00:00 2001
From: one_east <wanyidong@huawei.com>
Date: Tue, 4 Mar 2025 20:32:48 +0800
Subject: [PATCH 05/82] rename vllm-mindspore

---
 README.md | 4 ++--
 setup.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a228c1aa6..8a9b96ae9 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,8 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer
 pip3 uninstall torch torch-npu torchvision
 
 # 2.Install vllm_mindspore
-git clone https://gitee.com/mindspore/vllm_mindspore.git
-cd vllm_mindspore
+git clone https://gitee.com/mindspore/vllm-mindspore.git
+cd vllm-mindspore
 pip install .
 
 ```
diff --git a/setup.py b/setup.py
index a6c147505..0a589407a 100644
--- a/setup.py
+++ b/setup.py
@@ -96,9 +96,9 @@ setup(
     ),
     long_description=read_readme(),
     long_description_content_type="text/markdown",
-    url="https://gitee.com/mindspore/vllm_mindspore",
+    url="https://gitee.com/mindspore/vllm-mindspore",
     project_urls={
-        "Homepage": "https://gitee.com/mindspore/vllm_mindspore",
+        "Homepage": "https://gitee.com/mindspore/vllm-mindspore",
         "Documentation": "",
     },
     classifiers=[
-- 
Gitee


From 8c625cf5ffafa092f614157f264366e7cb179b06 Mon Sep 17 00:00:00 2001
From: zlq2020 <zouliqin@huawei.com>
Date: Wed, 5 Mar 2025 00:27:44 +0800
Subject: [PATCH 06/82] update mf config load checkpoint path

---
 .../models/mf_models/deepseek_v3.py            |  9 +++++++++
 .../model_executor/models/mf_models/qwen2.py   |  1 +
 .../model_executor/models/model_base.py        | 18 ++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 026988af6..ce3faddbd 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -97,6 +97,7 @@ class DeepseekV3ForCausalLM(MsModelBase):
             get_tensor_model_parallel_world_size()
         )
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
+        self.mf_config.load_checkpoint = self.get_model_path()
 
         self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
         self.mf_model_config.num_blocks = cal_block_num(self.cache_config, self.model_config, self.parallel_config)
@@ -230,3 +231,11 @@ class DeepseekV3ForCausalLM(MsModelBase):
         )
         self.network.set_dynamic_inputs()
         return None
+
+    def get_model_path(self):
+        model_name_or_path = self.model_config.model
+        if os.path.isdir(model_name_or_path):
+            return model_name_or_path
+        else:
+            raise ValueError("The 'model' in LLM should be the local path of the MindSpore checkpoint file.")
+
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index afde4466a..ee5a1dc8a 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -111,6 +111,7 @@ class Qwen2ForCausalLM(MsModelBase):
             jit_level="O0", infer_boost="on"
         ).jit_config_dict
 
+        self.mf_config.load_checkpoint = self.get_model_path()
         set_output_path(self.mf_config.output_dir)
         set_strategy_save_path(self.mf_config.parallel)
         # update safetensor path
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 86c12e252..4163b65ad 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 # ============================================================================
 
+import os
 from abc import abstractmethod
 from typing import Iterable, List, Optional, Set, Tuple, Union, Dict
 
@@ -41,9 +42,26 @@ class MsModelBase():
         self.lora_config = lora_config
         self.cache_config = vllm_config.cache_config
         self.parallel_config = vllm_config.parallel_config
+        self.load_config = vllm_config.load_config
 
         self.modules_dict = None
 
+    def get_model_path(self):
+        model_name_or_path = self.model_config.model
+        if os.path.isdir(model_name_or_path):
+            return model_name_or_path
+        else:
+            from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
+            allow_patterns = ["*.safetensors"]
+            revision = self.model_config.revision
+            return download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+
     def set_modules(self, model_dicts: Dict[str, nn.Cell]):
         self.modules_dict = model_dicts
 
-- 
Gitee


From f83b74161496b63e84721c5ba92dbfe052d16766 Mon Sep 17 00:00:00 2001
From: w00521005 <wangshaocong1@huawei.com>
Date: Tue, 4 Mar 2025 19:20:09 +0800
Subject: [PATCH 07/82] add warmup

---
 vllm_mindspore/worker/worker.py | 43 ++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 986691dca..807f9e2ff 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -19,6 +19,7 @@
 """Worker functions"""
 import gc
 import os
+import math
 from typing import Tuple, Optional
 
 import torch
@@ -34,18 +35,52 @@ from vllm.distributed import (
 from vllm.logger import init_logger
 
 from vllm_mindspore.utils import is_mindformers_model_backend
+from vllm.model_executor import set_random_seed
+from vllm.sequence import SequenceGroupMetadata
+from vllm.sampling_params import SamplingParams
 
 
 logger = init_logger(__name__)
 
 
+def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefill):
+    bs = 1
+    seq_len = model_config.max_seq_len_to_capture if is_prefill else 1
+    dummy_data = model_runner.input_registry.dummy_data_for_profiling(model_config, seq_len, model_runner.mm_registry)
+    block_tables = [i for i in range(math.ceil(seq_len / cache_engine.block_size))]
+    seqs = [
+        SequenceGroupMetadata(
+            request_id=str(idx),
+            is_prompt=is_prefill,
+            seq_data={idx: dummy_data.seq_data},
+            sampling_params=SamplingParams(),
+            block_tables={idx: block_tables},
+            lora_request=None,
+            multi_modal_data=None,
+            multi_modal_placeholders=None,
+        )
+        for idx in range(bs)
+    ]
+
+    model_input = model_runner.prepare_model_input(seqs)
+    return model_input
+
+
 def _warm_up_model(self) -> None:
+    # cache_engine is a list with length equal to the size of pipeline-parallel, and only pp=1 is supported.
+    kv_cache = self.cache_engine[0].gpu_cache
+
+    # warmup for prefill
+    model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], True)
+    self.model_runner.execute_model(model_input, kv_cache, None)
+    torch.cuda.synchronize()
+    # warmup for decode
+    model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], False)
+    self.model_runner.execute_model(model_input, kv_cache, None)
+    torch.cuda.synchronize()
+
     # Reset the seed to ensure that the random state is not affected by
     # the model initialization and profiling.
-    from vllm.model_executor import set_random_seed
-
-    # TODO(tronzhang): model compile here.
-
     set_random_seed(self.model_config.seed)
 
 
-- 
Gitee


From b53d9658f0cf9fbbdf30474a3e55320368befeb0 Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Thu, 6 Mar 2025 21:51:20 +0800
Subject: [PATCH 08/82] Reduce device memory usage

---
 .../model_executor/layers/linear.py           | 23 ++++++++-----------
 .../layers/vocab_parallel_embedding.py        |  2 +-
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index 339df9669..09e1b84a5 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -273,20 +273,19 @@ class ColumnParallelLinear(LinearBase):
 
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
 
-        param_data = param.data
         # bitsandbytes loads the weights of the specific portion
         # no need to narrow here
         if output_dim is not None and not use_bitsandbytes_4bit:
-            shard_size = param_data.shape[output_dim]
+            shard_size = param.shape[output_dim]
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous()
 
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert param_data.shape == loaded_weight.shape
+        assert param.shape == loaded_weight.shape
         # param_data.copy_(loaded_weight)
         # param.set_data(loaded_weight)
         # param[:, start_idx:start_idx + shard_size] = loaded_weight
@@ -377,7 +376,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow here
             if not use_bitsandbytes_4bit:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous()
             assert param_data.shape == loaded_weight.shape
             # param_data.copy_(loaded_weight)
             # param_data.set_data(loaded_weight)
@@ -460,7 +459,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             start_idx = shard_id * shard_size
 
             if not use_bitsandbytes_4bit:
-                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous()
             assert param_data.shape == loaded_weight.shape
             if param.name.endswith("weight"):
                 self.weight[shard_offset: shard_offset + shard_size, :] = loaded_weight
@@ -528,7 +527,7 @@ class RowParallelLinear(LinearBase):
             )
 
         if bias:
-            self.bias = Parameter(mint.zeros(self.output_size), dtype=params_dtype)
+            self.bias = Parameter(mint.zeros(self.output_size, dtype=params_dtype))
             set_weight_attrs(
                 self.bias,
                 {
@@ -569,24 +568,22 @@ class RowParallelLinear(LinearBase):
 
     def weight_loader(self, param, loaded_weight):
         tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
         input_dim = getattr(param, "input_dim", None)
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
 
-        param_data = param.data
         # bitsandbytes loads the weights of the specific portion
         # no need to narrow here
         if input_dim is not None and not use_bitsandbytes_4bit:
-            shard_size = param_data.shape[input_dim]
+            shard_size = param.shape[input_dim]
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
+            loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size).contiguous()
 
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert param_data.shape == loaded_weight.shape
+        assert param.shape == loaded_weight.shape
         # param_data.copy_(loaded_weight)
         # self.weight[:, start_idx : start_idx + shard_size] = loaded_weight
-        param.set_data(loaded_weight)
+        param.set_data(loaded_weight.contiguous())
diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
index 76fa68681..d2e00d199 100644
--- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
@@ -359,7 +359,7 @@ class VocabParallelEmbedding(nn.Cell):
                 f" but got {loaded_weight.shape[output_dim]} and {self.org_vocab_size}")
 
         # Copy the data.
-        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous()
         param[: loaded_weight.shape[0]] = loaded_weight
         param[loaded_weight.shape[0]:] = 0
 
-- 
Gitee


From 284fe0ca0ca8bfbc0b853d1d2c8b2423857d15ae Mon Sep 17 00:00:00 2001
From: TronZhang <zhangzhaochuang@huawei.com>
Date: Sat, 8 Mar 2025 09:58:17 +0000
Subject: [PATCH 09/82] update README.md.

Signed-off-by: TronZhang <zhangzhaochuang@huawei.com>
---
 README.md | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 8a9b96ae9..5ea56601b 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# vllm_mindspore
+# vllm-mindspore
 
 ## Overview
 
@@ -6,7 +6,7 @@ The `vllm-mindspore`is a integration for running vLLM on the MindSpore framework
 
 This  is the recommended solution for supporting the MindSpore  within the vLLM community. It provides deep integration with the MindSpore framework, offering efficient computation and optimization support for vLLM, enabling seamless operation on MindSpore.
 
-By using the `vllm-mindspore`, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, and Multi-modal LLMs, can run seamlessly for training and inference on the MindSpore framework.
+By using the `vllm-mindspore`, popular open-source models, can run seamlessly for training and inference on the MindSpore framework.
 
 ---
 
@@ -26,17 +26,7 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer
 
 #### Installation from source code
 
-```shell
-
-# 1. Uninstall torch-related packages due to msadapter limitations
-pip3 uninstall torch torch-npu torchvision
-
-# 2.Install vllm_mindspore
-git clone https://gitee.com/mindspore/vllm-mindspore.git
-cd vllm-mindspore
-pip install .
-
-```
+Install from source code. [Wiki Installation.](https://gitee.com/mindspore/vllm-mindspore/wikis/Getting%20Started/Installation)
 
 #### Set up using Docker
 
@@ -58,23 +48,28 @@ docker build --network=host .
 
 You can run vllm_mindspore in your own code on a list of prompts.
 
+```bash
+export ASCEND_TOTAL_MEMORY_GB=64 # Based on the ascend device.
+```
+
 ```python
 
 import vllm_mindspore # Add this line on the top of script.
+
 from vllm import LLM, SamplingParams
 
 # Sample prompts.
 prompts = [
     "I am",
     "Today is",
-    "Llama is"
+    "What is"
 ]
 
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="meta-llama/Llama-2-7b-hf")
+llm = LLM(model="Qwen/Qwen2.5-32B-Instruct", tensor_parallel_size=8)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
@@ -90,7 +85,7 @@ for output in outputs:
 
 You can start the server via the vllm_mindspore command:
 
-`python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "meta-llama/Llama-2-7b-hf"`
+`python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --tensor_parallel_size=8`
 
 To call the server, you can use `curl` or any other HTTP client.
 
@@ -99,8 +94,8 @@ To call the server, you can use `curl` or any other HTTP client.
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Llama-2-7b-hf",
-    "prompt": "Llama is",
+    "model": "Qwen/Qwen2.5-32B-Instruct",
+    "prompt": "MindSpore is",
     "max_tokens": 120,
     "temperature": 0
   }'
-- 
Gitee


From 239e6aabfc1899c6be0f79588d44e5aaee24e7ac Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Tue, 4 Mar 2025 15:52:31 +0800
Subject: [PATCH 10/82] move st to root

---
 .jenkins/test/config/dependent_packages.yaml           | 4 ++--
 {vllm_mindspore/tests => tests}/st/python/__init__.py  | 0
 {vllm_mindspore/tests => tests}/st/python/test_demo.py | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)
 rename {vllm_mindspore/tests => tests}/st/python/__init__.py (100%)
 rename {vllm_mindspore/tests => tests}/st/python/test_demo.py (96%)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index 6c3962997..535cd9fd1 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,8 +1,8 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202502/20250227/master_20250227211723_94ac228bae9cd6d0f00b4ce8d5857773799c4f26_newest/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250305/br_infer_deepseek_os_20250305001023_4011166933d7e6230601ac1ad07bfe1a8329541d/'
 
 mindformers:
-  'https://repo.mindspore.cn/mindspore/mindformers/version/202502/20250228/dev_20250228220021_4e90ca405720ea2f4a0abdf501d01078f28d724c_newest/'
+  'https://repo.mindspore.cn/mindspore/mindformers/version/202503/20250303/br_infer_deepseek_os_20250303142905_569a4261552abe2984651bd31d675d76c5f51fb0_newest/'
 
 msadapter:
   'https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250301/master_20250301_newest/'
diff --git a/vllm_mindspore/tests/st/python/__init__.py b/tests/st/python/__init__.py
similarity index 100%
rename from vllm_mindspore/tests/st/python/__init__.py
rename to tests/st/python/__init__.py
diff --git a/vllm_mindspore/tests/st/python/test_demo.py b/tests/st/python/test_demo.py
similarity index 96%
rename from vllm_mindspore/tests/st/python/test_demo.py
rename to tests/st/python/test_demo.py
index f5b8fae04..d6e1fd0e8 100644
--- a/vllm_mindspore/tests/st/python/test_demo.py
+++ b/tests/st/python/test_demo.py
@@ -21,6 +21,7 @@ class TestDemo:
     """
     Test Demo for ST.
     """
+
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
@@ -28,6 +29,8 @@ class TestDemo:
         """
         test case aaa
         """
+        # pylint: disable=W0611
+        import vllm_mindspore
         from vllm import LLM, SamplingParams
 
         # Sample prompts.
-- 
Gitee


From da0c60124a545e5df7c3028df2e12ec2c57ac599 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Wed, 5 Mar 2025 02:53:11 +0800
Subject: [PATCH 11/82] =?UTF-8?q?sampler.py=E4=BB=A3=E7=A0=81=E5=AF=B9?=
 =?UTF-8?q?=E9=BD=90=E5=8E=9F=E7=94=9Fsampler?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../model_executor/layers/sampler.py          | 480 ++++++++----------
 1 file changed, 210 insertions(+), 270 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index cd4e3d370..ed864ace9 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -21,30 +21,23 @@ import itertools
 import warnings
 import mindspore as ms
 from mindspore.common.api import _pynative_executor
+import numpy as np
 from dataclasses import dataclass
 from importlib.util import find_spec
 from math import inf
 from typing import Dict, Iterator, List, Optional, Tuple, Union
 
-# TODO(tronzhang): for some ops, msadaptor cannnot support, latter use vllm's...
-
 import msgspec
 import torch
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm_mindspore.model_executor.layers.utils import apply_penalties
 from vllm.sampling_params import SamplingType
-from vllm.sequence import (
-    VLLM_INVALID_TOKEN_ID,
-    CompletionSequenceGroupOutput,
-    Logprob,
-    PromptLogprobs,
-    SampleLogprobs,
-    SequenceOutput,
-)
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
+                           CompletionSequenceGroupOutput, Logprob,
+                           PromptLogprobs, SampleLogprobs, SequenceOutput)
 from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
-
+from vllm_mindspore.model_executor.layers.utils import apply_penalties
 from vllm_mindspore.model_executor.sampling_metadata import (
     SamplingMetadata,
     SamplingTensors,
@@ -66,6 +59,7 @@ else:
     flashinfer_top_k_top_p_sampling = None
 
 
+
 def get_sampler() -> torch.nn.Module:
     return Sampler()
 
@@ -75,7 +69,8 @@ SampleResultType = List[Tuple[List[int], List[int]]]
 
 # Types of temporary data structures used for
 # computing sample_result
-SampleMetadataType = Dict[SamplingType, Tuple[List[int], List[SequenceGroupToSample]]]
+SampleMetadataType = Dict[SamplingType, Tuple[List[int],
+                                              List[SequenceGroupToSample]]]
 MultinomialSamplesType = Dict[SamplingType, torch.Tensor]
 SampleResultsDictType = Dict[int, Tuple[List[int], List[int]]]
 
@@ -109,8 +104,9 @@ SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]]
 
 
 class SamplerOutput(
-    msgspec.Struct, omit_defaults=True, array_like=True  # type: ignore[call-arg]
-):  # type: ignore[call-arg]
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
     """For each sequence group, we generate a list of SequenceOutput object,
     each of which contains one possible candidate for the next token.
 
@@ -168,24 +164,21 @@ class SamplerOutput(
         return len(self.outputs)
 
     def __eq__(self, other: object):
-        return isinstance(other, self.__class__) and self.outputs == other.outputs
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs
 
     def __repr__(self) -> str:
-        """Show the shape of a tensor instead of its values to reduce noise."""
-        sampled_token_probs_repr = (
-            "None"
-            if self.sampled_token_probs is None
-            else self.sampled_token_probs.shape
-        )
-        sampled_token_ids_repr = (
-            "None" if self.sampled_token_ids is None else self.sampled_token_ids.shape
-        )
+        """Show the shape of a tensor instead of its values to reduce noise.
+        """
+        sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
+                                    else self.sampled_token_probs.shape)
+        sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
+                                  self.sampled_token_ids.shape)
         return (
             f"SamplerOutput(outputs={self.outputs}, "
             f"sampled_token_probs={sampled_token_probs_repr}, "
             f"sampled_token_ids={sampled_token_ids_repr}, "
-            f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})"
-        )
+            f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
 
 
 class Sampler(nn.Module):
@@ -235,11 +228,9 @@ class Sampler(nn.Module):
         self._sampling_tensors = None
 
         # Initialize new sampling tensors
-        (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) = (
-            SamplingTensors.from_sampling_metadata(
-                sampling_metadata, vocab_size, logits.device, logits.dtype
-            )
-        )
+        (sampling_tensors, do_penalties, do_top_p_top_k,
+         do_min_p) = SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
@@ -292,14 +283,11 @@ class Sampler(nn.Module):
 
         # Apply presence and frequency penalties.
         if do_penalties:
-            logits = apply_penalties(
-                logits,
-                sampling_tensors.prompt_tokens,
-                sampling_tensors.output_tokens,
-                sampling_tensors.presence_penalties,
-                sampling_tensors.frequency_penalties,
-                sampling_tensors.repetition_penalties,
-            )
+            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
+                                     sampling_tensors.output_tokens,
+                                     sampling_tensors.presence_penalties,
+                                     sampling_tensors.frequency_penalties,
+                                     sampling_tensors.repetition_penalties)
 
         # Use float32 to apply temperature scaling.
         # Use in-place division to avoid creating a new tensor.
@@ -307,9 +295,8 @@ class Sampler(nn.Module):
         logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
         if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
-            logits = _apply_top_k_top_p(
-                logits, sampling_tensors.top_ps, sampling_tensors.top_ks
-            )
+            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                        sampling_tensors.top_ks)
 
         if do_min_p:
             logits = _apply_min_p(logits, sampling_tensors.min_ps)
@@ -318,7 +305,6 @@ class Sampler(nn.Module):
         # Compute the probabilities.
         probs = torch.softmax(logits, dim=-1, dtype=torch.float)
         # Compute the log probabilities.
-
         logprobs = ms.ops.log_softmax(logits, axis=-1).to(torch.float)
 
         # Sample the next tokens.
@@ -347,10 +333,10 @@ class Sampler(nn.Module):
         sample_logprobs = None
         if not sampling_metadata.skip_sampler_cpu_output:
             # Pythonize logprobs now (GPU -> CPU); do not defer.
-            assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType)
+            assert not isinstance(maybe_deferred_sample_results,
+                                  SampleResultArgsType)
             prompt_logprobs, sample_logprobs = get_logprobs(
-                logprobs, sampling_metadata, maybe_deferred_sample_results
-            )
+                logprobs, sampling_metadata, maybe_deferred_sample_results)
 
         return _build_sampler_output(
             maybe_deferred_sample_results,
@@ -358,8 +344,7 @@ class Sampler(nn.Module):
             prompt_logprobs,
             sample_logprobs,
             on_device_tensors=on_device_tensors,
-            skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output,
-        )
+            skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output)
 
     def forward(
         self,
@@ -389,7 +374,7 @@ def _apply_min_tokens_penalty(
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
     """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens
-    have not been generated yet
+        have not been generated yet
     """
     # list of indices in logits that will be set to -inf
     logits_to_penalize: List[Tuple[int, int]] = []
@@ -399,7 +384,8 @@ def _apply_min_tokens_penalty(
         sampling_params = seq_group.sampling_params
 
         sample_indices = seq_group.sample_indices
-        logits_applied += len(sample_indices) + len(seq_group.prompt_logprob_indices)
+        logits_applied += len(sample_indices) + len(
+            seq_group.prompt_logprob_indices)
         if not seq_group.do_sample:
             continue
 
@@ -418,8 +404,7 @@ def _apply_min_tokens_penalty(
                 seqs_to_penalize = [start_idx + j for j in seqs_to_penalize]
                 # itertools.product pairs each seq index with every token id
                 logits_to_penalize.extend(
-                    itertools.product(seqs_to_penalize, token_ids_to_penalize)
-                )
+                    itertools.product(seqs_to_penalize, token_ids_to_penalize))
 
     if logits_to_penalize:
         # use zip and * to group indices along each dimension
@@ -436,14 +421,14 @@ def _apply_top_k_top_p(
     p: torch.Tensor,
     k: torch.Tensor,
 ) -> torch.Tensor:
-    logits_sort, logits_idx = logits.sort(axis=-1, descending=False)
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
 
     # Apply top-k.
     top_k_mask = logits_sort.size(1) - k.to(torch.long)
     # Get all the top_k values.
     top_k_mask = logits_sort.gather(top_k_mask, 0)
     top_k_mask = logits_sort < top_k_mask
-    logits_sort.masked_fill(top_k_mask, -float("inf"))
+    logits_sort.masked_fill_(top_k_mask, -float("inf"))
 
     # Apply top-p.
     probs_sort = logits_sort.softmax(axis=-1)
@@ -451,12 +436,12 @@ def _apply_top_k_top_p(
     top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
     # at least one
     top_p_mask[:, -1] = False
-    logits_sort.masked_fill(top_p_mask, -float("inf"))
+    logits_sort.masked_fill_(top_p_mask, -float("inf"))
 
     # Re-sort the probabilities.
-    logits = torch.empty_like(logits_sort).scatter(
-        axis=-1, index=logits_idx, src=logits_sort
-    )
+    logits = torch.empty_like(logits_sort).scatter_(dim=-1,
+                                                    index=logits_idx,
+                                                    src=logits_sort)
     return logits
 
 
@@ -468,11 +453,11 @@ def _apply_min_p(
     Adapted from
     https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17
     """
-    probs = torch.softmax(logits, axis=-1)
-    top_probs, _ = probs.max(axis=-1, keepdims=True)
-    scaled_min_p = min_p.unsqueeze_(axis=1) * top_probs
+    probs = torch.softmax(logits, dim=-1)
+    top_probs, _ = probs.max(dim=-1, keepdim=True)
+    scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs
     tokens_to_remove = probs < scaled_min_p
-    logits = logits.masked_fill(tokens_to_remove, -float("inf"))
+    logits = logits.masked_fill_(tokens_to_remove, -float("inf"))
 
     return logits
 
@@ -503,7 +488,8 @@ def _greedy_sample(
 
         seq_ids = seq_group.seq_ids
         num_parent_seqs = len(seq_ids)
-        assert num_parent_seqs == 1, "Greedy sampling should have only one seq."
+        assert num_parent_seqs == 1, (
+            "Greedy sampling should have only one seq.")
         parent_ids = list(range(num_parent_seqs))
         next_token_ids = [samples_lst[sample_idx]]
         results.append((next_token_ids, parent_ids))
@@ -528,6 +514,7 @@ def _random_sample(
         seq_group has do_sample=False, tuple contains ([], [])
     """
     # Find the maximum n value of the prompt phase requests.
+    random_samples = random_samples.cpu()
     sample_idx = 0
     results: SampleResultType = []
     for seq_group in selected_seq_groups:
@@ -542,13 +529,13 @@ def _random_sample(
         if is_prompt:
             # Prompt phase.
             parent_ids = [0] * sampling_params.n
-            next_token_ids = random_samples[sample_idx, : sampling_params.n].tolist()
+            next_token_ids = random_samples[
+                sample_idx, :sampling_params.n].tolist()
         else:
             # Generation phase.
             parent_ids = list(range(num_parent_seqs))
-            next_token_ids = random_samples[
-                sample_idx : sample_idx + num_parent_seqs, 0
-            ].tolist()
+            next_token_ids = random_samples[sample_idx:sample_idx +
+                                            num_parent_seqs, 0].tolist()
         results.append((next_token_ids, parent_ids))
         sample_idx += num_parent_seqs
     return results
@@ -589,25 +576,29 @@ def _beam_search_sample(
         seq_ids, sampling_params = seq_group.seq_ids, seq_group.sampling_params
         num_parent_seqs = len(seq_ids)
         beam_width = sampling_params.n
-        seq_group_logprobs = logprobs[sample_idx : sample_idx + num_parent_seqs]
+        seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs]
         if is_prompt:
             # Prompt phase.
-            assert num_parent_seqs == 1, "Prompt input should have only one seq."
+            assert num_parent_seqs == 1, (
+                "Prompt input should have only one seq.")
             parent_ids = [0] * (2 * beam_width)
-            _, next_token_ids = torch.topk(seq_group_logprobs[0], 2 * beam_width)
+            _, next_token_ids = torch.topk(seq_group_logprobs[0],
+                                           2 * beam_width)
             next_token_ids = next_token_ids.tolist()
         else:
             # Generation phase.
             cumulative_logprobs: List[float] = [
-                seq_group.seq_data[seq_id].cumulative_logprob for seq_id in seq_ids
+                seq_group.seq_data[seq_id].cumulative_logprob
+                for seq_id in seq_ids
             ]
             cumulative_logprobs_tensor = torch.tensor(
-                cumulative_logprobs, dtype=torch.float, device=seq_group_logprobs.device
-            )
-            seq_group_logprobs = (
-                seq_group_logprobs + cumulative_logprobs_tensor.unsqueeze(1)
-            )
-            _, topk_ids = torch.topk(seq_group_logprobs.flatten(), 2 * beam_width)
+                cumulative_logprobs,
+                dtype=torch.float,
+                device=seq_group_logprobs.device)
+            seq_group_logprobs = (seq_group_logprobs +
+                                  cumulative_logprobs_tensor.unsqueeze(dim=1))
+            _, topk_ids = torch.topk(seq_group_logprobs.flatten(),
+                                     2 * beam_width)
             topk_ids = topk_ids.tolist()
             vocab_size = seq_group_logprobs.size(-1)
             parent_ids = [i // vocab_size for i in topk_ids]
@@ -621,9 +612,6 @@ def _beam_search_sample(
 def exponential(x, lambd=1.0, *, generator=None):
     if generator is not None:
         raise ValueError("`generator` can not be supported.")
-    import numpy as np
-    import mindspore as ms
-
     output = np.random.exponential(scale=lambd, size=x.shape)
     return ms.Tensor(output).astype(x.dtype)
 
@@ -643,7 +631,6 @@ def _multinomial(
     q = torch.empty_like(probs)
     if seq_groups is None:
         q = exponential(q)
-        # q.exponential_()
     else:
         sample_idx = 0
         for seq_group in seq_groups:
@@ -653,26 +640,21 @@ def _multinomial(
             q[sample_idx : sample_idx + stride] = exponential(
                 q[sample_idx : sample_idx + stride]
             )
-            # q[sample_idx:sample_idx +
-            #   stride].exponential_(generator=seq_group.generator)
             sample_idx += stride
     return probs.div(q).argmax(axis=1).view(-1, num_samples)
 
 
 def _top_k_top_p_multinomial_with_flashinfer(
-    probs: torch.Tensor,
-    top_ks: torch.Tensor,
-    top_ps: torch.Tensor,
-    num_samples: int,
-    seq_groups: Optional[List[SequenceGroupToSample]],
-):
+        probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
+        num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]):
     max_top_k_round = 32
     if num_samples > 1:
         probs = probs.repeat_interleave(num_samples, dim=0)
         top_ks = top_ks.repeat_interleave(num_samples)
         top_ps = top_ps.repeat_interleave(num_samples)
     batch_size = probs.shape[0]
-    uniform_samples = torch.empty((max_top_k_round, batch_size), device=probs.device)
+    uniform_samples = torch.empty((max_top_k_round, batch_size),
+                                  device=probs.device)
     if seq_groups is None:
         uniform_samples.uniform_()
     else:
@@ -681,9 +663,8 @@ def _top_k_top_p_multinomial_with_flashinfer(
             seq_ids = seq_group.seq_ids
             stride = len(seq_ids) * num_samples
             assert seq_group.generator is not None
-            uniform_samples[:, sample_idx : sample_idx + stride].uniform_(
-                generator=seq_group.generator
-            )
+            uniform_samples[:, sample_idx:sample_idx +
+                            stride].uniform_(generator=seq_group.generator)
             sample_idx += stride
     batch_next_token_ids, success = flashinfer_top_k_top_p_sampling(
         probs,
@@ -692,19 +673,18 @@ def _top_k_top_p_multinomial_with_flashinfer(
         top_ps,
     )
     if not success.all():
-        warnings.warn("FlashInfer rejection sampling failed, fallback.", stacklevel=1)
+        warnings.warn("FlashInfer rejection sampling failed, fallback.",
+                      stacklevel=1)
         probs = flashinfer.sampling.top_k_renorm_prob(probs, top_ks)
         probs = flashinfer.sampling.top_p_renorm_prob(probs, top_ps)
         batch_next_token_ids = flashinfer.sampling.sampling_from_probs(
-            probs, uniform_samples[0]
-        )
+            probs, uniform_samples[0])
     return batch_next_token_ids.view(-1, num_samples)
 
 
 def get_pythonized_sample_results(
-    sample_result_args: SampleResultArgsType,
-) -> SampleResultType:
-    """This function consumes GPU-side sampler results and computes
+        sample_result_args: SampleResultArgsType) -> SampleResultType:
+    '''This function consumes GPU-side sampler results and computes
     Pythonized CPU-side sampler results (GPU -> CPU sync.)
 
     Single-step scheduling: this function is invoked at sampling-time
@@ -718,7 +698,7 @@ def get_pythonized_sample_results(
 
     Returns:
       Pythonized sampler results
-    """
+    '''
 
     (
         sample_metadata,
@@ -743,11 +723,11 @@ def get_pythonized_sample_results(
         if sampling_type == SamplingType.GREEDY:
             sample_results = _greedy_sample(seq_groups, greedy_samples)
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
-            sample_results = _random_sample(
-                seq_groups, multinomial_samples[sampling_type]
-            )
+            sample_results = _random_sample(seq_groups,
+                                            multinomial_samples[sampling_type])
         elif sampling_type == SamplingType.BEAM:
-            sample_results = _beam_search_sample(seq_groups, beam_search_logprobs)
+            sample_results = _beam_search_sample(seq_groups,
+                                                 beam_search_logprobs)
         sample_results_dict.update(zip(seq_group_id, sample_results))
 
     return [
@@ -764,7 +744,7 @@ def _sample_with_torch(
     include_gpu_probs_tensor: bool,
     modify_greedy_probs: bool,
 ) -> SampleReturnType:
-    """Torch-oriented _sample() implementation.
+    '''Torch-oriented _sample() implementation.
 
     Single-step scheduling:
     * Perform GPU-side sampling computation
@@ -774,11 +754,11 @@ def _sample_with_torch(
     * Perform GPU-side sampling computation
     * Defer Pythonization & preserve GPU-side
       tensors required for Pythonization
-    """
+    '''
 
-    categorized_seq_group_ids: Dict[SamplingType, List[int]] = {
-        t: [] for t in SamplingType
-    }
+    categorized_seq_group_ids: Dict[SamplingType,
+                                    List[int]] = {t: []
+                                                  for t in SamplingType}
     categorized_sample_indices = sampling_metadata.categorized_sample_indices
     for i, seq_group in enumerate(sampling_metadata.seq_groups):
         sampling_params = seq_group.sampling_params
@@ -793,12 +773,10 @@ def _sample_with_torch(
 
     # Create output tensor for sampled token ids.
     if include_gpu_probs_tensor:
-        sampled_token_ids_tensor = torch.full(
-            (logprobs.shape[0], 1),
-            VLLM_INVALID_TOKEN_ID,
-            dtype=torch.long,
-            device=logprobs.device,
-        )
+        sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1),
+                                              VLLM_INVALID_TOKEN_ID,
+                                              dtype=torch.long,
+                                              device=logprobs.device)
     else:
         sampled_token_ids_tensor = None
 
@@ -815,21 +793,21 @@ def _sample_with_torch(
         sample_metadata[sampling_type] = (seq_group_id, seq_groups)
         long_sample_indices = sample_indices.long()
         if sampling_type == SamplingType.GREEDY:
-            greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1)
+            greedy_samples = torch.argmax(logprobs[long_sample_indices],
+                                          dim=-1)
 
             if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
-                sampled_token_ids_tensor[long_sample_indices] = (
-                    greedy_samples.unsqueeze(-1)
-                )
+                sampled_token_ids_tensor[
+                    long_sample_indices] = greedy_samples.unsqueeze(-1)
 
             if modify_greedy_probs:
                 # If required, modify the probabilities such that sampling from
                 # the modified distribution would always sample the argmax
                 # token id.
-                _modify_greedy_probs_inplace(
-                    logprobs, probs, long_sample_indices, greedy_samples
-                )
+                _modify_greedy_probs_inplace(logprobs, probs,
+                                             long_sample_indices,
+                                             greedy_samples)
 
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             max_n_in_batch = 1
@@ -837,32 +815,28 @@ def _sample_with_torch(
                 if seq_group.is_prompt:
                     sampling_params = seq_group.sampling_params
                     max_n_in_batch = max(max_n_in_batch, sampling_params.n)
-            seq_groups_arg = (
-                None if sampling_type == SamplingType.RANDOM else seq_groups
-            )
+            seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
+                              seq_groups)
 
             if flashinfer_top_k_top_p_sampling is not None:
-                multinomial_samples[sampling_type] = (
-                    _top_k_top_p_multinomial_with_flashinfer(
+                multinomial_samples[
+                    sampling_type] = _top_k_top_p_multinomial_with_flashinfer(
                         probs[long_sample_indices],
                         sampling_tensors.top_ks[long_sample_indices],
                         sampling_tensors.top_ps[long_sample_indices],
                         max_n_in_batch,
                         seq_groups_arg,
                     )
-                )
             else:
                 multinomial_samples[sampling_type] = _multinomial(
                     probs[long_sample_indices],
                     max_n_in_batch,
-                    seq_groups=seq_groups_arg,
-                )
+                    seq_groups=seq_groups_arg)
 
             if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
-                sampled_token_ids_tensor[long_sample_indices] = multinomial_samples[
-                    sampling_type
-                ].to(torch.long)
+                sampled_token_ids_tensor[long_sample_indices] = \
+                    multinomial_samples[sampling_type].to(torch.long)
 
         elif sampling_type == SamplingType.BEAM:
             beam_search_logprobs = logprobs[sample_indices]
@@ -877,17 +851,14 @@ def _sample_with_torch(
         multinomial_samples=multinomial_samples,
         greedy_samples=greedy_samples,
         beam_search_logprobs=beam_search_logprobs,
-        sample_results_dict=sample_results_dict,
-    )
+        sample_results_dict=sample_results_dict)
 
     if not sampling_metadata.skip_sampler_cpu_output:
         # GPU<->CPU sync happens here.
         # This also converts the sampler output to a Python object.
         # Return Pythonized sampler result & sampled token ids
-        return (
-            get_pythonized_sample_results(maybe_deferred_args),
-            sampled_token_ids_tensor,
-        )
+        return get_pythonized_sample_results(
+            maybe_deferred_args), sampled_token_ids_tensor
     else:
         # Defer sampler result Pythonization; return deferred
         # Pythonization args & sampled token ids
@@ -941,8 +912,9 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
                     Each element in the returned tensor represents the rank
                     of the chosen token in the input logprob tensor.
     """
-    vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), indices]
-    result = x > vals[:, None]
+    vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
+             indices]
+    result = (x > vals[:, None])
     del vals
     return result.sum(1).add_(1)
 
@@ -990,14 +962,15 @@ def get_logprobs(
 
     # Select indices to compute logprob from, ranks of token ids, and the top
     # k token ids from logprobs.
-    for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results):
+    for (seq_group, sample_result) in zip(sampling_metadata.seq_groups,
+                                          sample_results):
         sampling_params = seq_group.sampling_params
 
         # Update indices and tokens for prompt logprobs.
-        if seq_group.is_prompt and sampling_params.prompt_logprobs is not None:
-            largest_num_logprobs = max(
-                largest_num_logprobs, sampling_params.prompt_logprobs
-            )
+        if (seq_group.is_prompt
+                and sampling_params.prompt_logprobs is not None):
+            largest_num_logprobs = max(largest_num_logprobs,
+                                       sampling_params.prompt_logprobs)
             next_prompt_tokens = _get_next_prompt_tokens(seq_group)
             query_indices.extend(seq_group.prompt_logprob_indices)
             next_token_ids.extend(next_prompt_tokens)
@@ -1011,14 +984,12 @@ def get_logprobs(
             # we can obtain it from `sample_result[1]`.
             query_idx = seq_group.sample_indices[0]
             query_indices.extend(
-                [query_idx + parent_id for parent_id in parent_seq_ids]
-            )
+                [query_idx + parent_id for parent_id in parent_seq_ids])
             next_token_ids.extend(token_ids)
 
             if sampling_params.logprobs is not None:
-                largest_num_logprobs = max(
-                    largest_num_logprobs, sampling_params.logprobs
-                )
+                largest_num_logprobs = max(largest_num_logprobs,
+                                           sampling_params.logprobs)
 
         assert len(next_token_ids) == len(query_indices)
 
@@ -1034,16 +1005,15 @@ def get_logprobs(
     # skip the whole logprob calculation.
     if largest_num_logprobs >= 0:
         query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
-        next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device)
+        next_token_ids_gpu = torch.tensor(next_token_ids,
+                                          device=logprobs.device)
 
         # (num_selected_query_tokens, num_logprobs). Note that query_indices can
         # contain duplicates if beam search is enabled.
-        selected_logprobs = logprobs[
-            [
-                query_indices_gpu,
-                next_token_ids_gpu,
-            ]
-        ]
+        selected_logprobs = logprobs[[
+            query_indices_gpu,
+            next_token_ids_gpu,
+        ]]
         ranks = _get_ranks(
             logprobs[query_indices_gpu],
             next_token_ids_gpu,
@@ -1054,14 +1024,14 @@ def get_logprobs(
         if largest_num_logprobs > 0:
             # Logprobs of topk tokens for a batch of sequence groups.
             # (num_query_tokens_across_batch).
-            top_logprobs, top_token_ids = torch.topk(
-                logprobs, largest_num_logprobs, dim=-1
-            )
-            top_logprobs = top_logprobs.to("cpu")
-            top_token_ids = top_token_ids.to("cpu")
+            top_logprobs, top_token_ids = torch.topk(logprobs,
+                                                     largest_num_logprobs,
+                                                     dim=-1)
+            top_logprobs = top_logprobs.to('cpu')
+            top_token_ids = top_token_ids.to('cpu')
 
-        selected_logprobs = selected_logprobs.to("cpu")
-        ranks = ranks.to("cpu")
+        selected_logprobs = selected_logprobs.to('cpu')
+        ranks = ranks.to('cpu')
 
     # Find prompt/sample logprobs.
     prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = []
@@ -1069,32 +1039,18 @@ def get_logprobs(
     top_logprob_idx = 0
     selected_logprobs_idx = 0
 
-    for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results):
-        (prompt_logprobs, top_logprob_idx, selected_logprobs_idx) = (
-            _get_prompt_logprob_if_needed(
-                seq_group,
-                selected_logprobs,
-                ranks,
-                top_token_ids,
-                top_logprobs,
-                selected_logprobs_idx,
-                top_logprob_idx,
-            )
-        )
+    for seq_group, sample_result in zip(sampling_metadata.seq_groups,
+                                        sample_results):
+        (prompt_logprobs, top_logprob_idx,
+         selected_logprobs_idx) = _get_prompt_logprob_if_needed(
+             seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs,
+             selected_logprobs_idx, top_logprob_idx)
         prompt_logprobs_per_seq_group.append(prompt_logprobs)
 
-        (sampled_logprobs, top_logprob_idx, selected_logprobs_idx) = (
-            _get_sampled_logprob_if_needed(
-                seq_group,
-                sample_result,
-                selected_logprobs,
-                ranks,
-                top_token_ids,
-                top_logprobs,
-                selected_logprobs_idx,
-                top_logprob_idx,
-            )
-        )
+        (sampled_logprobs, top_logprob_idx,
+         selected_logprobs_idx) = _get_sampled_logprob_if_needed(
+             seq_group, sample_result, selected_logprobs, ranks, top_token_ids,
+             top_logprobs, selected_logprobs_idx, top_logprob_idx)
         sample_logprobs_per_seq_group.append(sampled_logprobs)
 
     return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group
@@ -1122,11 +1078,10 @@ def _get_prompt_logprob_if_needed(
         # Pre-select indexes and create a list. It is faster than calling .item
         # repetitively.
         selected_logprob_items = selected_logprobs[
-            selected_logprobs_idx : selected_logprobs_idx + len(next_prompt_tokens)
-        ].tolist()
-        rank_items = ranks[
-            selected_logprobs_idx : selected_logprobs_idx + len(next_prompt_tokens)
-        ].tolist()
+            selected_logprobs_idx:selected_logprobs_idx +
+            len(next_prompt_tokens)].tolist()
+        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                           len(next_prompt_tokens)].tolist()
 
         for idx, token_id in enumerate(next_prompt_tokens):
             # Calculate the prompt logprob of the real prompt tokens.
@@ -1137,23 +1092,22 @@ def _get_prompt_logprob_if_needed(
 
             # Add top K prompt logprobs along with its rank.
             if num_logprobs > 0:
-                top_ids = top_token_ids[top_logprob_idx, :num_logprobs].tolist()
-                top_probs = top_logprobs[top_logprob_idx, :num_logprobs].tolist()
+                top_ids = top_token_ids[
+                    top_logprob_idx, :num_logprobs].tolist()
+                top_probs = top_logprobs[
+                    top_logprob_idx, :num_logprobs].tolist()
                 # Top K is already sorted by rank, so we can use 1 ~
                 # num_logprobs + 1 for rank.
                 top_ranks = range(1, num_logprobs + 1)
-                prompt_logprobs_dict.update(
-                    {
-                        top_id: (top_prob, rank)
-                        for top_id, top_prob, rank in zip(top_ids, top_probs, top_ranks)
-                    }
-                )
-            prompt_logprobs.append(
-                {
-                    token_id: Logprob(*logprob_and_rank)
-                    for token_id, logprob_and_rank in prompt_logprobs_dict.items()
-                }
-            )
+                prompt_logprobs_dict.update({
+                    top_id: (top_prob, rank)
+                    for top_id, top_prob, rank in zip(top_ids, top_probs,
+                                                      top_ranks)
+                })
+            prompt_logprobs.append({
+                token_id: Logprob(*logprob_and_rank)
+                for token_id, logprob_and_rank in prompt_logprobs_dict.items()
+            })
             # + 1 to go to the next prompt token.
             top_logprob_idx += 1
 
@@ -1188,44 +1142,37 @@ def _get_sampled_logprob_if_needed(
             # Pre-select items from tensor. tolist() is faster than repetitive
             # `.item()` calls.
             selected_logprob_items = selected_logprobs[
-                selected_logprobs_idx : selected_logprobs_idx + len(next_token_ids)
-            ].tolist()
-            rank_items = ranks[
-                selected_logprobs_idx : selected_logprobs_idx + len(next_token_ids)
-            ].tolist()
+                selected_logprobs_idx:selected_logprobs_idx +
+                len(next_token_ids)].tolist()
+            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
+                               len(next_token_ids)].tolist()
             for idx, (next_token_id, parent_id) in enumerate(
-                zip(next_token_ids, parent_seq_ids)
-            ):
+                    zip(next_token_ids, parent_seq_ids)):
                 # Get the logprob of a sampled token.
                 sampled_logprobs_dict = {
-                    next_token_id: (selected_logprob_items[idx], rank_items[idx])
+                    next_token_id:
+                    (selected_logprob_items[idx], rank_items[idx])
                 }
                 if num_logprobs is not None and num_logprobs > 0:
                     # Get top K logprobs.
-                    top_ids = top_token_ids[
-                        top_logprob_idx + parent_id, :num_logprobs
-                    ].tolist()
+                    top_ids = top_token_ids[top_logprob_idx +
+                                            parent_id, :num_logprobs].tolist()
                     top_probs = top_logprobs[
-                        top_logprob_idx + parent_id, :num_logprobs
-                    ].tolist()
+                        top_logprob_idx + parent_id, :num_logprobs].tolist()
                     # Top K is already sorted by rank, so we can use 1 ~
                     # num_logprobs + 1 for rank.
                     top_ranks = range(1, num_logprobs + 1)
-                    sampled_logprobs_dict.update(
-                        {
-                            top_id: (top_prob, rank)
-                            for top_id, top_prob, rank in zip(
-                                top_ids, top_probs, top_ranks
-                            )
-                        }
-                    )
+                    sampled_logprobs_dict.update({
+                        top_id: (top_prob, rank)
+                        for top_id, top_prob, rank in zip(
+                            top_ids, top_probs, top_ranks)
+                    })
 
-                sampled_logprobs.append(
-                    {
-                        token_id: Logprob(*logprob_and_rank)
-                        for token_id, logprob_and_rank in sampled_logprobs_dict.items()
-                    }
-                )
+                sampled_logprobs.append({
+                    token_id: Logprob(*logprob_and_rank)
+                    for token_id, logprob_and_rank in
+                    sampled_logprobs_dict.items()
+                })
 
         # NOTE: This part of code is not intuitive. `selected_logprobs` include
         # logprobs for the current step, which has len(next_token_ids) tokens
@@ -1239,12 +1186,9 @@ def _get_sampled_logprob_if_needed(
     return sampled_logprobs, top_logprob_idx, selected_logprobs_idx
 
 
-def _modify_greedy_probs_inplace(
-    logprobs: torch.Tensor,
-    probs: torch.Tensor,
-    sample_indices: torch.Tensor,
-    greedy_samples: torch.Tensor,
-) -> None:
+def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+                                 sample_indices: torch.Tensor,
+                                 greedy_samples: torch.Tensor) -> None:
     """Modify the probability distributions of the greedily-sampled tokens such
     that each sampled token has a "probability" of 1.0. This is required by
     speculative decoding, which depends on the sampling method being encoded
@@ -1297,7 +1241,8 @@ def _build_sampler_output(
     sampling_metadata: SamplingMetadata,
     prompt_logprobs: Optional[List[Optional[PromptLogprobs]]],
     sample_logprobs: Optional[List[SampleLogprobs]],
-    on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+    on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor,
+                                      torch.Tensor]],
     skip_sampler_cpu_output: bool = False,
 ) -> SamplerOutput:
     """Construct Python objects with the output of sampling.
@@ -1316,46 +1261,40 @@ def _build_sampler_output(
     else:
         assert prompt_logprobs is not None
         assert sample_logprobs is not None
-        assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType)
+        assert not isinstance(maybe_deferred_sample_results,
+                              SampleResultArgsType)
         deferred_sample_results_args = None
 
-        for (
-            seq_group,
-            sample_result,
-            group_prompt_logprobs,
-            group_sample_logprobs,
-        ) in zip(
-            sampling_metadata.seq_groups,
-            maybe_deferred_sample_results,
-            prompt_logprobs,
-            sample_logprobs,
-        ):
+        for (seq_group, sample_result, group_prompt_logprobs,
+             group_sample_logprobs) in zip(sampling_metadata.seq_groups,
+                                           maybe_deferred_sample_results,
+                                           prompt_logprobs, sample_logprobs):
             seq_ids = seq_group.seq_ids
             next_token_ids, parent_ids = sample_result
             seq_outputs: List[SequenceOutput] = []
             for parent_id, next_token_id, logprobs in zip(
-                parent_ids, next_token_ids, group_sample_logprobs
-            ):
+                    parent_ids, next_token_ids, group_sample_logprobs):
                 seq_outputs.append(
-                    SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)
-                )
+                    SequenceOutput(seq_ids[parent_id], next_token_id,
+                                   logprobs))
             sampler_output.append(
-                CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs)
-            )
+                CompletionSequenceGroupOutput(seq_outputs,
+                                              group_prompt_logprobs))
 
     # If not specified, store None values in SamplerOutput.
     if on_device_tensors is not None:
-        (sampled_token_probs, logprobs_tensor, sampled_token_ids) = on_device_tensors
+        (sampled_token_probs, logprobs_tensor,
+         sampled_token_ids) = on_device_tensors
     else:
-        sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, None)
+        sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None,
+                                                                   None)
 
     return SamplerOutput(
         outputs=sampler_output,
         sampled_token_probs=sampled_token_probs,
         sampled_token_ids=sampled_token_ids,
         logprobs=logprobs_tensor,
-        deferred_sample_results_args=deferred_sample_results_args,
-    )
+        deferred_sample_results_args=deferred_sample_results_args)
 
 
 def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
@@ -1372,9 +1311,8 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
     Returns:
         A list of next prompt tokens to compute logprob.
     """
-    assert (
-        seq_group.is_prompt
-    ), "Caller should ensure the sequence group is in a prefill stage."
+    assert seq_group.is_prompt, (
+        "Caller should ensure the sequence group is in a prefill stage.")
     seq_ids = seq_group.seq_ids
     query_len = seq_group.query_len
     assert query_len is not None
@@ -1385,6 +1323,8 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
     prompt_tokens = seq_data.prompt_token_ids
     # +1 because we are looking for a next prompt token.
     next_token_index_start = computed_len + 1
-    next_token_index_end = min(computed_len + query_len + 1, len(prompt_tokens))
-    next_prompt_tokens = prompt_tokens[next_token_index_start:next_token_index_end]
+    next_token_index_end = min(computed_len + query_len + 1,
+                               len(prompt_tokens))
+    next_prompt_tokens = prompt_tokens[
+        next_token_index_start:next_token_index_end]
     return next_prompt_tokens
-- 
Gitee


From ab7d53769deac369778882035947c5fb56370ff0 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Wed, 5 Mar 2025 16:58:40 +0800
Subject: [PATCH 12/82] =?UTF-8?q?=E8=A7=84=E9=81=BFmf=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=E9=9D=99=E6=80=81=E5=9B=BE=E5=90=8E=E5=A4=84=E7=90=86=E9=97=AE?=
 =?UTF-8?q?=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/model_executor/layers/sampler.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index ed864ace9..65ce69125 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -237,7 +237,7 @@ class Sampler(nn.Module):
         self._do_top_p_top_k = do_top_p_top_k
         self._do_min_p = do_min_p
 
-    def run_forward(
+    def forward(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
@@ -346,13 +346,13 @@ class Sampler(nn.Module):
             on_device_tensors=on_device_tensors,
             skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output)
 
-    def forward(
+    def __call__(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
         with AsyncContext() as ctx:
-            return self.run_forward(logits, sampling_metadata)
+            return self.forward(logits, sampling_metadata)
 
     @property
     def _should_modify_greedy_probs_inplace(self) -> bool:
@@ -514,7 +514,6 @@ def _random_sample(
         seq_group has do_sample=False, tuple contains ([], [])
     """
     # Find the maximum n value of the prompt phase requests.
-    random_samples = random_samples.cpu()
     sample_idx = 0
     results: SampleResultType = []
     for seq_group in selected_seq_groups:
-- 
Gitee


From 3a2730b7f9472ac78a78c7eeaf23da38482b3f32 Mon Sep 17 00:00:00 2001
From: r1chardf1d0 <xiaruijie@huawei.com>
Date: Wed, 5 Mar 2025 11:00:10 +0800
Subject: [PATCH 13/82] fix quant correct

fix total gpu memory

fix msg
---
 .../models/mf_models/deepseek_v3.py               | 14 +++-----------
 .../model_executor/models/mf_models/qwen2.py      |  4 ++--
 vllm_mindspore/platforms/ascend.py                | 15 +++++++++++++++
 vllm_mindspore/utils.py                           |  6 ++++--
 vllm_mindspore/worker/worker.py                   |  4 +++-
 5 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index da909c118..042a11af9 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -45,7 +45,7 @@ from research.deepseek3.deepseek3 import (
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
-from vllm_mindspore.utils import cal_block_num
+from vllm_mindspore.utils import calc_block_num
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model
@@ -101,7 +101,7 @@ class DeepseekV3ForCausalLM(MsModelBase):
         self.mf_config.load_checkpoint = self.get_model_path()
 
         self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
-        self.mf_model_config.num_blocks = cal_block_num(self.cache_config, self.model_config, self.parallel_config)
+        self.mf_model_config.num_blocks = calc_block_num(self.cache_config, self.model_config, self.parallel_config)
         self.mf_model_config.block_size = self.cache_config.block_size
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
@@ -125,14 +125,6 @@ class DeepseekV3ForCausalLM(MsModelBase):
                             precision_recovery=PrecisionRecovery.NONE,
                             act_quant_granularity=QuantGranularity.PER_TENSOR,
                             weight_quant_granularity=QuantGranularity.PER_CHANNEL)
-            wo_config = PTQConfig(mode=PTQMode.DEPLOY,
-                                  backend=BackendTarget.ASCEND,
-                                  weight_quant_dtype=msdtype.int8,
-                                  act_quant_dtype=msdtype.int8,
-                                  outliers_suppression=OutliersSuppressionType.NONE,
-                                  precision_recovery=PrecisionRecovery.NONE,
-                                  act_quant_granularity=QuantGranularity.PER_TENSOR,
-                                  weight_quant_granularity=QuantGranularity.PER_CHANNEL)
             ffn_config = PTQConfig(mode=PTQMode.DEPLOY,
                                    backend=BackendTarget.ASCEND,
                                    weight_quant_dtype=msdtype.int8,
@@ -142,7 +134,7 @@ class DeepseekV3ForCausalLM(MsModelBase):
                                    act_quant_granularity=QuantGranularity.PER_TOKEN,
                                    weight_quant_granularity=QuantGranularity.PER_CHANNEL)
             ptq = PTQ(config=cfg,
-                      layer_policies=OrderedDict({r'.*\.wo.*':wo_config, r'.*\.feed_forward\..*':ffn_config}))
+                      layer_policies=OrderedDict({r'.*\.feed_forward\..*':ffn_config}))
             ptq.apply(self.network)
             ptq.convert(self.network)
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index d699ad2cd..197d562d0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -44,7 +44,7 @@ from research.qwen2_5.infer.qwen2_5 import (
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
-from vllm_mindspore.utils import cal_block_num
+from vllm_mindspore.utils import calc_block_num
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model
@@ -99,7 +99,7 @@ class Qwen2ForCausalLM(MsModelBase):
 
         self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config)
         # Cannot get num_gpu_blocks from cache config now, calculate one first.
-        self.mf_model_config.num_blocks = cal_block_num(
+        self.mf_model_config.num_blocks = calc_block_num(
             self.cache_config, self.model_config, self.parallel_config
         )
         self.mf_model_config.block_size = self.cache_config.block_size
diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py
index 516ba3a0e..65d31766e 100644
--- a/vllm_mindspore/platforms/ascend.py
+++ b/vllm_mindspore/platforms/ascend.py
@@ -20,6 +20,8 @@
 from typing import TYPE_CHECKING, Optional
 
 import torch
+import os
+import mindspore as ms
 
 from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend
 from vllm.logger import init_logger
@@ -130,6 +132,19 @@ class AscendPlatform(Platform):
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+        if os.getenv("ASCEND_TOTAL_MEMORY_GB"):
+            total_device_memory = int(os.environ["ASCEND_TOTAL_MEMORY_GB"])
+        else:
+            total_device_memory = 64
+            logger.warning(
+                "Total device memory should be set by environ 'ASCEND_TOTAL_MEMORY_GB', "
+                "please check size by cmd(npu-smi info). "
+                "For now, we will try default size(64GB) which might not be correct exactly."
+            )
+        max_device_memory_for_ms = str(total_device_memory * cache_config.gpu_memory_utilization) + 'GB'
+        ms.set_context(max_device_memory=max_device_memory_for_ms)
+        logger.info("max_device_memory for mindspore is: ", max_device_memory_for_ms)
+
     @classmethod
     def verify_quantization(cls, quant: str) -> None:
         """
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index c7b247b9d..2ed6fd9ef 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -328,12 +328,14 @@ def check_ready():
         logger.info("Run with native model backend!")
 
 
-def cal_block_num(cache_config, model_config, parallel_config):
+def calc_block_num(cache_config, model_config, parallel_config):
     from vllm.worker.cache_engine import CacheEngine
 
     torch.cuda.empty_cache()
     torch.cuda.reset_peak_memory_stats()
-    _, total_gpu_memory = torch.cuda.mem_get_info()
+
+    total_gpu_memory = int(os.environ["ASCEND_TOTAL_MEMORY_GB"]) if os.getenv("ASCEND_TOTAL_MEMORY_GB") else 64
+    total_gpu_memory = total_gpu_memory * 1024 * 1024 * 1024
     memory_can_use = total_gpu_memory * cache_config.gpu_memory_utilization
 
     model_use_memory_b = int(os.getenv("vLLM_MODEL_MEMORY_USE_GB")) * 1024 * 1024 * 1024
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 807f9e2ff..684ee7f52 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -103,13 +103,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
     torch.cuda.empty_cache()
     torch.cuda.reset_peak_memory_stats()
 
-    _, total_gpu_memory = torch.cuda.mem_get_info()
+    total_gpu_memory = int(os.environ["ASCEND_TOTAL_MEMORY_GB"]) if os.getenv("ASCEND_TOTAL_MEMORY_GB") else 64
+    total_gpu_memory = total_gpu_memory * 1024 * 1024 * 1024
 
     if os.getenv("vLLM_MODEL_MEMORY_USE_GB"):
         memory_use_for_model_run = int(os.environ["vLLM_MODEL_MEMORY_USE_GB"]) * 1024 * 1024 * 1024
     else:
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
+        _, total_gpu_memory = torch.cuda.mem_get_info()
         with memory_profiling(
             baseline_memory_in_bytes=total_gpu_memory - self.init_gpu_memory,
             weights_memory_in_bytes=self.model_runner.model_memory_usage,
-- 
Gitee


From d72815e0b1393b363664726b0945f238de31fd14 Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Mon, 3 Mar 2025 08:13:00 +0800
Subject: [PATCH 14/82] fix mla bug

---
 vllm_mindspore/__init__.py            |  8 +++--
 vllm_mindspore/config.py              | 15 +++++++--
 vllm_mindspore/utils.py               |  9 ++++++
 vllm_mindspore/worker/cache_engine.py | 45 +++++++++++++++++----------
 4 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 797425b4f..a4a1f4763 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -112,7 +112,8 @@ from vllm_mindspore.worker.cache_engine import (
     ms_allocate_kv_cache,
     ms_swap_in,
     ms_swap_out,
-    cache_engine_init
+    cache_engine_init,
+    get_cache_block_size,
 )
 
 import vllm.worker.cache_engine
@@ -121,6 +122,7 @@ vllm.worker.cache_engine.CacheEngine._allocate_kv_cache = ms_allocate_kv_cache
 vllm.worker.cache_engine.CacheEngine.__init__ = cache_engine_init
 vllm.worker.cache_engine.CacheEngine.swap_in = ms_swap_in
 vllm.worker.cache_engine.CacheEngine.swap_out = ms_swap_out
+vllm.worker.cache_engine.CacheEngine.get_cache_block_size = get_cache_block_size
 
 from vllm_mindspore.model_executor.model_loader.weight_utils import (
     safetensors_weights_iterator,
@@ -181,9 +183,11 @@ vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster
 vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster
 
 
-from .config import get_head_size, _verify_quantization
+from .config import get_head_size, _verify_quantization, get_num_kv_heads
+
 vllm.config.ModelConfig.get_head_size = get_head_size
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
+vllm.config.ModelConfig.get_num_kv_heads = get_num_kv_heads
 
 from .utils import check_ready
 
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index b7c602322..02f79d41f 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 # ============================================================================
 
-from vllm_mindspore.utils import is_mindformers_model_backend
+from vllm_mindspore.utils import is_mindformers_model_backend, is_use_mla
 
 
 def get_head_size(self) -> int:
@@ -40,6 +40,17 @@ def get_head_size(self) -> int:
     # FIXME(woosuk): This may not be true for all models.
     return self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads
 
+
 def _verify_quantization(self) -> None:
     # Donnot verify now.
-    return
\ No newline at end of file
+    return
+
+
+def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
+    """Returns the number of KV heads per Device."""
+
+    if is_use_mla(self):
+        return 1
+
+    total_num_kv_heads = self.get_total_num_kv_heads()
+    return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index c7b247b9d..4262e2ede 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -343,3 +343,12 @@ def cal_block_num(cache_config, model_config, parallel_config):
     )
     num_gpu_blocks = int(available_cache_memory // cache_block_size)
     return num_gpu_blocks
+
+
+def is_use_mla(model_config):
+    if not is_mindformers_model_backend():
+        return False
+
+    return hasattr(model_config.hf_text_config, "model_type") and (
+        model_config.hf_text_config.model_type in ("deepseek_v3",)
+    )
diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py
index 9e9811a16..a8a16ad3b 100644
--- a/vllm_mindspore/worker/cache_engine.py
+++ b/vllm_mindspore/worker/cache_engine.py
@@ -21,17 +21,13 @@ from typing import List
 
 from vllm.logger import init_logger
 
-logger = init_logger(__name__)
-
-from vllm_mindspore.utils import (
-    MsKVCache,
-    get_valid_dtype,
-    is_mindformers_model_backend,
-)
+from vllm_mindspore.utils import MsKVCache, get_valid_dtype, is_use_mla, get_dtype_size
 
 import mindspore as ms
 from mindspore import mutable
 
+logger = init_logger(__name__)
+
 
 def create_block(shape, dtype, name=None, device=None):
     from mindspore.ops.function.array_func import empty as empty_tensor
@@ -116,15 +112,6 @@ def cache_engine_init(
     else:
         self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
 
-    if (
-        is_mindformers_model_backend()
-        and hasattr(model_config.hf_text_config, "model_type")
-        and (model_config.hf_text_config.model_type in ("deepseek_v3",))
-    ):
-        is_mla = True
-    else:
-        is_mla = False
-
     # Get attention backend.
     self.attn_backend = get_attn_backend(
         self.head_size,
@@ -132,7 +119,7 @@ def cache_engine_init(
         cache_config.cache_dtype,
         self.block_size,
         model_config.is_attention_free,
-        use_mla=is_mla,
+        use_mla=is_use_mla(model_config),
     )
 
     # Initialize the cache.
@@ -140,3 +127,27 @@ def cache_engine_init(
         self.num_gpu_blocks, self.device_config.device_type
     )
     self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
+
+
+def get_cache_block_size(
+    cache_config: "CacheConfig",
+    model_config: "ModelConfig",
+    parallel_config: "ParallelConfig",
+) -> int:
+    from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType
+
+    head_size = model_config.get_head_size()
+    num_heads = model_config.get_num_kv_heads(parallel_config)
+    num_attention_layers = model_config.get_num_layers_by_block_type(
+        parallel_config, LayerBlockType.attention
+    )
+
+    key_cache_block = cache_config.block_size * num_heads * head_size
+    value_cache_block = key_cache_block if not is_use_mla(model_config) else 0
+    total = num_attention_layers * (key_cache_block + value_cache_block)
+    if cache_config.cache_dtype == "auto":
+        dtype = model_config.dtype
+    else:
+        dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+    dtype_size = get_dtype_size(dtype)
+    return dtype_size * total
-- 
Gitee


From 399ef5da9999b9847f123375ed3489365be64f6a Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Mon, 10 Mar 2025 22:47:41 +0800
Subject: [PATCH 15/82] add OWNERS file

---
 OWNERS | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 OWNERS

diff --git a/OWNERS b/OWNERS
new file mode 100644
index 000000000..c49617069
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,11 @@
+reviewers:
+- wang_shaocong
+- erpim
+- zhang_xue_tong
+
+approvers:
+- tronzhang
+- zichun_ye
+- zlq2020
+- panshaowu
+- zhaizhiqiang
\ No newline at end of file
-- 
Gitee


From ce4e4594f8987b686b943bf2d842c6c27b305ffb Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Tue, 11 Mar 2025 11:17:23 +0800
Subject: [PATCH 16/82] add more reviewers

---
 OWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/OWNERS b/OWNERS
index c49617069..90ee21dac 100644
--- a/OWNERS
+++ b/OWNERS
@@ -2,6 +2,7 @@ reviewers:
 - wang_shaocong
 - erpim
 - zhang_xue_tong
+- tan-wei-cheng
 
 approvers:
 - tronzhang
-- 
Gitee


From fe97839845c9cb374a933965c12577e3858a6d40 Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Wed, 12 Mar 2025 10:56:16 +0800
Subject: [PATCH 17/82] update ms package for ci

---
 .jenkins/test/config/dependent_packages.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index 535cd9fd1..e632e9fd8 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,5 +1,5 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250305/br_infer_deepseek_os_20250305001023_4011166933d7e6230601ac1ad07bfe1a8329541d/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250307/br_infer_deepseek_os_20250307004508_4011166933d7e6230601ac1ad07bfe1a8329541d/'
 
 mindformers:
   'https://repo.mindspore.cn/mindspore/mindformers/version/202503/20250303/br_infer_deepseek_os_20250303142905_569a4261552abe2984651bd31d675d76c5f51fb0_newest/'
-- 
Gitee


From b4a73004b90a49f6efe10dee3ff5d2e753cbe347 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Thu, 6 Mar 2025 17:52:02 +0800
Subject: [PATCH 18/82] deepseekv3 infer support customize parameter Parallel
 segmentation

---
 .../models/mf_models/deepseek_v3.py           |  26 +-
 .../mf_models/deepseekv3_infer_parallelism.py | 919 ++++++++++++++++++
 .../mf_models/deepseekv3_infer_save_ckpt.py   | 104 ++
 .../models/mf_models/model_parallelism.py     |  82 ++
 4 files changed, 1123 insertions(+), 8 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/model_parallelism.py

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 042a11af9..5327351a0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -49,6 +49,9 @@ from vllm_mindspore.utils import calc_block_num
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model
+from mindspore.communication.comm_func import barrier
+
+from vllm_mindspore.model_executor.models.mf_models.deepseekv3_infer_parallelism import DeepseekInferParallelism
 
 
 logger = init_logger(__name__)
@@ -106,6 +109,8 @@ class DeepseekV3ForCausalLM(MsModelBase):
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
 
+        self.is_quant = hasattr(self.mf_config.model.model_config, "quantization_config")
+
         # Initital network
         self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
@@ -214,14 +219,19 @@ class DeepseekV3ForCausalLM(MsModelBase):
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
-        model = Model(self.network)
-        batch_size = self.mf_config.model.model_config.batch_size
-        seq_length = self.mf_config.model.model_config.seq_length
-        input_ids = np.ones(shape=tuple([batch_size, seq_length]))
-        infer_data = self.network.prepare_inputs_for_predict_layout(input_ids)
-        transform_and_load_checkpoint(
-            self.mf_config, model, self.network, infer_data, do_predict=True
-        )
+        if self.mf_config.load_ckpt_format == "ckpt":
+            model = Model(self.network)
+            batch_size = self.mf_config.model.model_config.batch_size
+            seq_length = self.mf_config.model.model_config.seq_length
+            input_ids = np.ones(shape=tuple([batch_size, seq_length]))
+            infer_data = self.network.prepare_inputs_for_predict_layout(input_ids)
+            transform_and_load_checkpoint(
+                self.mf_config, model, self.network, infer_data, do_predict=True
+            )
+        else:
+            model_parallelism = DeepseekInferParallelism(self.mf_config, self.network, self.is_quant)
+            model_parallelism.infer_convert_and_parallelism(self.mf_config.load_checkpoint)
+            barrier()
         self.network.set_dynamic_inputs()
         return None
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
new file mode 100644
index 000000000..671ab171c
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
@@ -0,0 +1,919 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+transform huggingface model to mindspore safetensor.
+"""
+import os
+import time
+import json
+import numpy as np
+
+import mindspore as ms
+from vllm_mindspore.model_executor.models.mf_models.model_parallelism import BaseModelParallelism
+
+
+class DeepseekInferParallelism(BaseModelParallelism):
+    r"""
+    Provide DeepseekV3/R1 Quant Model infer parameter convert and parallelism.
+    Args:
+        config (DeepseekV3/R1Config): The config of DeepseekV3/R1 model.
+        network (InferenceDeepseekV3ForCausalLM): The network of DeepseekV3/R1.
+
+    """
+
+    def __init__(self, config, network, is_quant):
+        super().__init__(config, network, is_quant)
+
+    def quant_convert_weight_name(self, weight_name: str):
+        """replace quant net weight name"""
+        weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight')
+
+        weight_name = weight_name.replace('.self_attn.q_a_proj.weight', '.attention.q2l_proj._layer.weight')
+        weight_name = weight_name.replace('.self_attn.q_a_proj.input_scale', '.attention.q2l_proj.quant_op.input_scale')
+        weight_name = weight_name.replace('.self_attn.q_a_proj.input_offset', '.attention.q2l_proj.quant_op.input_zp')
+        weight_name = weight_name.replace('.self_attn.q_a_proj.quant_bias',
+                                          '.attention.q2l_proj._layer.matmul.quant_bias')
+        weight_name = weight_name.replace('.self_attn.q_a_proj.deq_scale',
+                                          '.attention.q2l_proj._layer.matmul.dequant_scale')
+
+        weight_name = weight_name.replace('.self_attn.q_a_layernorm.weight', '.attention.lq_norm.weight')
+        weight_name = weight_name.replace('.self_attn.kv_a_layernorm.weight', '.attention.lkv_norm.weight')
+        weight_name = weight_name.replace('.self_attn.kv_b_proj.', '.attention.lkv2kv.')
+
+        weight_name = weight_name.replace('.self_attn.q_b_proj.weight', '.attention.l2q_proj._layer.weight')
+        weight_name = weight_name.replace('.self_attn.q_b_proj.input_scale', '.attention.l2q_proj.quant_op.input_scale')
+        weight_name = weight_name.replace('.self_attn.q_b_proj.input_offset', '.attention.l2q_proj.quant_op.input_zp')
+        weight_name = weight_name.replace('.self_attn.q_b_proj.quant_bias',
+                                          '.attention.l2q_proj._layer.matmul.quant_bias')
+        weight_name = weight_name.replace('.self_attn.q_b_proj.deq_scale',
+                                          '.attention.l2q_proj._layer.matmul.dequant_scale')
+
+        weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.weight', '.attention.kv2l._layer.weight')
+        weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.input_scale',
+                                          '.attention.kv2l.quant_op.input_scale')
+        weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.input_offset',
+                                          '.attention.kv2l.quant_op.input_zp')
+        weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.quant_bias',
+                                          '.attention.kv2l._layer.matmul.quant_bias')
+        weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.deq_scale',
+                                          '.attention.kv2l._layer.matmul.dequant_scale')
+
+        weight_name = weight_name.replace('.self_attn.o_proj.weight', '.attention.wo._layer.weight')
+        weight_name = weight_name.replace('.self_attn.o_proj.input_scale', '.attention.wo.quant_op.input_scale')
+        weight_name = weight_name.replace('.self_attn.o_proj.input_offset', '.attention.wo.quant_op.input_zp')
+        weight_name = weight_name.replace('.self_attn.o_proj.quant_bias', '.attention.wo._layer.matmul.quant_bias')
+        weight_name = weight_name.replace('.self_attn.o_proj.deq_scale', '.attention.wo._layer.matmul.dequant_scale')
+
+        weight_name = weight_name.replace('.self_attn.q_a_layernorm.bias', '.attention.l2q_proj.quant_op.beta')
+        weight_name = weight_name.replace('.input_layernorm.bias', '.attention.q2l_proj.quant_op.beta')
+
+        # mlp is pertoken quant
+        weight_name = weight_name.replace('.weight_scale', '.matmul.weight_scale')
+        weight_name = weight_name.replace('.weight_offset', '.matmul.weight_offset')
+
+        weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1._layer.')
+        weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2._layer.')
+        weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3._layer.')
+        weight_name = weight_name.replace('mlp.experts.', 'feed_forward.routed_experts.ffn.')
+        weight_name = weight_name.replace('mlp.shared_experts.gate_proj.', 'feed_forward.shared_experts.w1._layer.')
+        weight_name = weight_name.replace('mlp.shared_experts.down_proj.', 'feed_forward.shared_experts.w2._layer.')
+        weight_name = weight_name.replace('mlp.shared_experts.up_proj.', 'feed_forward.shared_experts.w3._layer.')
+        weight_name = weight_name.replace('mlp.gate.weight', 'feed_forward.routed_experts.router.dense.weight')
+        weight_name = weight_name.replace('mlp.gate.e_score_correction_bias',
+                                          'feed_forward.routed_experts.router.e_score_correction_bias')
+        weight_name = weight_name.replace('.input_layernorm.weight', '.attention_norm.weight')
+        weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.')
+        weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight')
+        return weight_name
+
+    def infer_trans_rope_weight(self, weight, qk_rope_head_dim):
+        """process rope router weight"""
+        w1 = weight[..., -qk_rope_head_dim::2, :]
+        w2 = weight[..., -qk_rope_head_dim + 1::2, :]
+        weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2)
+        return weight
+
+    def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """process moe router expert weight"""
+        ffn_concat = self.config.model.model_config.ffn_concat
+        num_router_experts = self.config.moe_config.expert_num
+
+        parameter_dict = {}
+        # router expert dense
+        router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
+        router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name)
+        router_dense_ms_param = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[router_dense_ms_name] = ms.Parameter(ms.Tensor(router_dense_ms_param, ms.bfloat16),
+                                                            name=router_dense_ms_name, requires_grad=False)
+
+        # e_score_correction_bias
+        e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
+        e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name)
+        e_score_correction_bias_ms_param = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
+                                                                         hf_weight_map)
+        parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
+            ms.Tensor(e_score_correction_bias_ms_param, ms.bfloat16),
+            name=e_score_correction_bias_ms_name, requires_grad=False)
+
+        w1_list = []
+        w2_list = []
+        w3_list = []
+
+        w1_scale_list = []
+        w2_scale_list = []
+        w3_scale_list = []
+
+        w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.weight"
+        w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.weight"
+        w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.weight"
+
+        w1_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.matmul.weight_scale"
+        w2_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.matmul.weight_scale"
+        w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale"
+
+        for index in range(0, num_router_experts):
+            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
+            w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=0)
+
+            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
+            w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=1)
+
+            w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
+            w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=0)
+
+            w1_list.append(w1_ms_param)
+            w2_list.append(w2_ms_param)
+            w3_list.append(w3_ms_param)
+
+            w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
+            w1_scale_ms_param = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                              is_split_param=True, split_axis=0)
+
+            w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
+            w2_scale_ms_param = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            # is_split_param=True, split_axis=0)
+
+            w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
+            w3_scale_ms_param = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                              is_split_param=True, split_axis=0)
+
+            w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
+            w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
+            w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1)
+            w1_scale_list.append(w1_scale_ms_param)
+            w2_scale_list.append(w2_scale_ms_param)
+            w3_scale_list.append(w3_scale_ms_param)
+
+        w1_ms_stack_param = np.stack(w1_list, axis=0).transpose(0, 2, 1)
+        w2_ms_stack_param = np.stack(w2_list, axis=0).transpose(0, 2, 1)
+        w3_ms_stack_param = np.stack(w3_list, axis=0).transpose(0, 2, 1)
+
+        w1_scale_ms_stack_param = np.stack(w1_scale_list, axis=0)
+        w2_scale_ms_stack_param = np.stack(w2_scale_list, axis=0)
+        w3_scale_ms_stack_param = np.stack(w3_scale_list, axis=0)
+
+        if ffn_concat:
+            # w_gate_hidden
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.weight"
+            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=2),
+                                            dtype=ms.int8)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+            # w_scale_gate_hidden
+            w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.matmul.weight_scale"
+            w_scale_gate_hidden_param = ms.Tensor(
+                np.concatenate([w1_scale_ms_stack_param, w3_scale_ms_stack_param], axis=1), dtype=ms.bfloat16)
+            parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
+                                                                    name=w_scale_gate_hidden_name,
+                                                                    requires_grad=False)
+        else:
+            # w1 w3
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_stack_param, ms.int8), name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_stack_param, ms.int8), name=w3_ms_name,
+                                                      requires_grad=False)
+
+            # w1_scale w3_scale
+            parameter_dict[w1_scale_ms_name] = ms.Parameter(ms.Tensor(w1_scale_ms_stack_param, ms.bfloat16),
+                                                            name=w1_ms_name,
+                                                            requires_grad=False)
+            parameter_dict[w3_scale_ms_name] = ms.Parameter(ms.Tensor(w3_scale_ms_stack_param, ms.bfloat16),
+                                                            name=w3_ms_name,
+                                                            requires_grad=False)
+
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_stack_param, ms.int8), name=w2_ms_name,
+                                                  requires_grad=False)
+
+        parameter_dict[w2_scale_ms_name] = ms.Parameter(ms.Tensor(w2_scale_ms_stack_param, ms.bfloat16),
+                                                        name=w2_scale_ms_name,
+                                                        requires_grad=False)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer quant process moe shared expert ffn weight"""
+
+        ffn_concat = self.config.model.model_config.ffn_concat
+        parameter_dict = {}
+        w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
+        w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
+        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+
+        w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale"
+        w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
+        w1_scale_ms_param = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
+
+        w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
+        w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
+        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+
+        w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale"
+        w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
+        w2_scale_ms_param = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+
+        w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
+        w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
+        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+
+        w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale"
+        w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
+        w3_scale_ms_param = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
+
+        w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
+        w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
+        w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1)
+
+        if ffn_concat:
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.weight"
+            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0), dtype=ms.int8)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+
+            w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.matmul.weight_scale"
+            w_scale_gate_hidden_param = ms.Tensor(
+                np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0),
+                dtype=ms.bfloat16)
+
+            parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
+                                                                    name=w_scale_gate_hidden_name,
+                                                                    requires_grad=False)
+
+        else:
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.int8),
+                                                      name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.int8),
+                                                      name=w3_ms_name,
+                                                      requires_grad=False)
+
+            parameter_dict[w1_scale_ms_name] = ms.Parameter(ms.Tensor(w1_scale_ms_param, ms.bfloat16),
+                                                            name=w1_ms_name,
+                                                            requires_grad=False)
+            parameter_dict[w3_scale_ms_name] = ms.Parameter(ms.Tensor(w3_scale_ms_param, ms.bfloat16),
+                                                            name=w3_ms_name,
+                                                            requires_grad=False)
+
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.int8),
+                                                  name=w2_ms_name,
+                                                  requires_grad=False)
+
+        parameter_dict[w2_scale_ms_name] = ms.Parameter(ms.Tensor(w2_scale_ms_param, ms.bfloat16),
+                                                        name=w2_ms_name,
+                                                        requires_grad=False)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_quant_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process dense ffn weight"""
+
+        ffn_concat = self.config.model.model_config.ffn_concat
+        parameter_dict = {}
+        w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
+        w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
+        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                    is_split_param=True,
+                                                    split_axis=0)
+        w1_scale_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight_scale"
+        w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
+        w1_scale_ms_param = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                          is_split_param=True,
+                                                          split_axis=0)
+
+        w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
+        w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
+        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                    is_split_param=True,
+                                                    split_axis=1)
+        w2_scale_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight_scale"
+        w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
+        # shape:[7168,1]
+        w2_scale_ms_param = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+        # is_split_param=True,
+        # split_axis=0)
+
+        w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
+        w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
+        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                    is_split_param=True,
+                                                    split_axis=0)
+        w3_scale_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight_scale"
+        w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
+        w3_scale_ms_param = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                          is_split_param=True,
+                                                          split_axis=0)
+
+        w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
+        w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
+        w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1)
+
+        if ffn_concat:
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.weight"
+            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0),
+                                            dtype=ms.int8)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+
+            w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.matmul.weight_scale"
+            w_scale_gate_hidden_param = ms.Tensor(
+                np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0),
+                dtype=ms.bfloat16)
+            parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
+                                                                    name=w_scale_gate_hidden_name,
+                                                                    requires_grad=False)
+
+        else:
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.int8),
+                                                      name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.int8),
+                                                      name=w3_ms_name,
+                                                      requires_grad=False)
+
+            parameter_dict[w1_scale_ms_name] = ms.Parameter(ms.Tensor(w1_scale_ms_param, ms.bfloat16),
+                                                            name=w1_scale_ms_name,
+                                                            requires_grad=False)
+            parameter_dict[w3_scale_ms_name] = ms.Parameter(ms.Tensor(w3_scale_ms_param, ms.bfloat16),
+                                                            name=w3_scale_ms_name,
+                                                            requires_grad=False)
+
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.int8),
+                                                  name=w2_ms_name,
+                                                  requires_grad=False)
+
+        parameter_dict[w2_scale_ms_name] = ms.Parameter(ms.Tensor(w2_scale_ms_param, ms.bfloat16),
+                                                        name=w2_ms_name,
+                                                        requires_grad=False)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map):
+        """convert weight not in model"""
+        parameter_dict = {}
+        embed_tokens_hf_name = "model.embed_tokens.weight"
+        embed_tokens_ms_name = self.quant_convert_weight_name(embed_tokens_hf_name)
+        np_data = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16),
+                                                            name=embed_tokens_ms_name,
+                                                            requires_grad=False)
+
+        norm_hf_name = "model.norm.weight"
+        norm_ms_name = self.quant_convert_weight_name(norm_hf_name)
+        np_data = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[norm_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=norm_ms_name,
+                                                    requires_grad=False)
+
+        lm_head_hf_name = "lm_head.weight"
+        lm_head_ms_name = self.quant_convert_weight_name(lm_head_hf_name)
+        if not self.config.parallel_config.vocab_emb_dp:
+            np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
+                                                    is_split_param=True, split_axis=0)
+        else:
+            np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
+                                                       requires_grad=False)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def quant_special_attention_weight(self, layer_id, src_hf_dir, hf_weight_map, name, is_trans_rope_weigh=False,
+                                       is_split_param=False):
+        # q_a_proj->q2l_proj
+        # kv_a_proj_with_mqa->kv2l
+        # q_a_layernorm->lq_norm
+        # o_proj->wo
+
+        # input_scale, input_zp no split
+        parameter_dict = {}
+        input_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_scale"
+        input_scale_ms_name = self.quant_convert_weight_name(input_scale_hf_name)
+        input_scale_ms_param = self.get_safetensor_from_file(input_scale_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[input_scale_ms_name] = ms.Parameter(ms.Tensor(input_scale_ms_param, ms.bfloat16),
+                                                           name=input_scale_ms_name, requires_grad=False)
+
+        input_zp_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_offset"
+        input_zp_ms_name = self.quant_convert_weight_name(input_zp_hf_name)
+        input_zp_ms_param = self.get_safetensor_from_file(input_zp_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[input_zp_ms_name] = ms.Parameter(ms.Tensor(input_zp_ms_param, ms.int8),
+                                                        name=input_zp_ms_name,
+                                                        requires_grad=False)
+
+        if not is_trans_rope_weigh:
+            quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
+            quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name)
+            quant_bias_ms_param = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
+
+            dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale"
+            dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name)
+            dequant_scale_ms_param = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map)
+        else:
+            kv_lora_rank = self.config.model.model_config.kv_lora_rank
+            qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
+            qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim
+
+            num_heads = self.config.model.model_config.num_heads
+            rope_dim = qk_rope_head_dim + qk_nope_head_dim
+            kv_head_dim = kv_lora_rank + qk_rope_head_dim
+
+            quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
+            quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name)
+            quant_bias_ms_param = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
+
+            dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale"
+            dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name)
+            dequant_scale_ms_param = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map)
+
+            if name == "q_b_proj":
+                quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads, rope_dim, -1)
+                quant_bias_ms_param = self.infer_trans_rope_weight(quant_bias_ms_param, qk_rope_head_dim)
+                quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads * rope_dim, -1).reshape(-1)
+
+                dequant_scale_ms_param = dequant_scale_ms_param.reshape(num_heads, rope_dim, -1)
+                dequant_scale_ms_param = self.infer_trans_rope_weight(dequant_scale_ms_param, qk_rope_head_dim)
+                dequant_scale_ms_param = dequant_scale_ms_param.reshape(num_heads * rope_dim, -1).reshape(-1)
+
+            elif name == "kv_a_proj_with_mqa":
+                quant_bias_ms_param = quant_bias_ms_param.reshape(kv_head_dim, -1)
+                quant_bias_ms_param = self.infer_trans_rope_weight(quant_bias_ms_param, qk_rope_head_dim).reshape(-1)
+
+                dequant_scale_ms_param = dequant_scale_ms_param.reshape(kv_head_dim, -1)
+                dequant_scale_ms_param = self.infer_trans_rope_weight(dequant_scale_ms_param, qk_rope_head_dim).reshape(
+                    -1)
+
+        if is_split_param:
+            quant_bias_ms_param = self.split_weight_by_rank(quant_bias_ms_param, split_axis=0)
+            dequant_scale_ms_param = self.split_weight_by_rank(dequant_scale_ms_param, split_axis=0)
+
+        parameter_dict[quant_bias_ms_name] = ms.Parameter(ms.Tensor(quant_bias_ms_param, ms.int32),
+                                                          name=quant_bias_ms_name, requires_grad=False)
+        parameter_dict[dequant_scale_ms_name] = ms.Parameter(ms.Tensor(dequant_scale_ms_param, ms.float32),
+                                                             name=dequant_scale_ms_name, requires_grad=False)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_quant_bias_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        # quant_op.beta
+        parameter_dict = {}
+        q2l_proj_bias_hf_name = f"model.layers.{layer_id}.input_layernorm.bias"
+        q2l_proj_bias_ms_name = self.quant_convert_weight_name(q2l_proj_bias_hf_name)
+        q2l_proj_bias_ms_param = self.get_safetensor_from_file(q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map)
+
+        kv2l_bias_ms_name = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta"
+        kv2l_bias_ms_param = q2l_proj_bias_ms_param.copy()
+
+        l2q_proj_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.bias"
+        l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name)
+        l2q_proj_bias_ms_param = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map)
+
+        parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter(ms.Tensor(q2l_proj_bias_ms_param, ms.bfloat16),
+                                                             name=q2l_proj_bias_ms_name, requires_grad=False)
+        parameter_dict[kv2l_bias_ms_name] = ms.Parameter(ms.Tensor(kv2l_bias_ms_param, ms.bfloat16),
+                                                         name=kv2l_bias_ms_name,
+                                                         requires_grad=False)
+        parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter(ms.Tensor(l2q_proj_bias_ms_param, ms.bfloat16),
+                                                             name=l2q_proj_bias_ms_name,
+                                                             requires_grad=False)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer quant process attention weight"""
+        start_time = time.time()
+        parameter_dict = {}
+        num_heads = self.config.model.model_config.num_heads
+        kv_lora_rank = self.config.model.model_config.kv_lora_rank
+        qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
+        v_head_dim = self.config.model.model_config.v_head_dim
+        qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim
+
+        rope_dim = qk_rope_head_dim + qk_nope_head_dim
+        kv_head_dim = kv_lora_rank + qk_rope_head_dim
+
+        # q_a_proj->q2l_proj
+        q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
+        q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name)
+        q2l_proj_ms_param = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.Tensor(q2l_proj_ms_param, ms.int8),
+                                                        name=q2l_proj_ms_name,
+                                                        requires_grad=False)
+        self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj")
+
+        # kv_a_proj_with_mqa->kv2l
+        kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
+        kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name)
+        kv2l_ms_param = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
+        kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
+        kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
+        parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.int8), name=kv2l_ms_name,
+                                                    requires_grad=False)
+        self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa",
+                                            is_trans_rope_weigh=True)
+
+        # q_a_layernorm->lq_norm
+        lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
+        lq_norm_ms_name = self.quant_convert_weight_name(lq_norm_hf_name)
+        lq_norm_ms_param = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.Tensor(lq_norm_ms_param, ms.bfloat16),
+                                                       name=lq_norm_ms_name,
+                                                       requires_grad=False)
+
+        # q_b_proj->l2q_proj
+        l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight"
+        l2q_proj_ms_name = self.quant_convert_weight_name(l2q_proj_hf_name)
+        l2q_proj_ms_param = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map)
+        l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1)
+        l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim)
+        l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1)
+        l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0)
+        parameter_dict[l2q_proj_ms_name] = ms.Parameter(ms.Tensor(l2q_proj_ms_param, ms.int8),
+                                                        name=l2q_proj_ms_name,
+                                                        requires_grad=False)
+        self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_b_proj", is_trans_rope_weigh=True,
+                                            is_split_param=True)
+
+        # kv_a_layernorm->lkv_norm
+        lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight"
+        lkv_norm_ms_name = self.quant_convert_weight_name(lkv_norm_hf_name)
+        lkv_norm_ms_param = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[lkv_norm_ms_name] = ms.Parameter(ms.Tensor(lkv_norm_ms_param, ms.bfloat16),
+                                                        name=lkv_norm_ms_name,
+                                                        requires_grad=False)
+
+        # kv_b_proj->lkv2kv
+        lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight"
+        lkv2kv_ms_name = self.quant_convert_weight_name(lkv2kv_hf_name)
+        lkv2kv_ms_param = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map)
+        lkv2kv_head = qk_nope_head_dim + v_head_dim
+        lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1)
+        value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :]
+
+        # value_k_nope
+        value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1])
+        value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0)
+        name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.")
+        parameter_dict[name_k_nope] = ms.Parameter(ms.Tensor(value_k_nope, ms.bfloat16), name=name_k_nope,
+                                                   requires_grad=False)
+        # value_v
+        value_v = value_v.reshape(-1, value_v.shape[-1])
+        value_v = self.split_weight_by_rank(value_v, split_axis=0)
+        name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.")
+        parameter_dict[name_v] = ms.Parameter(ms.Tensor(value_v, ms.bfloat16), name=name_v,
+                                              requires_grad=False)
+
+        # o_proj->wo
+        wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
+        wo_ms_name = self.quant_convert_weight_name(wo_hf_name)
+        wo_ms_param = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
+        wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1)
+        parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.int8), name=wo_ms_name,
+                                                  requires_grad=False)
+        self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj")
+
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer quant net convert layer weight"""
+        print(f"..... start convert layer {layer_id} .......", flush=True)
+
+        if layer_id >= 3:
+            self.infer_quant_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+            self.infer_quant_process_moe_shared_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+        else:
+            self.infer_quant_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        self.infer_quant_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
+        self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map)
+        self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        print(f"..... end convert layer {layer_id} .......", flush=True)
+
+    def convert_weight_name(self, weight_name: str):
+        """replace weight name"""
+        weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight')
+        weight_name = weight_name.replace('.self_attn.q_a_proj.', '.attention.q2l_proj.')
+        weight_name = weight_name.replace('.self_attn.q_a_layernorm.', '.attention.lq_norm.')
+        weight_name = weight_name.replace('.self_attn.q_b_proj.', '.attention.l2q_proj.')
+        weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.', '.attention.kv2l.')
+        weight_name = weight_name.replace('.self_attn.kv_a_layernorm.', '.attention.lkv_norm.')
+        weight_name = weight_name.replace('.self_attn.kv_b_proj.', '.attention.lkv2kv.')
+        weight_name = weight_name.replace('.self_attn.o_proj.', '.attention.wo.')
+        weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1.')
+        weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2.')
+        weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3.')
+        weight_name = weight_name.replace('mlp.experts.', 'feed_forward.routed_experts.ffn.')
+        weight_name = weight_name.replace('mlp.shared_experts.gate_proj.', 'feed_forward.shared_experts.w1.')
+        weight_name = weight_name.replace('mlp.shared_experts.down_proj.', 'feed_forward.shared_experts.w2.')
+        weight_name = weight_name.replace('mlp.shared_experts.up_proj.', 'feed_forward.shared_experts.w3.')
+        weight_name = weight_name.replace('mlp.gate.weight', 'feed_forward.routed_experts.router.dense.weight')
+        weight_name = weight_name.replace('mlp.gate.e_score_correction_bias',
+                                          'feed_forward.routed_experts.router.e_score_correction_bias')
+        weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.')
+        weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.')
+        weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight')
+        return weight_name
+
+    def infer_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """process moe router expert weight"""
+        ffn_concat = self.config.model.model_config.ffn_concat
+        num_router_experts = self.config.moe_config.expert_num
+        parameter_dict = {}
+
+        # router expert dense
+        router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
+        router_dense_ms_name = self.convert_weight_name(router_dense_hf_name)
+        router_dense_ms_param = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[router_dense_ms_name] = ms.Parameter(ms.Tensor(router_dense_ms_param, ms.bfloat16),
+                                                            name=router_dense_ms_name, requires_grad=False)
+
+        # e_score_correction_bias
+        e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
+        e_score_correction_bias_ms_name = self.convert_weight_name(e_score_correction_bias_hf_name)
+        e_score_correction_bias_ms_param = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
+                                                                         hf_weight_map)
+        parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
+            ms.Tensor(e_score_correction_bias_ms_param, ms.bfloat16),
+            name=e_score_correction_bias_ms_name, requires_grad=False)
+
+        w1_list = []
+        w2_list = []
+        w3_list = []
+
+        w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1.weight"
+        w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2.weight"
+        w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight"
+        for index in range(0, num_router_experts):
+            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
+            w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=0)
+
+            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
+            w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=1)
+
+            w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
+            w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=0)
+
+            w1_list.append(w1_ms_param)
+            w2_list.append(w2_ms_param)
+            w3_list.append(w3_ms_param)
+
+        w1_ms_stack_param = np.stack(w1_list, axis=0).transpose(0, 2, 1)
+        w2_ms_stack_param = np.stack(w2_list, axis=0).transpose(0, 2, 1)
+        w3_ms_stack_param = np.stack(w3_list, axis=0).transpose(0, 2, 1)
+
+        if ffn_concat:
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden.weight"
+            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=2),
+                                            dtype=ms.bfloat16)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+        else:
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_stack_param, ms.bfloat16), name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_stack_param, ms.bfloat16), name=w3_ms_name,
+                                                      requires_grad=False)
+
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_stack_param, ms.bfloat16), name=w2_ms_name,
+                                                  requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process moe shared expert ffn weight"""
+        ffn_concat = self.config.model.model_config.ffn_concat
+        parameter_dict = {}
+        w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
+        w1_ms_name = self.convert_weight_name(w1_hf_name)
+        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+
+        w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
+        w2_ms_name = self.convert_weight_name(w2_hf_name)
+        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+
+        w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
+        w3_ms_name = self.convert_weight_name(w3_hf_name)
+        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+
+        if ffn_concat:
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
+            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0), dtype=ms.bfloat16)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+        else:
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.bfloat16), name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.bfloat16), name=w3_ms_name,
+                                                      requires_grad=False)
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
+                                                  requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process dense ffn weight"""
+
+        ffn_concat = self.config.model.model_config.ffn_concat
+        parameter_dict = {}
+
+        w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
+        w1_ms_name = self.convert_weight_name(w1_hf_name)
+        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+
+        w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
+        w2_ms_name = self.convert_weight_name(w2_hf_name)
+        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=1)
+
+        w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
+        w3_ms_name = self.convert_weight_name(w3_hf_name)
+        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+
+        if ffn_concat:
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight"
+            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0), dtype=ms.bfloat16)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+        else:
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.bfloat16), name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.bfloat16), name=w3_ms_name,
+                                                      requires_grad=False)
+
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
+                                                  requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process attention weight"""
+        num_heads = self.config.model.model_config.num_heads
+        kv_lora_rank = self.config.model.model_config.kv_lora_rank
+        qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
+        v_head_dim = self.config.model.model_config.v_head_dim
+        qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim
+
+        rope_dim = qk_rope_head_dim + qk_nope_head_dim
+        kv_head_dim = kv_lora_rank + qk_rope_head_dim
+
+        parameter_dict = {}
+        # q2l_proj
+        q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
+        q2l_proj_ms_name = self.convert_weight_name(q2l_proj_hf_name)
+        q_a_proj_ms_param = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.Tensor(q_a_proj_ms_param, ms.bfloat16),
+                                                        name=q2l_proj_ms_name,
+                                                        requires_grad=False)
+
+        # kv2l
+        kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
+        kv2l_ms_name = self.convert_weight_name(kv2l_hf_name)
+        kv2l_ms_param = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
+        kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
+        kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
+        parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.bfloat16), name=kv2l_ms_name,
+                                                    requires_grad=False)
+
+        # lq_norm
+        lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
+        lq_norm_ms_name = self.convert_weight_name(lq_norm_hf_name)
+        lq_norm_ms_param = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.Tensor(lq_norm_ms_param, ms.bfloat16), name=lq_norm_ms_name,
+                                                       requires_grad=False)
+
+        # l2q_proj
+        l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight"
+        l2q_proj_ms_name = self.convert_weight_name(l2q_proj_hf_name)
+        l2q_proj_ms_param = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map)
+        l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1)
+        l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim)
+        l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1)
+        l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0)
+        parameter_dict[l2q_proj_ms_name] = ms.Parameter(ms.Tensor(l2q_proj_ms_param, ms.bfloat16),
+                                                        name=l2q_proj_ms_name,
+                                                        requires_grad=False)
+
+        # lkv_norm
+        lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight"
+        lkv_norm_ms_name = self.convert_weight_name(lkv_norm_hf_name)
+        lkv_norm_ms_param = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[lkv_norm_ms_name] = ms.Parameter(ms.Tensor(lkv_norm_ms_param, ms.bfloat16),
+                                                        name=lkv_norm_ms_name,
+                                                        requires_grad=False)
+
+        # lkv2kv
+        lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight"
+        lkv2kv_ms_name = self.convert_weight_name(lkv2kv_hf_name)
+        lkv2kv_ms_param = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map)
+        lkv2kv_head = qk_nope_head_dim + v_head_dim
+        lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1)
+        value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :]
+
+        # value_k_nope
+        value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1])
+        value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0)
+        name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.")
+        parameter_dict[name_k_nope] = ms.Parameter(ms.Tensor(value_k_nope, ms.bfloat16), name=name_k_nope,
+                                                   requires_grad=False)
+        # value_v
+        value_v = value_v.reshape(-1, value_v.shape[-1])
+        value_v = self.split_weight_by_rank(value_v, split_axis=0)
+        name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.")
+        parameter_dict[name_v] = ms.Parameter(ms.Tensor(value_v, ms.bfloat16), name=name_v,
+                                              requires_grad=False)
+
+        # wo
+        wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
+        wo_ms_name = self.convert_weight_name(wo_hf_name)
+        wo_ms_param = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
+        wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1)
+        parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
+                                                  requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process attention weight"""
+        parameter_dict = {}
+        # attention_norm
+        attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight"
+        attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name)
+        attention_norm_ms_param = self.get_safetensor_from_file(attention_norm_hf_name,
+                                                                src_hf_dir,
+                                                                hf_weight_map)
+        parameter_dict[attention_norm_ms_name] = ms.Parameter(ms.Tensor(attention_norm_ms_param, ms.bfloat16),
+                                                              name=attention_norm_ms_name,
+                                                              requires_grad=False)
+
+        # ffn_norm
+        ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight"
+        ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name)
+        ffn_norm_ms_param = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[ffn_norm_ms_name] = ms.Parameter(ms.Tensor(ffn_norm_ms_param, ms.bfloat16),
+                                                        name=ffn_norm_ms_name,
+                                                        requires_grad=False)
+
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer convert layer weight"""
+        print(f"..... start convert layer {layer_id} .......", flush=True)
+
+        if layer_id >= 3:
+            self.infer_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+            self.infer_process_moe_shared_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+        else:
+            self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
+        self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        print(f"..... end convert layer {layer_id} .......", flush=True)
+
+    def infer_convert_and_parallelism(self, src_hf_dir):
+        """convert inference model weight """
+        param_json_path = ""
+        for file in os.listdir(src_hf_dir):
+            if file.endswith('index.json'):
+                param_json_path = os.path.join(src_hf_dir, file)
+                break
+        if not param_json_path:
+            raise ValueError("param_json_path:{} is error.".format(param_json_path))
+        print("param_json_path is {}".format(param_json_path))
+
+        with open(param_json_path, "r") as fp:
+            hf_weight_map = json.load(fp)['weight_map']
+
+        self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
+        num_layers = self.config.model.model_config.num_layers
+        for layer_id in range(num_layers):
+            if self.is_quant:
+                self.infer_quant_net_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
+            else:
+                self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
new file mode 100644
index 000000000..c5e72c781
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
@@ -0,0 +1,104 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Infer save ckpt by safetensor."""
+import argparse
+import os
+from collections import OrderedDict
+
+import mindspore as ms
+from mindspore import dtype as msdtype
+from mindspore.communication.management import get_rank
+from mindformers.core.parallel_config import build_parallel_config
+from mindformers.tools.logger import logger
+from mindformers import MindFormerConfig
+from mindformers import build_context
+from research.deepseek3.deepseekv3_infer_parallelism import DeepseekInferParallelism
+
+from research.deepseek3.deepseek3_config import DeepseekV3Config
+from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
+
+# for example
+# bash scripts/msrun_launcher.sh "python ./infer_save_ckpt_from_safetensor.py
+# --config /path/to/predict_deepseek_r1_671b.yaml
+# --save_ckpt_path /path/to/save_ckpt_path
+# --load_checkpoint /path/to/safetensor_path " 4 8555 "output/deepseek_msrun_log" "False" 7200
+
+def create_ptq():
+    '''create_ptq'''
+    from research.deepseek3.deepseek3_model_infer import DeepseekV3DecodeLayer
+    from mindspore_gs.ptq import PTQ
+    from mindspore_gs.common import BackendTarget
+    from mindspore_gs.ptq import PTQConfig, PTQMode, OutliersSuppressionType, PrecisionRecovery, QuantGranularity
+    cfg = PTQConfig(mode=PTQMode.DEPLOY, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                    act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_PLUS,
+                    opname_blacklist=['lkv2kv', 'lm_head'], precision_recovery=PrecisionRecovery.NONE,
+                    act_quant_granularity=QuantGranularity.PER_TENSOR,
+                    weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+    ffn_config = PTQConfig(mode=PTQMode.DEPLOY, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                           act_quant_dtype=msdtype.int8,
+                           outliers_suppression=OutliersSuppressionType.NONE,
+                           precision_recovery=PrecisionRecovery.NONE,
+                           act_quant_granularity=QuantGranularity.PER_TOKEN,
+                           weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+    ptq = PTQ(config=cfg, layer_policies=OrderedDict({r'.*\.feed_forward\..*': ffn_config}))
+    ptq.decoder_layers.append(DeepseekV3DecodeLayer)
+    return ptq
+
+
+def main(config_path, load_checkpoint, save_ckpt_dir):
+    # set model config
+    config = MindFormerConfig(config_path)
+    config.load_checkpoint = load_checkpoint
+
+    build_context(config)
+    build_parallel_config(config)
+    model_config = config.model.model_config
+    model_config.parallel_config = config.parallel_config
+    model_config.moe_config = config.moe_config
+    model_config = DeepseekV3Config(**model_config)
+
+    # build model from config
+    network = InferenceDeepseekV3ForCausalLM(model_config)
+
+    is_quant = hasattr(config.model.model_config, "quantization_config")
+
+    if is_quant:
+        ptq = create_ptq()
+        ptq.apply(network)
+        ptq.convert(network)
+        ptq.summary(network)
+    # load checkpoint
+    if config.load_checkpoint:
+        logger.info("----------------Transform and load checkpoint----------------")
+        model_parallelism = DeepseekInferParallelism(config, network, is_quant)
+        model_parallelism.infer_convert_and_parallelism(config.load_checkpoint)
+
+    rank_id = get_rank()
+    os.makedirs(os.path.join(save_ckpt_dir, "rank_" + str(rank_id)), exist_ok=True)
+
+    save_ckpt_path = os.path.join(save_ckpt_dir, "rank_" + str(rank_id), "checkpoint_" + str(rank_id))
+    ms.save_checkpoint(network.parameters_dict(), save_ckpt_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config_path', default='predict_llama2_7b.yaml', type=str,
+                        help='model config file path.')
+    parser.add_argument('--load_checkpoint', type=str,
+                        help='load model checkpoint path or directory.')
+    parser.add_argument('--save_ckpt_dir', type=str,
+                        help='save ckpt path.')
+    args = parser.parse_args()
+    main(args.config_path, args.load_checkpoint, args.save_ckpt_dir)
diff --git a/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
new file mode 100644
index 000000000..a063cab96
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
@@ -0,0 +1,82 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+transform huggingface safetensor.
+"""
+from safetensors import safe_open
+from mindspore.communication.management import get_rank, get_group_size
+
+
+class BaseModelParallelism:
+    r"""
+    Provide Infer model parameter convert and parallelism.
+    Args:
+        config (MF Config): The config of Infer model.
+        network (InferenceModelForCausalLM): The network of infer model.
+
+    """
+
+    def __init__(self, config, network, is_quant):
+        self.config = config
+        self.network = network
+        self.is_quant = is_quant
+
+    def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0):
+        tp_group_size = get_group_size()
+        rank_id = get_rank()
+        safetensor_file = hf_weight_map[hf_param_name]
+        with safe_open(f"{src_hf_dir}/{safetensor_file}", framework="np") as sf_file:
+            if not is_split_param:
+                np_data = sf_file.get_tensor(hf_param_name)
+                return np_data
+
+            np_data = sf_file.get_slice(hf_param_name)
+            shape = np_data.get_shape()
+            if split_axis == 0:
+                split_size = shape[0] // tp_group_size
+                start = rank_id * split_size
+                stop = (rank_id + 1) * split_size
+                split_data = np_data[start:stop]
+            elif split_axis == 1:
+                split_size = shape[1] // tp_group_size
+                start = rank_id * split_size
+                stop = (rank_id + 1) * split_size
+                split_data = np_data[:, start:stop]
+            else:
+                raise ValueError("split_axis:{} is not supported.".format(split_axis))
+            return split_data
+
+    def split_weight_by_rank(self, weight, split_axis=0):
+        tp_group_size = get_group_size()
+        rank_id = get_rank()
+        shape = weight.shape
+        if split_axis == 0:
+            split_size = shape[0] // tp_group_size
+            start = rank_id * split_size
+            stop = (rank_id + 1) * split_size
+            split_data = weight[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // tp_group_size
+            start = rank_id * split_size
+            stop = (rank_id + 1) * split_size
+            split_data = weight[:, start:stop]
+        else:
+            raise ValueError("split_axis:{} is not supported.".format(split_axis))
+        return split_data
+
+    def infer_convert_and_parallelism(self, src_hf_dir):
+        """ infer convert and parallelism """
+        raise NotImplementedError("infer_convert_and_parallelism method is not implemented.")
-- 
Gitee


From 8ff075c1ec58e0332aa4751303e9937a130bdfa7 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Sat, 8 Mar 2025 17:22:35 +0800
Subject: [PATCH 19/82] fix bug

---
 .../model_executor/models/mf_models/deepseek_v3.py         | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 5327351a0..d4a492b92 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -109,13 +109,13 @@ class DeepseekV3ForCausalLM(MsModelBase):
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
 
-        self.is_quant = hasattr(self.mf_config.model.model_config, "quantization_config")
-
+        self.is_quant = bool(hasattr(self.mf_model_config, "quantization_config") and
+                             self.mf_model_config.quantization_config)
         # Initital network
         self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
         # quant
-        if hasattr(self.mf_model_config, "quantization_config") and self.mf_model_config.quantization_config:        
+        if self.is_quant:
             from mindspore_gs.ptq import PTQ
             from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity
             from mindspore_gs.common import BackendTarget
@@ -231,7 +231,6 @@ class DeepseekV3ForCausalLM(MsModelBase):
         else:
             model_parallelism = DeepseekInferParallelism(self.mf_config, self.network, self.is_quant)
             model_parallelism.infer_convert_and_parallelism(self.mf_config.load_checkpoint)
-            barrier()
         self.network.set_dynamic_inputs()
         return None
 
-- 
Gitee


From 2dd5af6cc6edd24ad1d466e9e8fb8761eb2af3f3 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Mon, 10 Mar 2025 15:25:42 +0800
Subject: [PATCH 20/82] deepseek bug fix e_score_correction_bias dtype fp32

---
 .../models/mf_models/deepseekv3_infer_parallelism.py        | 4 ++--
 .../models/mf_models/deepseekv3_infer_save_ckpt.py          | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
index 671ab171c..6a713f836 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
@@ -125,7 +125,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         e_score_correction_bias_ms_param = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
                                                                          hf_weight_map)
         parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
-            ms.Tensor(e_score_correction_bias_ms_param, ms.bfloat16),
+            ms.Tensor(e_score_correction_bias_ms_param, ms.float32),
             name=e_score_correction_bias_ms_name, requires_grad=False)
 
         w1_list = []
@@ -659,7 +659,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         e_score_correction_bias_ms_param = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
                                                                          hf_weight_map)
         parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
-            ms.Tensor(e_score_correction_bias_ms_param, ms.bfloat16),
+            ms.Tensor(e_score_correction_bias_ms_param, ms.float32),
             name=e_score_correction_bias_ms_name, requires_grad=False)
 
         w1_list = []
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
index c5e72c781..4b781a8c1 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
@@ -85,10 +85,10 @@ def main(config_path, load_checkpoint, save_ckpt_dir):
         model_parallelism = DeepseekInferParallelism(config, network, is_quant)
         model_parallelism.infer_convert_and_parallelism(config.load_checkpoint)
 
-    rank_id = get_rank()
-    os.makedirs(os.path.join(save_ckpt_dir, "rank_" + str(rank_id)), exist_ok=True)
+    rank_id = str(get_rank())
+    os.makedirs(os.path.join(save_ckpt_dir, "rank_" + rank_id), exist_ok=True)
 
-    save_ckpt_path = os.path.join(save_ckpt_dir, "rank_" + str(rank_id), "checkpoint_" + str(rank_id))
+    save_ckpt_path = os.path.join(save_ckpt_dir, "rank_" + rank_id, "checkpoint_" + rank_id + ".ckpt")
     ms.save_checkpoint(network.parameters_dict(), save_ckpt_path)
 
 
-- 
Gitee


From a46087169f7636441758815160b73805c83d08ad Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Mon, 10 Mar 2025 15:25:42 +0800
Subject: [PATCH 21/82] qwen infer support customize parameter Parallel
 segmentation

---
 .../model_executor/models/mf_models/qwen2.py  |  23 +-
 .../mf_models/qwen2_infer_parallelism.py      | 259 ++++++++++++++++++
 2 files changed, 265 insertions(+), 17 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py

diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 197d562d0..c0a959d0d 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -37,7 +37,6 @@ from mindformers.core.parallel_config import build_parallel_config
 
 from mindformers.models.llama import LlamaConfig as LlamaConfig_MF
 from mindformers.trainer import BaseTrainer
-from mindformers.tools.utils import set_output_path, set_strategy_save_path
 from research.qwen2_5.infer.qwen2_5 import (
     ParallelQwenForCausalLM as ParallelQwenForCausalLM_MF,
 )
@@ -48,7 +47,7 @@ from vllm_mindspore.utils import calc_block_num
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model
-from mindformers.trainer.utils import transform_and_load_checkpoint
+from vllm_mindspore.model_executor.models.mf_models.qwen2_infer_parallelism import Qwen2InferParallelism
 
 
 logger = init_logger(__name__)
@@ -106,6 +105,9 @@ class Qwen2ForCausalLM(MsModelBase):
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
 
+        # qwen qkv concat will support in next version
+        self.mf_model_config.qkv_concat = False
+        self.mf_config.model.model_config.qkv_concat = False
         # Initial network
         self.network = ParallelQwenForCausalLM_MF(self.mf_model_config)
         self.network._jit_config_dict = JitConfig(
@@ -113,13 +115,6 @@ class Qwen2ForCausalLM(MsModelBase):
         ).jit_config_dict
 
         self.mf_config.load_checkpoint = self.get_model_path()
-        set_output_path(self.mf_config.output_dir)
-        set_strategy_save_path(self.mf_config.parallel)
-        # update safetensor path
-        ms_safetensors_path = BaseTrainer._get_load_path_after_hf_convert(
-            self.mf_config, self.network
-        )
-        self.mf_config.load_checkpoint = ms_safetensors_path
 
         self.mf_kvcaches_init = False
         self.logits = None
@@ -196,14 +191,8 @@ class Qwen2ForCausalLM(MsModelBase):
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
-        model = Model(self.network)
-        batch_size = self.mf_config.model.model_config.batch_size
-        seq_length = self.mf_config.model.model_config.seq_length
-        input_ids = np.ones(shape=tuple([batch_size, seq_length]))
-        infer_data = self.network.prepare_inputs_for_predict_layout(input_ids)
-        transform_and_load_checkpoint(
-            self.mf_config, model, self.network, infer_data, do_predict=True
-        )
+        model_parallelism = Qwen2InferParallelism(self.mf_config, self.network, False)
+        model_parallelism.infer_convert_and_parallelism(self.mf_config.load_checkpoint)
 
         self.network.set_dynamic_inputs()
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
new file mode 100644
index 000000000..05bd499bd
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
@@ -0,0 +1,259 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+transform huggingface model to mindspore safetensor.
+"""
+import os
+import json
+import numpy as np
+from safetensors import safe_open
+
+import mindspore as ms
+from vllm_mindspore.model_executor.models.mf_models.model_parallelism import BaseModelParallelism
+
+
+class Qwen2InferParallelism(BaseModelParallelism):
+    r"""
+    Provide Qwen2 Model infer parameter convert and parallelism.
+    Args:
+        config (Qwen2Config): The config of Qwen2 model.
+        network (InferenceQwen2ForCausalLM): The network of Qwen2.
+
+    """
+
+    def __init__(self, config, network, is_quant):
+        super().__init__(config, network, is_quant)
+
+    def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map):
+        """convert weight not in model"""
+        parameter_dict = {}
+        embed_tokens_hf_name = "model.embed_tokens.weight"
+        embed_tokens_ms_name = self.convert_weight_name(embed_tokens_hf_name)
+        if self.config.parallel_config.vocab_emb_dp:
+            np_data = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
+        else:
+            np_data = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map,
+                                                    is_split_param=True, split_axis=0)
+        parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16),
+                                                            name=embed_tokens_ms_name,
+                                                            requires_grad=False)
+
+        norm_hf_name = "model.norm.weight"
+        norm_ms_name = self.convert_weight_name(norm_hf_name)
+        np_data = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[norm_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=norm_ms_name,
+                                                    requires_grad=False)
+
+        lm_head_hf_name = "lm_head.weight"
+        lm_head_ms_name = self.convert_weight_name(lm_head_hf_name)
+        if not self.config.model.model_config.tie_word_embeddings:
+            if not self.config.parallel_config.vocab_emb_dp:
+                np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
+                                                        is_split_param=True, split_axis=0)
+            else:
+                np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
+            parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
+                                                           requires_grad=False)
+
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def convert_weight_name(self, weight_name: str):
+        """replace weight name"""
+        weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight')
+        weight_name = weight_name.replace('self_attn.q_proj.', 'attention.wq.')
+        weight_name = weight_name.replace('self_attn.k_proj.', 'attention.wk.')
+        weight_name = weight_name.replace('self_attn.v_proj.', 'attention.wv.')
+        weight_name = weight_name.replace('self_attn.o_proj.', 'attention.wo.')
+
+        weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1.')
+        weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2.')
+        weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3.')
+        weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.')
+        weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.')
+        weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight')
+        return weight_name
+
+    def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process dense ffn weight"""
+
+        ffn_concat = self.config.model.model_config.qkv_concat
+        parameter_dict = {}
+
+        w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
+        w1_ms_name = self.convert_weight_name(w1_hf_name)
+        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+
+        w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
+        w2_ms_name = self.convert_weight_name(w2_hf_name)
+        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=1)
+
+        w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
+        w3_ms_name = self.convert_weight_name(w3_hf_name)
+        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+
+        if ffn_concat:
+            w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight"
+            w_gate_hidden_param = np.concatenate((w1_ms_param, w3_ms_param), axis=0)
+            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                              requires_grad=False)
+        else:
+            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.bfloat16), name=w1_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.bfloat16), name=w3_ms_name,
+                                                      requires_grad=False)
+
+        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
+                                                  requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process attention weight"""
+        qkv_concat = self.config.model.model_config.qkv_concat
+        parameter_dict = {}
+        # wq
+        wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight"
+        wq_ms_name = self.convert_weight_name(wq_hf_name)
+        wq_ms_param = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+        # wq bias
+        wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias"
+        wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name)
+        wq_bias_ms_param = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=0)
+
+        # wk
+        wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight"
+        wk_ms_name = self.convert_weight_name(wk_hf_name)
+        wk_ms_param = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+        # wk bias
+        wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias"
+        wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name)
+        wk_bias_ms_param = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=0)
+
+        # wv
+        wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight"
+        wv_ms_name = self.convert_weight_name(wv_hf_name)
+        wv_ms_param = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=0)
+        # wv bias
+        wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias"
+        wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name)
+        wv_bias_ms_param = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=0)
+
+        if qkv_concat:
+            w_qkv_name = f"model.layers.{layer_id}.attention.w_qkv.weight"
+            w_qkv_param = np.concatenate((wq_ms_param, wk_ms_param, wv_ms_param), axis=0)
+            w_qkv_param = ms.Tensor(w_qkv_param, dtype=ms.bfloat16)
+            parameter_dict[w_qkv_name] = ms.Parameter(w_qkv_param, name=w_qkv_name, requires_grad=False)
+
+            w_qkv_bias_name = f"model.layers.{layer_id}.attention.w_qkv.bias"
+            w_qkv_bias_param = np.concatenate((wq_bias_ms_param, wk_bias_ms_param, wv_bias_ms_param), axis=0)
+            w_qkv_bias_param = ms.Tensor(w_qkv_bias_param, dtype=ms.bfloat16)
+            parameter_dict[w_qkv_bias_name] = ms.Parameter(w_qkv_bias_param, name=w_qkv_bias_name, requires_grad=False)
+        else:
+            parameter_dict[wq_ms_name] = ms.Parameter(ms.Tensor(wq_ms_param, ms.bfloat16), name=wq_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[wk_ms_name] = ms.Parameter(ms.Tensor(wk_ms_param, ms.bfloat16), name=wk_ms_name,
+                                                      requires_grad=False)
+            parameter_dict[wv_ms_name] = ms.Parameter(ms.Tensor(wv_ms_param, ms.bfloat16), name=wv_ms_name,
+                                                      requires_grad=False)
+
+            parameter_dict[wq_bias_ms_name] = ms.Parameter(ms.Tensor(wq_bias_ms_param, ms.bfloat16),
+                                                           name=wq_bias_ms_name,
+                                                           requires_grad=False)
+            parameter_dict[wk_bias_ms_name] = ms.Parameter(ms.Tensor(wk_bias_ms_param, ms.bfloat16),
+                                                           name=wk_bias_ms_name,
+                                                           requires_grad=False)
+            parameter_dict[wv_bias_ms_name] = ms.Parameter(ms.Tensor(wv_bias_ms_param, ms.bfloat16),
+                                                           name=wv_bias_ms_name,
+                                                           requires_grad=False)
+
+        # wo
+        wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
+        wo_ms_name = self.convert_weight_name(wo_hf_name)
+        wo_ms_param = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                    split_axis=1)
+        parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
+                                                  requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer process attention weight"""
+        parameter_dict = {}
+        # attention_norm
+        attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight"
+        attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name)
+        attention_norm_ms_param = self.get_safetensor_from_file(attention_norm_hf_name,
+                                                                src_hf_dir,
+                                                                hf_weight_map)
+        parameter_dict[attention_norm_ms_name] = ms.Parameter(ms.Tensor(attention_norm_ms_param, ms.bfloat16),
+                                                              name=attention_norm_ms_name,
+                                                              requires_grad=False)
+
+        # ffn_norm
+        ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight"
+        ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name)
+        ffn_norm_ms_param = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
+        parameter_dict[ffn_norm_ms_name] = ms.Parameter(ms.Tensor(ffn_norm_ms_param, ms.bfloat16),
+                                                        name=ffn_norm_ms_name,
+                                                        requires_grad=False)
+
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
+    def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer convert layer weight"""
+        print(f"..... start convert layer {layer_id} .......", flush=True)
+
+        self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
+        self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
+        self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        print(f"..... end convert layer {layer_id} .......", flush=True)
+
+    def infer_convert_and_parallelism(self, src_hf_dir):
+        """convert inference model weight """
+        param_json_path = ""
+        for file in os.listdir(src_hf_dir):
+            if file.endswith('index.json'):
+                param_json_path = os.path.join(src_hf_dir, file)
+                break
+        print("param_json_path is {}".format(param_json_path))
+
+        hf_weight_map = {}
+        if os.path.exists(param_json_path):
+            with open(param_json_path, "r") as fp:
+                hf_weight_map = json.load(fp)['weight_map']
+        else:
+            # only one safetensor, create a hf_weight_map
+            safetensor_file = "model.safetensors"
+            with safe_open(f"{src_hf_dir}/{safetensor_file}", framework="np") as sf_file:
+                all_keys = sf_file.keys()
+                for key in all_keys:
+                    hf_weight_map[str(key).strip()] = safetensor_file
+
+        self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
+        num_layers = self.config.model.model_config.num_layers
+        for layer_id in range(num_layers):
+            self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
-- 
Gitee


From 2f2078081840acd06192933dc8bff1e9399554a9 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Mon, 10 Mar 2025 20:29:28 +0800
Subject: [PATCH 22/82] =?UTF-8?q?master=E6=B7=BB=E5=8A=A0sampler=20st?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/st/python/test_sampler.py      | 771 +++++++++++++++++++++++++++
 vllm_mindspore/tests/test_sampler.py | 185 -------
 2 files changed, 771 insertions(+), 185 deletions(-)
 create mode 100644 tests/st/python/test_sampler.py
 delete mode 100644 vllm_mindspore/tests/test_sampler.py

diff --git a/tests/st/python/test_sampler.py b/tests/st/python/test_sampler.py
new file mode 100644
index 000000000..d31c10161
--- /dev/null
+++ b/tests/st/python/test_sampler.py
@@ -0,0 +1,771 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import vllm_mindspore
+import itertools
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+from unittest.mock import Mock, patch
+from mindspore import mint
+
+import pytest
+import torch
+from transformers import GenerationConfig, GenerationMixin
+
+import vllm.envs as envs
+
+from vllm_mindspore.model_executor.layers.sampler import Sampler
+from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm_mindspore.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import Counter, is_pin_memory_available
+
+class MockLogitsSampler(Sampler):
+
+    def __init__(self, fake_logits: torch.Tensor):
+        super().__init__()
+        self.fake_logits = fake_logits
+
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, VOCAB_SIZE),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+    return input_tensor, fake_logits, sampler
+
+
+VOCAB_SIZE = 32000
+RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = 'cuda'
+
+
+def _do_sample(
+    batch_size: int,
+    input_tensor: torch.Tensor,
+    sampler: MockLogitsSampler,
+    sampling_params: SamplingParams,
+    device: str,
+):
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(temperature=0)
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+    expected = torch.argmax(fake_logits, dim=-1)
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == expected[i].item()
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                      sampling_params, device)
+
+    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                       sampling_params, device)
+
+    assert first_sampler_output == second_sampler_output
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_min_tokens_penalty(seed: int, device: str):
+    seq_id_counter = Counter(start=random.randint(0, 100))
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    def create_sampling_params(min_tokens,
+                               eos_token_id=0,
+                               *,
+                               stop_token_ids: Optional[List[int]] = None,
+                               prompt_logprobs: Optional[int] = None):
+        sampling_params = SamplingParams(
+            min_tokens=min_tokens,
+            max_tokens=9999,  # keep higher than max of min_tokens
+            stop_token_ids=stop_token_ids,
+            # requesting prompt_logprobs changes the structure of `logits`
+            prompt_logprobs=prompt_logprobs,
+        )
+        sampling_params.all_stop_token_ids.add(eos_token_id)
+        return sampling_params
+
+    def create_sequence_data(num_input=3, num_generated=0):
+        seq_data = SequenceData.from_seqs(
+            random.choices(range(0, VOCAB_SIZE), k=num_input))
+        if num_generated > 0:
+            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
+                                                       k=num_generated)
+        return seq_data
+
+    def generate_test_case():
+        # generate multiple seq groups but limit total batch size
+        batch_size = random.randint(1, 128)
+
+        expected_penalization = []
+        sequence_metadata_list: List[SequenceGroupMetadata] = []
+        # 20% chance to generate seq group metadata list with all prompts
+        is_prompt = random.random() < 0.2
+        while batch_size > 0:
+            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
+
+            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
+            min_tokens = random.randint(0, 50)
+            num_stop_tokens = random.randint(0, 8)
+            if num_stop_tokens > 0:
+                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
+                                                k=num_stop_tokens)
+            else:
+                stop_token_ids = None
+
+            sampling_params = create_sampling_params(
+                min_tokens=min_tokens,
+                eos_token_id=eos_token_id,
+                stop_token_ids=stop_token_ids)
+
+            seq_data: Dict[int, SequenceData] = {}
+            seq_group_penalization: List[bool] = []
+            for _ in range(num_seqs):
+                num_input = random.randint(1, 100)
+                num_generated = 0 if is_prompt else random.randint(1, 100)
+                seq_data[next(seq_id_counter)] = create_sequence_data(
+                    num_input=num_input, num_generated=num_generated)
+                seq_group_penalization.append(num_generated < min_tokens)
+
+            expected_penalization.extend(seq_group_penalization)
+            sequence_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{batch_size}",
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=sampling_params,
+                    block_tables={},
+                ))
+            batch_size -= num_seqs
+
+        return {
+            "expected_penalization": expected_penalization,
+            "seq_group_metadata_list": sequence_metadata_list,
+        }
+
+    # define some explicit test cases for edge case behavior
+    prompt_without_penalization = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(0),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization = {
+        "expected_penalization": [True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization_and_prompt_logprobs = {
+        "expected_penalization": [False, False, True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=3),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+        ]
+    }
+
+    stop_penalizing_after_min_tokens = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
+    prompt_combination = {
+        "expected_penalization": [False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=2),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_3",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(
+                    0, stop_token_ids=stop_token_ids),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
+    decode_combination = {
+        "expected_penalization": [True, False, False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=100),
+                },
+                sampling_params=create_sampling_params(
+                    2, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=20),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=10),
+                },
+                sampling_params=create_sampling_params(
+                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+        ]
+    }
+
+    if seed == 0:
+        test_cases = [
+            prompt_without_penalization,
+            prompt_with_penalization,
+            prompt_with_penalization_and_prompt_logprobs,
+            stop_penalizing_after_min_tokens,
+            prompt_combination,
+            decode_combination,
+        ]
+    else:
+        test_cases = [generate_test_case()]
+
+    def run_test_case(*, expected_penalization: List[bool],
+                      seq_group_metadata_list: List[SequenceGroupMetadata]):
+        assert expected_penalization, \
+            "Invalid test case, need expected_penalization"
+        assert seq_group_metadata_list, \
+            "Invalid test case, need seq_group_metadata_list"
+
+        batch_size = 0
+        seq_lens: List[int] = []
+        sampling_params_per_row: List[SamplingParams] = []
+        for sgm in seq_group_metadata_list:
+            sampling_params = sgm.sampling_params
+
+            num_rows = len(sgm.seq_data)
+            if sgm.is_prompt:
+                # a prompt seq_group has only one sequence
+                seq_data = next(iter(sgm.seq_data.values()))
+                prompt_len = seq_data.get_prompt_len()
+                seq_lens.append(prompt_len)
+
+                assert sgm.sampling_params is not None
+                if sgm.sampling_params.prompt_logprobs:
+                    # with prompt_logprobs each token in the prompt has a row in
+                    # logits
+                    num_rows = prompt_len
+
+            batch_size += num_rows
+            sampling_params_per_row.extend(
+                itertools.repeat(sampling_params, num_rows))
+
+        assert len(
+            expected_penalization
+        ) == batch_size, \
+            ("Invalid test case, expected_penalization does not match computed"
+             "batch size")
+
+        _, fake_logits, sampler = _prepare_test(batch_size)
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens=seq_lens if seq_lens else None,
+            query_lens=seq_lens if seq_lens else [1] * batch_size,
+            device=device,
+            pin_memory=is_pin_memory_available())
+        # the logits tensor is modified in-place by the sampler
+        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+        for logits_idx, (should_penalize, sampling_params) in enumerate(
+                zip(expected_penalization, sampling_params_per_row)):
+
+            tokens_to_check = sampling_params.all_stop_token_ids
+
+            if should_penalize:
+                for token_id in tokens_to_check:
+                    assert fake_logits[logits_idx, token_id] == -float(
+                        'inf'
+                    ), f"Expected token {token_id} for logits row {logits_idx}"
+                    " to be penalized"
+                # no other tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] == -float('inf')) == len(
+                        tokens_to_check
+                    ), f"Expected only {len(tokens_to_check)} to be penalized"
+            else:
+                # no tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] ==
+                    -float('inf')) == 0, "No tokens should have been penalized"
+
+    for test_case in test_cases:
+        run_test_case(**test_case)
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_mixed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    expected_tokens: List[Optional[List[int]]] = []
+    seq_lens: List[int] = []
+    for i in range(batch_size):
+        expected: Optional[List[int]] = None
+        sampling_type = random.randint(0, 2)
+        if sampling_type == 0:
+            sampling_params = SamplingParams(temperature=0)
+            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
+        elif sampling_type in (1, 2):
+            n = random.randint(1, 10)
+            sampling_params = SamplingParams(
+                temperature=random.random() + 0.1,
+                top_p=min(random.random() + 0.1, 1),
+                top_k=random.randint(0, 10) or -1,
+                n=n,
+                presence_penalty=random.randint(0, 1),
+            )
+            if sampling_type == 2:
+                sampling_params.seed = random.randint(0, 10000)
+            else:
+                for idx in range(n):
+                    fake_logits[i, i + idx] = 1e2
+                expected = list(range(i, i + n))
+
+        expected_tokens.append(expected)
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    generators: Dict[str, torch.Generator] = {}
+
+    def test_sampling():
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available(),
+            generators=generators)
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        for i, (sequence_output, metadata) in enumerate(
+                zip(sampler_output, seq_group_metadata_list)):
+            assert metadata.sampling_params is not None
+
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
+                expected_tokens[i] = [
+                    nth_output.output_token
+                    for nth_output in sequence_output.samples
+                ]
+                continue
+
+            expected_tokens_item = expected_tokens[i]
+            assert expected_tokens_item is not None
+
+            for n, nth_output in enumerate(sequence_output.samples):
+                assert metadata.sampling_params is not None
+
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
+                    # Ensure exact matches for greedy or random with seed
+                    assert nth_output.output_token == expected_tokens_item[n]
+                else:
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
+                    assert nth_output.output_token in expected_tokens_item
+
+    # Test batch
+    test_sampling()
+
+    # Shuffle the batch and resample
+    target_index = list(range(batch_size))
+    for list_to_shuffle in (target_index, seq_group_metadata_list,
+                            expected_tokens, seq_lens):
+        random.Random(seed).shuffle(list_to_shuffle)
+    target_index = torch.tensor(target_index)
+    input_tensor.data = input_tensor.index_select(0, target_index)
+    fake_logits.data = fake_logits.index_select(0, target_index)
+
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
+    test_sampling()
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_top_k_top_p(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    top_k = random.randint(100, 500)
+    top_p = random.random() * 0.1
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024),
+                              device=device,
+                              dtype=torch.float16)
+    fake_logits = torch.normal(0,
+                               5,
+                               size=(batch_size, vocab_size),
+                               device=input_tensor.device,
+                               dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+
+    generation_model = GenerationMixin()
+    generation_config = GenerationConfig(top_k=top_k,
+                                         top_p=top_p,
+                                         do_sample=True)
+
+    @dataclass
+    class MockConfig:
+        is_encoder_decoder: bool = False
+
+    generation_model.config = MockConfig()  # needed by the following method
+    generation_model._prepare_special_tokens(generation_config, device=device)
+    processors = generation_model._get_logits_processor(generation_config,
+                                                        None,
+                                                        None,
+                                                        None, [],
+                                                        device=device)
+    assert len(processors) == 2  # top_p and top_k
+
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(
+                    temperature=1,
+                    top_k=top_k,
+                    top_p=top_p,
+                ),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+
+    sample_probs = None
+
+    def mock_sample(probs, *args, **kwargs):
+        nonlocal sample_probs
+        sample_probs = probs
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                 for prob in probs], None)
+
+    # top-k and top-p is only calculated when flashinfer kernel is not available
+    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
+         patch("vllm.model_executor.layers.sampler."
+               "flashinfer_top_k_top_p_sampling", None):
+        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+    assert sample_probs is not None
+
+    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
+    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
+    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
+    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_flashinfer_fallback(seed: int, device: str):
+    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
+        pytest.skip("Flashinfer sampler is disabled")
+
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    def failing_flashinfer_sampling(*_args, **_kwargs):
+        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    with patch(
+            "vllm.model_executor.layers.sampler."
+            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
+        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                             sampling_params, device)
+
+    assert sampler_output == fallback_sampler_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_repetition_penalty_mixed(device: str):
+
+    vocab_size = 8
+
+    def test_sampling_params(sampling_params: List[SamplingParams]):
+
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        seq_lens: List[int] = []
+        for i in range(2):
+            seq_group_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{i}",
+                    is_prompt=True,
+                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                    sampling_params=sampling_params[i],
+                    block_tables={0: [1]},
+                ))
+            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available())
+
+        fake_logits = torch.full((2, vocab_size),
+                                 1e-2,
+                                 device=device,
+                                 dtype=torch.float16)
+
+        fake_logits[:, 5] = 1.1e-2
+        fake_logits[:, 1] = 1.2e-2
+
+        sampler = MockLogitsSampler(fake_logits)
+
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        generated_tokens = []
+        for output in sampler_output:
+            generated_tokens.append(output.samples[0].output_token)
+
+        return generated_tokens
+
+    # one configuration is greedy with repetition_penalty
+    sampling_params_rep = SamplingParams(
+        temperature=0.0,
+        repetition_penalty=2.0,
+    )
+
+    # other configuration is sampling w/o repetition_penalty
+    sampling_params_sample = SamplingParams(
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+    )
+
+    tokens1 = test_sampling_params(
+        [sampling_params_rep, sampling_params_sample])
+
+    tokens2 = test_sampling_params(
+        [sampling_params_sample, sampling_params_rep])
+
+    assert tokens1[0] == tokens2[1]
+    assert tokens1[1] == tokens2[0]
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_include_gpu_probs_tensor(device: str):
+    set_random_seed(42)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+    sampler.include_gpu_probs_tensor = True
+    sampler.should_modify_greedy_probs_inplace = False
+
+    sampling_params = SamplingParams(temperature=0)
+
+    mock_inplace = Mock()
+    with patch(
+            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
+            mock_inplace):
+
+        sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                    sampling_params, device)
+        mock_inplace.assert_not_called()
+
+    assert sampler_output.sampled_token_probs is not None
+    assert sampler_output.logprobs is not None
+    assert sampler_output.sampled_token_ids is not None
diff --git a/vllm_mindspore/tests/test_sampler.py b/vllm_mindspore/tests/test_sampler.py
deleted file mode 100644
index 24db77803..000000000
--- a/vllm_mindspore/tests/test_sampler.py
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env python3
-# encoding: utf-8
-# Copyright 2025 Huawei Technologies Co., Ltd
-# Copyright 2024 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-import vllm_mindspore
-import itertools
-import random
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-from unittest.mock import Mock, patch
-
-import pytest
-import torch
-
-from vllm_mindspore.model_executor.layers.sampler import Sampler
-from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
-# from vllm_mindspore.model_executor.utils import set_random_seed
-from vllm_mindspore.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-
-VOCAB_SIZE = 32000
-RANDOM_SEEDS = list(range(128))
-
-class MockLogitsSampler(Sampler):
-
-    def __init__(self, fake_logits: torch.Tensor):
-        super().__init__()
-        self.fake_logits = fake_logits
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-def _prepare_test(
-        batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, VOCAB_SIZE),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-    return input_tensor, fake_logits, sampler
-
-def _do_sample(
-    batch_size: int,
-    input_tensor: torch.Tensor,
-    sampler: MockLogitsSampler,
-    sampling_params: SamplingParams,
-    device: str,
-):
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=False)
-    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
-
-def test_sampler_all_greedy():
-    # set_random_seed(seed)
-    device='cuda'
-    # torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-    expected = mint.argmax(fake_logits, dim=-1)
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == expected[i].item()
-
-
-def test_sampler_all_random():
-    # set_random_seed(seed)
-    # torch.set_default_device(device)
-    device='cuda'
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-
-def test_sampler_repetition_penalty_mixed():
-    device='cuda'
-    vocab_size = 8
-
-    def test_sampling_params(sampling_params: List[SamplingParams]):
-
-        seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        seq_lens: List[int] = []
-        for i in range(2):
-            seq_group_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{i}",
-                    is_prompt=True,
-                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                    sampling_params=sampling_params[i],
-                    block_tables={0: [1]},
-                ))
-            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=False)
-
-        fake_logits = torch.full((2, vocab_size),
-                                 1e-2,
-                                 dtype=torch.float16)
-
-        fake_logits[:, 5] = 1.1e-2
-        fake_logits[:, 1] = 1.2e-2
-
-        sampler = MockLogitsSampler(fake_logits)
-        print(f'fake_logits is: {fake_logits}', flush = True)
-
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        generated_tokens = []
-        for output in sampler_output:
-            generated_tokens.append(output.samples[0].output_token)
-
-        return generated_tokens
-
-    # one configuration is greedy with repetition_penalty
-    sampling_params_rep = SamplingParams(
-        temperature=0.0,
-        repetition_penalty=2.0,
-    )
-
-    # other configuration is sampling w/o repetition_penalty
-    sampling_params_sample = SamplingParams(
-        temperature=1.0,
-        top_k=1,
-    )
-
-    tokens1 = test_sampling_params(
-        [sampling_params_rep, sampling_params_sample])
-
-    tokens2 = test_sampling_params(
-        [sampling_params_sample, sampling_params_rep])
-
-    assert tokens1[0] == tokens2[1]
\ No newline at end of file
-- 
Gitee


From 05359c6d51add7c23de430cd6972dafeeae07e66 Mon Sep 17 00:00:00 2001
From: one_east <wanyidong@huawei.com>
Date: Wed, 12 Mar 2025 17:40:45 +0800
Subject: [PATCH 23/82] update v0.7.3

---
 vllm_mindspore/__init__.py                    |  20 +-
 vllm_mindspore/attention/backends/ms_attn.py  |  23 +-
 vllm_mindspore/attention/ops/paged_attn.py    |   2 -
 vllm_mindspore/distributed/parallel_state.py  |  16 +-
 .../executor/multiproc_worker_utils.py        |   1 -
 vllm_mindspore/executor/ray_gpu_executor.py   | 475 ++++++++++--------
 .../model_executor/layers/linear.py           |  17 +-
 .../model_executor/layers/logits_processor.py |  53 +-
 .../layers/quantization/base_config.py        |   6 +-
 .../model_executor/layers/sampler.py          |  11 +-
 .../layers/vocab_parallel_embedding.py        |  26 +-
 .../model_executor/model_loader/utils.py      |  10 +-
 vllm_mindspore/model_executor/models/llama.py |  36 +-
 .../models/mf_models/deepseek_v3.py           |  27 +-
 .../model_executor/models/mf_models/qwen2.py  |  27 +-
 vllm_mindspore/model_executor/models/qwen2.py |  12 +-
 .../model_executor/models/registry.py         |  29 +-
 .../model_executor/sampling_metadata.py       |   3 -
 vllm_mindspore/model_executor/utils.py        |   2 -
 vllm_mindspore/platforms/ascend.py            | 125 ++---
 vllm_mindspore/utils.py                       |  64 ++-
 vllm_mindspore/worker/cache_engine.py         |   3 +-
 vllm_mindspore/worker/model_runner.py         |   1 -
 vllm_mindspore/worker/worker.py               |  14 +-
 24 files changed, 537 insertions(+), 466 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index a4a1f4763..b9849cb46 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -70,15 +70,18 @@ vllm.executor.cuda_device_count_stateless = ascend_device_count_stateless
 
 from vllm_mindspore.model_executor.models.registry import (
     MindSporeModelRegistry,
-    _run_in_subprocess,
+    _SUBPROCESS_COMMAND,
 )
 
+vllm.config.ModelRegistry = MindSporeModelRegistry
+
 import vllm.model_executor
 
 vllm.model_executor.models.ModelRegistry = MindSporeModelRegistry
-vllm.config.ModelRegistry = MindSporeModelRegistry
+vllm.model_executor.models.registry._SUBPROCESS_COMMAND = _SUBPROCESS_COMMAND
 
 from vllm_mindspore.model_executor.model_loader.utils import get_ms_model_architecture
+# To patching the get_model_architecture, should import it first.
 from vllm.model_executor.model_loader import get_model_architecture
 
 vllm.model_executor.model_loader.get_model_architecture = get_ms_model_architecture
@@ -88,7 +91,6 @@ vllm.model_executor.model_loader.utils.get_model_architecture = (
 vllm.model_executor.model_loader.loader.get_model_architecture = (
     get_ms_model_architecture
 )
-vllm.model_executor.models.registry._run_in_subprocess = _run_in_subprocess
 
 from vllm_mindspore.model_executor.sampling_metadata import (
     SequenceGroupToSample,
@@ -102,12 +104,6 @@ vllm.model_executor.sampling_metadata.SequenceGroupToSample = SequenceGroupToSam
 vllm.model_executor.sampling_metadata.SamplingMetadataCache = SamplingMetadataCache
 vllm.model_executor.sampling_metadata.SamplingMetadata = SamplingMetadata
 
-from vllm_mindspore.attention.selector import get_ms_attn_backend
-
-import vllm.attention
-
-vllm.attention.get_attn_backend = get_ms_attn_backend
-
 from vllm_mindspore.worker.cache_engine import (
     ms_allocate_kv_cache,
     ms_swap_in,
@@ -161,6 +157,7 @@ vllm.distributed.parallel_state.init_model_parallel_group = init_model_parallel_
 from vllm_mindspore.executor.multiproc_worker_utils import (
     get_mp_context as ms_get_mp_context,
 )
+# To patching the get_mp_context, should import it first.
 from vllm.executor.multiproc_worker_utils import get_mp_context
 
 vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context
@@ -170,10 +167,11 @@ from vllm_mindspore.executor.ray_gpu_executor import (
     initialize_ray_cluster,
 )
 
-from vllm.executor.ray_gpu_executor import RayGPUExecutor
+from vllm.executor.ray_distributed_executor import RayDistributedExecutor
 
-RayGPUExecutor._init_workers_ray = ms_init_workers_ray
+RayDistributedExecutor._init_workers_ray = ms_init_workers_ray
 
+vllm.executor.ray_distributed_executor.initialize_ray_cluster = initialize_ray_cluster
 vllm.executor.ray_utils.initialize_ray_cluster = initialize_ray_cluster
 
 import vllm.engine.llm_engine
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index 37ad40379..c99fc91c1 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -30,6 +30,7 @@ from vllm.attention.backends.abstract import (
     AttentionMetadataBuilder,
     AttentionType,
     AttentionState,
+    AttentionLayer,
 )
 
 if TYPE_CHECKING:
@@ -85,8 +86,9 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     cross_slot_mapping: Optional[torch.Tensor] = None
     cross_block_tables: Optional[torch.Tensor] = None
 
-    # TODO(tronzhang): No need to use cuda_graph for mindspore.
     use_cuda_graph: bool = False
+    enable_kv_scales_calculation: bool
+
 
     @property
     def prefill_metadata(self):
@@ -177,6 +179,12 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
 class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
 
     def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+
+    def prepare(self):
         self.slot_mapping: List[int] = []
         self.prefill_seq_lens: List[int] = []
         self.context_lens: List[int] = []
@@ -190,10 +198,6 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
         self.num_decode_tokens = 0
         self.has_prefix_cache_hit = False
 
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
 
     def _add_seq_group(
         self,
@@ -310,7 +314,6 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
         num_decode_tokens = self.num_decode_tokens
 
         if use_captured_graph:
-            # TODO(tronzhang): Maybe here only turn graph mode on , and go with then same condition branch logic?
             raise RuntimeError("Doesnot support captured graph now!")
         else:
             block_tables = make_tensor_with_pad(
@@ -334,6 +337,7 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
             num_prefill_tokens=self.num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
             multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
         )
 
 
@@ -342,7 +346,7 @@ class MsAttentionBackend(AttentionBackend):
 
     @staticmethod
     def get_name() -> str:
-        raise "MS_ATTN"
+        return "MS_ATTN"
 
     @staticmethod
     def get_impl_cls() -> Type["AttentionImpl"]:
@@ -399,7 +403,6 @@ class MsAttentionBackend(AttentionBackend):
         kv_caches: List[MsKVCache],
         src_to_dists: torch.Tensor,
     ) -> None:
-        # TODO(tronzhang): this may be slow, a faster interface should be implemented by custom op!
         blocks_to_copy = src_to_dists.asnumpy().tolist()
         for kv_cache in kv_caches:
             npu_key_block, npu_value_block = kv_cache
@@ -445,18 +448,18 @@ class MsAttentionImpl(AttentionImpl):
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
     ) -> None:
         pass
 
     def forward(
         self,
+        layer: AttentionLayer,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: MSAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
         attn_type: str = AttentionType.DECODER,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
diff --git a/vllm_mindspore/attention/ops/paged_attn.py b/vllm_mindspore/attention/ops/paged_attn.py
index abfb37dca..df9394c78 100644
--- a/vllm_mindspore/attention/ops/paged_attn.py
+++ b/vllm_mindspore/attention/ops/paged_attn.py
@@ -31,8 +31,6 @@ if HAS_TRITON:
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
 
-# TODO(tronzhang): delete all not work codes.
-
 
 @dataclass
 class PagedAttentionMetadata:
diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
index 71e2cdd2d..58cf43489 100644
--- a/vllm_mindspore/distributed/parallel_state.py
+++ b/vllm_mindspore/distributed/parallel_state.py
@@ -28,28 +28,16 @@ def init_model_parallel_group(
     group_ranks: List[List[int]],
     local_rank: int,
     backend: str,
-    use_custom_allreduce: Optional[bool] = None,
     use_message_queue_broadcaster: bool = False,
     group_name: Optional[str] = None,
 ) -> "GroupCoordinator":
-    from vllm.distributed.parallel_state import (
-        GroupCoordinator,
-        _ENABLE_CUSTOM_ALL_REDUCE,
-    )
-
-    if use_custom_allreduce is None:
-        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    from vllm.distributed.parallel_state import GroupCoordinator
 
-    # TODO(tronzhang): mindspore doesnot support enough communicate cpu ops, set use_message_queue_broadcaster to False now.
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=False,
-        use_custom_allreduce=use_custom_allreduce,
-        use_tpu_communicator=True,
-        use_hpu_communicator=True,
-        use_xpu_communicator=True,
+        use_device_communicator=True,
         use_message_queue_broadcaster=False,
         group_name=group_name,
     )
diff --git a/vllm_mindspore/executor/multiproc_worker_utils.py b/vllm_mindspore/executor/multiproc_worker_utils.py
index 8b24cf014..86986fa6f 100644
--- a/vllm_mindspore/executor/multiproc_worker_utils.py
+++ b/vllm_mindspore/executor/multiproc_worker_utils.py
@@ -20,5 +20,4 @@ import multiprocessing
 
 
 def get_mp_context():
-    # TODO(tronzhang): support spawn latter...
     return multiprocessing.get_context("fork")
diff --git a/vllm_mindspore/executor/ray_gpu_executor.py b/vllm_mindspore/executor/ray_gpu_executor.py
index 3e6369462..d9c2affd6 100644
--- a/vllm_mindspore/executor/ray_gpu_executor.py
+++ b/vllm_mindspore/executor/ray_gpu_executor.py
@@ -15,6 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import os
 
 from typing import Dict, List, Optional
 from collections import defaultdict
@@ -25,7 +26,7 @@ from vllm.logger import init_logger
 from vllm.config import ParallelConfig
 from vllm.platforms import current_platform
 from vllm.executor.ray_utils import RayWorkerWrapper, ray, available_resources_per_node
-from vllm.executor.ray_gpu_executor import PlacementGroupSchedulingStrategy
+from vllm.executor.ray_distributed_executor import PlacementGroupSchedulingStrategy
 
 
 logger = init_logger(__name__)
@@ -38,220 +39,265 @@ class MsRayWorkerWrapper(RayWorkerWrapper):
 
 def ms_init_workers_ray(self, placement_group: "PlacementGroup",
                       **ray_remote_kwargs):
-    if (self.parallel_config.tensor_parallel_size == 1
-            and self.parallel_config.pipeline_parallel_size == 1):
-        # For single GPU case, we use a ray worker with constrained memory.
-        num_gpus = self.cache_config.gpu_memory_utilization
-    else:
-        # Otherwise, the ray workers are allocated with a full GPU.
-        num_gpus = 1
-
-    # The driver dummy worker does not actually use any resources.
-    # It holds the resource for the driver worker.
-    self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
-    # The remaining workers are the actual ray actors.
-    self.workers: List[RayWorkerWrapper] = []
-
-    # Used in ray compiled DAG: indexed first by PP rank,
-    # and then TP rank. In other words, the inner list is
-    # the TP group of workers for a PP rank.
-    self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
-
-    if self.parallel_config.ray_workers_use_nsight:
-        ray_remote_kwargs = self._configure_ray_workers_use_nsight(
-            ray_remote_kwargs)
-
-    logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
-
-    # Create the workers.
-    driver_ip = get_ip()
-    workers = []
-    for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-        if not bundle.get("NPU", 0):
-            continue
-        scheduling_strategy = PlacementGroupSchedulingStrategy(
-            placement_group=placement_group,
-            placement_group_capture_child_tasks=True,
-            placement_group_bundle_index=bundle_id,
-        )
-
-        worker = ray.remote(
-            num_cpus=0,
-            num_gpus=0,
-            resources={"NPU": 1},
-            scheduling_strategy=scheduling_strategy,
-            **ray_remote_kwargs,
-        )(MsRayWorkerWrapper).remote(vllm_config=self.vllm_config)
-        workers.append(worker)
-
-    worker_ip_refs = [
-        worker.get_node_ip.remote()  # type: ignore[attr-defined]
-        for worker in workers
-    ]
-    worker_ips = ray.get(worker_ip_refs)
-
-    if not self.use_ray_spmd_worker:
-        for i in range(len(workers)):
-            worker = workers[i]
-            worker_ip = worker_ips[i]
-            if self.driver_dummy_worker is None and worker_ip == driver_ip:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-                self.driver_worker = MsRayWorkerWrapper(
-                    vllm_config=self.vllm_config)
-                workers.pop(i)
-                worker_ips.pop(i)
-                self.workers = workers
-                break
-    else:
-        self.workers = workers
-
-    logger.debug("workers: %s", self.workers)
-    logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
-    if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
-        raise ValueError(
-            "Ray does not allocate any GPUs on the driver node. Consider "
-            "adjusting the Ray placement group or running the driver on a "
-            "NPU node.")
-
-    ip_counts: Dict[str, int] = {}
-    for ip in worker_ips:
-        ip_counts[ip] = ip_counts.get(ip, 0) + 1
-
-    worker_to_ip = dict(zip(self.workers, worker_ips))
-
-    def sort_by_driver_then_worker_ip(worker):
-        """
-        Sort the workers based on 3 properties:
-        1. If the worker is on the same node as the driver (vllm engine),
-            it should be placed first.
-        2. Then, if the worker is on a node with fewer workers, it should
-            be placed first.
-        3. Finally, if the work is on a node with smaller IP address, it
-            should be placed first.
-        """
-        ip = worker_to_ip[worker]
-        return (ip != driver_ip, ip_counts[ip], ip)
-
-    # After sorting, the workers on the same node will be
-    # close to each other, and the workers on the driver
-    # node will be placed first.
-    self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip)
-
-    # Get the set of GPU IDs used on each node.
-    worker_node_and_gpu_ids = []
-    for worker in [self.driver_dummy_worker] + self.workers:
-        if worker is None:
-            # driver_dummy_worker can be None when using ray spmd worker.
-            continue
-        worker_node_and_gpu_ids.append(
-            ray.get(worker.get_node_and_gpu_ids.remote()) \
-        ) # type: ignore
-
-    node_workers = defaultdict(list)  # node id -> list of worker ranks
-    node_gpus = defaultdict(list)  # node id -> list of gpu ids
-
-    for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-        node_workers[node_id].append(i)
-        # `gpu_ids` can be a list of strings or integers.
-        # convert them to integers for consistency.
-        # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
-        # string sorting is not sufficient.
-        # see https://github.com/vllm-project/vllm/issues/5590
-        gpu_ids = [int(x) for x in gpu_ids]
-        node_gpus[node_id].extend(gpu_ids)
-    for node_id, gpu_ids in node_gpus.items():
-        node_gpus[node_id] = sorted(gpu_ids)
-
-    all_ips = set(worker_ips + [driver_ip])
-    n_ips = len(all_ips)
-    n_nodes = len(node_workers)
-
-    if n_nodes != n_ips:
-        raise RuntimeError(
-            f"Every node should have a unique IP address. Got {n_nodes}"
-            f" nodes with node ids {list(node_workers.keys())} and "
-            f"{n_ips} unique IP addresses {all_ips}. Please check your"
-            " network configuration. If you set `VLLM_HOST_IP`"
-            " environment variable, make sure it is unique for"
-            " each node.")
-
-    # Set environment variables for the driver and workers.
-    all_args_to_update_environment_variables = [({
-        "CUDA_VISIBLE_DEVICES":
-        ",".join(map(str, node_gpus[node_id])),
-        "VLLM_TRACE_FUNCTION":
-        str(envs.VLLM_TRACE_FUNCTION),
-        **({
-            "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND
-        } if envs.VLLM_ATTENTION_BACKEND is not None else {})
-    }, ) for (node_id, _) in worker_node_and_gpu_ids]
-
-    self._env_vars_for_all_workers = (
-        all_args_to_update_environment_variables)
-
-    self._run_workers("update_environment_variables",
-                      all_args=self._get_env_vars_to_be_updated())
-
-    if len(node_gpus) == 1:
-        # in single node case, we don't need to get the IP address.
-        # the loopback address is sufficient
-        # NOTE: a node may have several IP addresses, one for each
-        # network interface. `get_ip()` might return any of them,
-        # while they might not work for communication inside the node
-        # if the network setup is complicated. Using the loopback address
-        # solves this issue, as it always works for communication inside
-        # the node.
-        driver_ip = "127.0.0.1"
-    distributed_init_method = get_distributed_init_method(
-        driver_ip, get_open_port())
-
-    # Initialize the actual workers inside worker wrapper.
-    init_worker_all_kwargs = [
-        self._get_worker_kwargs(
-            local_rank=node_workers[node_id].index(rank),
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-        ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
-    ]
-    self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
-
-    self._run_workers("init_device")
-    self._run_workers("load_model",
-                      max_concurrent_workers=self.parallel_config.
-                      max_parallel_loading_workers)
-
-    if self.use_ray_spmd_worker:
-        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
-            self.pp_tp_workers.append([])
-            for tp_rank in range(
-                    self.parallel_config.tensor_parallel_size):
-                # PP=2, TP=4
-                # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
-                rank = (pp_rank * self.parallel_config.tensor_parallel_size
-                        ) + tp_rank
-                assert len(self.pp_tp_workers[pp_rank]) == tp_rank
-                assert pp_rank < len(self.pp_tp_workers)
-                self.pp_tp_workers[pp_rank].append(self.workers[rank])
-
-    # This is the list of workers that are rank 0 of each TP group EXCEPT
-    # global rank 0. These are the workers that will broadcast to the
-    # rest of the workers.
-    self.tp_driver_workers: List[RayWorkerWrapper] = []
-    # This is the list of workers that are not drivers and not the first
-    # worker in a TP group. These are the workers that will be
-    # broadcasted to.
-    self.non_driver_workers: List[RayWorkerWrapper] = []
-
-    # Enforce rank order for correct rank to return final output.
-    for index, worker in enumerate(self.workers):
-        # The driver worker is rank 0 and not in self.workers.
-        rank = index + 1
-        if rank % self.parallel_config.tensor_parallel_size == 0:
-            self.tp_driver_workers.append(worker)
+        from vllm.executor.ray_distributed_executor import RayWorkerMetaData
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+
+        # Used in ray compiled DAG: indexed first by PP rank,
+        # and then TP rank. In other words, the inner list is
+        # the TP group of workers for a PP rank.
+        self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
+
+        if self.parallel_config.ray_workers_use_nsight:
+            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+                ray_remote_kwargs)
+
+        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
+
+        # Create the workers.
+        bundle_indices: List[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(
+                map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, \
+            ("VLLM_RAY_BUNDLE_INDICES must have the same size"
+            f" as the world size, but got {bundle_indices=} "
+            f"and {self.parallel_config.world_size=}")
+            assert len(set(bundle_indices)) == len(bundle_indices), \
+            ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+            f" but got {bundle_indices=}")
         else:
-            self.non_driver_workers.append(worker)
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[:self.parallel_config.world_size]
+
+        worker_metadata: List[RayWorkerMetaData] = []
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+
+            if current_platform.ray_device_key == "GPU":
+                # NV+AMD GPUs, and Intel XPUs
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=num_gpus,
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rpc_rank=rank)
+            else:
+                worker = ray.remote(
+                    num_cpus=0,
+                    num_gpus=0,
+                    resources={current_platform.ray_device_key: num_gpus},
+                    scheduling_strategy=scheduling_strategy,
+                    **ray_remote_kwargs,
+                )(MsRayWorkerWrapper).remote(vllm_config=self.vllm_config,
+                                           rpc_rank=rank)
+            worker_metadata.append(
+                RayWorkerMetaData(worker=worker, created_rank=rank))
+
+        worker_ips = ray.get([
+            each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
+            for each in worker_metadata
+        ])
+
+        for each, ip in zip(worker_metadata, worker_ips):
+            each.ip = ip
+
+        if not self.use_ray_spmd_worker:
+            for i, each in enumerate(worker_metadata):
+                # find and remove the dummy worker from the list
+                worker = each.worker
+                worker_ip = each.ip
+                if self.driver_dummy_worker is None and worker_ip == driver_ip:
+                    # If the worker is on the same node as the driver, we use it
+                    # as the resource holder for the driver process.
+                    self.driver_dummy_worker = worker
+                    self.driver_worker = MsRayWorkerWrapper(
+                        vllm_config=self.vllm_config, rpc_rank=0)
+                    worker_metadata.pop(i)
+                    break
+
+        logger.debug("workers: %s", worker_metadata)
+        logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
+        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+
+        ip_counts: Dict[str, int] = {}
+        for ip in worker_ips:
+            ip_counts[ip] = ip_counts.get(ip, 0) + 1
+
+        def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
+            """
+            Sort the workers based on 3 properties:
+            1. If the worker is on the same node as the driver (vllm engine),
+                it should be placed first.
+            2. Then, if the worker is on a node with fewer workers, it should
+                be placed first.
+            3. Finally, if the work is on a node with smaller IP address, it
+                should be placed first.
+            """
+            ip = item.ip
+            return (0 if ip == driver_ip else 1, ip_counts[ip], ip)
+
+        # After sorting, the workers on the same node will be
+        # close to each other, and the workers on the driver
+        # node will be placed first.
+        sorted_worker_metadata = sorted(worker_metadata,
+                                        key=sort_by_driver_then_worker_ip)
+        start_rank = 0 if self.use_ray_spmd_worker else 1
+        for i, item in enumerate(sorted_worker_metadata):
+            item.adjusted_rank = i + start_rank
+        self.workers = [item.worker for item in sorted_worker_metadata]
+        rerank_mapping = {
+            item.created_rank: item.adjusted_rank
+            for item in sorted_worker_metadata
+        }
+        self._run_workers("adjust_rank", rerank_mapping)
+
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = []
+        for worker in [self.driver_dummy_worker] + self.workers:
+            if worker is None:
+                # driver_dummy_worker can be None when using ray spmd worker.
+                continue
+            worker_node_and_gpu_ids.append(
+                ray.get(worker.get_node_and_gpu_ids.remote()) \
+            ) # type: ignore
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        all_ips = set(worker_ips + [driver_ip])
+        n_ips = len(all_ips)
+        n_nodes = len(node_workers)
+
+        if n_nodes != n_ips:
+            raise RuntimeError(
+                f"Every node should have a unique IP address. Got {n_nodes}"
+                f" nodes with node ids {list(node_workers.keys())} and "
+                f"{n_ips} unique IP addresses {all_ips}. Please check your"
+                " network configuration. If you set `VLLM_HOST_IP`"
+                " environment variable, make sure it is unique for"
+                " each node.")
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [{
+            current_platform.device_control_env_var:
+            ",".join(map(str, node_gpus[node_id])),
+        } for (node_id, _) in worker_node_and_gpu_ids]
+
+        for args in all_args_to_update_environment_variables:
+            # some carry-over env vars from the driver
+            # TODO: refactor platform-specific env vars
+            for name in [
+                    "VLLM_ATTENTION_BACKEND",
+                    "TPU_CHIPS_PER_HOST_BOUNDS",
+                    "TPU_HOST_BOUNDS",
+                    "VLLM_USE_V1",
+                    "VLLM_TRACE_FUNCTION",
+            ]:
+                if name in os.environ:
+                    args[name] = os.environ[name]
+
+        self._env_vars_for_all_workers = (
+            all_args_to_update_environment_variables)
+
+        self._run_workers("update_environment_variables",
+                          self._get_env_vars_to_be_updated())
+
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Initialize the actual workers inside worker wrapper.
+        all_kwargs = []
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(rank)
+            kwargs = dict(
+                vllm_config=self.vllm_config,
+                local_rank=local_rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                is_driver_worker=(not self.parallel_config)
+                or (rank % self.parallel_config.tensor_parallel_size == 0),
+            )
+            all_kwargs.append(kwargs)
+        self._run_workers("init_worker", all_kwargs)
+
+        self._run_workers("init_device")
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+
+        if self.use_ray_spmd_worker:
+            for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+                self.pp_tp_workers.append([])
+                for tp_rank in range(
+                        self.parallel_config.tensor_parallel_size):
+                    # PP=2, TP=4
+                    # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]]
+                    rank = (pp_rank * self.parallel_config.tensor_parallel_size
+                            ) + tp_rank
+                    assert len(self.pp_tp_workers[pp_rank]) == tp_rank
+                    assert pp_rank < len(self.pp_tp_workers)
+                    self.pp_tp_workers[pp_rank].append(self.workers[rank])
+
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+
+        # Enforce rank order for correct rank to return final output.
+        for index, worker in enumerate(self.workers):
+            # The driver worker is rank 0 and not in self.workers.
+            rank = index + 1
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(worker)
+            else:
+                self.non_driver_workers.append(worker)
 
 
 def initialize_ray_cluster(
@@ -355,3 +401,4 @@ def initialize_ray_cluster(
     _verify_bundles(current_placement_group, parallel_config, device_str)
     # Set the placement group in the parallel config
     parallel_config.placement_group = current_placement_group
+
diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index 09e1b84a5..45aa4c439 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -56,6 +56,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "IPEXAWQLinearMethod",
     "IPEXGPTQLinearMethod",
     "HQQMarlinMethod",
+    "QuarkLinearMethod"
 ]
 
 
@@ -498,19 +499,21 @@ class RowParallelLinear(LinearBase):
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
         )
 
-        self.input_is_parallel = input_is_parallel
-        self.reduce_results = reduce_results
-
         # Divide the weight matrix along the last dimension.
         self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.input_size_per_partition = divide(input_size, self.tp_size)
+        self.output_size_per_partition = output_size
+        self.output_partition_sizes = [output_size]
+        self.input_is_parallel = input_is_parallel
+        self.reduce_results = reduce_results
+
         assert self.quant_method is not None
 
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
-            output_partition_sizes=[self.output_size],
+            output_partition_sizes=self.output_partition_sizes,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
@@ -570,10 +573,14 @@ class RowParallelLinear(LinearBase):
         tp_rank = get_tensor_model_parallel_rank()
         input_dim = getattr(param, "input_dim", None)
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
 
         # bitsandbytes loads the weights of the specific portion
         # no need to narrow here
-        if input_dim is not None and not use_bitsandbytes_4bit:
+        if input_dim is not None and not is_sharded_weight:
             shard_size = param.shape[input_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size).contiguous()
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index b9beb080b..cc0550378 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -17,12 +17,15 @@
 # ============================================================================
 """A layer that compute logits from hidden_stats."""
 import inspect
+from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore import mint
 
+import vllm.envs as envs
+from vllm.config import get_current_vllm_config
 from vllm.distributed import (
     tensor_model_parallel_all_gather,
     tensor_model_parallel_gather,
@@ -33,8 +36,11 @@ from vllm_mindspore.model_executor.layers.vocab_parallel_embedding import (
 from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
 
-# TODO(tronzhang): Use vllm's logits_processor.py latter...
 
+_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
+if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
+    _logits_processor_threadpool = ThreadPoolExecutor(
+        envs.VLLM_LOGITS_PROCESSOR_THREADS)
 
 class LogitsProcessor(nn.Cell):
     """Process logits and apply logits processors from sampling metadata.
@@ -67,7 +73,10 @@ class LogitsProcessor(nn.Cell):
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
-        self.use_gather = not current_platform.is_tpu()
+        parallel_config = get_current_vllm_config().parallel_config
+        self.use_gather = not current_platform.is_tpu() \
+            or envs.VLLM_USE_V1 \
+            or parallel_config.distributed_executor_backend == "external_launcher"
 
     def construct(
         self,
@@ -106,7 +115,7 @@ class LogitsProcessor(nn.Cell):
         embedding_bias: Optional[Tensor],
     ) -> Optional[Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(
+        logits = lm_head.quant_method.apply(
             lm_head, hidden_states, bias=embedding_bias
         )
         if self.use_gather:
@@ -150,6 +159,7 @@ def _apply_logits_processors(
 ) -> Tensor:
     found_logits_processors = False
     logits_processed = 0
+    logits_row_ids_and_logits_row_futures = []
     for seq_group in sampling_metadata.seq_groups:
         seq_ids = seq_group.seq_ids
         sampling_params = seq_group.sampling_params
@@ -162,22 +172,39 @@ def _apply_logits_processors(
                 past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
                 prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
 
-                for logits_processor in logits_processors:
-                    parameters = inspect.signature(logits_processor).parameters
-                    if len(parameters) == 3:
-                        logits_row = logits_processor(
-                            prompt_tokens_ids, past_tokens_ids, logits_row
-                        )
-                    else:
-                        logits_row = logits_processor(past_tokens_ids, logits_row)
-
-                logits[logits_row_idx] = logits_row
+            if _logits_processor_threadpool is not None:
+                logits_row_ids_and_logits_row_futures.append(
+                    (logits_row_idx,
+                     _logits_processor_threadpool.submit(
+                         _apply_logits_processors_single_seq, logits_row,
+                         logits_processors, past_tokens_ids,
+                         prompt_tokens_ids)))
+            else:
+                logits[logits_row_idx] = \
+                    _apply_logits_processors_single_seq(
+                        logits_row, logits_processors, past_tokens_ids,
+                        prompt_tokens_ids)
 
         logits_processed += len(seq_group.sample_indices) + len(
             seq_group.prompt_logprob_indices
         )
+    
+    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
+        logits[logits_row_idx] = future.result()
 
     if found_logits_processors:
         # verifies that no rows in logits were missed unexpectedly
         assert logits_processed == logits.shape[0]
     return logits
+
+def _apply_logits_processors_single_seq(logits_row, logits_processors,
+                                        past_tokens_ids,
+                                        prompt_tokens_ids) -> Tensor:
+    for logits_processor in logits_processors:
+        parameters = inspect.signature(logits_processor).parameters
+        if len(parameters) == 3:
+            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
+                                          logits_row)
+        else:
+            logits_row = logits_processor(past_tokens_ids, logits_row)
+    return logits_row
\ No newline at end of file
diff --git a/vllm_mindspore/model_executor/layers/quantization/base_config.py b/vllm_mindspore/model_executor/layers/quantization/base_config.py
index 5d3b0acb0..afc957ce8 100644
--- a/vllm_mindspore/model_executor/layers/quantization/base_config.py
+++ b/vllm_mindspore/model_executor/layers/quantization/base_config.py
@@ -21,7 +21,6 @@ from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Type
 import mindspore as ms
 
-# TODO(tronzhang): Use vllm's quantization base_config.py latter.
 
 class QuantizeMethodBase(ABC):
     """Base class for different quantized methods."""
@@ -58,6 +57,11 @@ class QuantizeMethodBase(ABC):
 class QuantizationConfig(ABC):
     """Base class for quantization configs."""
 
+    def __init__(self):
+        super().__init__()
+        # mapping is updated by models as they initialize
+        self.packed_modules_mapping: Dict[str, List[str]] = dict()
+
     @abstractmethod
     def get_name(self) -> str:
         """Name of the quantization method."""
diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index 65ce69125..1b91157fd 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -91,7 +91,6 @@ class SampleResultArgsType:
     sample_results_dict: SampleResultsDictType
     sampling_metadata: SamplingMetadata
     greedy_samples: Optional[torch.Tensor]
-    beam_search_logprobs: Optional[torch.Tensor]
 
 
 # Union of non-deferred (single-step scheduling)
@@ -704,14 +703,12 @@ def get_pythonized_sample_results(
         sampling_metadata,
         greedy_samples,
         multinomial_samples,
-        beam_search_logprobs,
         sample_results_dict,
     ) = (
         sample_result_args.sample_metadata,
         sample_result_args.sampling_metadata,
         sample_result_args.greedy_samples,
         sample_result_args.multinomial_samples,
-        sample_result_args.beam_search_logprobs,
         sample_result_args.sample_results_dict,
     )
 
@@ -724,9 +721,6 @@ def get_pythonized_sample_results(
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             sample_results = _random_sample(seq_groups,
                                             multinomial_samples[sampling_type])
-        elif sampling_type == SamplingType.BEAM:
-            sample_results = _beam_search_sample(seq_groups,
-                                                 beam_search_logprobs)
         sample_results_dict.update(zip(seq_group_id, sample_results))
 
     return [
@@ -768,7 +762,6 @@ def _sample_with_torch(
     sample_metadata: SampleMetadataType = {}
     multinomial_samples: MultinomialSamplesType = {}
     greedy_samples: Optional[torch.Tensor] = None
-    beam_search_logprobs: Optional[torch.Tensor] = None
 
     # Create output tensor for sampled token ids.
     if include_gpu_probs_tensor:
@@ -837,8 +830,6 @@ def _sample_with_torch(
                 sampled_token_ids_tensor[long_sample_indices] = \
                     multinomial_samples[sampling_type].to(torch.long)
 
-        elif sampling_type == SamplingType.BEAM:
-            beam_search_logprobs = logprobs[sample_indices]
         else:
             raise ValueError(f"Unsupported sampling type: {sampling_type}")
 
@@ -849,7 +840,6 @@ def _sample_with_torch(
         sample_metadata=sample_metadata,
         multinomial_samples=multinomial_samples,
         greedy_samples=greedy_samples,
-        beam_search_logprobs=beam_search_logprobs,
         sample_results_dict=sample_results_dict)
 
     if not sampling_metadata.skip_sampler_cpu_output:
@@ -1327,3 +1317,4 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
     next_prompt_tokens = prompt_tokens[
         next_token_index_start:next_token_index_end]
     return next_prompt_tokens
+
diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
index d2e00d199..81ebbe119 100644
--- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
@@ -36,8 +36,6 @@ from mindspore import jit
 
 DEFAULT_VOCAB_PADDING_SIZE = 64
 
-# TODO(tronzhang): Most same as vllm's one, check latter...
-
 
 class UnquantizedEmbeddingMethod(QuantizeMethodBase):
     """Unquantized method for embeddings."""
@@ -224,26 +222,26 @@ class VocabParallelEmbedding(nn.Cell):
 
         self.embedding_dim = embedding_dim
 
-        linear_method = None
-        if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self, prefix=prefix)
-        if linear_method is None:
-            linear_method = UnquantizedEmbeddingMethod()
+        quant_method = None
+        if quant_method is not None:
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
 
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
         is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
-        linear_method_implements_embedding = method_has_implemented_embedding(
-            type(linear_method)
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method)
         )
-        if is_embedding_layer and not linear_method_implements_embedding:
+        if is_embedding_layer and not quant_method_implements_embedding:
             raise NotImplementedError(
-                f"The class {type(linear_method).__name__} must implement "
+                f"The class {type(quant_method).__name__} must implement "
                 "the 'embedding' method, see UnquantizedEmbeddingMethod."
             )
 
-        self.linear_method: QuantizeMethodBase = linear_method
+        self.quant_method: QuantizeMethodBase = quant_method
 
         if params_dtype is None:
             params_dtype = mstype.float16
@@ -264,7 +262,7 @@ class VocabParallelEmbedding(nn.Cell):
             - self.shard_indices.added_vocab_start_index
         )
 
-        self.linear_method.create_weights(
+        self.quant_method.create_weights(
             self,
             self.embedding_dim,
             [self.num_embeddings_per_partition],
@@ -328,7 +326,7 @@ class VocabParallelEmbedding(nn.Cell):
         else:
             masked_input, input_mask = input_, None
         # Get the embeddings.
-        output_parallel = self.linear_method.embedding(self, masked_input)
+        output_parallel = self.quant_method.embedding(self, masked_input)
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel = mint.mul(output_parallel, input_mask)
diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py
index 7d0ef95d3..66295a32c 100644
--- a/vllm_mindspore/model_executor/model_loader/utils.py
+++ b/vllm_mindspore/model_executor/model_loader/utils.py
@@ -20,14 +20,20 @@ from typing import Tuple, Type
 
 from torch import nn
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, ModelImpl
 
+from vllm.model_executor.models import ModelRegistry
 from vllm_mindspore.model_executor.models.registry import MindSporeModelRegistry
-
+from vllm.model_executor.model_loader.utils import resolve_transformers_fallback
 
 def get_ms_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
 
+    vllm_supported_archs = ModelRegistry.get_supported_archs()
+    is_vllm_supported = any(arch in vllm_supported_archs
+                            for arch in architectures)
+    if not is_vllm_supported:
+        raise RuntimeError("vLLM-Mindspore does not support %s for now." % str(architectures))
     model_cls, arch = MindSporeModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
         raise RecursionError("MindSpore unsupport embed model task now!")
diff --git a/vllm_mindspore/model_executor/models/llama.py b/vllm_mindspore/model_executor/models/llama.py
index 90ea548b7..19cb6e3a8 100644
--- a/vllm_mindspore/model_executor/models/llama.py
+++ b/vllm_mindspore/model_executor/models/llama.py
@@ -55,6 +55,7 @@ from vllm_mindspore.model_executor.models.model_base import MsModelBase
 from vllm.sequence import IntermediateTensors
 from vllm.attention import AttentionMetadata
 from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
 
 from mindspore import Tensor, mint, jit, nn
 from mindspore import dtype as mstype
@@ -116,6 +117,7 @@ class LlamaAttention(nn.Cell):
         max_position_embeddings: int = 8192,
         quant_config=None,
         bias: bool = False,
+        bias_o_proj: bool = False,
         cache_config=None,
         prefix: str = "",
     ) -> None:
@@ -140,6 +142,9 @@ class LlamaAttention(nn.Cell):
         self.head_dim = getattr(
             config, "head_dim", self.hidden_size // self.total_num_heads
         )
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -159,13 +164,14 @@ class LlamaAttention(nn.Cell):
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
-            bias=bias,
+            bias=bias_o_proj,
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
 
         is_neox_style = True
-        if quant_config is not None and quant_config.get_name() == "gguf":
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
             is_neox_style = False
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -177,13 +183,14 @@ class LlamaAttention(nn.Cell):
         )
 
         if hasattr(config, "interleaved_sliding_window"):
-            if isinstance(config.interleaved_sliding_window, int):
-                sliding_window = config.interleaved_sliding_window
-            elif isinstance(config.interleaved_sliding_window, list):
-                sw_idx = layer_idx % len(config.interleaved_sliding_window)
-                sliding_window = config.interleaved_sliding_window[sw_idx]
+            interleaved_sliding_window = config.interleaved_sliding_window
+            if isinstance(interleaved_sliding_window, int):
+                sliding_window = interleaved_sliding_window
+            elif isinstance(interleaved_sliding_window, list):
+                sw_idx = layer_idx % len(interleaved_sliding_window)
+                sliding_window = interleaved_sliding_window[sw_idx]
             else:
-                raise ValueError(f"{type(sliding_window)} is not supported.")
+                raise ValueError(f"{type(interleaved_sliding_window)} is not supported.")
         else:
             sliding_window = None
 
@@ -246,6 +253,11 @@ class LlamaDecoderLayer(nn.Cell):
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False
         )
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
         self.self_attn = LlamaAttention(
             config=config,
             hidden_size=self.hidden_size,
@@ -258,6 +270,7 @@ class LlamaDecoderLayer(nn.Cell):
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
+            bias_o_proj=bias_o_proj,
             cache_config=cache_config,
             prefix=f"{prefix}.self_attn",
         )
@@ -329,11 +342,6 @@ class LlamaModel(nn.Cell):
         config = vllm_config
         self.config = config
         self.padding_idx = config.pad_token_id
-        # TODO: Support lora_config
-        # lora_config = config
-        # lora_vocab = (lora_config.lora_extra_vocab_size *
-        #              (lora_config.max_loras or 1)) if lora_config else 0
-        # self.vocab_size = config.vocab_size + lora_vocab
         self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
         # TODO: Support quant_config cache_config
@@ -557,4 +565,4 @@ class LlamaForCausalLM(MsModelBase, SupportsPP):
         sampling_metadata: SamplingMetadata,
     ) -> Optional[Tensor]:
         logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
-        return logits
+        return logits
\ No newline at end of file
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index d4a492b92..97b1e1a16 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -17,6 +17,7 @@
 # ============================================================================
 
 import os
+import torch
 from typing import Iterable, List, Optional, Set, Tuple, Union
 from pathlib import Path
 
@@ -24,10 +25,13 @@ import numpy as np
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.sequence import IntermediateTensors
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.attention.backends.abstract import AttentionType
 from vllm.logger import init_logger
 
 
@@ -84,6 +88,13 @@ def _batch_seq(input_tokens, prefill):
 
     return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
 
+class Fake_Attention:
+    def __init__(self):
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
+        self.attn_type = AttentionType.DECODER
 
 class DeepseekV3ForCausalLM(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -152,12 +163,21 @@ class DeepseekV3ForCausalLM(MsModelBase):
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
 
-    def update_mf_kvcaches(self, kv_caches):
+        self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)]
+        compilation_config = get_current_vllm_config().compilation_config
+
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        for i in range(self.mf_model_config.num_layers):
+            compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
+
+    def update_mf_kvcaches(self):
         if self.mf_kvcaches_init:
             return
 
+        forward_context = get_forward_context()
         for i in range(self.mf_model_config.num_layers):
-            k_cache = kv_caches[i][0]
+            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
             mf_k_cache, _ = self.network.kvcache(i)
 
             mf_k_cache.set_device_address(
@@ -174,7 +194,7 @@ class DeepseekV3ForCausalLM(MsModelBase):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Union[Tensor, IntermediateTensors]:
-        self.update_mf_kvcaches(kv_caches)
+        self.update_mf_kvcaches()
 
         is_prefill = True if attn_metadata.prefill_metadata else False
 
@@ -240,4 +260,3 @@ class DeepseekV3ForCausalLM(MsModelBase):
             return model_name_or_path
         else:
             raise ValueError("The 'model' in LLM should be the local path of the MindSpore checkpoint file.")
-
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index c0a959d0d..81afbb0d7 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -17,6 +17,7 @@
 # ============================================================================
 
 import os
+import torch
 from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
@@ -24,8 +25,11 @@ import numpy as np
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.attention.backends.abstract import AttentionType
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 
@@ -80,6 +84,13 @@ def _batch_seq(input_tokens, prefill):
 
     return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
 
+class Fake_Attention:
+    def __init__(self):
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
+        self.attn_type = AttentionType.DECODER
 
 class Qwen2ForCausalLM(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -122,12 +133,22 @@ class Qwen2ForCausalLM(MsModelBase):
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
 
-    def update_mf_kvcaches(self, kv_caches):
+        self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)]
+        compilation_config = get_current_vllm_config().compilation_config
+
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        for i in range(self.mf_model_config.num_layers):
+            compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
+
+    def update_mf_kvcaches(self):
         if self.mf_kvcaches_init:
             return
 
+        forward_context = get_forward_context()
         for i in range(self.mf_model_config.num_layers):
-            k_cache, v_cache = kv_caches[i]
+            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
+            v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1]
             mf_k_cache, mf_v_cache = self.network.kvcache(i)
             mf_k_cache.set_device_address(
                 k_cache._data_ptr(), k_cache.shape, k_cache.dtype
@@ -146,7 +167,7 @@ class Qwen2ForCausalLM(MsModelBase):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Union[Tensor, IntermediateTensors]:
-        self.update_mf_kvcaches(kv_caches)
+        self.update_mf_kvcaches()
 
         is_prefill = True if attn_metadata.prefill_metadata else False
 
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index ca0e86ac8..90ce30416 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -397,7 +397,17 @@ class Qwen2Model(nn.Cell):
                 # Models trained using ColossalAI may include these tensors in
                 # the checkpoint. Skip them.
                 continue
-
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+               # Loading kv cache quantization scales
+               param = params_dict[scale_name]
+               weight_loader = getattr(param, "weight_loader",
+                                       default_weight_loader)
+               loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                loaded_weight[0])
+               weight_loader(param, loaded_weight)
+               loaded_params.add(scale_name)
+               continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py
index 0fb037469..09243fcb4 100644
--- a/vllm_mindspore/model_executor/models/registry.py
+++ b/vllm_mindspore/model_executor/models/registry.py
@@ -60,32 +60,9 @@ MindSporeModelRegistry = _ModelRegistry(
 _T = TypeVar("_T")
 
 
-def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
-    with tempfile.TemporaryDirectory() as tempdir:
-        output_filepath = os.path.join(tempdir, "registry_output.tmp")
-
-        # `cloudpickle` allows pickling lambda functions directly
-        input_bytes = cloudpickle.dumps((fn, output_filepath))
-
-        # cannot use `sys.executable __file__` here because the script
-        # contains relative imports
-        returned = subprocess.run(
-            [sys.executable, "-m", "vllm_mindspore.model_executor.models.registry"],
-            input=input_bytes,
-            capture_output=True,
-        )
-
-        # check if the subprocess is successful
-        try:
-            returned.check_returncode()
-        except Exception as e:
-            # wrap raised exception to provide more information
-            raise RuntimeError(
-                f"Error raised in subprocess:\n" f"{returned.stderr.decode()}"
-            ) from e
-
-        with open(output_filepath, "rb") as f:
-            return pickle.load(f)
+_SUBPROCESS_COMMAND = [
+    sys.executable, "-m", "vllm.model_executor.models.registry"
+]
 
 
 def _run() -> None:
diff --git a/vllm_mindspore/model_executor/sampling_metadata.py b/vllm_mindspore/model_executor/sampling_metadata.py
index c9d11a198..a016dd8e1 100644
--- a/vllm_mindspore/model_executor/sampling_metadata.py
+++ b/vllm_mindspore/model_executor/sampling_metadata.py
@@ -35,8 +35,6 @@ _SAMPLING_EPS = 1e-5
 from mindspore import Tensor
 import mindspore as ms
 
-# TODO(tronzhang): use vllm's SequenceGroupToSample. (now for tensor create pin/device and tensor.to)
-
 
 @dataclass
 class SequenceGroupToSample:
@@ -602,7 +600,6 @@ class SamplingTensors:
         # Because the memory is pinned, we can do non-blocking
         # transfer to device.
 
-        # TODO(tronzhang): mindspore tensor donot support tensor.to(device=xxx, non_blocking=xxx), but tensor.move_to(to, blocking=xxx).
         return cls(
             temperatures=temperatures_t,
             top_ps=top_ps_t,
diff --git a/vllm_mindspore/model_executor/utils.py b/vllm_mindspore/model_executor/utils.py
index e1f5ec779..eb421de0b 100644
--- a/vllm_mindspore/model_executor/utils.py
+++ b/vllm_mindspore/model_executor/utils.py
@@ -19,8 +19,6 @@
 from typing import Any, Dict, Optional
 from mindspore import Tensor
 
-# TODO(tronzhang): Use vllm's latter...
-
 
 def set_weight_attrs(
     weight: Tensor,
diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py
index 65d31766e..31fe2c2e0 100644
--- a/vllm_mindspore/platforms/ascend.py
+++ b/vllm_mindspore/platforms/ascend.py
@@ -17,10 +17,10 @@
 # ============================================================================
 """Ascend platform."""
 
-from typing import TYPE_CHECKING, Optional
+import os
+from typing import (TYPE_CHECKING, Optional, Union, Tuple)
 
 import torch
-import os
 import mindspore as ms
 
 from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend
@@ -35,23 +35,25 @@ logger = init_logger(__name__)
 
 
 class AscendPlatform(Platform):
-    _enum = PlatformEnum.CUDA
-    device_name: str = "cuda"
-    device_type: str = "cuda"
-    dispatch_key: str = "CUDA"
+
+    _enum = PlatformEnum.OOT
+    device_name: str = "npu"
+    device_type: str = "cuda" # To use cuda worker, executor...
+    simple_compile_backend: str = "npu"
+    ray_device_key: str = "NPU"
+    device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
 
     @classmethod
-    def get_default_attn_backend(cls, selected_backend: _Backend):
-        """Get the default attention backend of a device."""
-        return _Backend.FLASH_ATTN
+    def get_device_capability(cls, device_id: int = 0):
+        return True
 
     @classmethod
-    def get_device_capability(
+    def has_device_capability(
         cls,
+        capability: Union[Tuple[int, int], int],
         device_id: int = 0,
-    ) -> Optional[DeviceCapability]:
-        major, minor = torch.cuda.get_device_capability(device_id)
-        return DeviceCapability(major=major, minor=minor)
+    ) -> bool:
+        return True
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
@@ -59,37 +61,10 @@ class AscendPlatform(Platform):
         return torch.cuda.get_device_name(device_id)
 
     @classmethod
-    def get_device_total_memory(cls, device_id: int = 0) -> int:
-        """Get the total memory of a device in bytes."""
-        device_props = torch.cuda.get_device_properties(device_id)
-        return device_props.total_memory
-
-    @classmethod
-    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        """
-        Check if the current platform supports async output.
-        """
-        if enforce_eager:
-            # from vllm.logger import init_logger
-            # logger = init_logger(__name__)
-            logger.warning(
-                "To see benefits of async output processing, enable CUDA "
-                "graph. Since, enforce-eager is enabled, async output "
-                "processor cannot be used"
-            )
-            return False
+    def is_async_output_supported(cls, _) -> bool:
+        """Check if the current platform supports async output."""
         return True
 
-    @classmethod
-    def inference_mode(cls):
-        """A device-specific wrapper of `torch.inference_mode`.
-
-        This wrapper is recommended because some hardware backends such as TPU
-        do not support `torch.inference_mode`. In such a case, they will fall
-        back to `torch.no_grad` by overriding this method.
-        """
-        return torch.inference_mode(mode=True)
-
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         """
@@ -105,28 +80,13 @@ class AscendPlatform(Platform):
         scheduler_config = vllm_config.scheduler_config
 
         if parallel_config.worker_cls == "auto":
-            import vllm.envs as envs
-
             if scheduler_config.is_multi_step:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError
-                else:
-                    parallel_config.worker_cls = (
-                        "vllm.worker.multi_step_worker.MultiStepWorker"
-                    )
+                parallel_config.worker_cls = "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError
-                else:
-                    parallel_config.worker_cls = (
-                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-                    )
-                    parallel_config.sd_worker_cls = "vllm.worker.worker.Worker"
+                parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                parallel_config.sd_worker_cls = "vllm.worker.worker.Worker"
             else:
-                if envs.VLLM_USE_V1:
-                    parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
-                else:
-                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
+                parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
@@ -141,17 +101,40 @@ class AscendPlatform(Platform):
                 "please check size by cmd(npu-smi info). "
                 "For now, we will try default size(64GB) which might not be correct exactly."
             )
-        max_device_memory_for_ms = str(total_device_memory * cache_config.gpu_memory_utilization) + 'GB'
+        max_device_memory_for_ms = str(total_device_memory * cache_config.gpu_memory_utilization) + "GB"
         ms.set_context(max_device_memory=max_device_memory_for_ms)
         logger.info("max_device_memory for mindspore is: ", max_device_memory_for_ms)
 
     @classmethod
-    def verify_quantization(cls, quant: str) -> None:
-        """
-        Verify whether the quantization is supported by the current platform.
-        """
-        if cls.supported_quantization and quant not in cls.supported_quantization:
-            raise ValueError(
-                f"{quant} quantization is currently not supported in "
-                f"{cls.device_name}."
-            )
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla):
+        """Get the attention backend class of a device."""
+        if use_v1:
+            raise RuntimeError("vLLM-MindSpore do not support v1 egine now!")
+        if use_mla:
+            logger.info("Using MindSpore MLA backend.")
+            return "vllm_mindspore.attention.backends.ms_attn.MLABackend"
+
+        if selected_backend == _Backend.FLASH_ATTN or selected_backend is None:
+            logger.info("Using MindSpore Attention backend.")
+            return "vllm_mindspore.attention.backends.ms_attn.MsAttentionBackend"
+
+        raise ValueError(
+            "Invaild attention backend %s for vLLM-MindSpore with head_size: %s, dtype: %s, kv_cache_dtype: %s, block_size: %s."
+            % (str(selected_backend), str(head_size), str(dtype), str(kv_cache_dtype), str(block_size))
+        )
+
+    @classmethod
+    def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None) -> float:
+        """Return the memory usage in bytes."""
+        return torch.cuda.max_memory_allocated(device)
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """Get device specific communicator class for distributed communication."""
+        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        device_props = torch.cuda.get_device_properties(device_id)
+        return device_props.total_memory
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index fe9c83563..ca4a0d2ad 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -79,18 +79,14 @@ def direct_register_custom_op(
 
 @contextlib.contextmanager
 def memory_profiling(
-    baseline_memory_in_bytes: int, weights_memory_in_bytes: int
-) -> Generator["MemoryProfilingResult", None, None]:
+        baseline_snapshot: "MemorySnapshot",
+        weights_memory: int) -> "Generator[MemoryProfilingResult, None, None]":
     """Memory profiling context manager.
-    baseline_memory_in_bytes: memory used by all the components other than
-        the current vLLM instance. It contains: memory used by other processes, memory
-        used by another vLLM instance in the same process, etc. It is usually measured
-        before the current vLLM instance initialize the device. And we assume it is
-        constant during the profiling of the current vLLM instance.
-    weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
+    baseline_snapshot: the memory snapshot before the current vLLM instance.
+    weights_memory: memory used by PyTorch when loading the model weights.
         Note that, before loading the model weights, we also initialize the device
         and distributed environment, which may consume some memory. This part is not
-        included in the weights_memory_in_bytes because PyTorch does not control it.
+        included in the weights_memory because PyTorch does not control it.
 
     The memory in one GPU can be classified into 3 categories:
     1. memory used by anything other than the current vLLM instance.
@@ -125,25 +121,28 @@ def memory_profiling(
     b. 2 GiB reserved for the peak activation tensors (category 2)
     c. 1 GiB used by non-torch components (category 3)
 
-    The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
+    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
 
-    The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
-
-    (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
-    subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
-    """  # noqa
-    torch.cuda.reset_peak_memory_stats()
+    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
 
+    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
+    """ # noqa
     from vllm.utils import MemoryProfilingResult
 
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+
     result = MemoryProfilingResult()
 
-    result.baseline_memory_in_bytes = baseline_memory_in_bytes
+    result.before_create = baseline_snapshot
     # the part of memory used for holding the model weights
-    result.weights_memory_in_bytes = weights_memory_in_bytes
+    result.weights_memory = weights_memory
 
     result.before_profile.measure()
 
+    before_torch_memory_in_bytes = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+
     yield result
 
     gc.collect()
@@ -151,23 +150,14 @@ def memory_profiling(
 
     result.after_profile.measure()
 
-    diff = result.after_profile - result.before_profile
-    result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
-
-    # For mindspore, the memory is allocated and free in memory pool, so cannot read the current used memory by `torch.cuda.mem_get_info`.
-    current_cuda_memory_bytes = result.after_profile.torch_memory_in_bytes
-    result.non_torch_increase_in_bytes = (
-        current_cuda_memory_bytes
-        - baseline_memory_in_bytes
-        - weights_memory_in_bytes
-        - diff.torch_memory_in_bytes
-    )  # noqa
-    result.profile_time = diff.timestamp
-    result.non_kv_cache_memory_in_bytes = (
-        result.non_torch_increase_in_bytes
-        + result.torch_peak_increase_in_bytes
-        + result.weights_memory_in_bytes
-    )  # noqa
+    after_torch_memory_in_bytes = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+
+    diff_profile = result.after_profile - result.before_profile
+    diff_from_create = result.after_profile - result.before_create
+    result.torch_peak_increase = diff_profile.torch_peak
+    result.non_torch_increase = after_torch_memory_in_bytes - before_torch_memory_in_bytes
+    result.profile_time = diff_profile.timestamp
+    result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
 
 
 def _create_empty_tensor(ms_type):
@@ -300,8 +290,12 @@ def is_mindformers_model_backend():
 
 
 def check_ready():
+    import vllm.envs as envs
     from mindspore import set_context
 
+    if envs.VLLM_USE_V1:
+        raise NotImplementedError("vLLM-MindSpore does not support VLLM V1 now!")
+
     # Common environment variables of predict.
     set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
 
diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py
index a8a16ad3b..9c16ffde4 100644
--- a/vllm_mindspore/worker/cache_engine.py
+++ b/vllm_mindspore/worker/cache_engine.py
@@ -20,6 +20,8 @@
 from typing import List
 
 from vllm.logger import init_logger
+from vllm import envs
+from vllm.platforms import current_platform
 
 from vllm_mindspore.utils import MsKVCache, get_valid_dtype, is_use_mla, get_dtype_size
 
@@ -49,7 +51,6 @@ def ms_allocate_kv_cache(
 
     self.dtype = get_valid_dtype(self.dtype)
 
-    # TODO(tronzhang): A shape with (2, ...) for a kv tensor cannot support in mindspore's tensor and block operation, so split it to two tensor.
     for _ in range(self.num_attention_layers):
         device_type = "CPU" if device == "cpu" else "Ascend"
         current_cache = []
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index b8203a8b6..843902b88 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -128,7 +128,6 @@ def profile_run(self) -> None:
     # multiplying the list, to avoid Dynamo from treating them as
     # tensor aliasing.
 
-    # TODO(tronzhang): MindSpore's tensor view is limit now, delete this whole funtion patching latter.
     kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
         else self.cache_config.cache_dtype
     kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype]
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 684ee7f52..fe52fdb10 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -113,15 +113,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # of the model.
         _, total_gpu_memory = torch.cuda.mem_get_info()
         with memory_profiling(
-            baseline_memory_in_bytes=total_gpu_memory - self.init_gpu_memory,
-            weights_memory_in_bytes=self.model_runner.model_memory_usage,
+            self.baseline_snapshot,
+            weights_memory=self.model_runner.model_memory_usage,
         ) as result:
             self.model_runner.profile_run()
             torch.cuda.synchronize()
 
         self._assert_memory_footprint_increased_during_profiling()
 
-        memory_use_for_model_run = result.non_kv_cache_memory_in_bytes
+        memory_use_for_model_run = result.non_kv_cache_memory
 
     memory_for_current_instance = (
         total_gpu_memory * self.cache_config.gpu_memory_utilization
@@ -163,11 +163,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             f"({self.cache_config.gpu_memory_utilization:.2f})"
             f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
             "model weights take "
-            f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;"
+            f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
             " non_torch_memory takes "
-            f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;"
+            f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
             " PyTorch activation peak memory takes "
-            f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;"
+            f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
             " the rest of the memory reserved for KV Cache is "
             f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB."
         )
@@ -175,8 +175,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
     logger.info(msg)
 
     # Final cleanup
-    if self.model_runner.lora_manager:
-        self.model_runner.remove_all_loras()
     gc.collect()
 
     return num_gpu_blocks, num_cpu_blocks
-- 
Gitee


From 5ee58010f62061eb1f97a6859e2f3c088b0e01f8 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Mon, 17 Mar 2025 15:53:57 +0800
Subject: [PATCH 24/82] fix sampler.py reset gather

---
 vllm_mindspore/model_executor/layers/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index 1b91157fd..575d4ba80 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -425,7 +425,7 @@ def _apply_top_k_top_p(
     # Apply top-k.
     top_k_mask = logits_sort.size(1) - k.to(torch.long)
     # Get all the top_k values.
-    top_k_mask = logits_sort.gather(top_k_mask, 0)
+    top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
     top_k_mask = logits_sort < top_k_mask
     logits_sort.masked_fill_(top_k_mask, -float("inf"))
 
-- 
Gitee


From b4faffab1621a17e6fdf5667866ccb5ce8c6ff8c Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Wed, 19 Mar 2025 16:32:01 +0800
Subject: [PATCH 25/82] =?UTF-8?q?support=20chunk=20prefill=20and=20prefix?=
 =?UTF-8?q?=20caching=EF=BC=8Csupport=20q=5Fseq=5Flen,=20position=20ids,?=
 =?UTF-8?q?=20mask=20input?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/attention/backends/ms_attn.py  |  2 +
 .../models/mf_models/attention_mask.py        | 53 +++++++++++++++++++
 .../models/mf_models/deepseek_v3.py           | 50 ++++++++++++-----
 .../model_executor/models/mf_models/qwen2.py  | 52 ++++++++++++------
 vllm_mindspore/utils.py                       |  2 +
 5 files changed, 131 insertions(+), 28 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/attention_mask.py

diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index c99fc91c1..bc40ff1dc 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -69,6 +69,7 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     query_start_loc: Optional[torch.Tensor] = None
     kv_start_loc: Optional[torch.Tensor] = None
     prefill_block_tables: Optional[torch.Tensor] = None
+    query_lens: Optional[List[int]] = None,
 
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
@@ -338,6 +339,7 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
             num_decode_tokens=num_decode_tokens,
             multi_modal_placeholder_index_maps=None,
             enable_kv_scales_calculation=False,
+            query_lens=query_lens,
         )
 
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/attention_mask.py b/vllm_mindspore/model_executor/models/mf_models/attention_mask.py
new file mode 100644
index 000000000..10fcd25ec
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/attention_mask.py
@@ -0,0 +1,53 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+infer attention mask.
+"""
+import numpy as np
+
+import mindspore as ms
+from mindspore import Tensor, JitConfig, Model
+
+
+class LowerTriangularMask:
+    r"""
+    Provide Infer model attention mask.
+    Args:
+        mf_model_config (MF Config): The config of Infer model.
+
+    """
+
+    def __init__(self, mf_model_config):
+        compute_dtype = mf_model_config.compute_dtype
+        seq_length = mf_model_config.seq_length
+        self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1), dtype=compute_dtype)
+
+        self.decode_mask = Tensor(np.triu(np.ones(shape=(seq_length, seq_length), dtype=np.int8), k=1),
+                                  dtype=compute_dtype)
+
+        self.hard_mask = Tensor([0], dtype=compute_dtype).reshape(1, 1)
+
+        self.gather = ms.ops.Gather()
+
+    def gen_attention_mask(self, is_prefill, position_ids, query_lens):
+        if is_prefill:
+            attention_mask = self.prefill_mask
+        else:
+            if max(query_lens) > 1:
+                attention_mask = self.gather(self.decode_mask, position_ids, 0)
+            else:
+                attention_mask = self.hard_mask
+        return attention_mask
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 97b1e1a16..06beef143 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -53,9 +53,9 @@ from vllm_mindspore.utils import calc_block_num
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model
-from mindspore.communication.comm_func import barrier
 
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_infer_parallelism import DeepseekInferParallelism
+from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 
 logger = init_logger(__name__)
@@ -119,6 +119,7 @@ class DeepseekV3ForCausalLM(MsModelBase):
         self.mf_model_config.block_size = self.cache_config.block_size
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
+        self.mf_model_config.return_hidden_states = True
 
         self.is_quant = bool(hasattr(self.mf_model_config, "quantization_config") and
                              self.mf_model_config.quantization_config)
@@ -158,7 +159,6 @@ class DeepseekV3ForCausalLM(MsModelBase):
             jit_level="O0", infer_boost="on"
         ).jit_config_dict
         self.mf_kvcaches_init = False
-        self.logits = None
 
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
@@ -171,6 +171,9 @@ class DeepseekV3ForCausalLM(MsModelBase):
         for i in range(self.mf_model_config.num_layers):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
+        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
+        self.set_flags = False
+
     def update_mf_kvcaches(self):
         if self.mf_kvcaches_init:
             return
@@ -196,39 +199,60 @@ class DeepseekV3ForCausalLM(MsModelBase):
     ) -> Union[Tensor, IntermediateTensors]:
         self.update_mf_kvcaches()
 
-        is_prefill = True if attn_metadata.prefill_metadata else False
+        query_lens = attn_metadata.query_lens
+        kv_cache_lens = attn_metadata.seq_lens_tensor.asnumpy() - query_lens
+        if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
+            is_prefill = True
+        else:
+            is_prefill = False
 
-        self.logits = None
+        q_seq_lens = ms.Tensor(query_lens, dtype=ms.int32)
+        position_ids = ms.Tensor(positions, dtype=ms.int32)
+        attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
 
         model_inputs = {}
         model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-        model_inputs["batch_valid_length"] = ms.ops.expand_dims(
-            attn_metadata.seq_lens_tensor, 0
-        )
+        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(
+            attn_metadata.seq_lens_tensor.asnumpy(), 0))
         model_inputs["block_tables"] = _pad_block_table(
             attn_metadata.block_tables,
             self.mf_model_config.seq_length,
             self.mf_model_config.block_size,
         )
         model_inputs["slot_mapping"] = attn_metadata.slot_mapping
+        model_inputs["position_ids"] = position_ids
+        model_inputs["q_seq_lens"] = q_seq_lens
+        model_inputs["attention_mask"] = attention_mask
 
         if is_prefill:
             self.network.phase = "prefill"
-            self.network.add_flags_custom(is_first_iteration=True)
-            self.logits = self.network(**model_inputs)
+            if not self.set_flags:
+                self.network.add_flags_custom(is_first_iteration=True)
+            hidden_states = self.network(**model_inputs)
             self.network.phase = "increment"
-            self.network.add_flags_custom(is_first_iteration=False)
+            if not self.set_flags:
+                self.network.add_flags_custom(is_first_iteration=False)
+                self.set_flags = True
         else:
-            self.logits = self.network(**model_inputs)
+            hidden_states = self.network(**model_inputs)
 
-        return None
+        return hidden_states
 
     def compute_logits(
         self,
         hidden_states: Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[Tensor]:
-        return self.logits
+        selected_token_indices = sampling_metadata.selected_token_indices
+        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
+            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
+                                    dtype=self.mf_model_config.compute_dtype)
+        else:
+            hidden_states = hidden_states.index_select(0, selected_token_indices)
+            logits = self.network.lm_head(hidden_states)
+            logits = logits.reshape(-1, logits.shape[-1])
+
+        return logits
 
     def sample(
         self,
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 81afbb0d7..5ebcd6dcc 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -40,7 +40,6 @@ from mindformers.core.context import build_context
 from mindformers.core.parallel_config import build_parallel_config
 
 from mindformers.models.llama import LlamaConfig as LlamaConfig_MF
-from mindformers.trainer import BaseTrainer
 from research.qwen2_5.infer.qwen2_5 import (
     ParallelQwenForCausalLM as ParallelQwenForCausalLM_MF,
 )
@@ -52,7 +51,7 @@ from vllm_mindspore.utils import calc_block_num
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model
 from vllm_mindspore.model_executor.models.mf_models.qwen2_infer_parallelism import Qwen2InferParallelism
-
+from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 logger = init_logger(__name__)
 
@@ -115,6 +114,7 @@ class Qwen2ForCausalLM(MsModelBase):
         self.mf_model_config.block_size = self.cache_config.block_size
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
+        self.mf_model_config.return_hidden_states = True
 
         # qwen qkv concat will support in next version
         self.mf_model_config.qkv_concat = False
@@ -128,7 +128,6 @@ class Qwen2ForCausalLM(MsModelBase):
         self.mf_config.load_checkpoint = self.get_model_path()
 
         self.mf_kvcaches_init = False
-        self.logits = None
 
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
@@ -141,6 +140,9 @@ class Qwen2ForCausalLM(MsModelBase):
         for i in range(self.mf_model_config.num_layers):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
+        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
+        self.set_flags = False
+
     def update_mf_kvcaches(self):
         if self.mf_kvcaches_init:
             return
@@ -169,39 +171,59 @@ class Qwen2ForCausalLM(MsModelBase):
     ) -> Union[Tensor, IntermediateTensors]:
         self.update_mf_kvcaches()
 
-        is_prefill = True if attn_metadata.prefill_metadata else False
-
-        self.logits = None
+        query_lens = attn_metadata.query_lens
+        kv_cache_lens = attn_metadata.seq_lens_tensor.asnumpy() - query_lens
+        if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
+            is_prefill = True
+        else:
+            is_prefill = False
+        q_seq_lens = ms.Tensor(query_lens, dtype=ms.int32)
+        position_ids = ms.Tensor(positions, dtype=ms.int32)
+        attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
 
         model_inputs = {}
         model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-        model_inputs["batch_valid_length"] = ms.ops.expand_dims(
-            attn_metadata.seq_lens_tensor, 0
-        )
+        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(
+            attn_metadata.seq_lens_tensor.asnumpy(), 0))
         model_inputs["block_tables"] = _pad_block_table(
             attn_metadata.block_tables,
             self.mf_model_config.seq_length,
             self.mf_model_config.block_size,
         )
         model_inputs["slot_mapping"] = attn_metadata.slot_mapping
+        model_inputs["position_ids"] = position_ids
+        model_inputs["q_seq_lens"] = q_seq_lens
+        model_inputs["attention_mask"] = attention_mask
 
         if is_prefill:
             self.network.phase = "prefill"
-            self.network.add_flags_custom(is_first_iteration=True)
-            self.logits = self.network(**model_inputs)
+            if not self.set_flags:
+                self.network.add_flags_custom(is_first_iteration=True)
+            hidden_states = self.network(**model_inputs)
             self.network.phase = "increment"
-            self.network.add_flags_custom(is_first_iteration=False)
+            if not self.set_flags:
+                self.network.add_flags_custom(is_first_iteration=False)
+                self.set_flags = True
         else:
-            self.logits = self.network(**model_inputs)
+            hidden_states = self.network(**model_inputs)
 
-        return None
+        return hidden_states
 
     def compute_logits(
         self,
         hidden_states: Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[Tensor]:
-        return self.logits
+        selected_token_indices = sampling_metadata.selected_token_indices
+        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
+            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
+                                    dtype=self.mf_model_config.compute_dtype)
+        else:
+            hidden_states = hidden_states.index_select(0, selected_token_indices)
+            logits = self.network.lm_head(hidden_states)
+            logits = logits.reshape(-1, logits.shape[-1])
+
+        return logits
 
     def sample(
         self,
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index ca4a0d2ad..d37fa39d1 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -42,6 +42,7 @@ from vllm.utils import T, TORCH_DTYPE_TO_NUMPY_DTYPE, make_ndarray_with_pad
 import mindspore as ms
 from mindspore.common.initializer import Zero
 from mindspore import dtype as mstype
+from mindspore.common.api import _pynative_executor
 
 from .scripts import env_setup
 
@@ -317,6 +318,7 @@ def check_ready():
         env_setup(mindformers_default_env)
 
         set_context(mode=0, device_target="Ascend", max_call_depth=10000)
+        _pynative_executor.set_async_for_graph(True)
     else:
         env_setup({"MS_ALLOC_CONF": "enable_vmm:True", })
         logger.info("Run with native model backend!")
-- 
Gitee


From de0091b834e05b1e8ee85d4664f311da9834bddf Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Wed, 19 Mar 2025 22:40:16 +0800
Subject: [PATCH 26/82] avoid cpu communicator op limit

---
 vllm_mindspore/__init__.py                   |  2 +
 vllm_mindspore/distributed/parallel_state.py | 71 +++++++++++++++++++-
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index b9849cb46..2cf18b6f6 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -147,12 +147,14 @@ vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run
 from vllm_mindspore.distributed.parallel_state import (
     all_reduce_for_GroupCoordinator,
     init_model_parallel_group,
+    init_group_coordinator,
 )
 
 vllm.distributed.parallel_state.GroupCoordinator.all_reduce = (
     all_reduce_for_GroupCoordinator
 )
 vllm.distributed.parallel_state.init_model_parallel_group = init_model_parallel_group
+vllm.distributed.parallel_state.GroupCoordinator.__init__ = init_group_coordinator
 
 from vllm_mindspore.executor.multiproc_worker_utils import (
     get_mp_context as ms_get_mp_context,
diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
index 58cf43489..d599660cd 100644
--- a/vllm_mindspore/distributed/parallel_state.py
+++ b/vllm_mindspore/distributed/parallel_state.py
@@ -17,12 +17,14 @@
 # ============================================================================
 
 import pickle
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Union
 
 import numpy as np
 import torch
 import torch.distributed
 
+from torch.distributed import Backend
+
 
 def init_model_parallel_group(
     group_ranks: List[List[int]],
@@ -64,3 +66,70 @@ def all_reduce_for_GroupCoordinator(self, input_: torch.Tensor) -> torch.Tensor:
 
     torch.distributed.all_reduce(input_, group=self.device_group)
     return input_
+
+def init_group_coordinator(
+    self,
+    group_ranks: List[List[int]],
+    local_rank: int,
+    torch_distributed_backend: Union[str, Backend],
+    use_device_communicator: bool,
+    use_message_queue_broadcaster: bool = False,
+    group_name: Optional[str] = None,
+):
+    from vllm.distributed.parallel_state import _get_unique_name, _register_group
+    from vllm.utils import resolve_obj_by_qualname
+
+    group_name = group_name or "anonymous"
+    self.unique_name = _get_unique_name(group_name)
+    _register_group(self)
+
+    self.rank = torch.distributed.get_rank()
+    self.local_rank = local_rank
+    self.device_group = None
+    self.cpu_group = None
+
+    for ranks in group_ranks:
+        device_group = torch.distributed.new_group(
+            ranks, backend=torch_distributed_backend)
+        # CPU not ready now, use device to communication now.
+        cpu_group = torch.distributed.new_group(ranks, backend="hccl")
+        if self.rank in ranks:
+            self.ranks = ranks
+            self.world_size = len(ranks)
+            self.rank_in_group = ranks.index(self.rank)
+            self.device_group = device_group
+            self.cpu_group = cpu_group
+
+    assert self.cpu_group is not None
+    assert self.device_group is not None
+
+    from vllm.platforms import current_platform
+
+    # TODO: fix it for other platforms
+    if current_platform.is_cuda_alike():
+        self.device = torch.device(f"cuda:{local_rank}")
+    else:
+        self.device = torch.device("cpu")
+
+    self.use_device_communicator = use_device_communicator
+
+    self.device_communicator: DeviceCommunicatorBase = None  # type: ignore
+    if use_device_communicator and self.world_size > 1:
+        device_comm_cls = resolve_obj_by_qualname(
+            current_platform.get_device_communicator_cls())
+        self.device_communicator = device_comm_cls(
+            cpu_group=self.cpu_group,
+            device=self.device,
+            device_group=self.device_group,
+            unique_name=self.unique_name,
+        )
+
+    from vllm.distributed.device_communicators.shm_broadcast import (
+        MessageQueue)
+    self.mq_broadcaster: Optional[MessageQueue] = None
+    if use_message_queue_broadcaster and self.world_size > 1:
+        self.mq_broadcaster = MessageQueue.create_from_process_group(
+            self.cpu_group, 1 << 22, 6)
+
+    from vllm.platforms import current_platform
+    self.use_custom_op_call = current_platform.is_cuda_alike()
-- 
Gitee


From 6470c10d02fd95ef49270ee88f04a9ada2597c4d Mon Sep 17 00:00:00 2001
From: zlq2020 <zouliqin@huawei.com>
Date: Mon, 24 Mar 2025 23:25:16 +0800
Subject: [PATCH 27/82] add mf_model_base

---
 .../models/mf_models/deepseek_v3.py           | 147 +------------
 .../models/mf_models/mf_model_base.py         | 199 ++++++++++++++++++
 .../model_executor/models/mf_models/qwen2.py  | 167 +--------------
 3 files changed, 217 insertions(+), 296 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/mf_model_base.py

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 06beef143..41dad5658 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -17,28 +17,17 @@
 # ============================================================================
 
 import os
-import torch
-from typing import Iterable, List, Optional, Set, Tuple, Union
-from pathlib import Path
+from typing import Iterable, Set, Tuple
 
 import numpy as np
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.sequence import IntermediateTensors
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.attention.backends.abstract import AttentionType
+from vllm.config import  get_current_vllm_config
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 
+from mindspore import Tensor, JitConfig, Model
 
-from mindformers.tools.register.config import MindFormerConfig
-
-from mindformers.core.context import build_context
-from mindformers.core.parallel_config import build_parallel_config
 from mindformers.trainer.utils import transform_and_load_checkpoint
 from research.deepseek3.deepseek3_config import (
     DeepseekV3Config as DeepseekV3Config_MF,
@@ -48,12 +37,10 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.model_base import MsModelBase
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, \
+    _pad_block_table, Fake_Attention
 from vllm_mindspore.utils import calc_block_num
 
-import mindspore as ms
-from mindspore import Tensor, JitConfig, Model
-
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_infer_parallelism import DeepseekInferParallelism
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
@@ -61,57 +48,12 @@ from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerT
 logger = init_logger(__name__)
 
 
-def _pad_to_max(x, max_len):
-    return x + [-1] * (max_len - len(x))
-
-
-def _pad_block_table(block_tables, seq_length, block_size):
-    # When prefill, the block_tables is a empty tensor.
-    if len(block_tables.shape) < 2:
-        fake_block_tables = ms.mint.empty(
-            2, seq_length // block_size, dtype=ms.int32, device="Ascend"
-        )
-        return fake_block_tables
-
-    block_tables_list = block_tables.tolist()
-    padded_block_tables = [
-        _pad_to_max(block_table, seq_length // block_size)
-        for block_table in block_tables_list
-    ]
-
-    return Tensor(np.array(padded_block_tables).astype(np.int32))
-
-
-def _batch_seq(input_tokens, prefill):
-    if prefill:
-        return ms.ops.expand_dims(input_tokens, 0).to(ms.int32)
-
-    return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
-
-class Fake_Attention:
-    def __init__(self):
-        self.kv_cache = [
-            torch.tensor([]) for _ in range(get_current_vllm_config(
-            ).parallel_config.pipeline_parallel_size)
-        ]
-        self.attn_type = AttentionType.DECODER
-
-class DeepseekV3ForCausalLM(MsModelBase):
+class DeepseekV3ForCausalLM(MfModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(DeepseekV3ForCausalLM, self).__init__(
             vllm_config=vllm_config, prefix=prefix
         )
 
-        self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG"))
-        build_context(self.mf_config, is_set_ms_ctx=False, is_init_ms=False)
-        build_parallel_config(self.mf_config)
-        self.mf_config.model.model_config.parallel_config = (
-            self.mf_config.parallel_config
-        )
-        self.mf_config.model.model_config.parallel_config.model_parallel = (
-            get_tensor_model_parallel_world_size()
-        )
-        self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
         self.mf_config.load_checkpoint = self.get_model_path()
 
         self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
@@ -188,79 +130,8 @@ class DeepseekV3ForCausalLM(MsModelBase):
             )
         self.mf_kvcaches_init = True
 
-    def forward(
-        self,
-        input_ids: Tensor,
-        positions: Tensor,
-        kv_caches: List[Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[Tensor] = None,
-    ) -> Union[Tensor, IntermediateTensors]:
-        self.update_mf_kvcaches()
-
-        query_lens = attn_metadata.query_lens
-        kv_cache_lens = attn_metadata.seq_lens_tensor.asnumpy() - query_lens
-        if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
-            is_prefill = True
-        else:
-            is_prefill = False
-
-        q_seq_lens = ms.Tensor(query_lens, dtype=ms.int32)
-        position_ids = ms.Tensor(positions, dtype=ms.int32)
-        attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
-
-        model_inputs = {}
-        model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(
-            attn_metadata.seq_lens_tensor.asnumpy(), 0))
-        model_inputs["block_tables"] = _pad_block_table(
-            attn_metadata.block_tables,
-            self.mf_model_config.seq_length,
-            self.mf_model_config.block_size,
-        )
-        model_inputs["slot_mapping"] = attn_metadata.slot_mapping
-        model_inputs["position_ids"] = position_ids
-        model_inputs["q_seq_lens"] = q_seq_lens
-        model_inputs["attention_mask"] = attention_mask
-
-        if is_prefill:
-            self.network.phase = "prefill"
-            if not self.set_flags:
-                self.network.add_flags_custom(is_first_iteration=True)
-            hidden_states = self.network(**model_inputs)
-            self.network.phase = "increment"
-            if not self.set_flags:
-                self.network.add_flags_custom(is_first_iteration=False)
-                self.set_flags = True
-        else:
-            hidden_states = self.network(**model_inputs)
-
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[Tensor]:
-        selected_token_indices = sampling_metadata.selected_token_indices
-        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
-            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
-                                    dtype=self.mf_model_config.compute_dtype)
-        else:
-            hidden_states = hidden_states.index_select(0, selected_token_indices)
-            logits = self.network.lm_head(hidden_states)
-            logits = logits.reshape(-1, logits.shape[-1])
-
-        return logits
-
-    def sample(
-        self,
-        logits: Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
+    def pad_block_table(self, block_tables, seq_length, block_size):
+        return _pad_block_table(block_tables, seq_length, block_size, 2)
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         if self.mf_config.load_ckpt_format == "ckpt":
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
new file mode 100644
index 000000000..5084544dd
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import os
+from typing import Iterable, List, Optional, Set, Tuple, Union
+import numpy as np
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.config import get_current_vllm_config
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.sequence import IntermediateTensors
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.attention.backends.abstract import AttentionType
+from vllm.logger import init_logger
+
+import torch
+import mindspore as ms
+from mindspore import Tensor
+
+from mindformers.tools.register.config import MindFormerConfig
+from mindformers.core.context import build_context
+from mindformers.core.parallel_config import build_parallel_config
+
+from vllm_mindspore.model_executor.models.model_base import MsModelBase
+
+logger = init_logger(__name__)
+
+
+def _pad_to_max(x, max_len):
+    return x + [-1] * (max_len - len(x))
+
+
+def _pad_block_table(block_tables, seq_length, block_size, pad_size):
+    # When prefill, the block_tables is a empty tensor.
+    if len(block_tables.shape) < 2:
+        fake_block_tables = ms.mint.empty(
+            pad_size, seq_length // block_size, dtype=ms.int32, device="Ascend"
+        )
+        return fake_block_tables
+
+    block_tables_list = block_tables.tolist()
+    padded_block_tables = [
+        _pad_to_max(block_table, seq_length // block_size)
+        for block_table in block_tables_list
+    ]
+
+    return Tensor(np.array(padded_block_tables).astype(np.int32))
+
+
+def _batch_seq(input_tokens, prefill):
+    if prefill:
+        return ms.ops.expand_dims(input_tokens, 0).to(ms.int32)
+
+    return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
+
+
+class Fake_Attention:
+    def __init__(self):
+        self.kv_cache = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
+        self.attn_type = AttentionType.DECODER
+
+
+class MfModelBase(MsModelBase):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super(MfModelBase, self).__init__(
+            vllm_config=vllm_config, prefix=prefix
+        )
+
+        self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG"))
+        build_context(self.mf_config, is_set_ms_ctx=False, is_init_ms=False)
+        build_parallel_config(self.mf_config)
+        self.mf_config.model.model_config.parallel_config = (
+            self.mf_config.parallel_config
+        )
+        self.mf_config.model.model_config.parallel_config.model_parallel = (
+            get_tensor_model_parallel_world_size()
+        )
+        self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
+
+
+    def update_mf_kvcaches(self):
+        if self.mf_kvcaches_init:
+            return
+
+        forward_context = get_forward_context()
+        for i in range(self.mf_model_config.num_layers):
+            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
+            v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1]
+            mf_k_cache, mf_v_cache = self.network.kvcache(i)
+            mf_k_cache.set_device_address(
+                k_cache._data_ptr(), k_cache.shape, k_cache.dtype
+            )
+            mf_v_cache.set_device_address(
+                v_cache._data_ptr(), v_cache.shape, v_cache.dtype
+            )
+        self.mf_kvcaches_init = True
+
+
+    def pad_block_table(self, block_tables, seq_length, block_size):
+        raise NotImplementedError("pad_block_table not implemented.")
+
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        kv_caches: List[Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[Tensor] = None,
+    ) -> Union[Tensor, IntermediateTensors]:
+        self.update_mf_kvcaches()
+
+        query_lens = attn_metadata.query_lens
+        kv_cache_lens = attn_metadata.seq_lens_tensor.asnumpy() - query_lens
+        if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
+            is_prefill = True
+        else:
+            is_prefill = False
+
+        q_seq_lens = ms.Tensor(query_lens, dtype=ms.int32)
+        position_ids = ms.Tensor(positions, dtype=ms.int32)
+        attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
+
+        model_inputs = {}
+        model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
+        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(
+            attn_metadata.seq_lens_tensor.asnumpy(), 0))
+        model_inputs["block_tables"] = self.pad_block_table(
+            attn_metadata.block_tables,
+            self.mf_model_config.seq_length,
+            self.mf_model_config.block_size,
+        )
+        model_inputs["slot_mapping"] = attn_metadata.slot_mapping
+        model_inputs["position_ids"] = position_ids
+        model_inputs["q_seq_lens"] = q_seq_lens
+        model_inputs["attention_mask"] = attention_mask
+
+        if is_prefill:
+            self.network.phase = "prefill"
+            if not self.set_flags:
+                self.network.add_flags_custom(is_first_iteration=True)
+            hidden_states = self.network(**model_inputs)
+            self.network.phase = "increment"
+            if not self.set_flags:
+                self.network.add_flags_custom(is_first_iteration=False)
+                self.set_flags = True
+        else:
+            hidden_states = self.network(**model_inputs)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[Tensor]:
+        selected_token_indices = sampling_metadata.selected_token_indices
+        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
+            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
+                                    dtype=self.mf_model_config.compute_dtype)
+        else:
+            hidden_states = hidden_states.index_select(0, selected_token_indices)
+            logits = self.network.lm_head(hidden_states)
+            logits = logits.reshape(-1, logits.shape[-1])
+
+        return logits
+
+    def sample(
+        self,
+        logits: Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
+        raise NotImplementedError("load_weight not implemented.")
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 5ebcd6dcc..2de4090f1 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -16,28 +16,13 @@
 # limitations under the License.
 # ============================================================================
 
-import os
-import torch
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Set, Tuple
 
-import numpy as np
-
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.config import CacheConfig, get_current_vllm_config
-from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-from vllm.attention.backends.abstract import AttentionType
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 
-
-from mindformers.tools.register.config import MindFormerConfig
-
-from mindformers.core.context import build_context
-from mindformers.core.parallel_config import build_parallel_config
+from mindspore import Tensor, JitConfig
 
 from mindformers.models.llama import LlamaConfig as LlamaConfig_MF
 from research.qwen2_5.infer.qwen2_5 import (
@@ -45,67 +30,20 @@ from research.qwen2_5.infer.qwen2_5 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.model_base import MsModelBase
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, \
+    _pad_block_table, Fake_Attention
 from vllm_mindspore.utils import calc_block_num
-
-import mindspore as ms
-from mindspore import Tensor, JitConfig, Model
 from vllm_mindspore.model_executor.models.mf_models.qwen2_infer_parallelism import Qwen2InferParallelism
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
-logger = init_logger(__name__)
-
-
-def _pad_to_max(x, max_len):
-    return x + [-1] * (max_len - len(x))
-
-
-def _pad_block_table(block_tables, seq_length, block_size):
-    # When Prefill, the block_tables is a empty tensor.
-    if len(block_tables.shape) < 2:
-        fake_block_tables = ms.mint.empty(
-            1, seq_length // block_size, dtype=ms.int32, device="Ascend"
-        )
-        return fake_block_tables
-
-    block_tables_list = block_tables.tolist()
-    padded_block_tables = [
-        _pad_to_max(block_table, seq_length // block_size)
-        for block_table in block_tables_list
-    ]
 
-    return Tensor(np.array(padded_block_tables).astype(np.int32))
-
-
-def _batch_seq(input_tokens, prefill):
-    if prefill:
-        return ms.ops.expand_dims(input_tokens, 0).to(ms.int32)
-
-    return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
+logger = init_logger(__name__)
 
-class Fake_Attention:
-    def __init__(self):
-        self.kv_cache = [
-            torch.tensor([]) for _ in range(get_current_vllm_config(
-            ).parallel_config.pipeline_parallel_size)
-        ]
-        self.attn_type = AttentionType.DECODER
 
-class Qwen2ForCausalLM(MsModelBase):
+class Qwen2ForCausalLM(MfModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(Qwen2ForCausalLM, self).__init__(vllm_config=vllm_config, prefix=prefix)
 
-        self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG"))
-        build_context(self.mf_config, is_set_ms_ctx=False, is_init_ms=False)
-        build_parallel_config(self.mf_config)
-        self.mf_config.model.model_config.parallel_config = (
-            self.mf_config.parallel_config
-        )
-        self.mf_config.model.model_config.parallel_config.model_parallel = (
-            get_tensor_model_parallel_world_size()
-        )
-        self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
-
         self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config)
         # Cannot get num_gpu_blocks from cache config now, calculate one first.
         self.mf_model_config.num_blocks = calc_block_num(
@@ -143,95 +81,8 @@ class Qwen2ForCausalLM(MsModelBase):
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
-    def update_mf_kvcaches(self):
-        if self.mf_kvcaches_init:
-            return
-
-        forward_context = get_forward_context()
-        for i in range(self.mf_model_config.num_layers):
-            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
-            v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1]
-            mf_k_cache, mf_v_cache = self.network.kvcache(i)
-            mf_k_cache.set_device_address(
-                k_cache._data_ptr(), k_cache.shape, k_cache.dtype
-            )
-            mf_v_cache.set_device_address(
-                v_cache._data_ptr(), v_cache.shape, v_cache.dtype
-            )
-        self.mf_kvcaches_init = True
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        positions: Tensor,
-        kv_caches: List[Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[Tensor] = None,
-    ) -> Union[Tensor, IntermediateTensors]:
-        self.update_mf_kvcaches()
-
-        query_lens = attn_metadata.query_lens
-        kv_cache_lens = attn_metadata.seq_lens_tensor.asnumpy() - query_lens
-        if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
-            is_prefill = True
-        else:
-            is_prefill = False
-        q_seq_lens = ms.Tensor(query_lens, dtype=ms.int32)
-        position_ids = ms.Tensor(positions, dtype=ms.int32)
-        attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
-
-        model_inputs = {}
-        model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(
-            attn_metadata.seq_lens_tensor.asnumpy(), 0))
-        model_inputs["block_tables"] = _pad_block_table(
-            attn_metadata.block_tables,
-            self.mf_model_config.seq_length,
-            self.mf_model_config.block_size,
-        )
-        model_inputs["slot_mapping"] = attn_metadata.slot_mapping
-        model_inputs["position_ids"] = position_ids
-        model_inputs["q_seq_lens"] = q_seq_lens
-        model_inputs["attention_mask"] = attention_mask
-
-        if is_prefill:
-            self.network.phase = "prefill"
-            if not self.set_flags:
-                self.network.add_flags_custom(is_first_iteration=True)
-            hidden_states = self.network(**model_inputs)
-            self.network.phase = "increment"
-            if not self.set_flags:
-                self.network.add_flags_custom(is_first_iteration=False)
-                self.set_flags = True
-        else:
-            hidden_states = self.network(**model_inputs)
-
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[Tensor]:
-        selected_token_indices = sampling_metadata.selected_token_indices
-        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
-            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
-                                    dtype=self.mf_model_config.compute_dtype)
-        else:
-            hidden_states = hidden_states.index_select(0, selected_token_indices)
-            logits = self.network.lm_head(hidden_states)
-            logits = logits.reshape(-1, logits.shape[-1])
-
-        return logits
-
-    def sample(
-        self,
-        logits: Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
+    def pad_block_table(self, block_tables, seq_length, block_size):
+        return _pad_block_table(block_tables, seq_length, block_size, 1)
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         model_parallelism = Qwen2InferParallelism(self.mf_config, self.network, False)
-- 
Gitee


From 91b55ba66f57d63c0d8814beab9dbcebcbc31ca8 Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Mon, 17 Mar 2025 23:14:43 +0800
Subject: [PATCH 28/82] skip max_num_batched_tokens < max_model_len check

---
 vllm_mindspore/__init__.py |  3 ++-
 vllm_mindspore/config.py   | 55 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index b9849cb46..5fb37d4f4 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -181,11 +181,12 @@ vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster
 vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster
 
 
-from .config import get_head_size, _verify_quantization, get_num_kv_heads
+from .config import get_head_size, _verify_quantization, get_num_kv_heads, _verify_args
 
 vllm.config.ModelConfig.get_head_size = get_head_size
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.ModelConfig.get_num_kv_heads = get_num_kv_heads
+vllm.config.SchedulerConfig._verify_args = _verify_args
 
 from .utils import check_ready
 
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 02f79d41f..848bef2e9 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -16,8 +16,10 @@
 # limitations under the License.
 # ============================================================================
 
+from vllm.logger import init_logger
 from vllm_mindspore.utils import is_mindformers_model_backend, is_use_mla
 
+logger = init_logger(__name__)
 
 def get_head_size(self) -> int:
     if hasattr(self.hf_text_config, "model_type") and (
@@ -54,3 +56,56 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
 
     total_num_kv_heads = self.get_total_num_kv_heads()
     return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
+
+
+def _verify_args(self) -> None:
+    if (self.max_num_batched_tokens < self.max_model_len
+            and not self.chunked_prefill_enabled):
+        logger.warning(
+            f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+            f"smaller than max_model_len ({self.max_model_len}). "
+            "This effectively limits the maximum sequence length to "
+            "max_num_batched_tokens and makes vLLM reject longer "
+            "sequences. Please increase max_num_batched_tokens or "
+            "decrease max_model_len.")
+
+        if self.max_num_batched_tokens < self.max_num_seqs:
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+                "be greater than or equal to max_num_seqs "
+                f"({self.max_num_seqs}).")
+
+        if self.num_lookahead_slots < 0:
+            raise ValueError(
+                "num_lookahead_slots "
+                f"({self.num_lookahead_slots}) must be greater than or "
+                "equal to 0.")
+
+        if self.num_scheduler_steps < 1:
+            raise ValueError(
+                "num_scheduler_steps "
+                f"({self.num_scheduler_steps}) must be greater than or "
+                "equal to 1.")
+
+        if self.max_num_partial_prefills < 1:
+            raise ValueError(
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
+                "must be greater than or equal to 1.")
+        elif self.max_num_partial_prefills > 1:
+            if not self.chunked_prefill_enabled:
+                raise ValueError("Chunked prefill must be enabled to set "
+                                 "max_num_partial_prefills > 1.")
+
+            if self.long_prefill_token_threshold > self.max_model_len:
+                raise ValueError(
+                    "long_prefill_token_threshold "
+                    f"({self.long_prefill_token_threshold}) cannot be greater "
+                    f"than the max_model_len ({self.max_model_len}).")
+
+        if (self.max_long_partial_prefills
+                < 1) or (self.max_long_partial_prefills
+                         > self.max_num_partial_prefills):
+            raise ValueError(
+                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
+                "must be greater than or equal to 1 and less than or equal to "
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
-- 
Gitee


From ae87907ad5d2cacca508bb25b5c4e3d9a80b36c9 Mon Sep 17 00:00:00 2001
From: liu lili <liulili715@huawei.com>
Date: Fri, 21 Mar 2025 20:39:34 +0800
Subject: [PATCH 29/82] lll: delete parameter_dict to reduce host memory

---
 .../mf_models/deepseekv3_infer_parallelism.py | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
index 6a713f836..69d0664a9 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
@@ -19,6 +19,7 @@ transform huggingface model to mindspore safetensor.
 import os
 import time
 import json
+import gc
 import numpy as np
 
 import mindspore as ms
@@ -224,6 +225,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
                                                         name=w2_scale_ms_name,
                                                         requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process moe shared expert ffn weight"""
@@ -296,6 +300,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
                                                         name=w2_ms_name,
                                                         requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_quant_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process dense ffn weight"""
@@ -378,6 +385,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
                                                         name=w2_ms_name,
                                                         requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map):
         """convert weight not in model"""
@@ -405,6 +415,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
                                                        requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def quant_special_attention_weight(self, layer_id, src_hf_dir, hf_weight_map, name, is_trans_rope_weigh=False,
                                        is_split_param=False):
@@ -479,6 +492,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict[dequant_scale_ms_name] = ms.Parameter(ms.Tensor(dequant_scale_ms_param, ms.float32),
                                                              name=dequant_scale_ms_name, requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_quant_bias_weight(self, src_hf_dir, layer_id, hf_weight_map):
         # quant_op.beta
@@ -503,6 +519,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
                                                              name=l2q_proj_bias_ms_name,
                                                              requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process attention weight"""
@@ -598,6 +617,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj")
 
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant net convert layer weight"""
@@ -705,6 +727,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_stack_param, ms.bfloat16), name=w2_ms_name,
                                                   requires_grad=False)
         _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process moe shared expert ffn weight"""
@@ -735,6 +760,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
                                                   requires_grad=False)
         _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process dense ffn weight"""
@@ -771,6 +799,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
                                                   requires_grad=False)
         _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process attention weight"""
@@ -857,6 +888,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
                                                   requires_grad=False)
         _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process attention weight"""
@@ -880,6 +914,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
                                                         requires_grad=False)
 
         _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        
+        del parameter_dict
+        gc.collect()
 
     def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer convert layer weight"""
-- 
Gitee


From d46bac3e793a41e60699e14bf6fbbbc2209a25eb Mon Sep 17 00:00:00 2001
From: zhaizhiqiang <zhaizhiqiang@huawei.com>
Date: Wed, 26 Mar 2025 01:24:44 +0800
Subject: [PATCH 30/82] support jit

---
 vllm_mindspore/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index d37fa39d1..8505d4eec 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -316,9 +316,6 @@ def check_ready():
             "MS_ALLOC_CONF": "enable_vmm:False",
         }
         env_setup(mindformers_default_env)
-
-        set_context(mode=0, device_target="Ascend", max_call_depth=10000)
-        _pynative_executor.set_async_for_graph(True)
     else:
         env_setup({"MS_ALLOC_CONF": "enable_vmm:True", })
         logger.info("Run with native model backend!")
-- 
Gitee


From 888cf2a932a799af0df842b8e70d7b528512df05 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Tue, 25 Mar 2025 18:23:16 +0800
Subject: [PATCH 31/82] =?UTF-8?q?support=20cp=EF=BC=8Cpc=EF=BC=8Cmss=20at?=
 =?UTF-8?q?=20the=20same=20time?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/__init__.py                    |   5 +-
 vllm_mindspore/attention/backends/ms_attn.py  |   1 +
 vllm_mindspore/config.py                      | 107 +++++++++++++-----
 .../models/mf_models/mf_model_base.py         |  20 +++-
 .../model_executor/models/model_base.py       |   5 +
 5 files changed, 102 insertions(+), 36 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index c178ecda8..0e37cb4a4 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -183,11 +183,10 @@ vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster
 vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster
 
 
-from .config import get_head_size, _verify_quantization, get_num_kv_heads, _verify_args
+from .config import _verify_quantization, _verify_args, vllm_config_post_init
 
-vllm.config.ModelConfig.get_head_size = get_head_size
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
-vllm.config.ModelConfig.get_num_kv_heads = get_num_kv_heads
+vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
 vllm.config.SchedulerConfig._verify_args = _verify_args
 
 from .utils import check_ready
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index bc40ff1dc..1c082a49d 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -340,6 +340,7 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
             multi_modal_placeholder_index_maps=None,
             enable_kv_scales_calculation=False,
             query_lens=query_lens,
+            max_query_len=max_query_len
         )
 
 
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 848bef2e9..5fd633977 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -15,47 +15,98 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+import torch
 
+import vllm.envs as envs
+
+from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger
+from vllm.utils import random_uuid
 from vllm.logger import init_logger
-from vllm_mindspore.utils import is_mindformers_model_backend, is_use_mla
 
 logger = init_logger(__name__)
 
-def get_head_size(self) -> int:
-    if hasattr(self.hf_text_config, "model_type") and (
-        self.hf_text_config.model_type in ("deepseek_v2", "deepseek_v3")
-    ):
-
-        if is_mindformers_model_backend():
-            qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0)
-            return self.hf_text_config.kv_lora_rank + qk_rope_head_dim
-
-        # FlashAttention supports only head_size 32, 64, 128, 256,
-        # we need to pad head_size 192 to 256
-        return 256
-
-    if self.is_attention_free:
-        return 0
-
-    if hasattr(self.hf_text_config, "head_dim"):
-        return self.hf_text_config.head_dim
-    # FIXME(woosuk): This may not be true for all models.
-    return self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads
-
 
 def _verify_quantization(self) -> None:
     # Donnot verify now.
     return
 
 
-def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
-    """Returns the number of KV heads per Device."""
+def vllm_config_post_init(self):
+    """Verify configs are valid & consistent with each other."""
+    if self.model_config is not None:
+        self.model_config.verify_async_output_proc(self.parallel_config,
+                                                   self.speculative_config,
+                                                   self.device_config)
+        self.model_config.verify_with_parallel_config(self.parallel_config)
+
+    if self.cache_config is not None:
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+
+    if self.lora_config:
+        self.lora_config.verify_with_cache_config(self.cache_config)
+        self.lora_config.verify_with_model_config(self.model_config)
+        self.lora_config.verify_with_scheduler_config(
+            self.scheduler_config)
+    if self.prompt_adapter_config:
+        self.prompt_adapter_config.verify_with_model_config(
+            self.model_config)
+
+    if self.quant_config is None and \
+        self.model_config is not None and self.load_config is not None:
+        self.quant_config = VllmConfig._get_quantization_config(
+            self.model_config, self.load_config)
+
+    from vllm.platforms import current_platform
+    if self.scheduler_config is not None and \
+        self.model_config is not None and \
+        self.scheduler_config.chunked_prefill_enabled and \
+        self.model_config.dtype == torch.float32 and \
+        current_platform.get_device_capability() == (7, 5):
+        logger.warning_once(
+            "Turing devices tensor cores do not support float32 matmul. "
+            "To workaround this limitation, vLLM will set 'ieee' input "
+            "precision for chunked prefill triton kernels.")
+
+    if self.compilation_config is None:
+        self.compilation_config = CompilationConfig()
+    if envs.VLLM_USE_V1 and self.model_config is not None and \
+        not self.model_config.enforce_eager:
+        # NOTE(woosuk): Currently, we use inductor because the piecewise
+        # CUDA graphs do not work properly with the custom CUDA kernels.
+        # FIXME(woosuk): Disable inductor to reduce the compilation time
+        # and avoid any potential issues with the inductor.
+        self.compilation_config.custom_ops = ["none"]
+        self.compilation_config.use_cudagraph = True
+        self.compilation_config.use_inductor = True
+        self.compilation_config.cudagraph_num_of_warmups = 1
+        self.compilation_config.pass_config.enable_fusion = False
+        self.compilation_config.pass_config.enable_reshape = False
+        self.compilation_config.level = CompilationLevel.PIECEWISE
+
+    self._set_cudagraph_sizes()
+
+    if self.cache_config is not None and \
+        self.cache_config.cpu_offload_gb > 0 and \
+        self.compilation_config.level != CompilationLevel.NO_COMPILATION:
+        logger.warning(
+            "CPU offload is not supported with `torch.compile` yet."
+            " Disabling `torch.compile`.")
+        self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
+    if self.lora_config is not None and self.compilation_config.level !=\
+            CompilationLevel.NO_COMPILATION:
+        logger.warning("LoRA is not supported with `torch.compile` yet. "
+                        "Disabling `torch.compile`.")
+        self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
+    current_platform.check_and_update_config(self)
 
-    if is_use_mla(self):
-        return 1
+    if self.model_config and self.model_config.use_mla:
+        logger.info("For MindSpore, MLA supports chunked prefill and prefix, "
+                    "so keep them enable.")
 
-    total_num_kv_heads = self.get_total_num_kv_heads()
-    return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
+    if not self.instance_id:
+        self.instance_id = random_uuid()[:5]
 
 
 def _verify_args(self) -> None:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 5084544dd..91a0446e6 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -132,21 +132,31 @@ class MfModelBase(MsModelBase):
     ) -> Union[Tensor, IntermediateTensors]:
         self.update_mf_kvcaches()
 
-        query_lens = attn_metadata.query_lens
-        kv_cache_lens = attn_metadata.seq_lens_tensor.asnumpy() - query_lens
+        seq_lens = attn_metadata.seq_lens
+        max_query_len = attn_metadata.max_query_len
+        # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+        # decodes are scheduled together. In the first step, all the
+        # prefills turn into decodes and max_query_len will be 1.
+        if self.is_multi_step_chunked_prefill and max_query_len == 1:
+            query_lens = [1] * len(seq_lens)
+        else:
+            query_lens = attn_metadata.query_lens
+
+        seq_lens_np = np.array(seq_lens, dtype=np.int32)
+        query_lens_np = np.array(query_lens, dtype=np.int32)
+        kv_cache_lens = seq_lens_np - query_lens_np
         if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
             is_prefill = True
         else:
             is_prefill = False
 
-        q_seq_lens = ms.Tensor(query_lens, dtype=ms.int32)
+        q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32)
         position_ids = ms.Tensor(positions, dtype=ms.int32)
         attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
 
         model_inputs = {}
         model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(
-            attn_metadata.seq_lens_tensor.asnumpy(), 0))
+        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0))
         model_inputs["block_tables"] = self.pad_block_table(
             attn_metadata.block_tables,
             self.mf_model_config.seq_length,
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 15d1ff03a..f1bb23615 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -47,6 +47,11 @@ class MsModelBase():
 
         self.modules_dict = None
 
+        self.enable_chunked_prefill = vllm_config.scheduler_config.enable_chunked_prefill
+        self.enable_prefix_caching = vllm_config.cache_config.enable_prefix_caching
+        self.is_multi_step = vllm_config.scheduler_config.is_multi_step
+        self.is_multi_step_chunked_prefill = self.is_multi_step and self.enable_chunked_prefill
+
     def get_model_path(self):
         model_name_or_path = self.model_config.model
         if os.path.isdir(model_name_or_path):
-- 
Gitee


From a161ee1bba84bdbbd81a71ca5aa633c900c09616 Mon Sep 17 00:00:00 2001
From: r1chardf1d0 <xiaruijie@huawei.com>
Date: Thu, 13 Mar 2025 17:49:10 +0800
Subject: [PATCH 32/82] support multi-step scheduling

---
 vllm_mindspore/__init__.py                   |   9 +-
 vllm_mindspore/attention/backends/ms_attn.py | 239 ++++++++++++++++++-
 vllm_mindspore/worker/model_runner.py        |  16 +-
 vllm_mindspore/worker/worker.py              |  17 +-
 4 files changed, 269 insertions(+), 12 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 0e37cb4a4..2a1195c14 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -137,13 +137,20 @@ from vllm.worker.worker import Worker
 Worker._warm_up_model = _warm_up_model
 Worker.determine_num_available_blocks = determine_num_available_blocks
 
-from vllm_mindspore.worker.model_runner import _get_cuda_graph_pad_size, profile_run
+from vllm_mindspore.worker.model_runner import (
+    _get_cuda_graph_pad_size,
+    profile_run,
+    _get_supported_attention_backends
+)
 
 vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = (
     _get_cuda_graph_pad_size
 )
 vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run
 
+import vllm.worker.multi_step_model_runner
+vllm.worker.multi_step_model_runner._get_supported_attention_backends = _get_supported_attention_backends
+
 from vllm_mindspore.distributed.parallel_state import (
     all_reduce_for_GroupCoordinator,
     init_model_parallel_group,
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index 1c082a49d..0e5075fbd 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -19,6 +19,7 @@
 
 from collections import defaultdict
 from dataclasses import dataclass
+from itertools import accumulate
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -53,6 +54,41 @@ import mindspore as ms
 from mindspore import mutable
 from mindspore._c_expression import swap_cache
 
+def advance_step_op(sampled_token_ids,
+                    model_input,
+                    seq_lens_tensor,
+                    num_queries,
+                    block_size,
+                    block_tables,
+                    slot_mapping):
+    # update input_tokens
+    sampled_token_ids_list = sampled_token_ids[:
+                                               num_queries].squeeze(  # type: ignore
+                                                   -1)
+    model_input.input_tokens[:
+                             num_queries] = sampled_token_ids_list  # type: ignore
+
+    # get seq_lens and input_positions
+    seq_lens = seq_lens_tensor[:num_queries]
+    next_seq_lens = seq_lens + 1
+    next_input_pos = next_seq_lens - 1
+
+    # update seq_lens and input_positions
+    seq_lens_tensor[:num_queries] = next_seq_lens
+    model_input.input_positions[:
+                                num_queries] = next_input_pos  # type: ignore
+
+    # 计算 block index 和 offset
+    block_idx = next_input_pos // block_size
+    block_offset = next_input_pos % block_size
+
+    current_block_table = block_tables.gather(
+        1, block_idx.unsqueeze(-1)).squeeze(-1)
+    slot_num = current_block_table * block_size + block_offset
+
+    # update slot_mapping
+    slot_mapping[:num_queries] = slot_num
+
 
 @dataclass
 class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
@@ -65,6 +101,15 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
 
     # For chunked prefill only
     max_query_len: Optional[int] = None
+
+    max_prefill_seq_len: int = 0
+    seq_start_loc: Optional[torch.Tensor] = None
+    _cached_prefill_metadata: Optional["MSAttentionMetadata"] = None
+    _cached_decode_metadata: Optional["MSAttentionMetadata"] = None
+    context_lens_tensor: Optional[torch.Tensor] = None
+    encoder_seq_start_loc: Optional[torch.Tensor] = None
+    max_decode_query_len: Optional[int] = None
+
     max_kv_len: Optional[int] = None
     query_start_loc: Optional[torch.Tensor] = None
     kv_start_loc: Optional[torch.Tensor] = None
@@ -93,15 +138,183 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
 
     @property
     def prefill_metadata(self):
-        if self.num_prefill_tokens == 0:
+        if self.num_prefills == 0:
             return None
-        return self
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        self._cached_prefill_metadata = MSAttentionMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            enable_kv_scales_calculation=False,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            chunked_prefill=self.chunked_prefill,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_prefill_metadata
 
     @property
     def decode_metadata(self):
         if self.num_decode_tokens == 0:
             return None
-        return self
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        self._cached_decode_metadata = MSAttentionMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            encoder_seq_start_loc=self.encoder_seq_start_loc,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            chunked_prefill=self.chunked_prefill,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForNPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+
+        if turn_prefills_into_decodes:
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        advance_step_op(sampled_token_ids,
+                        model_input,
+                        self.seq_lens_tensor,
+                        num_queries,
+                        block_size,
+                        self.block_tables,
+                        self.slot_mapping)
 
     def get_seq_lens(
         self,
@@ -311,8 +524,16 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
         use_captured_graph = cuda_graph_pad_size != -1
 
         max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
         num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
 
         if use_captured_graph:
             raise RuntimeError("Doesnot support captured graph now!")
@@ -325,10 +546,15 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
             )
         assert max_query_len > 0, "query_lens: {}".format(query_lens)
 
+        context_lens_tensor = ms.Tensor(self.context_lens, dtype=ms.int32)
         seq_lens_tensor = ms.Tensor(seq_lens, dtype=ms.int32)
 
+        slot_mapping_tensor = ms.Tensor(self.slot_mapping, dtype=ms.int32)
+        query_start_loc_tensor = ms.Tensor(query_start_loc, dtype=ms.int32)
+        seq_start_loc_tensor = ms.Tensor(seq_start_loc, dtype=ms.int32)
+
         return MSAttentionMetadata(
-            slot_mapping=ms.Tensor(self.slot_mapping, dtype=ms.int32),
+            slot_mapping=slot_mapping_tensor,
             block_tables=block_tables,
             seq_lens_tensor=seq_lens_tensor,
             seq_lens=seq_lens,
@@ -340,7 +566,10 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
             multi_modal_placeholder_index_maps=None,
             enable_kv_scales_calculation=False,
             query_lens=query_lens,
-            max_query_len=max_query_len
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            max_query_len=max_query_len,
         )
 
 
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 843902b88..9a6df8463 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -26,8 +26,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceGroupMetadata
 from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE
 
-from mindspore.common import dtype as mstype
-from mindspore import mutable, Tensor
+from mindspore import mutable
 
 logger = init_logger(__name__)
 
@@ -155,3 +154,16 @@ def profile_run(self) -> None:
     self.execute_model(model_input, kv_caches, intermediate_tensors)
     torch.cuda.synchronize()
     return
+
+
+MULTI_STEP_ATTENTION_BACKENDS = [
+    "MS_MLA", "MS_ATTN", "NO_ATTENTION"
+]
+MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["MS_MLA", "MS_ATTN"]
+
+def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
+    -> List[str]:
+    if chunked_prefill_enabled:
+        return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS
+    else:
+        return MULTI_STEP_ATTENTION_BACKENDS
\ No newline at end of file
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index fe52fdb10..19edacdda 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -71,12 +71,21 @@ def _warm_up_model(self) -> None:
     kv_cache = self.cache_engine[0].gpu_cache
 
     # warmup for prefill
-    model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], True)
-    self.model_runner.execute_model(model_input, kv_cache, None)
+    if self.vllm_config.scheduler_config.is_multi_step:
+        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner, self.cache_engine[0], True)
+        self.model_runner._base_model_runner.execute_model(model_input, kv_cache, None)
+    else:
+        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], True)
+        self.model_runner.execute_model(model_input, kv_cache, None)
     torch.cuda.synchronize()
+
     # warmup for decode
-    model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], False)
-    self.model_runner.execute_model(model_input, kv_cache, None)
+    if self.vllm_config.scheduler_config.is_multi_step:
+        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner, self.cache_engine[0], False)
+        self.model_runner._base_model_runner.execute_model(model_input, kv_cache, None)
+    else:
+        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], False)
+        self.model_runner.execute_model(model_input, kv_cache, None)
     torch.cuda.synchronize()
 
     # Reset the seed to ensure that the random state is not affected by
-- 
Gitee


From 9e132df1522aa83b3baafd4d78360c1821ada28b Mon Sep 17 00:00:00 2001
From: zhangxuetong <zhangxuetong@huawei.com>
Date: Wed, 26 Mar 2025 11:44:29 +0800
Subject: [PATCH 33/82] delete set_async_for_graph for sampler

---
 vllm_mindspore/model_executor/layers/sampler.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index 575d4ba80..b12d5671b 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -20,7 +20,6 @@
 import itertools
 import warnings
 import mindspore as ms
-from mindspore.common.api import _pynative_executor
 import numpy as np
 from dataclasses import dataclass
 from importlib.util import find_spec
@@ -44,15 +43,6 @@ from vllm_mindspore.model_executor.sampling_metadata import (
     SequenceGroupToSample,
 )
 
-class AsyncContext:
-    def __enter__(self):
-        _pynative_executor.sync()
-        _pynative_executor.set_async_for_graph(True)
-
-    def __exit__(self, exc_type, exc_value, tb):
-        _pynative_executor.sync()
-        _pynative_executor.set_async_for_graph(False)
-
 if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
     raise RuntimeError("Donot support for mindspore now.")
 else:
@@ -350,8 +340,7 @@ class Sampler(nn.Module):
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
-        with AsyncContext() as ctx:
-            return self.forward(logits, sampling_metadata)
+        return self.forward(logits, sampling_metadata)
 
     @property
     def _should_modify_greedy_probs_inplace(self) -> bool:
-- 
Gitee


From afa63b0a52d397b4382a69e39f25b58a3b053d72 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Wed, 26 Mar 2025 11:28:58 +0800
Subject: [PATCH 34/82] =?UTF-8?q?=E9=80=82=E9=85=8Dmsa=E6=94=B9=E5=8A=A8,s?=
 =?UTF-8?q?oftmax=E5=85=A5=E5=8F=82axis->dim?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/model_executor/layers/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index 575d4ba80..57a78495a 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -430,7 +430,7 @@ def _apply_top_k_top_p(
     logits_sort.masked_fill_(top_k_mask, -float("inf"))
 
     # Apply top-p.
-    probs_sort = logits_sort.softmax(axis=-1)
+    probs_sort = logits_sort.softmax(-1)
     probs_sum = probs_sort.cumsum(axis=-1)
     top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
     # at least one
-- 
Gitee


From 9add0a88ab1e8e10a4cb889534636927a602a2f6 Mon Sep 17 00:00:00 2001
From: one_east <wanyidong@huawei.com>
Date: Wed, 26 Mar 2025 16:44:18 +0800
Subject: [PATCH 35/82] adapte various quant method

---
 .../models/mf_models/deepseek_v3.py           | 108 +++++++++++++-----
 1 file changed, 81 insertions(+), 27 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 41dad5658..3dd88ab96 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -18,6 +18,7 @@
 
 import os
 from typing import Iterable, Set, Tuple
+from collections import OrderedDict
 
 import numpy as np
 
@@ -27,8 +28,14 @@ from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 
 from mindspore import Tensor, JitConfig, Model
+from mindspore.common import dtype as msdtype
+
+from mindspore_gs.ptq import PTQ
+from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity, GPTQQuantConfig
+from mindspore_gs.common import BackendTarget
 
 from mindformers.trainer.utils import transform_and_load_checkpoint
+from research.deepseek3.deepseek3_model_infer import DeepseekV3DecodeLayer
 from research.deepseek3.deepseek3_config import (
     DeepseekV3Config as DeepseekV3Config_MF,
 )
@@ -69,33 +76,11 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
         # quant
-        if self.is_quant:
-            from mindspore_gs.ptq import PTQ
-            from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity
-            from mindspore_gs.common import BackendTarget
-            from mindspore.common import dtype as msdtype
-            from collections import OrderedDict
-            cfg = PTQConfig(mode=PTQMode.DEPLOY,
-                            backend=BackendTarget.ASCEND,
-                            weight_quant_dtype=msdtype.int8,
-                            act_quant_dtype=msdtype.int8,
-                            outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_PLUS,
-                            opname_blacklist=['lkv2kv', 'lm_head'],
-                            precision_recovery=PrecisionRecovery.NONE,
-                            act_quant_granularity=QuantGranularity.PER_TENSOR,
-                            weight_quant_granularity=QuantGranularity.PER_CHANNEL)
-            ffn_config = PTQConfig(mode=PTQMode.DEPLOY,
-                                   backend=BackendTarget.ASCEND,
-                                   weight_quant_dtype=msdtype.int8,
-                                   act_quant_dtype=msdtype.int8,
-                                   outliers_suppression=OutliersSuppressionType.NONE,
-                                   precision_recovery=PrecisionRecovery.NONE,
-                                   act_quant_granularity=QuantGranularity.PER_TOKEN,
-                                   weight_quant_granularity=QuantGranularity.PER_CHANNEL)
-            ptq = PTQ(config=cfg,
-                      layer_policies=OrderedDict({r'.*\.feed_forward\..*':ffn_config}))
-            ptq.apply(self.network)
-            ptq.convert(self.network)
+        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, "quant_method"):
+            ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY)
+            if ptq is not None:
+                ptq.apply(self.network)
+                ptq.convert(self.network)
 
         self.network._jit_config_dict = JitConfig(
             jit_level="O0", infer_boost="on"
@@ -155,3 +140,72 @@ class DeepseekV3ForCausalLM(MfModelBase):
             return model_name_or_path
         else:
             raise ValueError("The 'model' in LLM should be the local path of the MindSpore checkpoint file.")
+
+    def create_ptq(self, quant_type: str, quant_mode: PTQMode):
+        """create_ptq"""
+        if quant_type.lower() == 'ptq':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                            act_quant_dtype=msdtype.int8,
+                            outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_PLUS,
+                            opname_blacklist=['lkv2kv', 'lm_head'], precision_recovery=PrecisionRecovery.NONE,
+                            act_quant_granularity=QuantGranularity.PER_TENSOR,
+                            weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+            ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                                act_quant_dtype=msdtype.int8,
+                                outliers_suppression=OutliersSuppressionType.NONE,
+                                precision_recovery=PrecisionRecovery.NONE,
+                                act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+            layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config})
+        elif quant_type.lower() == 'awq-a16w4':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2,
+                            act_quant_dtype=None, outliers_suppression=OutliersSuppressionType.AWQ,
+                            opname_blacklist=['lm_head', 'lkv2kv'], weight_quant_granularity=QuantGranularity.PER_GROUP,
+                            group_size=128)
+            layer_policies = OrderedDict()
+        elif quant_type.lower() == 'awq-a16w8':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                            act_quant_dtype=None, outliers_suppression=OutliersSuppressionType.AWQ,
+                            opname_blacklist=['lm_head', 'lkv2kv'])
+        elif quant_type.lower() == 'gptq-perchannel':
+            gptq_config = GPTQQuantConfig()
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2,
+                            act_quant_dtype=None, precision_recovery=PrecisionRecovery.GPTQ, algo_args=gptq_config,
+                            opname_blacklist=['lm_head', 'lkv2kv'])
+            layer_policies = OrderedDict()
+        elif quant_type.lower() == 'gptq-pergroup':
+            gptq_config = GPTQQuantConfig()
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2,
+                            algo_args=gptq_config, act_quant_dtype=None, precision_recovery=PrecisionRecovery.GPTQ,
+                            weight_quant_granularity=QuantGranularity.PER_GROUP, opname_blacklist=['lm_head', 'lkv2kv'],
+                            group_size=128)
+            layer_policies = OrderedDict()
+        elif quant_type.lower() == 'smoothquant':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                            act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH,
+                            opname_blacklist=['lm_head', 'lkv2kv'])
+            w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                                act_quant_dtype=msdtype.int8,
+                                outliers_suppression=OutliersSuppressionType.NONE,
+                                precision_recovery=PrecisionRecovery.NONE,
+                                act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+            layer_policies = OrderedDict({r'.*\.w2.*': w2_config})
+        elif quant_type.lower() == 'a16w8':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                            opname_blacklist=['lm_head', 'lkv2kv'])
+            layer_policies = OrderedDict()
+        elif quant_type.lower() == 'a8dynw8':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                            act_quant_dtype=msdtype.int8, act_quant_granularity=QuantGranularity.PER_TOKEN,
+                            opname_blacklist=['lm_head', 'lkv2kv'])
+            layer_policies = OrderedDict()
+        else:
+            logger.warning("Input unsupported quant type: %s.", quant_type)
+            return None
+        ptq = PTQ(config=cfg, layer_policies=layer_policies)
+        if 'awq' in quant_type.lower():
+            # pylint: disable=protected-access
+            ptq._config.weight_symmetric = False
+        ptq.decoder_layer_types.append(DeepseekV3DecodeLayer)
+        return ptq
\ No newline at end of file
-- 
Gitee


From 7ce994eeb8ed4289a0fa2692a385b6534523c95d Mon Sep 17 00:00:00 2001
From: yyyyrf <yourifan@outlook.com>
Date: Tue, 25 Mar 2025 15:15:50 +0800
Subject: [PATCH 36/82] support int4 sf split for deploy

---
 .../mf_models/deepseekv3_infer_parallelism.py | 212 ++++++++++++------
 .../models/mf_models/model_parallelism.py     |   7 +-
 vllm_mindspore/utils.py                       |  18 ++
 3 files changed, 171 insertions(+), 66 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
index 69d0664a9..7adca46e5 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
@@ -21,9 +21,12 @@ import time
 import json
 import gc
 import numpy as np
+from tqdm import tqdm
 
 import mindspore as ms
+from mindspore import dtype
 from vllm_mindspore.model_executor.models.mf_models.model_parallelism import BaseModelParallelism
+from vllm_mindspore.utils import convert_np_to_ms_dtype
 
 
 class DeepseekInferParallelism(BaseModelParallelism):
@@ -116,14 +119,14 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # router expert dense
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
         router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name)
-        router_dense_ms_param = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
+        router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[router_dense_ms_name] = ms.Parameter(ms.Tensor(router_dense_ms_param, ms.bfloat16),
                                                             name=router_dense_ms_name, requires_grad=False)
 
         # e_score_correction_bias
         e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
         e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name)
-        e_score_correction_bias_ms_param = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
+        e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
                                                                          hf_weight_map)
         parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
             ms.Tensor(e_score_correction_bias_ms_param, ms.float32),
@@ -147,15 +150,15 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         for index in range(0, num_router_experts):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
-            w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=0)
 
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
-            w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=1)
 
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
-            w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=0)
 
             w1_list.append(w1_ms_param)
@@ -163,15 +166,15 @@ class DeepseekInferParallelism(BaseModelParallelism):
             w3_list.append(w3_ms_param)
 
             w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
-            w1_scale_ms_param = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+            w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
                                                               is_split_param=True, split_axis=0)
 
             w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
-            w2_scale_ms_param = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
             # is_split_param=True, split_axis=0)
 
             w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
-            w3_scale_ms_param = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+            w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
                                                               is_split_param=True, split_axis=0)
 
             w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
@@ -236,27 +239,27 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
-        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
 
         w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale"
         w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
-        w1_scale_ms_param = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
+        w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
-        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
 
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale"
         w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
-        w2_scale_ms_param = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+        w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
-        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
 
         w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale"
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
-        w3_scale_ms_param = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
+        w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
 
         w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
         w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -311,35 +314,35 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
-        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
                                                     is_split_param=True,
                                                     split_axis=0)
         w1_scale_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight_scale"
         w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
-        w1_scale_ms_param = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+        w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
                                                           is_split_param=True,
                                                           split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
-        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
                                                     is_split_param=True,
                                                     split_axis=1)
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight_scale"
         w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
         # shape:[7168,1]
-        w2_scale_ms_param = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+        w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
         # is_split_param=True,
         # split_axis=0)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
-        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
                                                     is_split_param=True,
                                                     split_axis=0)
         w3_scale_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight_scale"
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
-        w3_scale_ms_param = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+        w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
                                                           is_split_param=True,
                                                           split_axis=0)
 
@@ -394,24 +397,24 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         embed_tokens_hf_name = "model.embed_tokens.weight"
         embed_tokens_ms_name = self.quant_convert_weight_name(embed_tokens_hf_name)
-        np_data = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
+        np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16),
                                                             name=embed_tokens_ms_name,
                                                             requires_grad=False)
 
         norm_hf_name = "model.norm.weight"
         norm_ms_name = self.quant_convert_weight_name(norm_hf_name)
-        np_data = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
+        np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[norm_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=norm_ms_name,
                                                     requires_grad=False)
 
         lm_head_hf_name = "lm_head.weight"
         lm_head_ms_name = self.quant_convert_weight_name(lm_head_hf_name)
         if not self.config.parallel_config.vocab_emb_dp:
-            np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
+            np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
                                                     is_split_param=True, split_axis=0)
         else:
-            np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
+            np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
                                                        requires_grad=False)
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
@@ -430,13 +433,13 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         input_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_scale"
         input_scale_ms_name = self.quant_convert_weight_name(input_scale_hf_name)
-        input_scale_ms_param = self.get_safetensor_from_file(input_scale_hf_name, src_hf_dir, hf_weight_map)
+        input_scale_ms_param, _ = self.get_safetensor_from_file(input_scale_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[input_scale_ms_name] = ms.Parameter(ms.Tensor(input_scale_ms_param, ms.bfloat16),
                                                            name=input_scale_ms_name, requires_grad=False)
 
         input_zp_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_offset"
         input_zp_ms_name = self.quant_convert_weight_name(input_zp_hf_name)
-        input_zp_ms_param = self.get_safetensor_from_file(input_zp_hf_name, src_hf_dir, hf_weight_map)
+        input_zp_ms_param, _ = self.get_safetensor_from_file(input_zp_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[input_zp_ms_name] = ms.Parameter(ms.Tensor(input_zp_ms_param, ms.int8),
                                                         name=input_zp_ms_name,
                                                         requires_grad=False)
@@ -444,11 +447,11 @@ class DeepseekInferParallelism(BaseModelParallelism):
         if not is_trans_rope_weigh:
             quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
             quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name)
-            quant_bias_ms_param = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
+            quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
 
             dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale"
             dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name)
-            dequant_scale_ms_param = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map)
+            dequant_scale_ms_param, _ = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map)
         else:
             kv_lora_rank = self.config.model.model_config.kv_lora_rank
             qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
@@ -460,11 +463,11 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
             quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
             quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name)
-            quant_bias_ms_param = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
+            quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
 
             dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale"
             dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name)
-            dequant_scale_ms_param = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map)
+            dequant_scale_ms_param, _ = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map)
 
             if name == "q_b_proj":
                 quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads, rope_dim, -1)
@@ -501,14 +504,14 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         q2l_proj_bias_hf_name = f"model.layers.{layer_id}.input_layernorm.bias"
         q2l_proj_bias_ms_name = self.quant_convert_weight_name(q2l_proj_bias_hf_name)
-        q2l_proj_bias_ms_param = self.get_safetensor_from_file(q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map)
+        q2l_proj_bias_ms_param, _ = self.get_safetensor_from_file(q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map)
 
         kv2l_bias_ms_name = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta"
         kv2l_bias_ms_param = q2l_proj_bias_ms_param.copy()
 
         l2q_proj_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.bias"
         l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name)
-        l2q_proj_bias_ms_param = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map)
+        l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map)
 
         parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter(ms.Tensor(q2l_proj_bias_ms_param, ms.bfloat16),
                                                              name=q2l_proj_bias_ms_name, requires_grad=False)
@@ -539,7 +542,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # q_a_proj->q2l_proj
         q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
         q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name)
-        q2l_proj_ms_param = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
+        q2l_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.Tensor(q2l_proj_ms_param, ms.int8),
                                                         name=q2l_proj_ms_name,
                                                         requires_grad=False)
@@ -548,7 +551,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # kv_a_proj_with_mqa->kv2l
         kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
         kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name)
-        kv2l_ms_param = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
+        kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
         kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
         kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
         parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.int8), name=kv2l_ms_name,
@@ -559,7 +562,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # q_a_layernorm->lq_norm
         lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
         lq_norm_ms_name = self.quant_convert_weight_name(lq_norm_hf_name)
-        lq_norm_ms_param = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
+        lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.Tensor(lq_norm_ms_param, ms.bfloat16),
                                                        name=lq_norm_ms_name,
                                                        requires_grad=False)
@@ -567,7 +570,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # q_b_proj->l2q_proj
         l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight"
         l2q_proj_ms_name = self.quant_convert_weight_name(l2q_proj_hf_name)
-        l2q_proj_ms_param = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map)
+        l2q_proj_ms_param, _ = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map)
         l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1)
         l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim)
         l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1)
@@ -581,7 +584,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # kv_a_layernorm->lkv_norm
         lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight"
         lkv_norm_ms_name = self.quant_convert_weight_name(lkv_norm_hf_name)
-        lkv_norm_ms_param = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
+        lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[lkv_norm_ms_name] = ms.Parameter(ms.Tensor(lkv_norm_ms_param, ms.bfloat16),
                                                         name=lkv_norm_ms_name,
                                                         requires_grad=False)
@@ -589,7 +592,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # kv_b_proj->lkv2kv
         lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight"
         lkv2kv_ms_name = self.quant_convert_weight_name(lkv2kv_hf_name)
-        lkv2kv_ms_param = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map)
+        lkv2kv_ms_param, _ = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map)
         lkv2kv_head = qk_nope_head_dim + v_head_dim
         lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1)
         value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :]
@@ -610,7 +613,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # o_proj->wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.quant_convert_weight_name(wo_hf_name)
-        wo_ms_param = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
+        wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
         wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1)
         parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.int8), name=wo_ms_name,
                                                   requires_grad=False)
@@ -671,14 +674,14 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # router expert dense
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
         router_dense_ms_name = self.convert_weight_name(router_dense_hf_name)
-        router_dense_ms_param = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
+        router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[router_dense_ms_name] = ms.Parameter(ms.Tensor(router_dense_ms_param, ms.bfloat16),
                                                             name=router_dense_ms_name, requires_grad=False)
 
         # e_score_correction_bias
         e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
         e_score_correction_bias_ms_name = self.convert_weight_name(e_score_correction_bias_hf_name)
-        e_score_correction_bias_ms_param = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
+        e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
                                                                          hf_weight_map)
         parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
             ms.Tensor(e_score_correction_bias_ms_param, ms.float32),
@@ -693,15 +696,15 @@ class DeepseekInferParallelism(BaseModelParallelism):
         w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight"
         for index in range(0, num_router_experts):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
-            w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=0)
 
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
-            w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=1)
 
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
-            w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=0)
 
             w1_list.append(w1_ms_param)
@@ -737,15 +740,15 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
@@ -772,17 +775,17 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
 
         if ffn_concat:
@@ -818,7 +821,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # q2l_proj
         q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
         q2l_proj_ms_name = self.convert_weight_name(q2l_proj_hf_name)
-        q_a_proj_ms_param = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
+        q_a_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.Tensor(q_a_proj_ms_param, ms.bfloat16),
                                                         name=q2l_proj_ms_name,
                                                         requires_grad=False)
@@ -826,7 +829,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # kv2l
         kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
         kv2l_ms_name = self.convert_weight_name(kv2l_hf_name)
-        kv2l_ms_param = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
+        kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
         kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
         kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
         parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.bfloat16), name=kv2l_ms_name,
@@ -835,14 +838,14 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # lq_norm
         lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
         lq_norm_ms_name = self.convert_weight_name(lq_norm_hf_name)
-        lq_norm_ms_param = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
+        lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.Tensor(lq_norm_ms_param, ms.bfloat16), name=lq_norm_ms_name,
                                                        requires_grad=False)
 
         # l2q_proj
         l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight"
         l2q_proj_ms_name = self.convert_weight_name(l2q_proj_hf_name)
-        l2q_proj_ms_param = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map)
+        l2q_proj_ms_param, _ = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map)
         l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1)
         l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim)
         l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1)
@@ -854,7 +857,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # lkv_norm
         lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight"
         lkv_norm_ms_name = self.convert_weight_name(lkv_norm_hf_name)
-        lkv_norm_ms_param = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
+        lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[lkv_norm_ms_name] = ms.Parameter(ms.Tensor(lkv_norm_ms_param, ms.bfloat16),
                                                         name=lkv_norm_ms_name,
                                                         requires_grad=False)
@@ -862,7 +865,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # lkv2kv
         lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight"
         lkv2kv_ms_name = self.convert_weight_name(lkv2kv_hf_name)
-        lkv2kv_ms_param = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map)
+        lkv2kv_ms_param, _ = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map)
         lkv2kv_head = qk_nope_head_dim + v_head_dim
         lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1)
         value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :]
@@ -883,7 +886,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.convert_weight_name(wo_hf_name)
-        wo_ms_param = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
+        wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
         wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1)
         parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
                                                   requires_grad=False)
@@ -898,7 +901,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # attention_norm
         attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight"
         attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name)
-        attention_norm_ms_param = self.get_safetensor_from_file(attention_norm_hf_name,
+        attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name,
                                                                 src_hf_dir,
                                                                 hf_weight_map)
         parameter_dict[attention_norm_ms_name] = ms.Parameter(ms.Tensor(attention_norm_ms_param, ms.bfloat16),
@@ -908,7 +911,7 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # ffn_norm
         ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight"
         ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name)
-        ffn_norm_ms_param = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
+        ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[ffn_norm_ms_name] = ms.Parameter(ms.Tensor(ffn_norm_ms_param, ms.bfloat16),
                                                         name=ffn_norm_ms_name,
                                                         requires_grad=False)
@@ -933,22 +936,103 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         print(f"..... end convert layer {layer_id} .......", flush=True)
 
+    def infer_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
+        """infer_quant_net_ms_convert_layer_weight"""
+        parameter_dict = {}
+
+        no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj",
+                               "kv2l", "routed_experts.router.dense",
+                               "routed_experts.router.e_score_correction_bias",
+                               "shared_experts.w_gate_hidden", "shared_experts.w2",
+                               "topk_bias"]
+        
+        for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"):
+            if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
+                continue
+
+            if any([name in param_name for name in no_need_split_layer]):
+                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                               hf_weight_map)
+            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden."]]):
+                if param_name.endswith(".weight"):
+                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=0)
+                else:
+                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=1)
+            elif any([name in param_name for name in [".feed_forward.w2.", ".wo."]]):
+                if param_name.endswith(".weight"):
+                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=1)
+                else:
+                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=0)
+            elif ".routed_experts.ffn.w_gate_hidden." in param_name:
+                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                value_list = []
+                for experts_id in range(value.shape[0]):
+                    value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1))
+                value = np.stack(value_list, axis=0)
+            elif ".routed_experts.ffn.w2" in param_name:
+                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                value_list = []
+                for experts_id in range(value.shape[0]):
+                    value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0))
+                value = np.stack(value_list, axis=0)
+            elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
+                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=0)
+            elif "lm_head" in param_name:
+                if not self.config.parallel_config.vocab_emb_dp:
+                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                                   is_split_param=True, split_axis=0)
+                else:
+                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+            else:
+                raise ValueError(f"not found layer {param_name}, please check safetensors file.")
+
+            dst_dtype = convert_np_to_ms_dtype(value)
+            if is_int4:
+                parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dtype.qint4x2),
+                                                          name=param_name, requires_grad=False)
+            else:
+                parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
+                                                          name=param_name, requires_grad=False)
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
     def infer_convert_and_parallelism(self, src_hf_dir):
         """convert inference model weight """
         param_json_path = ""
+
         for file in os.listdir(src_hf_dir):
             if file.endswith('index.json'):
                 param_json_path = os.path.join(src_hf_dir, file)
+                with open(param_json_path, "r") as fp:
+                    hf_weight_map = json.load(fp)['weight_map']
+                break
+            elif file.endswith('_name_map.json'):
+                param_json_path = os.path.join(src_hf_dir, file)
+                with open(param_json_path, "r") as fp:
+                    hf_weight_map = json.load(fp)
                 break
+
         if not param_json_path:
-            raise ValueError("param_json_path:{} is error.".format(param_json_path))
+            raise ValueError(f"Not found param_json_path in {src_hf_dir}")
         print("param_json_path is {}".format(param_json_path))
 
-        with open(param_json_path, "r") as fp:
-            hf_weight_map = json.load(fp)['weight_map']
-
-        self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
+        quantization_config = self.config.model.model_config.quantization_config
+        quant_method = quantization_config.quant_method if quantization_config else None
+        if not quant_method or quant_method != "gptq-pergroup":
+            self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
+        
         num_layers = self.config.model.model_config.num_layers
+        if quant_method and quant_method == "gptq-pergroup":
+            self.infer_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
+            return
         for layer_id in range(num_layers):
             if self.is_quant:
                 self.infer_quant_net_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
diff --git a/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
index a063cab96..4ad9d4495 100644
--- a/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
@@ -39,9 +39,12 @@ class BaseModelParallelism:
         rank_id = get_rank()
         safetensor_file = hf_weight_map[hf_param_name]
         with safe_open(f"{src_hf_dir}/{safetensor_file}", framework="np") as sf_file:
+            qint4 = False
+            if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+                qint4 = True
             if not is_split_param:
                 np_data = sf_file.get_tensor(hf_param_name)
-                return np_data
+                return np_data, qint4
 
             np_data = sf_file.get_slice(hf_param_name)
             shape = np_data.get_shape()
@@ -57,7 +60,7 @@ class BaseModelParallelism:
                 split_data = np_data[:, start:stop]
             else:
                 raise ValueError("split_axis:{} is not supported.".format(split_axis))
-            return split_data
+            return split_data, qint4
 
     def split_weight_by_rank(self, weight, split_axis=0):
         tp_group_size = get_group_size()
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index d37fa39d1..f3601effe 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -29,6 +29,7 @@ from typing import (
     Tuple,
     Union,
 )
+import numpy as np
 
 import torch
 
@@ -350,3 +351,20 @@ def is_use_mla(model_config):
     return hasattr(model_config.hf_text_config, "model_type") and (
         model_config.hf_text_config.model_type in ("deepseek_v3",)
     )
+
+
+def convert_np_to_ms_dtype(value):
+    """convert_np_to_ms_dtype"""
+    if value.dtype == np.int8:
+        value_dtype = ms.int8
+    elif value.dtype == np.int32:
+        value_dtype = ms.int32
+    elif value.dtype == np.int64:
+        value_dtype = ms.int64
+    elif value.dtype == np.float64:
+        value_dtype = ms.float64
+    elif value.dtype == np.float32:
+        value_dtype = ms.float32
+    else:
+        value_dtype = ms.bfloat16
+    return value_dtype
\ No newline at end of file
-- 
Gitee


From 49ce9cca16f8d35826902cbed02292c7b0eac5e3 Mon Sep 17 00:00:00 2001
From: huzhikun <huzhikun1@yeah.net>
Date: Thu, 27 Mar 2025 11:03:11 +0800
Subject: [PATCH 37/82] =?UTF-8?q?fea:=20=E6=B7=BB=E5=8A=A0=E5=85=B1?=
 =?UTF-8?q?=E4=BA=AB=E4=B8=93=E5=AE=B6=E5=88=87=E5=88=86=E7=89=B9=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../mf_models/deepseekv3_infer_parallelism.py     | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
index 7adca46e5..a9e316239 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
@@ -239,15 +239,18 @@ class DeepseekInferParallelism(BaseModelParallelism):
         parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=0)
 
         w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale"
         w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
-        w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
+        w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                             is_split_param=True, split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=1)
 
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale"
         w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
@@ -255,11 +258,13 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=0)
 
         w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale"
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
-        w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
+        w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                             is_split_param=True, split_axis=0)
 
         w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
         w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
-- 
Gitee


From c6e99543f305798e78fe58d11f3b1c4013d86472 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Wed, 26 Mar 2025 17:37:03 +0800
Subject: [PATCH 38/82] delete block table padding

---
 .../models/mf_models/deepseek_v3.py           |  6 +----
 .../models/mf_models/mf_model_base.py         | 27 +------------------
 .../model_executor/models/mf_models/qwen2.py  |  6 +----
 vllm_mindspore/worker/worker.py               |  3 +++
 4 files changed, 6 insertions(+), 36 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 3dd88ab96..18a1a12b1 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -44,8 +44,7 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, \
-    _pad_block_table, Fake_Attention
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
 from vllm_mindspore.utils import calc_block_num
 
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_infer_parallelism import DeepseekInferParallelism
@@ -115,9 +114,6 @@ class DeepseekV3ForCausalLM(MfModelBase):
             )
         self.mf_kvcaches_init = True
 
-    def pad_block_table(self, block_tables, seq_length, block_size):
-        return _pad_block_table(block_tables, seq_length, block_size, 2)
-
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         if self.mf_config.load_ckpt_format == "ckpt":
             model = Model(self.network)
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 91a0446e6..56f394c48 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -48,23 +48,6 @@ def _pad_to_max(x, max_len):
     return x + [-1] * (max_len - len(x))
 
 
-def _pad_block_table(block_tables, seq_length, block_size, pad_size):
-    # When prefill, the block_tables is a empty tensor.
-    if len(block_tables.shape) < 2:
-        fake_block_tables = ms.mint.empty(
-            pad_size, seq_length // block_size, dtype=ms.int32, device="Ascend"
-        )
-        return fake_block_tables
-
-    block_tables_list = block_tables.tolist()
-    padded_block_tables = [
-        _pad_to_max(block_table, seq_length // block_size)
-        for block_table in block_tables_list
-    ]
-
-    return Tensor(np.array(padded_block_tables).astype(np.int32))
-
-
 def _batch_seq(input_tokens, prefill):
     if prefill:
         return ms.ops.expand_dims(input_tokens, 0).to(ms.int32)
@@ -117,10 +100,6 @@ class MfModelBase(MsModelBase):
         self.mf_kvcaches_init = True
 
 
-    def pad_block_table(self, block_tables, seq_length, block_size):
-        raise NotImplementedError("pad_block_table not implemented.")
-
-
     def forward(
         self,
         input_ids: Tensor,
@@ -157,11 +136,7 @@ class MfModelBase(MsModelBase):
         model_inputs = {}
         model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
         model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0))
-        model_inputs["block_tables"] = self.pad_block_table(
-            attn_metadata.block_tables,
-            self.mf_model_config.seq_length,
-            self.mf_model_config.block_size,
-        )
+        model_inputs["block_tables"] = attn_metadata.block_tables
         model_inputs["slot_mapping"] = attn_metadata.slot_mapping
         model_inputs["position_ids"] = position_ids
         model_inputs["q_seq_lens"] = q_seq_lens
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 2de4090f1..78247b8e1 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -30,8 +30,7 @@ from research.qwen2_5.infer.qwen2_5 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, \
-    _pad_block_table, Fake_Attention
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
 from vllm_mindspore.utils import calc_block_num
 from vllm_mindspore.model_executor.models.mf_models.qwen2_infer_parallelism import Qwen2InferParallelism
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
@@ -81,9 +80,6 @@ class Qwen2ForCausalLM(MfModelBase):
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
-    def pad_block_table(self, block_tables, seq_length, block_size):
-        return _pad_block_table(block_tables, seq_length, block_size, 1)
-
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         model_parallelism = Qwen2InferParallelism(self.mf_config, self.network, False)
         model_parallelism.infer_convert_and_parallelism(self.mf_config.load_checkpoint)
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 19edacdda..b3c87dc93 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -63,6 +63,9 @@ def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefi
     ]
 
     model_input = model_runner.prepare_model_input(seqs)
+    block_tables = model_input.attn_metadata.block_tables
+    if block_tables is not None and block_tables.numel() <= 0:
+        model_input.attn_metadata.block_tables = torch.zeros((1, 1), dtype=torch.int32)
     return model_input
 
 
-- 
Gitee


From 13dec8f62625e36ccb0a544492ed6b1a6ed1a26c Mon Sep 17 00:00:00 2001
From: yyyyrf <yourifan@outlook.com>
Date: Thu, 27 Mar 2025 16:10:53 +0800
Subject: [PATCH 39/82] bugfix:qwen2 read safetensor return two value

---
 .../mf_models/qwen2_infer_parallelism.py      | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
index 05bd499bd..6075672ab 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
@@ -43,9 +43,9 @@ class Qwen2InferParallelism(BaseModelParallelism):
         embed_tokens_hf_name = "model.embed_tokens.weight"
         embed_tokens_ms_name = self.convert_weight_name(embed_tokens_hf_name)
         if self.config.parallel_config.vocab_emb_dp:
-            np_data = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
+            np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
         else:
-            np_data = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map,
+            np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map,
                                                     is_split_param=True, split_axis=0)
         parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16),
                                                             name=embed_tokens_ms_name,
@@ -53,7 +53,7 @@ class Qwen2InferParallelism(BaseModelParallelism):
 
         norm_hf_name = "model.norm.weight"
         norm_ms_name = self.convert_weight_name(norm_hf_name)
-        np_data = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
+        np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[norm_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=norm_ms_name,
                                                     requires_grad=False)
 
@@ -61,10 +61,10 @@ class Qwen2InferParallelism(BaseModelParallelism):
         lm_head_ms_name = self.convert_weight_name(lm_head_hf_name)
         if not self.config.model.model_config.tie_word_embeddings:
             if not self.config.parallel_config.vocab_emb_dp:
-                np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
+                np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
                                                         is_split_param=True, split_axis=0)
             else:
-                np_data = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
+                np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
             parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
                                                            requires_grad=False)
 
@@ -94,17 +94,17 @@ class Qwen2InferParallelism(BaseModelParallelism):
 
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
 
         if ffn_concat:
@@ -129,36 +129,36 @@ class Qwen2InferParallelism(BaseModelParallelism):
         # wq
         wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight"
         wq_ms_name = self.convert_weight_name(wq_hf_name)
-        wq_ms_param = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        wq_ms_param, _ = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
         # wq bias
         wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias"
         wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name)
-        wq_bias_ms_param = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map,
+        wq_bias_ms_param, _ = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map,
                                                          is_split_param=True,
                                                          split_axis=0)
 
         # wk
         wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight"
         wk_ms_name = self.convert_weight_name(wk_hf_name)
-        wk_ms_param = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        wk_ms_param, _ = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
         # wk bias
         wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias"
         wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name)
-        wk_bias_ms_param = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map,
+        wk_bias_ms_param, _ = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map,
                                                          is_split_param=True,
                                                          split_axis=0)
 
         # wv
         wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight"
         wv_ms_name = self.convert_weight_name(wv_hf_name)
-        wv_ms_param = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        wv_ms_param, _ = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=0)
         # wv bias
         wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias"
         wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name)
-        wv_bias_ms_param = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map,
+        wv_bias_ms_param, _ = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map,
                                                          is_split_param=True,
                                                          split_axis=0)
 
@@ -193,7 +193,7 @@ class Qwen2InferParallelism(BaseModelParallelism):
         # wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.convert_weight_name(wo_hf_name)
-        wo_ms_param = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+        wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
                                                     split_axis=1)
         parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
                                                   requires_grad=False)
@@ -205,7 +205,7 @@ class Qwen2InferParallelism(BaseModelParallelism):
         # attention_norm
         attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight"
         attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name)
-        attention_norm_ms_param = self.get_safetensor_from_file(attention_norm_hf_name,
+        attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name,
                                                                 src_hf_dir,
                                                                 hf_weight_map)
         parameter_dict[attention_norm_ms_name] = ms.Parameter(ms.Tensor(attention_norm_ms_param, ms.bfloat16),
@@ -215,7 +215,7 @@ class Qwen2InferParallelism(BaseModelParallelism):
         # ffn_norm
         ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight"
         ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name)
-        ffn_norm_ms_param = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
+        ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
         parameter_dict[ffn_norm_ms_name] = ms.Parameter(ms.Tensor(ffn_norm_ms_param, ms.bfloat16),
                                                         name=ffn_norm_ms_name,
                                                         requires_grad=False)
-- 
Gitee


From abe65a9629d502738591acb214d5de28e513a415 Mon Sep 17 00:00:00 2001
From: huzhikun <huzhikun1@yeah.net>
Date: Wed, 26 Mar 2025 16:34:08 +0800
Subject: [PATCH 40/82] =?UTF-8?q?fea:=20=E6=B7=BB=E5=8A=A0profile=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/__init__.py       |  6 +++
 vllm_mindspore/worker/profile.py | 63 ++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 vllm_mindspore/worker/profile.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 2a1195c14..86f0a8f6c 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -132,10 +132,16 @@ from vllm_mindspore.worker.worker import (
     _warm_up_model,
     determine_num_available_blocks,
 )
+from vllm_mindspore.worker.profile import (
+    wrapper_worker_init,
+    wrapper_worker_init_device
+)
 from vllm.worker.worker import Worker
 
 Worker._warm_up_model = _warm_up_model
 Worker.determine_num_available_blocks = determine_num_available_blocks
+Worker.__init__ = wrapper_worker_init(Worker.__init__)
+Worker.init_device = wrapper_worker_init_device(Worker.init_device)
 
 from vllm_mindspore.worker.model_runner import (
     _get_cuda_graph_pad_size,
diff --git a/vllm_mindspore/worker/profile.py b/vllm_mindspore/worker/profile.py
new file mode 100644
index 000000000..563637064
--- /dev/null
+++ b/vllm_mindspore/worker/profile.py
@@ -0,0 +1,63 @@
+import os
+import sys
+import subprocess
+
+from mindspore import Profiler
+from mindspore.profiler import ProfilerLevel, ProfilerActivity, AicoreMetrics
+from mindspore.profiler.common.profiler_context import ProfilerContext
+
+PROFILE_ENV_NAME = "VLLM_TORCH_PROFILER_DIR"
+
+def shell_analyse(path):
+    subprocess.run(
+        [sys.executable, "-c", f'from mindspore import Profiler; Profiler.offline_analyse("{path}")'],
+        shell=False, check=True)
+
+class AdapterProfiler:
+    def __init__(self, path):
+        self.profiler = Profiler(
+            profiler_level=ProfilerLevel.Level1,
+            activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
+            output_path=path,
+        )
+
+    def start(self):
+        self.profiler.start()
+
+    def stop(self):
+        self.profiler.stop()
+        path = ProfilerContext().ascend_ms_dir
+        shell_analyse(path)
+
+def wrapper_worker_init(fun):
+    def new_fun(*arg, **kwarg):
+        # Profiler initialization during worker init triggers device setup,
+        # causing init_device to fail due to duplicate configuration.
+        # To fix this, temporarily unset VLLM_TORCH_PROFILER_DIR before worker init,
+        # restore it afterward, then initialize profiler properly after worker init_device completes
+        profile_output_path = os.getenv(PROFILE_ENV_NAME, "")
+        if profile_output_path:
+            del os.environ[PROFILE_ENV_NAME]
+
+        fun(*arg, **kwarg)
+
+        if profile_output_path:
+            os.environ[PROFILE_ENV_NAME] = profile_output_path
+    return new_fun
+
+def wrapper_worker_init_device(fun):
+    def new_fun(*arg, **kwarg):
+        fun(*arg, **kwarg)
+
+        # The actual profiler initialization is performed after the worker.init_device() method,
+        # based on the VLLM_TORCH_PROFILER_DIR environment variable.
+        self = arg[0]
+        profile_output_path = os.getenv(PROFILE_ENV_NAME, "")
+        if profile_output_path:
+            print(f"Profiling enabled. Traces will be saved to: {profile_output_path}")
+            self.profiler = AdapterProfiler(profile_output_path)
+        else:
+            self.profiler = None
+    return new_fun
+
+
-- 
Gitee


From 7088398afd1b60cf426449e3a3be1edd75a52438 Mon Sep 17 00:00:00 2001
From: ccsszz <changshaozhong@huawei.com>
Date: Thu, 27 Mar 2025 17:47:10 +0800
Subject: [PATCH 41/82] add ptq smooth_quant aclnn quant ops config

---
 vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 3dd88ab96..5965b4ea4 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -207,5 +207,8 @@ class DeepseekV3ForCausalLM(MfModelBase):
         if 'awq' in quant_type.lower():
             # pylint: disable=protected-access
             ptq._config.weight_symmetric = False
+        if 'smoothquant' in quant_type.lower():
+            # pylint: disable=protected-access
+            ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden"]
         ptq.decoder_layer_types.append(DeepseekV3DecodeLayer)
         return ptq
\ No newline at end of file
-- 
Gitee


From d2e9e5055c83cc04da13f5da86b1c2b5610eec90 Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Tue, 25 Mar 2025 11:37:15 +0800
Subject: [PATCH 42/82] vllm code check tool

---
 codecheck_toolkits/README.md                 |  30 +++++
 codecheck_toolkits/pyproject.toml            | 110 +++++++++++++++++++
 codecheck_toolkits/requirements-lint.txt     |  15 +++
 codecheck_toolkits/vllm_codecheck.sh         |  62 +++++++++++
 vllm_mindspore/attention/backends/ms_attn.py |   4 +-
 5 files changed, 219 insertions(+), 2 deletions(-)
 create mode 100644 codecheck_toolkits/README.md
 create mode 100644 codecheck_toolkits/pyproject.toml
 create mode 100644 codecheck_toolkits/requirements-lint.txt
 create mode 100644 codecheck_toolkits/vllm_codecheck.sh

diff --git a/codecheck_toolkits/README.md b/codecheck_toolkits/README.md
new file mode 100644
index 000000000..03b601856
--- /dev/null
+++ b/codecheck_toolkits/README.md
@@ -0,0 +1,30 @@
+# vllm 社区 codecheck 检查工具链使用说明
+
+## 使用步骤
+- 1. 确保修改已经```git commit```，并合并成一个commit id.
+- 2. 运行命令：```bash vllm_codecheck.sh```
+
+## 执行说明
+- 1、根据 ``requiremnts-lint.txt``安装工具链，请确保网络畅通。
+- 2、依次运行`yaph`, `codespell`, `ruff`, `isort`, `mypy` 工具。
+
+## 工具说明
+- `yapf`: 自动formatting工具。
+- `codespell`: 拼写检查工具。
+- `ruff`: 代码format检查工具。
+- `isort`: 自动修复import工具。
+- `mypy`: 静态类型检查工具。
+
+## 修复建议：
+- `codespell`如需屏蔽拼写错误，修改`pyproject.toml`中的
+
+```commandline
+[tool.codespell]
+ignore-words-list = "dout, te, indicies, subtile, ElementE, CANN"
+```
+
+- `ruff` 如需屏蔽检查，在代码行后增加注释
+
+```commandline
+# noqa: {error_code}
+```
diff --git a/codecheck_toolkits/pyproject.toml b/codecheck_toolkits/pyproject.toml
new file mode 100644
index 000000000..9a3c52de0
--- /dev/null
+++ b/codecheck_toolkits/pyproject.toml
@@ -0,0 +1,110 @@
+[build-system]
+# Should be mirrored in requirements-build.txt
+requires = [
+    "cmake>=3.26",
+    "ninja",
+    "packaging",
+    "setuptools>=61",
+    "setuptools-scm>=8.0",
+    "torch == 2.5.1",
+    "wheel",
+    "jinja2",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools_scm]
+# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
+
+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+exclude = [
+    # External file, leaving license intact
+    "vllm_mindspore/__init__.py",
+    "tests/*"
+]
+
+[tool.ruff.lint.per-file-ignores]
+"vllm_mindspore/version.txt" = ["F401"]
+"vllm_mindspore/_version.txt" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    # "I",
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # long line
+    "E501"
+]
+
+[tool.mypy]
+ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "silent"
+
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from tools/mypy.sh
+#files = [
+#    "vllm/*.py",
+#    "vllm/adapter_commons",
+#    "vllm/assets",
+#    "vllm/entrypoints",
+#    "vllm/core",
+#    "vllm/inputs",
+#    "vllm/logging_utils",
+#    "vllm/multimodal",
+#    "vllm/platforms",
+#    "vllm/transformers_utils",
+#    "vllm/triton_utils",
+#    "vllm/usage",
+#]
+files= ["vllm_mindspore/*.py",]
+# TODO(woosuk): Include the code from Megatron and HuggingFace.
+exclude = [
+    "vllm_mindspore/model_executor/parallel_utils/|vllm_mindspore/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm_mindspore/attention/ops/.*\.py$'
+]
+
+[tool.codespell]
+ignore-words-list = "dout, te, indicies, subtile, ElementE, CANN"
+skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+
+[tool.isort]
+use_parentheses = true
+skip_gitignore = true
+
+skip_glob = ["tests/*", "vllm_mindspore/ops/*"]
+skip = ["vllm_mindspore/__init__.py"]
+
+[tool.pytest.ini_options]
+markers = [
+    "skip_global_cleanup",
+    "core_model: enable this model test in each PR instead of only nightly",
+    "cpu_model: enable this model test in CPU tests",
+    "quant_model: run this model test under Quantized category",
+    "split: run this test as part of a split",
+    "distributed: run this test only in distributed GPU tests",
+    "skip_v1: do not run this test with v1",
+    "optional: optional tests that are automatically skipped, include --optional to run them",
+]
\ No newline at end of file
diff --git a/codecheck_toolkits/requirements-lint.txt b/codecheck_toolkits/requirements-lint.txt
new file mode 100644
index 000000000..711bb50a0
--- /dev/null
+++ b/codecheck_toolkits/requirements-lint.txt
@@ -0,0 +1,15 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+tomli==2.0.2
+ruff==0.6.5
+codespell==2.3.0
+isort==5.13.2
+clang-format==18.1.5
+sphinx-lint==1.0.0
+
+# type checking
+mypy==1.11.1
+types-PyYAML
+types-requests
+types-setuptools
diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh
new file mode 100644
index 000000000..7e5e0a286
--- /dev/null
+++ b/codecheck_toolkits/vllm_codecheck.sh
@@ -0,0 +1,62 @@
+pip install -r requirements-lint.txt
+
+cd ..
+# yapf formats code automatically
+
+MERGEBASE="$(git merge-base origin/master HEAD)"
+if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+  yapf --in-place --recursive --parallel --exclude build/
+fi
+
+if [[ $? -ne 0 ]]; then
+  echo "yapf run failed."
+else
+  echo "yapf run success."
+fi
+
+# codespell check
+if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+  codespell --skip ./vllm_mindspore/ops/ascendc/*
+fi
+if [[ $? -ne 0 ]]; then
+  echo "codespell check failed."
+else
+  echo "codespell check success."
+fi
+
+# ruff check
+if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+  ruff check
+fi
+if [[ $? -ne 0 ]]; then
+  echo "ruff check failed."
+else
+  echo "ruff check success."
+fi
+
+# isort fixed
+if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+  isort
+fi
+if [[ $? -ne 0 ]]; then
+  echo "isort fixed failed."
+else
+  echo "isort fixed success."
+fi
+
+# mypy check type
+if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+  mypy --follow-imports skip --python-version 3.9 "$@"
+fi
+if [[ $? -ne 0 ]]; then
+  echo "mypy check failed."
+else
+  echo "mypy check success."
+fi
+
+cd -
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index bc40ff1dc..0b76d38c4 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -7,7 +7,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#        http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -69,7 +69,7 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
     query_start_loc: Optional[torch.Tensor] = None
     kv_start_loc: Optional[torch.Tensor] = None
     prefill_block_tables: Optional[torch.Tensor] = None
-    query_lens: Optional[List[int]] = None,
+    query_lens: Optional[List[int]] = None
 
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
-- 
Gitee


From 6f25c8c98c1dc9f3ae24a74390835835f6219a19 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Fri, 28 Mar 2025 16:06:04 +0800
Subject: [PATCH 43/82] create dummy block table in jit mode

---
 vllm_mindspore/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index 596433df4..b501fd86f 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -171,6 +171,10 @@ def _create_empty_tensor(ms_type):
     return init_tensor
 
 
+def _create_dummy_block_tables():
+    return ms.ops.zeros((1, 1), dtype=ms.int32)
+
+
 def make_tensor_with_pad(
     x: List[List[T]],
     pad: T,
@@ -192,7 +196,7 @@ def make_tensor_with_pad(
     pin_memory = False
 
     if padded_x.size == 0:
-        tensor = _create_empty_tensor(dtype)
+        tensor = _create_dummy_block_tables()
     else:
         tensor = torch.from_numpy(padded_x)
     if pin_memory:
-- 
Gitee


From 923e1a3368743b91d38ee1206674f19cd64715bf Mon Sep 17 00:00:00 2001
From: huandong <huandong1@huawei.com>
Date: Wed, 26 Mar 2025 16:39:49 +0800
Subject: [PATCH 44/82] vllm 0.7.3 profile_run

---
 vllm_mindspore/__init__.py                    |   8 +-
 .../models/mf_models/deepseek_v3.py           |   7 +-
 .../models/mf_models/mf_model_base.py         |  24 +-
 .../model_executor/models/mf_models/qwen2.py  |   6 -
 vllm_mindspore/platforms/ascend.py            |  13 +-
 vllm_mindspore/utils.py                       | 105 +-------
 vllm_mindspore/worker/model_runner.py         | 243 ++++++++++--------
 vllm_mindspore/worker/worker.py               |  96 -------
 8 files changed, 160 insertions(+), 342 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 86f0a8f6c..de11500c1 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -47,7 +47,6 @@ vllm.utils.current_platform = ascend_platform
 
 from vllm_mindspore.utils import (
     direct_register_custom_op,
-    memory_profiling,
     make_tensor_with_pad,
     async_tensor_h2d,
     get_dtype_size,
@@ -56,7 +55,6 @@ from vllm_mindspore.utils import (
 )
 
 vllm.utils.direct_register_custom_op = direct_register_custom_op
-vllm.utils.memory_profiling = memory_profiling
 vllm.utils.make_tensor_with_pad = make_tensor_with_pad
 vllm.utils.async_tensor_h2d = async_tensor_h2d
 vllm.utils.get_dtype_size = get_dtype_size
@@ -129,8 +127,7 @@ vllm.model_executor.model_loader.loader.safetensors_weights_iterator = (
 )
 
 from vllm_mindspore.worker.worker import (
-    _warm_up_model,
-    determine_num_available_blocks,
+    _warm_up_model
 )
 from vllm_mindspore.worker.profile import (
     wrapper_worker_init,
@@ -139,13 +136,13 @@ from vllm_mindspore.worker.profile import (
 from vllm.worker.worker import Worker
 
 Worker._warm_up_model = _warm_up_model
-Worker.determine_num_available_blocks = determine_num_available_blocks
 Worker.__init__ = wrapper_worker_init(Worker.__init__)
 Worker.init_device = wrapper_worker_init_device(Worker.init_device)
 
 from vllm_mindspore.worker.model_runner import (
     _get_cuda_graph_pad_size,
     profile_run,
+    _dummy_run,
     _get_supported_attention_backends
 )
 
@@ -153,6 +150,7 @@ vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = (
     _get_cuda_graph_pad_size
 )
 vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run
+vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run
 
 import vllm.worker.multi_step_model_runner
 vllm.worker.multi_step_model_runner._get_supported_attention_backends = _get_supported_attention_backends
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 9b2a5a384..1ac29d9a6 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -44,8 +44,7 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
-from vllm_mindspore.utils import calc_block_num
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_MLA
 
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_infer_parallelism import DeepseekInferParallelism
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
@@ -63,8 +62,6 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.mf_config.load_checkpoint = self.get_model_path()
 
         self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
-        self.mf_model_config.num_blocks = calc_block_num(self.cache_config, self.model_config, self.parallel_config)
-        self.mf_model_config.block_size = self.cache_config.block_size
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
         self.mf_model_config.return_hidden_states = True
@@ -89,7 +86,7 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
 
-        self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)]
+        self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)]
         compilation_config = get_current_vllm_config().compilation_config
 
         if prefix in compilation_config.static_forward_context:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 56f394c48..3e1133973 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -57,13 +57,33 @@ def _batch_seq(input_tokens, prefill):
 
 class Fake_Attention:
     def __init__(self):
+        vllm_config = get_current_vllm_config()
+        block_size = vllm_config.cache_config.block_size
+        num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+            vllm_config.parallel_config
+        )
+        head_size = vllm_config.model_config.get_head_size()
+        num_block = 0
+        self.kv_shape = [num_block, block_size, num_kv_heads, head_size]
         self.kv_cache = [
-            torch.tensor([]) for _ in range(get_current_vllm_config(
-            ).parallel_config.pipeline_parallel_size)
+            (
+                torch.zeros(self.kv_shape, dtype=ms.bfloat16, device="Ascend"),
+                torch.zeros(self.kv_shape, dtype=ms.bfloat16, device="Ascend"),
+            )
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
         ]
         self.attn_type = AttentionType.DECODER
 
 
+class Fake_MLA(Fake_Attention):
+    def __init__(self):
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+        self.kv_cache = [
+            (torch.zeros(self.kv_shape, dtype=ms.bfloat16, device="Ascend"),)
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+        ]
+
 class MfModelBase(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(MfModelBase, self).__init__(
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 78247b8e1..f7e386538 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -31,7 +31,6 @@ from research.qwen2_5.infer.qwen2_5 import (
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
-from vllm_mindspore.utils import calc_block_num
 from vllm_mindspore.model_executor.models.mf_models.qwen2_infer_parallelism import Qwen2InferParallelism
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
@@ -44,11 +43,6 @@ class Qwen2ForCausalLM(MfModelBase):
         super(Qwen2ForCausalLM, self).__init__(vllm_config=vllm_config, prefix=prefix)
 
         self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config)
-        # Cannot get num_gpu_blocks from cache config now, calculate one first.
-        self.mf_model_config.num_blocks = calc_block_num(
-            self.cache_config, self.model_config, self.parallel_config
-        )
-        self.mf_model_config.block_size = self.cache_config.block_size
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
         self.mf_model_config.return_hidden_states = True
diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py
index 31fe2c2e0..b96403d49 100644
--- a/vllm_mindspore/platforms/ascend.py
+++ b/vllm_mindspore/platforms/ascend.py
@@ -92,18 +92,6 @@ class AscendPlatform(Platform):
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
-        if os.getenv("ASCEND_TOTAL_MEMORY_GB"):
-            total_device_memory = int(os.environ["ASCEND_TOTAL_MEMORY_GB"])
-        else:
-            total_device_memory = 64
-            logger.warning(
-                "Total device memory should be set by environ 'ASCEND_TOTAL_MEMORY_GB', "
-                "please check size by cmd(npu-smi info). "
-                "For now, we will try default size(64GB) which might not be correct exactly."
-            )
-        max_device_memory_for_ms = str(total_device_memory * cache_config.gpu_memory_utilization) + "GB"
-        ms.set_context(max_device_memory=max_device_memory_for_ms)
-        logger.info("max_device_memory for mindspore is: ", max_device_memory_for_ms)
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla):
@@ -126,6 +114,7 @@ class AscendPlatform(Platform):
     @classmethod
     def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None) -> float:
         """Return the memory usage in bytes."""
+        torch.cuda.reset_peak_memory_stats()
         return torch.cuda.max_memory_allocated(device)
 
     @classmethod
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index b501fd86f..ac44272f1 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -79,89 +79,6 @@ def direct_register_custom_op(
 ): ...
 
 
-@contextlib.contextmanager
-def memory_profiling(
-        baseline_snapshot: "MemorySnapshot",
-        weights_memory: int) -> "Generator[MemoryProfilingResult, None, None]":
-    """Memory profiling context manager.
-    baseline_snapshot: the memory snapshot before the current vLLM instance.
-    weights_memory: memory used by PyTorch when loading the model weights.
-        Note that, before loading the model weights, we also initialize the device
-        and distributed environment, which may consume some memory. This part is not
-        included in the weights_memory because PyTorch does not control it.
-
-    The memory in one GPU can be classified into 3 categories:
-    1. memory used by anything other than the current vLLM instance.
-    2. memory used by torch in the current vLLM instance.
-    3. memory used in the current vLLM instance, but not by torch.
-
-    A quantitive example:
-
-    Before creating the current vLLM instance:
-        category 1: 1 GiB
-        category 2: 0 GiB
-        category 3: 0 GiB
-
-    After creating the current vLLM instance and loading the model,
-    (i.e. before profiling):
-        category 1: 1 GiB
-        category 2: 2 GiB (model weights take 2 GiB)
-        category 3: 0.5 GiB (memory used by NCCL)
-
-    During profiling (peak):
-        category 1: 1 GiB
-        category 2: 4 GiB (peak activation tensors take 2 GiB)
-        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
-
-    After profiling:
-        category 1: 1 GiB
-        category 2: 3 GiB (after garbage-collecting activation tensors)
-        category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
-
-    In this case, non-kv cache takes 5 GiB in total, including:
-    a. 2 GiB used by the model weights (category 2)
-    b. 2 GiB reserved for the peak activation tensors (category 2)
-    c. 1 GiB used by non-torch components (category 3)
-
-    The memory used for loading weights (a.) is directly given from the argument `weights_memory`.
-
-    The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.).
-
-    The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.).
-    """ # noqa
-    from vllm.utils import MemoryProfilingResult
-
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-
-    result = MemoryProfilingResult()
-
-    result.before_create = baseline_snapshot
-    # the part of memory used for holding the model weights
-    result.weights_memory = weights_memory
-
-    result.before_profile.measure()
-
-    before_torch_memory_in_bytes = torch.cuda.memory_stats()["allocated_bytes.all.current"]
-
-    yield result
-
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    result.after_profile.measure()
-
-    after_torch_memory_in_bytes = torch.cuda.memory_stats()["allocated_bytes.all.current"]
-
-    diff_profile = result.after_profile - result.before_profile
-    diff_from_create = result.after_profile - result.before_create
-    result.torch_peak_increase = diff_profile.torch_peak
-    result.non_torch_increase = after_torch_memory_in_bytes - before_torch_memory_in_bytes
-    result.profile_time = diff_profile.timestamp
-    result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
-
-
 def _create_empty_tensor(ms_type):
     init_func = Zero()
     init_func.__enable_zero_dim__ = True
@@ -307,7 +224,7 @@ def check_ready():
 
     if is_mindformers_model_backend():
         logger.info("Run with Mindformers backend!")
-        necessary_envs = ("vLLM_MODEL_MEMORY_USE_GB", "MINDFORMERS_MODEL_CONFIG")
+        necessary_envs = ("MINDFORMERS_MODEL_CONFIG", )
         lost_envs = [env_item for env_item in necessary_envs if not os.getenv(env_item)]
 
         if lost_envs:
@@ -325,26 +242,6 @@ def check_ready():
         env_setup({"MS_ALLOC_CONF": "enable_vmm:True", })
         logger.info("Run with native model backend!")
 
-
-def calc_block_num(cache_config, model_config, parallel_config):
-    from vllm.worker.cache_engine import CacheEngine
-
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-
-    total_gpu_memory = int(os.environ["ASCEND_TOTAL_MEMORY_GB"]) if os.getenv("ASCEND_TOTAL_MEMORY_GB") else 64
-    total_gpu_memory = total_gpu_memory * 1024 * 1024 * 1024
-    memory_can_use = total_gpu_memory * cache_config.gpu_memory_utilization
-
-    model_use_memory_b = int(os.getenv("vLLM_MODEL_MEMORY_USE_GB")) * 1024 * 1024 * 1024
-    available_cache_memory = memory_can_use - model_use_memory_b
-    cache_block_size = CacheEngine.get_cache_block_size(
-        cache_config, model_config, parallel_config
-    )
-    num_gpu_blocks = int(available_cache_memory // cache_block_size)
-    return num_gpu_blocks
-
-
 def is_use_mla(model_config):
     if not is_mindformers_model_backend():
         return False
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 9a6df8463..767bde8d2 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -41,119 +41,138 @@ def _get_cuda_graph_pad_size(
 
 
 def profile_run(self) -> None:
-    # Enable top-k sampling to reflect the accurate memory usage.
-    sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-    max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+    max_num_batched_tokens = \
+        self.scheduler_config.max_num_batched_tokens
     max_num_seqs = self.scheduler_config.max_num_seqs
-    # This represents the maximum number of different requests
-    # that will have unique loras, an therefore the max amount of memory
-    # consumption create dummy lora request copies from the lora request
-    # passed in, which contains a lora from the lora warmup path.
-    dummy_lora_requests: List[LoRARequest] = []
-    dummy_lora_requests_per_seq: List[LoRARequest] = []
-    if self.lora_config:
-        assert self.lora_manager is not None
-        with self.lora_manager.dummy_lora_cache():
-            for idx in range(self.lora_config.max_loras):
-                lora_id = idx + 1
-                dummy_lora_request = LoRARequest(
-                    lora_name=f"warmup_{lora_id}",
-                    lora_int_id=lora_id,
-                    lora_path="/not/a/real/path",
-                )
-                self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                 rank=LORA_WARMUP_RANK)
-                dummy_lora_requests.append(dummy_lora_request)
-            dummy_lora_requests_per_seq = [
-                dummy_lora_requests[idx % len(dummy_lora_requests)]
-                for idx in range(max_num_seqs)
-            ]
-
-    # Profile memory usage with max_num_sequences sequences and the total
-    # number of tokens equal to max_num_batched_tokens.
-    seqs: List[SequenceGroupMetadata] = []
-    # Additional GPU memory may be needed for multi-modal encoding, which
-    # needs to be accounted for when calculating the GPU blocks for
-    # vLLM blocker manager.
-    # To exercise the worst scenario for GPU memory consumption,
-    # the number of seqs (batch_size) is chosen to maximize the number
-    # of images processed.
-
-    max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-        self.model_config)
-    if max_mm_tokens > 0:
-        max_num_seqs_orig = max_num_seqs
-        max_num_seqs = min(max_num_seqs,
-                           max_num_batched_tokens // max_mm_tokens)
-        if max_num_seqs < 1:
-            expr = (f"min({max_num_seqs_orig}, "
-                    f"{max_num_batched_tokens} // {max_mm_tokens})")
-            logger.warning(
-                "Computed max_num_seqs (%s) to be less than 1. "
-                "Setting it to the minimum value of 1.", expr)
-            max_num_seqs = 1
-
-    batch_size = 0
-    for group_id in range(max_num_seqs):
-        seq_len = (max_num_batched_tokens // max_num_seqs +
-                   (group_id < max_num_batched_tokens % max_num_seqs))
-        batch_size += seq_len
-
-        dummy_data = self.input_registry \
-            .dummy_data_for_profiling(self.model_config,
-                                      seq_len,
-                                      self.mm_registry)
-
-        seq = SequenceGroupMetadata(
-            request_id=str(group_id),
-            is_prompt=True,
-            seq_data={group_id: dummy_data.seq_data},
-            sampling_params=sampling_params,
-            block_tables=None,
-            lora_request=dummy_lora_requests_per_seq[group_id]
-            if dummy_lora_requests_per_seq else None,
-            multi_modal_data=dummy_data.multi_modal_data,
-            multi_modal_placeholders=dummy_data.multi_modal_placeholders,
-        )
-        seqs.append(seq)
-
-    # Run the model with the dummy inputs.
-    num_layers = self.model_config.get_num_layers(self.parallel_config)
-    # use an empty tensor instead of `None`` to force Dynamo to pass
-    # it by reference, rather by specializing on the value ``None``.
-    # the `dtype` argument does not matter, and we use `float32` as
-    # a placeholder (it has wide hardware support).
-    # it is important to create tensors inside the loop, rather than
-    # multiplying the list, to avoid Dynamo from treating them as
-    # tensor aliasing.
-
-    kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
-        else self.cache_config.cache_dtype
-    kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype]
-    block_size = self.cache_config.block_size
-    num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
-    head_size = self.model_config.get_head_size()
-    kv_shape = [0, block_size, num_kv_heads, head_size]
-    kv_caches = mutable([
-        mutable((
-            mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)),
-            mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)),
-        ))
-        for _ in range(num_layers)
-    ])
-    finished_requests_ids = [seq.request_id for seq in seqs]
-    model_input = self.prepare_model_input(
-        seqs, finished_requests_ids=finished_requests_ids)
-    intermediate_tensors = None
-    if not get_pp_group().is_first_rank:
-        intermediate_tensors = self.model.make_empty_intermediate_tensors(
-            batch_size=batch_size,
-            dtype=self.model_config.dtype,
-            device=self.device)
-
-    self.execute_model(model_input, kv_caches, intermediate_tensors)
-    torch.cuda.synchronize()
-    return
+    self._dummy_run(max_num_batched_tokens, max_num_seqs)
+
+
+def _dummy_run(self,
+               max_num_batched_tokens: int,
+               max_num_seqs: int = 1) -> None:
+    with self.set_in_profile_run():
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = \
+            SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+
+        # This represents the maximum number of different requests
+        # that will have unique loras, an therefore the max amount of memory
+        # consumption create dummy lora request copies from the lora request
+        # passed in, which contains a lora from the lora warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config:
+            assert self.lora_manager is not None
+            with self.lora_manager.dummy_lora_cache():
+                for idx in range(self.lora_config.max_loras):
+                    lora_id = idx + 1
+                    dummy_lora_request = LoRARequest(
+                        lora_name=f"warmup_{lora_id}",
+                        lora_int_id=lora_id,
+                        lora_path="/not/a/real/path",
+                    )
+                    self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                        rank=LORA_WARMUP_RANK)
+                    dummy_lora_requests.append(dummy_lora_request)
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
+
+        # Profile memory usage with max_num_sequences sequences and the
+        # total number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for multi-modal encoding,
+        # which needs to be accounted for when calculating the GPU blocks
+        # for vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+
+        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
+            self.model_config)
+        if max_mm_tokens > 0:
+            max_num_seqs_orig = max_num_seqs
+            max_num_seqs = min(max_num_seqs,
+                                max_num_batched_tokens // max_mm_tokens)
+            if max_num_seqs < 1:
+                expr = (f"min({max_num_seqs_orig}, "
+                        f"{max_num_batched_tokens} // {max_mm_tokens})")
+                logger.warning(
+                    "Computed max_num_seqs (%s) to be less than 1. "
+                    "Setting it to the minimum value of 1.", expr)
+                max_num_seqs = 1
+
+        batch_size = 0
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                        (group_id < max_num_batched_tokens % max_num_seqs))
+            batch_size += seq_len
+
+            dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
+                                        seq_len,
+                                        self.mm_registry)
+
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: dummy_data.seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
+                multi_modal_data=dummy_data.multi_modal_data,
+                multi_modal_placeholders=dummy_data.
+                multi_modal_placeholders,
+            )
+            seqs.append(seq)
+
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        # use an empty tensor instead of `None`` to force Dynamo to pass
+        # it by reference, rather by specializing on the value ``None``.
+        # the `dtype` argument does not matter, and we use `float32` as
+        # a placeholder (it has wide hardware support).
+        # it is important to create tensors inside the loop, rather than
+        # multiplying the list, to avoid Dynamo from treating them as
+        # tensor aliasing.
+        kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
+            else self.cache_config.cache_dtype
+        kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype]
+        block_size = self.cache_config.block_size
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        head_size = self.model_config.get_head_size()
+        kv_shape = [0, block_size, num_kv_heads, head_size]
+        kv_caches = mutable([
+            mutable((
+                mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)),
+                mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)),
+            ))
+            for _ in range(num_layers)
+        ])
+        finished_requests_ids = [seq.request_id for seq in seqs]
+        model_input = self.prepare_model_input(
+            seqs, finished_requests_ids=finished_requests_ids)
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = \
+                self.model.make_empty_intermediate_tensors(
+                batch_size=batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
+
+        # Disable KV Scale Calculation for dummy data during profile run
+        if model_input.attn_metadata is not None:
+            model_input.attn_metadata.enable_kv_scales_calculation = False
+
+        self.execute_model(model_input, kv_caches, intermediate_tensors)
+        torch.cuda.synchronize()
+        if self.lora_config:
+            # Remove dummy loras.
+            assert self.lora_manager is not None
+            self.remove_all_loras()
+        return
 
 
 MULTI_STEP_ATTENTION_BACKENDS = [
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index b3c87dc93..f05eef3aa 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -94,99 +94,3 @@ def _warm_up_model(self) -> None:
     # Reset the seed to ensure that the random state is not affected by
     # the model initialization and profiling.
     set_random_seed(self.model_config.seed)
-
-
-def determine_num_available_blocks(self) -> Tuple[int, int]:
-    """Profiles the peak memory usage of the model to determine how many
-    KV blocks may be allocated without OOMs.
-
-    The engine will first conduct a profiling of the existing memory usage.
-    Then, it calculate the maximum possible number of GPU and CPU blocks
-    that can be allocated with the remaining free memory.
-
-    .. tip::
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-    """
-    from vllm.utils import GiB_bytes, memory_profiling
-
-    # Profile the memory usage of the model and get the maximum number of
-    # cache blocks that can be allocated with the remaining free memory.
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-
-    total_gpu_memory = int(os.environ["ASCEND_TOTAL_MEMORY_GB"]) if os.getenv("ASCEND_TOTAL_MEMORY_GB") else 64
-    total_gpu_memory = total_gpu_memory * 1024 * 1024 * 1024
-
-    if os.getenv("vLLM_MODEL_MEMORY_USE_GB"):
-        memory_use_for_model_run = int(os.environ["vLLM_MODEL_MEMORY_USE_GB"]) * 1024 * 1024 * 1024
-    else:
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        _, total_gpu_memory = torch.cuda.mem_get_info()
-        with memory_profiling(
-            self.baseline_snapshot,
-            weights_memory=self.model_runner.model_memory_usage,
-        ) as result:
-            self.model_runner.profile_run()
-            torch.cuda.synchronize()
-
-        self._assert_memory_footprint_increased_during_profiling()
-
-        memory_use_for_model_run = result.non_kv_cache_memory
-
-    memory_for_current_instance = (
-        total_gpu_memory * self.cache_config.gpu_memory_utilization
-    )
-    available_kv_cache_memory = memory_for_current_instance - memory_use_for_model_run
-
-    # Calculate the number of blocks that can be allocated with the
-    # profiled peak memory.
-    cache_block_size = self.get_cache_block_size_bytes()
-    if cache_block_size == 0:
-        num_gpu_blocks = 0
-        num_cpu_blocks = 0
-    else:
-        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
-        num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size)
-    num_gpu_blocks = max(num_gpu_blocks, 0)
-    num_cpu_blocks = max(num_cpu_blocks, 0)
-
-    if os.getenv("vLLM_MODEL_MEMORY_USE_GB"):
-        msg = (
-            f"The current vLLM instance can use "
-            "total_gpu_memory "
-            f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
-            " x gpu_memory_utilization "
-            f"({self.cache_config.gpu_memory_utilization:.2f})"
-            f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
-            "set model use memory "
-            f"{(memory_use_for_model_run):.2f}GiB;"
-            " the rest of the memory reserved for KV Cache is "
-            f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB."
-        )
-    else:
-        msg = (
-            f"Memory profiling takes {result.profile_time:.2f} seconds\n"
-            "the current vLLM instance can use "
-            "total_gpu_memory "
-            f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
-            " x gpu_memory_utilization "
-            f"({self.cache_config.gpu_memory_utilization:.2f})"
-            f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
-            "model weights take "
-            f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
-            " non_torch_memory takes "
-            f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
-            " PyTorch activation peak memory takes "
-            f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
-            " the rest of the memory reserved for KV Cache is "
-            f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB."
-        )
-
-    logger.info(msg)
-
-    # Final cleanup
-    gc.collect()
-
-    return num_gpu_blocks, num_cpu_blocks
-- 
Gitee


From f033614913c41687f572fb70919667547a9869b5 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Wed, 19 Mar 2025 17:20:49 +0800
Subject: [PATCH 45/82] fix process not quit when clean up

---
 vllm_mindspore/__init__.py                        | 4 ++++
 vllm_mindspore/engine/__init__.py                 | 0
 vllm_mindspore/engine/multiprocessing/__init__.py | 0
 vllm_mindspore/engine/multiprocessing/engine.py   | 4 ++++
 4 files changed, 8 insertions(+)
 create mode 100644 vllm_mindspore/engine/__init__.py
 create mode 100644 vllm_mindspore/engine/multiprocessing/__init__.py
 create mode 100644 vllm_mindspore/engine/multiprocessing/engine.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 86f0a8f6c..56c92ffb9 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -204,4 +204,8 @@ vllm.config.SchedulerConfig._verify_args = _verify_args
 
 from .utils import check_ready
 
+from vllm_mindspore.engine.multiprocessing.engine import cleanup
+import vllm.engine.multiprocessing.engine
+vllm.engine.multiprocessing.engine.MQLLMEngine.cleanup = cleanup
+
 check_ready()
diff --git a/vllm_mindspore/engine/__init__.py b/vllm_mindspore/engine/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm_mindspore/engine/multiprocessing/__init__.py b/vllm_mindspore/engine/multiprocessing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/vllm_mindspore/engine/multiprocessing/engine.py b/vllm_mindspore/engine/multiprocessing/engine.py
new file mode 100644
index 000000000..c91658e38
--- /dev/null
+++ b/vllm_mindspore/engine/multiprocessing/engine.py
@@ -0,0 +1,4 @@
+def cleanup(self):
+    self.ctx.destroy(linger=0)
+    if model_executor := getattr(self.engine, "model_executor", None):
+        model_executor.shutdown()
\ No newline at end of file
-- 
Gitee


From 40513dc6a514102b5f0b55a0528eee3d6c0226ad Mon Sep 17 00:00:00 2001
From: uh <xiaoyueclwy@163.com>
Date: Mon, 31 Mar 2025 00:04:03 +0800
Subject: [PATCH 46/82] =?UTF-8?q?=E3=80=90fix=E3=80=91=E4=BF=AE=E5=A4=8D?=
 =?UTF-8?q?=E5=9C=A8profile=E5=AF=B9=E8=B1=A1=E5=88=9D=E5=A7=8B=E5=8C=96?=
 =?UTF-8?q?=E5=90=8E=E5=BC=80=E5=A7=8B=E8=AE=B0=E5=BD=95profile=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=EF=BC=8C=E8=80=8C=E4=B8=8D=E6=98=AF=E5=9C=A8start=5Fp?=
 =?UTF-8?q?rofile=E8=B0=83=E7=94=A8=E5=90=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/worker/profile.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_mindspore/worker/profile.py b/vllm_mindspore/worker/profile.py
index 563637064..728362d20 100644
--- a/vllm_mindspore/worker/profile.py
+++ b/vllm_mindspore/worker/profile.py
@@ -19,6 +19,7 @@ class AdapterProfiler:
             profiler_level=ProfilerLevel.Level1,
             activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
             output_path=path,
+            start_profile=False
         )
 
     def start(self):
-- 
Gitee


From 024019fc1f9b22f61202c43a67c20e9cdd6a59d8 Mon Sep 17 00:00:00 2001
From: dayschan <chendeshi@huawei.com>
Date: Fri, 14 Mar 2025 23:32:28 +0800
Subject: [PATCH 47/82] add custom op advance_step_flashattn

---
 setup.py                                      | 118 ++++++++
 tests/__init__.py                             |   0
 tests/st/python/test_custom.py                |  60 +++++
 vllm_mindspore/attention/backends/ms_attn.py  |  28 +-
 vllm_mindspore/ops/ascendc/CMakeLists.txt     |  31 +++
 vllm_mindspore/ops/ascendc/adv_step_flash.c   | 253 ++++++++++++++++++
 vllm_mindspore/ops/ascendc/adv_step_flash.h   |   9 +
 .../ops/ascendc/adv_step_flash_adapter.cpp    |  99 +++++++
 8 files changed, 591 insertions(+), 7 deletions(-)
 delete mode 100644 tests/__init__.py
 create mode 100644 tests/st/python/test_custom.py
 create mode 100644 vllm_mindspore/ops/ascendc/CMakeLists.txt
 create mode 100644 vllm_mindspore/ops/ascendc/adv_step_flash.c
 create mode 100644 vllm_mindspore/ops/ascendc/adv_step_flash.h
 create mode 100644 vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp

diff --git a/setup.py b/setup.py
index 0a589407a..60dddeb37 100644
--- a/setup.py
+++ b/setup.py
@@ -21,9 +21,14 @@ import importlib.util
 import logging
 import os
 import sys
+import shutil
 from typing import List
 from pathlib import Path
 from setuptools import find_packages, setup
+from setuptools.command.build_ext import build_ext
+from setuptools.command.install import install
+from setuptools import Extension
+import subprocess
 
 
 def load_module_from_path(module_name, path):
@@ -85,6 +90,115 @@ def get_requirements() -> List[str]:
 
 version = (Path("vllm_mindspore") / "version.txt").read_text()
 
+def _get_ascend_home_path():
+    return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest")
+
+class CustomBuildExt(build_ext):
+    ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
+    ASCENDC_OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops", "ascendc")
+
+    def build_extension(self, ext):
+        if ext.name == "ascendc_kernels_npu":
+            self.build_ascendc_kernels()
+        elif ext.name == "npu_ops":
+            self.build_npu_ops(ext)
+        else:
+            raise ValueError(f"Unknown extension name: {ext.name}")
+
+    def build_ascendc_kernels(self):
+        kernel_so_name = "libascendc_kernels_npu.so"
+        print(f"Building {kernel_so_name}...")
+        tmp_build_dir = os.path.join(self.ASCENDC_OPS_DIR, "build")
+        if os.path.exists(tmp_build_dir):
+            print(f"Removing existing build directory: {tmp_build_dir}")
+            shutil.rmtree(tmp_build_dir)
+        os.makedirs(tmp_build_dir, exist_ok=True)
+
+        ascend_home_path = _get_ascend_home_path()
+        env_script_path = os.path.join(ascend_home_path, "bin", "setenv.bash")
+        if not os.path.exists(env_script_path):
+            raise RuntimeError(f"The file '{env_script_path}' is not found, "
+                               "please make sure env variable 'ASCEND_HOME_PATH' is set correctly.")
+        # Combine all cmake commands into one string
+        cmake_cmd = (
+            f"source {env_script_path} && "
+            f"cmake -S {self.ASCENDC_OPS_DIR} -B {tmp_build_dir} "
+            f"-DRUN_MODE=npu -DCMAKE_BUILD_TYPE=Debug "
+            f"-DCMAKE_INSTALL_PREFIX={os.path.join(tmp_build_dir, 'install')} "
+            f"-DASCEND_CANN_PACKAGE_PATH={ascend_home_path} && "
+            f"cmake --build {tmp_build_dir} -j --verbose && "
+            f"cmake --install {tmp_build_dir}"
+        )
+
+        try:
+            # Run the combined cmake command
+            print("Running combined CMake commands:")
+            result = subprocess.run(cmake_cmd, cwd=self.ROOT_DIR, text=True, shell=True, capture_output=True)
+            if result.returncode != 0:
+                print("CMake commands failed:")
+                print(result.stdout)  # Print standard output
+                print(result.stderr)  # Print error output
+                raise RuntimeError(f"Combined CMake commands failed with exit code {result.returncode}")
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Failed to build {kernel_so_name}: {e}")
+
+        # Move the generated .so file to the target directory
+        src_so_path = os.path.join(tmp_build_dir, "lib", kernel_so_name)
+        lib_dir = os.path.join(self.ROOT_DIR, self.build_lib, "vllm_mindspore", "lib")
+        dst_so_path = os.path.join(lib_dir, kernel_so_name)
+        os.makedirs(lib_dir, exist_ok=True)
+        if os.path.exists(dst_so_path):
+            os.remove(dst_so_path)
+        shutil.move(src_so_path, dst_so_path)
+        print(f"Moved {kernel_so_name} to {lib_dir}.")
+        # Remove the build directory after building kernels.so
+        shutil.rmtree(tmp_build_dir)
+
+    def build_npu_ops(self, ext):
+        print("Building npu_ops.so ...")
+        try:
+            import mindspore as ms
+        except ImportError:
+            print("Mindspore is not found, skip building npu_ops.so")
+            return
+        try:
+            src = [os.path.join(self.ASCENDC_OPS_DIR, s) for s in ext.sources]
+            build_lib_dir = os.path.join(self.ROOT_DIR, self.build_lib, "vllm_mindspore")
+            ms.ops.CustomOpBuilder(
+                "npu_ops",
+                src,
+                backend="Ascend",
+                cflags=f"-I{self.ASCENDC_OPS_DIR}",
+                ldflags=f"-L{os.path.join(build_lib_dir, 'lib')} -lascendc_kernels_npu -Wl,-rpath,'$$ORIGIN/lib'"
+            ).load()
+        except ImportError:
+            pass
+        # Move the generated .so file to the target directory
+        kernel_meta_dir = os.path.join(self.ROOT_DIR, "kernel_meta")
+        src_so_path = os.path.join(kernel_meta_dir, "npu_ops", "npu_ops.so")
+        dst_so_path = os.path.join(build_lib_dir, "npu_ops.so")
+        os.makedirs(build_lib_dir, exist_ok=True)
+        if os.path.exists(dst_so_path):
+            os.remove(dst_so_path)
+        shutil.move(src_so_path, build_lib_dir)
+        print(f"Moved npu_ops.so to {build_lib_dir}.")
+        shutil.rmtree(kernel_meta_dir)
+
+package_data = {
+    "": [
+        "*.so",
+        "lib/*.so",
+    ]
+}
+
+def _get_ext_modules():
+    ext_modules = []
+    ext_modules.append(Extension("ascendc_kernels_npu", sources=[]))
+    ext_modules.append(Extension("npu_ops", sources=[
+        "adv_step_flash_adapter.cpp"
+    ]))
+    return ext_modules
+
 setup(
     name="vllm-mindspore",
     version=version,
@@ -115,4 +229,8 @@ setup(
     packages=find_packages(),
     python_requires=">=3.9",
     install_requires=get_requirements(),
+    cmdclass={"build_ext": CustomBuildExt},
+    ext_modules=_get_ext_modules(),
+    include_package_data=True,
+    package_data=package_data,
 )
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/st/python/test_custom.py b/tests/st/python/test_custom.py
new file mode 100644
index 000000000..b7e8cc3b6
--- /dev/null
+++ b/tests/st/python/test_custom.py
@@ -0,0 +1,60 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test case for custom op adv_step_flash"""
+
+import mindspore as ms
+from vllm_mindspore import npu_ops
+import numpy as np
+from mindspore import Tensor
+
+# TODO refactor this case to run on ci
+def testcase():
+    ms.context.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
+    in_block_tables = Tensor(np.load("data/block_tables.npy"))
+    in_input_positions = Tensor(np.load("data/input_positions.npy"))
+    in_input_tokens = Tensor(np.load("data/input_tokens.npy"))
+    in_sampled_token_ids = Tensor(np.load("data/sampled_token_ids.npy"))
+    in_seq_lens_tensor = Tensor(np.load("data/seq_lens_tensor.npy"))
+    in_slot_mapping = Tensor(np.load("data/slot_mapping.npy"))
+    num_seqs = 256
+    num_queries = 256
+    block_size = 32
+    npu_ops.adv_step_flash(num_seqs=num_seqs,
+                           num_queries=num_queries,
+                           block_size=block_size,
+                           input_tokens=in_input_tokens,
+                           sampled_token_ids=in_sampled_token_ids,
+                           input_positions=in_input_positions,
+                           seq_lens=in_seq_lens_tensor,
+                           slot_mapping=in_slot_mapping,
+                           block_tables=in_block_tables)
+
+    out_block_tables = np.load("data/o_block_tables.npy").astype(np.int32)
+    out_input_positions = np.load("data/o_input_positions.npy").astype(np.int32)
+    out_input_tokens = np.load("data/o_input_tokens.npy").astype(np.int32)
+    out_sampled_token_ids = np.load("data/o_sampled_token_ids.npy").astype(np.int32)
+    out_seq_lens_tensor = np.load("data/o_seq_lens_tensor.npy").astype(np.int32)
+    out_slot_mapping = np.load("data/o_slot_mapping.npy").astype(np.int32)
+    assert np.allclose(in_block_tables, out_block_tables)
+    assert np.allclose(in_input_positions, out_input_positions)
+    assert np.allclose(in_input_tokens, out_input_tokens)
+    assert np.allclose(in_sampled_token_ids, out_sampled_token_ids)
+    assert np.allclose(in_seq_lens_tensor, out_seq_lens_tensor)
+    assert np.allclose(in_slot_mapping, out_slot_mapping)
+    print("passed.")
+
+if __name__ == "__main__":
+    testcase()
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index 0e5075fbd..961d11dc6 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -21,6 +21,7 @@ from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+import os
 
 import torch
 
@@ -308,13 +309,26 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
 
-        advance_step_op(sampled_token_ids,
-                        model_input,
-                        self.seq_lens_tensor,
-                        num_queries,
-                        block_size,
-                        self.block_tables,
-                        self.slot_mapping)
+        # default use python op
+        if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on":
+            from vllm_mindspore import npu_ops
+            npu_ops.adv_step_flash(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+        else:
+            advance_step_op(sampled_token_ids,
+                            model_input,
+                            self.seq_lens_tensor,
+                            num_queries,
+                            block_size,
+                            self.block_tables,
+                            self.slot_mapping)
 
     def get_seq_lens(
         self,
diff --git a/vllm_mindspore/ops/ascendc/CMakeLists.txt b/vllm_mindspore/ops/ascendc/CMakeLists.txt
new file mode 100644
index 000000000..ce4a8d276
--- /dev/null
+++ b/vllm_mindspore/ops/ascendc/CMakeLists.txt
@@ -0,0 +1,31 @@
+cmake_minimum_required(VERSION 3.16)
+project(AscendC_Kernels)
+
+# Parameters passed from command line or default values
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend910B1" CACHE STRING "system on chip type")
+set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug")
+
+# Set ASCEND_CANN_PACKAGE_PATH based on the ASCEND_HOME_PATH environment variable
+set(ASCEND_CANN_PACKAGE_PATH "$ENV{ASCEND_HOME_PATH}" CACHE STRING "ASCEND CANN package installation directory")
+
+# Verify that the required paths exist
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist. Check whether the CANN package is installed in ${ASCEND_CANN_PACKAGE_PATH}")
+endif()
+
+# Include Ascend CANN CMake file
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# Add source files
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/adv_step_flash.c)
+
+# Build shared library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+
+# Set the output directory
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
\ No newline at end of file
diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash.c b/vllm_mindspore/ops/ascendc/adv_step_flash.c
new file mode 100644
index 000000000..e89976311
--- /dev/null
+++ b/vllm_mindspore/ops/ascendc/adv_step_flash.c
@@ -0,0 +1,253 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+template <typename Tp, Tp v>
+struct integral_constant {
+  static constexpr Tp value = v;
+};
+using true_type = integral_constant<bool, true>;
+using false_type = integral_constant<bool, false>;
+template <typename, typename>
+struct is_same : public false_type {};
+template <typename Tp>
+struct is_same<Tp, Tp> : public true_type {};
+
+template <typename T, typename U, typename R>
+__aicore__ inline void DataCopyCustom(const U &dstTensor, const R &srcTensor, const uint32_t count) {
+  DataCopyParams copyParams;
+  copyParams.blockLen = count * sizeof(T);
+  copyParams.blockCount = 1;
+  if constexpr (is_same<U, AscendC::LocalTensor<T>>::value) {
+    DataCopyPadParams padParams;
+    DataCopyPad(dstTensor, srcTensor, copyParams, padParams);
+  } else {
+    DataCopyPad(dstTensor, srcTensor, copyParams);
+  }
+}
+
+class KernelAdvStepFlash {
+ public:
+  __aicore__ inline KernelAdvStepFlash(TPipe *pipe) { Ppipe = pipe; }
+
+  __aicore__ inline void Init(GM_ADDR sampledTokenIds, GM_ADDR blockTables, GM_ADDR seqLensInput, GM_ADDR inputTokens,
+                              GM_ADDR inputPositions, GM_ADDR seqLensOut, GM_ADDR slotMapping, int32_t num_seqs,
+                              int32_t block_size, int32_t block_tables_stride) {
+    ASSERT(GetBlockNum() != 0 && "Block dim can not be zero!");
+    this->blockSize = block_size;
+    this->blockTablesStride = block_tables_stride;
+    this->tensorLength = num_seqs;
+
+    this->blockSizeFp = static_cast<float>(this->blockSize);
+
+    // get start index for current core, core parallel
+    sampledTokenIdsGm.SetGlobalBuffer((__gm__ int32_t *)sampledTokenIds, tensorLength);
+    seqLensInputGm.SetGlobalBuffer((__gm__ int32_t *)seqLensInput, tensorLength);
+    blockTablesGm.SetGlobalBuffer((__gm__ int32_t *)blockTables);  // inf size
+
+    inputTokensGm.SetGlobalBuffer((__gm__ int32_t *)inputTokens, tensorLength);
+    inputPositionsGm.SetGlobalBuffer((__gm__ int32_t *)inputPositions, tensorLength);
+    seqLensOutGm.SetGlobalBuffer((__gm__ int32_t *)seqLensOut, tensorLength);
+    slotMappingGm.SetGlobalBuffer((__gm__ int32_t *)slotMapping, tensorLength);
+
+    // pipe alloc memory to queue, the unit is Bytes
+    Ppipe->InitBuffer(sampledIdsQue, 1, tensorLength * sizeof(int32_t));
+    Ppipe->InitBuffer(seqLenInQue, 1, tensorLength * sizeof(int32_t));
+
+    Ppipe->InitBuffer(inputTokensQue, 1, tensorLength * sizeof(int32_t));
+    Ppipe->InitBuffer(seqLensOutQue, 1, tensorLength * sizeof(int32_t));
+    Ppipe->InitBuffer(inputPositionsQue, 1, tensorLength * sizeof(int32_t));
+
+    Ppipe->InitBuffer(tableOffsetBuf, tensorLength * sizeof(int32_t));
+
+    Ppipe->InitBuffer(tmpDivBuf01, tensorLength * sizeof(int32_t));
+    Ppipe->InitBuffer(tmpDivBuf02, tensorLength * sizeof(int32_t));
+
+    Ppipe->InitBuffer(outTableBuf, tensorLength * sizeof(int32_t));
+    Ppipe->InitBuffer(blockTableBuf, 32);
+  }
+
+  __aicore__ inline void Process() {
+    CopyIn();
+    Compute();
+    CopyOut();
+  }
+
+ private:
+  __aicore__ inline void CopyIn() {
+    LocalTensor<int32_t> sampledIdsLocal = sampledIdsQue.AllocTensor<int32_t>();
+    LocalTensor<int32_t> seqLenInLocal = seqLenInQue.AllocTensor<int32_t>();
+
+    DataCopyCustom<int32_t>(sampledIdsLocal, sampledTokenIdsGm, tensorLength);
+    DataCopyCustom<int32_t>(seqLenInLocal, seqLensInputGm, tensorLength);
+
+    sampledIdsQue.EnQue(sampledIdsLocal);
+    seqLenInQue.EnQue(seqLenInLocal);
+  }
+
+  __aicore__ inline void Compute() {
+    LocalTensor<int32_t> tableOffset = tableOffsetBuf.Get<int32_t>();
+
+    LocalTensor<int32_t> sampledIdsLocal = sampledIdsQue.DeQue<int32_t>();
+    LocalTensor<int32_t> seqLenInLocal = seqLenInQue.DeQue<int32_t>();
+
+    LocalTensor<int32_t> inputTokensLocal = inputTokensQue.AllocTensor<int32_t>();
+    LocalTensor<int32_t> seqLensOutLocal = seqLensOutQue.AllocTensor<int32_t>();
+    LocalTensor<int32_t> inputPositionsLocal = inputPositionsQue.AllocTensor<int32_t>();
+
+    Adds(inputTokensLocal, sampledIdsLocal, (int32_t)0, tensorLength);   // inputTokensLocal <-- sampledIdsLocal
+    Adds(inputPositionsLocal, seqLenInLocal, (int32_t)0, tensorLength);  // inputPositionsLocal <-- seqLenInLocal
+    Adds(seqLensOutLocal, seqLenInLocal, (int32_t)1, tensorLength);      // seqLensOutLocal <-- seqLenInLocal + 1
+    PipeBarrier<PIPE_V>();
+
+    // TODO add Function
+    ComputeTableOffset(tableOffset, inputPositionsLocal);
+    // GetTableValueByOffset(tableOffset, inputPositionsLocal);
+
+    sampledIdsQue.FreeTensor(sampledIdsLocal);
+    seqLenInQue.FreeTensor(seqLenInLocal);
+
+    inputTokensQue.EnQue(inputTokensLocal);
+    seqLensOutQue.EnQue(seqLensOutLocal);
+    inputPositionsQue.EnQue(inputPositionsLocal);
+  }
+
+  __aicore__ inline void CopyOut() {
+    LocalTensor<int32_t> inputTokensLocal = inputTokensQue.DeQue<int32_t>();
+    LocalTensor<int32_t> seqLensOutLocal = seqLensOutQue.DeQue<int32_t>();
+    LocalTensor<int32_t> inputPositionsLocal = inputPositionsQue.DeQue<int32_t>();
+
+    DataCopyCustom<int32_t>(inputTokensGm, inputTokensLocal, tensorLength);
+    DataCopyCustom<int32_t>(inputPositionsGm, inputPositionsLocal, tensorLength);
+    DataCopyCustom<int32_t>(seqLensOutGm, seqLensOutLocal, tensorLength);
+
+    inputTokensQue.FreeTensor(inputTokensLocal);
+    seqLensOutQue.FreeTensor(seqLensOutLocal);
+    inputPositionsQue.FreeTensor(inputPositionsLocal);
+  }
+
+  __aicore__ inline void ComputeTableOffset(LocalTensor<int32_t> tableOffset,
+                                            LocalTensor<int32_t> inputPositionsLocal) {
+    LocalTensor<float> tmpBuf01 = tmpDivBuf01.Get<float>();
+    LocalTensor<float> tmpBuf02 = tmpDivBuf02.Get<float>();
+
+    LocalTensor<int32_t> tmpBuf01Int = tmpBuf01.ReinterpretCast<int32_t>();
+    LocalTensor<int32_t> tmpBuf02Int = tmpBuf02.ReinterpretCast<int32_t>();
+
+    LocalTensor<int32_t> outTableValue = outTableBuf.Get<int32_t>();
+    LocalTensor<int32_t> blockTableLocal = blockTableBuf.Get<int32_t>();
+
+    // floor div
+    Cast(tmpBuf01, inputPositionsLocal, RoundMode::CAST_RINT, tensorLength);
+    Duplicate(tmpBuf02, blockSizeFp, tensorLength);
+    PipeBarrier<PIPE_V>();
+    Div(tmpBuf01, tmpBuf01, tmpBuf02, tensorLength);  // <-- inputPositionsLocal / blockSize
+    PipeBarrier<PIPE_V>();
+    Cast(tmpBuf02Int, tmpBuf01, RoundMode::CAST_TRUNC, tensorLength);
+
+    CreateVecIndex(tableOffset, (int32_t)0, tensorLength);  // tableOffset <--- 0, 1, 2, 3, .... tensorLength -1
+    PipeBarrier<PIPE_V>();
+
+    Muls(tableOffset, tableOffset, this->blockTablesStride,
+         tensorLength);  // tableOffset <--- curt_offset * block_stride
+    PipeBarrier<PIPE_V>();
+    Add(tableOffset, tableOffset, tmpBuf02Int,
+        tensorLength);  // tableOffset <--- curt_offset * block_stride + inputPositionsLocal / blockSize
+
+    PIPE_V_S();
+
+    for (int32_t idx = 0; idx < tensorLength; idx++) {
+      int32_t blockTableIdx = tableOffset.GetValue(idx);
+
+      PIPE_S_MTE2();
+
+      DataCopyCustom<int32_t>(blockTableLocal, blockTablesGm[blockTableIdx], 1);
+
+      PIPE_MTE2_S();
+
+      int32_t blockTableValue = blockTableLocal.GetValue(0);
+      int32_t block_offset = inputPositionsLocal.GetValue(idx) % this->blockSize;
+      blockTableValue = blockTableValue * this->blockSize + block_offset;
+      outTableValue.SetValue(idx, blockTableValue);
+    }
+    PIPE_S_MTE3();
+    DataCopyCustom<int32_t>(slotMappingGm, outTableValue, tensorLength);
+  }
+
+  __aicore__ inline void PIPE_S_MTE3() {
+    event_t event_S_MTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3));
+    SetFlag<HardEvent::S_MTE3>(event_S_MTE3);
+    WaitFlag<HardEvent::S_MTE3>(event_S_MTE3);
+  }
+
+  __aicore__ inline void PIPE_S_MTE2() {
+    event_t event_S_MTE2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_MTE2));
+    SetFlag<HardEvent::S_MTE2>(event_S_MTE2);
+    WaitFlag<HardEvent::S_MTE2>(event_S_MTE2);
+  }
+
+  __aicore__ inline void PIPE_MTE2_S() {
+    event_t event_MTE2_S = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));
+    SetFlag<HardEvent::MTE2_S>(event_MTE2_S);
+    WaitFlag<HardEvent::MTE2_S>(event_MTE2_S);
+  }
+
+  __aicore__ inline void PIPE_V_S() {
+    event_t event_V_S = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_S));
+    SetFlag<HardEvent::V_S>(event_V_S);
+    WaitFlag<HardEvent::V_S>(event_V_S);
+  }
+
+ private:
+  TPipe *Ppipe = nullptr;
+  // create queues for input, in this case depth is equal to buffer num
+  TQue<QuePosition::VECIN, 1> sampledIdsQue, seqLenInQue;
+  // create queues for output, in this case depth is equal to buffer num
+  TQue<QuePosition::VECOUT, 1> inputTokensQue, seqLensOutQue, inputPositionsQue;
+
+  TBuf<TPosition::VECCALC> tableOffsetBuf;
+  TBuf<TPosition::VECCALC> tmpDivBuf01;
+  TBuf<TPosition::VECCALC> tmpDivBuf02;
+  TBuf<TPosition::VECCALC> outTableBuf;
+  TBuf<TPosition::VECCALC> blockTableBuf;
+
+  // inputs
+  GlobalTensor<int32_t> sampledTokenIdsGm;
+  GlobalTensor<int32_t> seqLensInputGm;
+  GlobalTensor<int32_t> blockTablesGm;
+  // outs
+  GlobalTensor<int32_t> inputTokensGm;
+  GlobalTensor<int32_t> inputPositionsGm;
+  GlobalTensor<int32_t> seqLensOutGm;
+  GlobalTensor<int32_t> slotMappingGm;
+
+  int32_t blockSize;
+  int32_t blockTablesStride;
+  int64_t tensorLength;  // number of calculations rows on each core
+
+  float blockSizeFp;
+};
+
+extern "C" __global__ __aicore__ void adv_step_flash_impl(GM_ADDR sampledTokenIds, GM_ADDR blockTables,
+                                                          GM_ADDR seqLensInput, GM_ADDR inputTokens,
+                                                          GM_ADDR inputPositions, GM_ADDR seqLensOut,
+                                                          GM_ADDR slotMapping, int32_t num_seqs, int32_t block_size,
+                                                          int32_t block_tables_stride) {
+  TPipe pipe;
+
+  KernelAdvStepFlash op(&pipe);
+  op.Init(sampledTokenIds, blockTables, seqLensInput, inputTokens, inputPositions, seqLensOut, slotMapping, num_seqs,
+          block_size, block_tables_stride);
+  op.Process();
+}
+
+#ifndef __CCE_KT_TEST__
+void AdvStepFlashKernelEntry(uint32_t blockDims, void *l2ctrl, void *aclStream, uint8_t *sampledTokenIds,
+                             uint8_t *blockTables, uint8_t *seqLensInput, uint8_t *inputTokens, uint8_t *inputPositions,
+                             uint8_t *seqLensOut, uint8_t *slotMapping, int32_t num_seqs, int32_t block_size,
+                             int32_t block_tables_stride) {
+  adv_step_flash_impl<<<blockDims, l2ctrl, aclStream>>>(sampledTokenIds, blockTables, seqLensInput, inputTokens,
+                                                        inputPositions, seqLensOut, slotMapping, num_seqs, block_size,
+                                                        block_tables_stride);
+}
+#endif
diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash.h b/vllm_mindspore/ops/ascendc/adv_step_flash.h
new file mode 100644
index 000000000..926626b0b
--- /dev/null
+++ b/vllm_mindspore/ops/ascendc/adv_step_flash.h
@@ -0,0 +1,9 @@
+#ifndef VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H
+#define VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H
+
+extern void AdvStepFlashKernelEntry(uint32_t blockDims, void *l2ctrl, void *aclStream, uint8_t *sampledTokenIds,
+                                    uint8_t *blockTables, uint8_t *seqLensInput, uint8_t *inputTokens,
+                                    uint8_t *inputPositions, uint8_t *seqLensOut, uint8_t *slotMapping,
+                                    int32_t num_seqs, int32_t block_size, int32_t block_tables_stride);
+
+#endif  // VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H
diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp b/vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp
new file mode 100644
index 000000000..d72af3e38
--- /dev/null
+++ b/vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp
@@ -0,0 +1,99 @@
+#include <string>
+#include <map>
+#include <memory>
+
+#include "ms_extension.h"
+
+#include "adv_step_flash.h"
+
+using BaseTensor = mindspore::tensor::BaseTensor;
+using BaseTensorPtr = mindspore::tensor::BaseTensorPtr;
+using PyBoostUtils = mindspore::kernel::pyboost::PyBoostUtils;
+
+uint8_t *GetDataPtr(const BaseTensorPtr &t) {
+  return static_cast<uint8_t *>(t->device_address()->GetMutablePtr()) + t->data().itemsize() * t->storage_offset();
+}
+
+struct DtypeCaster {
+  BaseTensorPtr CheckAndCast(const BaseTensorPtr &t, const std::string &name = "") {
+    mindspore::Int64ImmPtr dst_type = std::make_shared<mindspore::Int64Imm>(mindspore::TypeId::kNumberTypeInt32);
+    if (t->data_type() != mindspore::TypeId::kNumberTypeInt32) {
+      if (!name.empty()) {
+        tensor_map_[name] = t;
+      }
+      return mindspore::kernel::pyboost::cast(t, dst_type);
+    }
+    return t;
+  }
+  BaseTensorPtr RecoveryTensorDtype(const BaseTensorPtr &t, const std::string &name) {
+    auto iter = tensor_map_.find(name);
+    if (iter == tensor_map_.end()) {
+      return t;
+    }
+    auto ori_tensor = iter->second;
+    auto ori_dtype = std::make_shared<mindspore::Int64Imm>(ori_tensor->data_type());
+    auto ret = mindspore::kernel::pyboost::cast(t, ori_dtype);
+    ori_tensor->AssignValue(*ret);
+    return ori_tensor;
+  }
+  std::map<std::string, BaseTensorPtr> tensor_map_;
+};
+
+void AdvStepFlashAscendC(int32_t num_seqs, int32_t num_queries, int32_t block_size,
+                         BaseTensorPtr &input_tokens,      // output
+                         BaseTensorPtr sampled_token_ids,  // input
+                         BaseTensorPtr &input_positions,   // output
+                         BaseTensorPtr &seq_lens,          // input&output (inplace)
+                         BaseTensorPtr &slot_mapping,      // output
+                         BaseTensorPtr block_tables        // input
+) {
+  // the AdvStepFlashKernelEntry only support int32 inputs.
+  DtypeCaster caster;
+  sampled_token_ids = caster.CheckAndCast(sampled_token_ids);
+  block_tables = caster.CheckAndCast(block_tables);
+  input_tokens = caster.CheckAndCast(input_tokens, "input_tokens");
+  input_positions = caster.CheckAndCast(input_positions, "input_positions");
+  slot_mapping = caster.CheckAndCast(slot_mapping, "slot_mapping");
+  seq_lens = caster.CheckAndCast(seq_lens, "seq_lens");
+
+  auto stream_id = PyBoostUtils::cur_stream_id();
+  auto device_context = mindspore::runtime::OpRunner::GetDeviceContext("Ascend");
+  PyBoostUtils::PrepareOpInputs(device_context, stream_id, input_tokens, sampled_token_ids, input_positions, seq_lens,
+                                slot_mapping, block_tables);
+  // PyBoostUtils::PrepareOpOutputs(device_context, stream_id, outputs);
+  PyBoostUtils::DispatchRun(std::make_shared<mindspore::runtime::PyBoostDeviceTask>([=]() {
+    PyBoostUtils::MallocOpInputs(device_context, input_tokens, sampled_token_ids, input_positions, seq_lens,
+                                 slot_mapping, block_tables);
+    // PyBoostUtils::MallocOpOutputs(device_context, outputs);
+
+    uint8_t *sampledTokenIdsPtr = GetDataPtr(sampled_token_ids);
+    uint8_t *blockTablesPtr = GetDataPtr(block_tables);
+    uint8_t *seqLensPtr = GetDataPtr(seq_lens);
+    uint8_t *inputTokensPtr = GetDataPtr(input_tokens);
+    uint8_t *inputPositionsPtr = GetDataPtr(input_positions);
+    uint8_t *slotMappingPtr = GetDataPtr(slot_mapping);
+    auto aclStream = device_context->device_res_manager_->GetStream(stream_id);
+    auto stride = block_tables->stride();
+    int32_t block_tables_stride = stride.empty() ? 1 : stride[0];
+
+    mindspore::runtime::OpExecutor::DispatchLaunchTask([=]() {
+      uint32_t blockDims = 1;
+      void *l2ctrl = nullptr;
+      AdvStepFlashKernelEntry(blockDims, l2ctrl, aclStream, sampledTokenIdsPtr, blockTablesPtr, seqLensPtr,
+                              inputTokensPtr, inputPositionsPtr, seqLensPtr, slotMappingPtr, num_seqs, block_size,
+                              block_tables_stride);
+    });
+  }));
+
+  input_tokens = caster.RecoveryTensorDtype(input_tokens, "input_tokens");
+  input_positions = caster.RecoveryTensorDtype(input_positions, "input_positions");
+  slot_mapping = caster.RecoveryTensorDtype(slot_mapping, "slot_mapping");
+  seq_lens = caster.RecoveryTensorDtype(seq_lens, "seq_lens");
+}
+
+PYBIND11_MODULE(MS_EXTENSION_NAME, m) {
+  m.def("adv_step_flash", &AdvStepFlashAscendC, "adv_step_flash_ascendc", pybind11::arg("num_seqs"),
+        pybind11::arg("num_queries"), pybind11::arg("block_size"), pybind11::arg("input_tokens"),
+        pybind11::arg("sampled_token_ids"), pybind11::arg("input_positions"), pybind11::arg("seq_lens"),
+        pybind11::arg("slot_mapping"), pybind11::arg("block_tables"));
+}
-- 
Gitee


From 63fb8a5cf354453560f518f95a3db173add8936e Mon Sep 17 00:00:00 2001
From: dayschan <chendeshi@huawei.com>
Date: Tue, 1 Apr 2025 16:45:33 +0800
Subject: [PATCH 48/82] do not compile custom operators by default

---
 setup.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 60dddeb37..647dfa531 100644
--- a/setup.py
+++ b/setup.py
@@ -193,10 +193,15 @@ package_data = {
 
 def _get_ext_modules():
     ext_modules = []
-    ext_modules.append(Extension("ascendc_kernels_npu", sources=[]))
-    ext_modules.append(Extension("npu_ops", sources=[
-        "adv_step_flash_adapter.cpp"
-    ]))
+    # Currently, the CI environment does not support the compilation of custom operators.
+    # As a temporary solution, this is controlled via an environment variable.
+    # Once the CI environment adds support for custom operator compilation,
+    # this should be updated to enable compilation by default.
+    if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on":
+        ext_modules.append(Extension("ascendc_kernels_npu", sources=[]))
+        ext_modules.append(Extension("npu_ops", sources=[
+            "adv_step_flash_adapter.cpp"
+        ]))
     return ext_modules
 
 setup(
-- 
Gitee


From 68e67537216d0e0e0b3dc76d8afa1185955cab4b Mon Sep 17 00:00:00 2001
From: lijiakun <lijiakun9@huawei.com>
Date: Fri, 28 Mar 2025 14:45:14 +0800
Subject: [PATCH 49/82] add kvcache

---
 .../models/mf_models/deepseek_v3.py           | 17 +++++--------
 .../models/mf_models/mf_model_base.py         | 24 ++++++++-----------
 .../model_executor/models/mf_models/qwen2.py  |  1 +
 3 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 9b2a5a384..b9726bc68 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -27,7 +27,7 @@ from vllm.config import  get_current_vllm_config
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 
-from mindspore import Tensor, JitConfig, Model
+from mindspore import Tensor, JitConfig, Model, mutable
 from mindspore.common import dtype as msdtype
 
 from mindspore_gs.ptq import PTQ
@@ -68,6 +68,7 @@ class DeepseekV3ForCausalLM(MfModelBase):
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
         self.mf_model_config.return_hidden_states = True
+        setattr(self.mf_model_config, 'npu_mem_size', -1)
 
         self.is_quant = bool(hasattr(self.mf_model_config, "quantization_config") and
                              self.mf_model_config.quantization_config)
@@ -100,19 +101,13 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
-    def update_mf_kvcaches(self):
-        if self.mf_kvcaches_init:
-            return
-
+    def get_kvcache(self):
+        key_cache = []
         forward_context = get_forward_context()
         for i in range(self.mf_model_config.num_layers):
             k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
-            mf_k_cache, _ = self.network.kvcache(i)
-
-            mf_k_cache.set_device_address(
-                k_cache._data_ptr(), k_cache.shape, k_cache.dtype
-            )
-        self.mf_kvcaches_init = True
+            key_cache.append(k_cache)
+        return mutable(key_cache), None
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         if self.mf_config.load_ckpt_format == "ckpt":
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 56f394c48..2290ae0c6 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -33,7 +33,7 @@ from vllm.logger import init_logger
 
 import torch
 import mindspore as ms
-from mindspore import Tensor
+from mindspore import Tensor, mutable
 
 from mindformers.tools.register.config import MindFormerConfig
 from mindformers.core.context import build_context
@@ -82,22 +82,16 @@ class MfModelBase(MsModelBase):
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
 
 
-    def update_mf_kvcaches(self):
-        if self.mf_kvcaches_init:
-            return
-
+    def get_kvcache(self):
+        key_cache = []
+        value_cache = []
         forward_context = get_forward_context()
         for i in range(self.mf_model_config.num_layers):
             k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
             v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1]
-            mf_k_cache, mf_v_cache = self.network.kvcache(i)
-            mf_k_cache.set_device_address(
-                k_cache._data_ptr(), k_cache.shape, k_cache.dtype
-            )
-            mf_v_cache.set_device_address(
-                v_cache._data_ptr(), v_cache.shape, v_cache.dtype
-            )
-        self.mf_kvcaches_init = True
+            key_cache.append(k_cache)
+            value_cache.append(v_cache)
+        return mutable(key_cache), mutable(value_cache)
 
 
     def forward(
@@ -109,7 +103,7 @@ class MfModelBase(MsModelBase):
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Union[Tensor, IntermediateTensors]:
-        self.update_mf_kvcaches()
+        key_cache, value_cache = self.get_kvcache()
 
         seq_lens = attn_metadata.seq_lens
         max_query_len = attn_metadata.max_query_len
@@ -141,6 +135,8 @@ class MfModelBase(MsModelBase):
         model_inputs["position_ids"] = position_ids
         model_inputs["q_seq_lens"] = q_seq_lens
         model_inputs["attention_mask"] = attention_mask
+        model_inputs["key_cache"] = key_cache
+        model_inputs["value_cache"] = value_cache
 
         if is_prefill:
             self.network.phase = "prefill"
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 78247b8e1..f73be661e 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -55,6 +55,7 @@ class Qwen2ForCausalLM(MfModelBase):
 
         # qwen qkv concat will support in next version
         self.mf_model_config.qkv_concat = False
+        setattr(self.mf_model_config, 'npu_mem_size', -1)
         self.mf_config.model.model_config.qkv_concat = False
         # Initial network
         self.network = ParallelQwenForCausalLM_MF(self.mf_model_config)
-- 
Gitee


From 7e0aea1b089fb2d98f5487d2b87b28a6aac0747a Mon Sep 17 00:00:00 2001
From: huandong <huandong1@huawei.com>
Date: Tue, 1 Apr 2025 23:55:40 +0800
Subject: [PATCH 50/82] del profile_run and del prefill warmup

---
 vllm_mindspore/__init__.py            | 2 --
 vllm_mindspore/worker/model_runner.py | 7 -------
 vllm_mindspore/worker/worker.py       | 9 ---------
 3 files changed, 18 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index de11500c1..e6f25e44b 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -141,7 +141,6 @@ Worker.init_device = wrapper_worker_init_device(Worker.init_device)
 
 from vllm_mindspore.worker.model_runner import (
     _get_cuda_graph_pad_size,
-    profile_run,
     _dummy_run,
     _get_supported_attention_backends
 )
@@ -149,7 +148,6 @@ from vllm_mindspore.worker.model_runner import (
 vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = (
     _get_cuda_graph_pad_size
 )
-vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run
 vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run
 
 import vllm.worker.multi_step_model_runner
diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 767bde8d2..561fd2021 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -40,13 +40,6 @@ def _get_cuda_graph_pad_size(
     return -1
 
 
-def profile_run(self) -> None:
-    max_num_batched_tokens = \
-        self.scheduler_config.max_num_batched_tokens
-    max_num_seqs = self.scheduler_config.max_num_seqs
-    self._dummy_run(max_num_batched_tokens, max_num_seqs)
-
-
 def _dummy_run(self,
                max_num_batched_tokens: int,
                max_num_seqs: int = 1) -> None:
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index f05eef3aa..3ef4717b0 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -73,15 +73,6 @@ def _warm_up_model(self) -> None:
     # cache_engine is a list with length equal to the size of pipeline-parallel, and only pp=1 is supported.
     kv_cache = self.cache_engine[0].gpu_cache
 
-    # warmup for prefill
-    if self.vllm_config.scheduler_config.is_multi_step:
-        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner, self.cache_engine[0], True)
-        self.model_runner._base_model_runner.execute_model(model_input, kv_cache, None)
-    else:
-        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], True)
-        self.model_runner.execute_model(model_input, kv_cache, None)
-    torch.cuda.synchronize()
-
     # warmup for decode
     if self.vllm_config.scheduler_config.is_multi_step:
         model_input = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner, self.cache_engine[0], False)
-- 
Gitee


From 723919592c6e7f132ca3f6f0a7f356bc1d4f54ad Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Wed, 2 Apr 2025 10:52:37 +0800
Subject: [PATCH 51/82] delete redundancy code and env

---
 vllm_mindspore/__init__.py                   | 23 ++----
 vllm_mindspore/distributed/parallel_state.py | 22 -----
 vllm_mindspore/utils.py                      | 10 ---
 vllm_mindspore/worker/cache_engine.py        | 86 +-------------------
 4 files changed, 12 insertions(+), 129 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index de11500c1..dca390326 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -79,6 +79,7 @@ vllm.model_executor.models.ModelRegistry = MindSporeModelRegistry
 vllm.model_executor.models.registry._SUBPROCESS_COMMAND = _SUBPROCESS_COMMAND
 
 from vllm_mindspore.model_executor.model_loader.utils import get_ms_model_architecture
+
 # To patching the get_model_architecture, should import it first.
 from vllm.model_executor.model_loader import get_model_architecture
 
@@ -106,17 +107,13 @@ from vllm_mindspore.worker.cache_engine import (
     ms_allocate_kv_cache,
     ms_swap_in,
     ms_swap_out,
-    cache_engine_init,
-    get_cache_block_size,
 )
 
 import vllm.worker.cache_engine
 
 vllm.worker.cache_engine.CacheEngine._allocate_kv_cache = ms_allocate_kv_cache
-vllm.worker.cache_engine.CacheEngine.__init__ = cache_engine_init
 vllm.worker.cache_engine.CacheEngine.swap_in = ms_swap_in
 vllm.worker.cache_engine.CacheEngine.swap_out = ms_swap_out
-vllm.worker.cache_engine.CacheEngine.get_cache_block_size = get_cache_block_size
 
 from vllm_mindspore.model_executor.model_loader.weight_utils import (
     safetensors_weights_iterator,
@@ -126,12 +123,10 @@ vllm.model_executor.model_loader.loader.safetensors_weights_iterator = (
     safetensors_weights_iterator
 )
 
-from vllm_mindspore.worker.worker import (
-    _warm_up_model
-)
+from vllm_mindspore.worker.worker import _warm_up_model
 from vllm_mindspore.worker.profile import (
     wrapper_worker_init,
-    wrapper_worker_init_device
+    wrapper_worker_init_device,
 )
 from vllm.worker.worker import Worker
 
@@ -143,7 +138,7 @@ from vllm_mindspore.worker.model_runner import (
     _get_cuda_graph_pad_size,
     profile_run,
     _dummy_run,
-    _get_supported_attention_backends
+    _get_supported_attention_backends,
 )
 
 vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = (
@@ -153,23 +148,23 @@ vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run
 vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run
 
 import vllm.worker.multi_step_model_runner
-vllm.worker.multi_step_model_runner._get_supported_attention_backends = _get_supported_attention_backends
+
+vllm.worker.multi_step_model_runner._get_supported_attention_backends = (
+    _get_supported_attention_backends
+)
 
 from vllm_mindspore.distributed.parallel_state import (
-    all_reduce_for_GroupCoordinator,
     init_model_parallel_group,
     init_group_coordinator,
 )
 
-vllm.distributed.parallel_state.GroupCoordinator.all_reduce = (
-    all_reduce_for_GroupCoordinator
-)
 vllm.distributed.parallel_state.init_model_parallel_group = init_model_parallel_group
 vllm.distributed.parallel_state.GroupCoordinator.__init__ = init_group_coordinator
 
 from vllm_mindspore.executor.multiproc_worker_utils import (
     get_mp_context as ms_get_mp_context,
 )
+
 # To patching the get_mp_context, should import it first.
 from vllm.executor.multiproc_worker_utils import get_mp_context
 
diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
index d599660cd..42b10d699 100644
--- a/vllm_mindspore/distributed/parallel_state.py
+++ b/vllm_mindspore/distributed/parallel_state.py
@@ -45,28 +45,6 @@ def init_model_parallel_group(
     )
 
 
-def all_reduce_for_GroupCoordinator(self, input_: torch.Tensor) -> torch.Tensor:
-    """
-    User-facing all-reduce function before we actually call the
-    all-reduce operation.
-
-    We need this because Dynamo does not support passing an arbitrary
-    object (`self` in this case) to a custom op. We need to pass the
-        group name as a string, and then look up the group coordinator from
-        the group name, dispatch the all-reduce operation to the group
-        coordinator.
-
-    In addition, PyTorch custom ops do not support mutation or returning
-    a new tensor in the same op. So we always make the all-reduce operation
-    out-of-place.
-    """
-    # Bypass the function if we are using only 1 GPU.
-    if self.world_size == 1:
-        return input_
-
-    torch.distributed.all_reduce(input_, group=self.device_group)
-    return input_
-
 def init_group_coordinator(
     self,
     group_ranks: List[List[int]],
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index ac44272f1..0273fb873 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -235,21 +235,11 @@ def check_ready():
 
         mindformers_default_env = {
             "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention",
-            "MS_ALLOC_CONF": "enable_vmm:False",
         }
         env_setup(mindformers_default_env)
     else:
-        env_setup({"MS_ALLOC_CONF": "enable_vmm:True", })
         logger.info("Run with native model backend!")
 
-def is_use_mla(model_config):
-    if not is_mindformers_model_backend():
-        return False
-
-    return hasattr(model_config.hf_text_config, "model_type") and (
-        model_config.hf_text_config.model_type in ("deepseek_v3",)
-    )
-
 
 def convert_np_to_ms_dtype(value):
     """convert_np_to_ms_dtype"""
diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py
index 9c16ffde4..dfd0ef10e 100644
--- a/vllm_mindspore/worker/cache_engine.py
+++ b/vllm_mindspore/worker/cache_engine.py
@@ -19,14 +19,11 @@
 
 from typing import List
 
-from vllm.logger import init_logger
-from vllm import envs
-from vllm.platforms import current_platform
-
-from vllm_mindspore.utils import MsKVCache, get_valid_dtype, is_use_mla, get_dtype_size
-
 import mindspore as ms
 from mindspore import mutable
+from vllm.logger import init_logger
+from vllm_mindspore.utils import MsKVCache, get_valid_dtype
+
 
 logger = init_logger(__name__)
 
@@ -75,80 +72,3 @@ def ms_swap_out(self, src_to_dst: ms.Tensor) -> None:
         self.attn_backend.swap_blocks(
             self.gpu_cache[i], self.cpu_cache[i], src_to_dst, True
         )
-
-
-def cache_engine_init(
-    self,
-    cache_config,
-    model_config,
-    parallel_config,
-    device_config,
-) -> None:
-
-    from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType
-    from vllm.attention import get_attn_backend
-
-    self.cache_config = cache_config
-    self.model_config = model_config
-    self.parallel_config = parallel_config
-    self.device_config = device_config
-
-    self.head_size = model_config.get_head_size()
-    # Models like Jamba, have mixed typed layers, E.g Mamba
-    self.num_attention_layers = model_config.get_num_layers_by_block_type(
-        parallel_config, LayerBlockType.attention
-    )
-    self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-
-    self.block_size = cache_config.block_size
-    self.num_gpu_blocks = cache_config.num_gpu_blocks
-    if self.num_gpu_blocks:
-        self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
-    self.num_cpu_blocks = cache_config.num_cpu_blocks
-    if self.num_cpu_blocks:
-        self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
-
-    if cache_config.cache_dtype == "auto":
-        self.dtype = model_config.dtype
-    else:
-        self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-
-    # Get attention backend.
-    self.attn_backend = get_attn_backend(
-        self.head_size,
-        model_config.dtype,
-        cache_config.cache_dtype,
-        self.block_size,
-        model_config.is_attention_free,
-        use_mla=is_use_mla(model_config),
-    )
-
-    # Initialize the cache.
-    self.gpu_cache = self._allocate_kv_cache(
-        self.num_gpu_blocks, self.device_config.device_type
-    )
-    self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
-
-
-def get_cache_block_size(
-    cache_config: "CacheConfig",
-    model_config: "ModelConfig",
-    parallel_config: "ParallelConfig",
-) -> int:
-    from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType
-
-    head_size = model_config.get_head_size()
-    num_heads = model_config.get_num_kv_heads(parallel_config)
-    num_attention_layers = model_config.get_num_layers_by_block_type(
-        parallel_config, LayerBlockType.attention
-    )
-
-    key_cache_block = cache_config.block_size * num_heads * head_size
-    value_cache_block = key_cache_block if not is_use_mla(model_config) else 0
-    total = num_attention_layers * (key_cache_block + value_cache_block)
-    if cache_config.cache_dtype == "auto":
-        dtype = model_config.dtype
-    else:
-        dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-    dtype_size = get_dtype_size(dtype)
-    return dtype_size * total
-- 
Gitee


From 5ba7646a767d7e554af2f49c682eb3a8c53640b7 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Wed, 2 Apr 2025 12:18:38 +0800
Subject: [PATCH 52/82] deepseek and qwen weight load optimizer

---
 vllm_mindspore/config.py                      |   2 +-
 .../models/mf_models/deepseek_v3.py           |  41 +-
 ...lism.py => deepseekv3_weight_processor.py} | 630 +++++++++---------
 .../model_executor/models/mf_models/qwen2.py  |  10 +-
 ...rallelism.py => qwen2_weight_processor.py} | 168 ++---
 ...del_parallelism.py => weight_processor.py} |  86 ++-
 6 files changed, 478 insertions(+), 459 deletions(-)
 rename vllm_mindspore/model_executor/models/mf_models/{deepseekv3_infer_parallelism.py => deepseekv3_weight_processor.py} (68%)
 rename vllm_mindspore/model_executor/models/mf_models/{qwen2_infer_parallelism.py => qwen2_weight_processor.py} (62%)
 rename vllm_mindspore/model_executor/models/mf_models/{model_parallelism.py => weight_processor.py} (44%)

diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 5fd633977..0f20ca17d 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -102,7 +102,7 @@ def vllm_config_post_init(self):
     current_platform.check_and_update_config(self)
 
     if self.model_config and self.model_config.use_mla:
-        logger.info("For MindSpore, MLA supports chunked prefill and prefix, "
+        logger.info("For MindSpore, MLA supports chunked prefill and prefix cache, "
                     "so keep them enable.")
 
     if not self.instance_id:
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index f29d176ce..49f9fc91c 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -23,15 +23,17 @@ from collections import OrderedDict
 import numpy as np
 
 from vllm.config import VllmConfig
-from vllm.config import  get_current_vllm_config
+from vllm.config import get_current_vllm_config
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 
 from mindspore import Tensor, JitConfig, Model, mutable
 from mindspore.common import dtype as msdtype
+from mindspore.nn.utils import no_init_parameters
 
 from mindspore_gs.ptq import PTQ
-from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity, GPTQQuantConfig
+from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity, \
+    GPTQQuantConfig
 from mindspore_gs.common import BackendTarget
 
 from mindformers.trainer.utils import transform_and_load_checkpoint
@@ -46,10 +48,9 @@ from research.deepseek3.deepseek3 import (
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_MLA
 
-from vllm_mindspore.model_executor.models.mf_models.deepseekv3_infer_parallelism import DeepseekInferParallelism
+from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
-
 logger = init_logger(__name__)
 
 
@@ -70,10 +71,12 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.is_quant = bool(hasattr(self.mf_model_config, "quantization_config") and
                              self.mf_model_config.quantization_config)
         # Initital network
-        self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
+        with no_init_parameters():  # Delay initialization
+            self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
         # quant
-        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, "quant_method"):
+        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config,
+                                                                            "quant_method"):
             ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY)
             if ptq is not None:
                 ptq.apply(self.network)
@@ -117,8 +120,8 @@ class DeepseekV3ForCausalLM(MfModelBase):
                 self.mf_config, model, self.network, infer_data, do_predict=True
             )
         else:
-            model_parallelism = DeepseekInferParallelism(self.mf_config, self.network, self.is_quant)
-            model_parallelism.infer_convert_and_parallelism(self.mf_config.load_checkpoint)
+            weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant)
+            weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint)
         self.network.set_dynamic_inputs()
         return None
 
@@ -139,11 +142,11 @@ class DeepseekV3ForCausalLM(MfModelBase):
                             act_quant_granularity=QuantGranularity.PER_TENSOR,
                             weight_quant_granularity=QuantGranularity.PER_CHANNEL)
             ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
-                                act_quant_dtype=msdtype.int8,
-                                outliers_suppression=OutliersSuppressionType.NONE,
-                                precision_recovery=PrecisionRecovery.NONE,
-                                act_quant_granularity=QuantGranularity.PER_TOKEN,
-                                weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+                                   act_quant_dtype=msdtype.int8,
+                                   outliers_suppression=OutliersSuppressionType.NONE,
+                                   precision_recovery=PrecisionRecovery.NONE,
+                                   act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                   weight_quant_granularity=QuantGranularity.PER_CHANNEL)
             layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config})
         elif quant_type.lower() == 'awq-a16w4':
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2,
@@ -173,11 +176,11 @@ class DeepseekV3ForCausalLM(MfModelBase):
                             act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH,
                             opname_blacklist=['lm_head', 'lkv2kv'])
             w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
-                                act_quant_dtype=msdtype.int8,
-                                outliers_suppression=OutliersSuppressionType.NONE,
-                                precision_recovery=PrecisionRecovery.NONE,
-                                act_quant_granularity=QuantGranularity.PER_TOKEN,
-                                weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+                                  act_quant_dtype=msdtype.int8,
+                                  outliers_suppression=OutliersSuppressionType.NONE,
+                                  precision_recovery=PrecisionRecovery.NONE,
+                                  act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                  weight_quant_granularity=QuantGranularity.PER_CHANNEL)
             layer_policies = OrderedDict({r'.*\.w2.*': w2_config})
         elif quant_type.lower() == 'a16w8':
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
@@ -199,4 +202,4 @@ class DeepseekV3ForCausalLM(MfModelBase):
             # pylint: disable=protected-access
             ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden"]
         ptq.decoder_layer_types.append(DeepseekV3DecodeLayer)
-        return ptq
\ No newline at end of file
+        return ptq
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
similarity index 68%
rename from vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
rename to vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index a9e316239..441345133 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -17,7 +17,6 @@
 transform huggingface model to mindspore safetensor.
 """
 import os
-import time
 import json
 import gc
 import numpy as np
@@ -25,13 +24,14 @@ from tqdm import tqdm
 
 import mindspore as ms
 from mindspore import dtype
-from vllm_mindspore.model_executor.models.mf_models.model_parallelism import BaseModelParallelism
+from mindspore.communication.management import get_rank
+from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor
 from vllm_mindspore.utils import convert_np_to_ms_dtype
 
 
-class DeepseekInferParallelism(BaseModelParallelism):
+class DeepseekV3WeightProcessor(BaseWeightProcessor):
     r"""
-    Provide DeepseekV3/R1 Quant Model infer parameter convert and parallelism.
+    Provide DeepseekV3/R1 Model weight load and shards.
     Args:
         config (DeepseekV3/R1Config): The config of DeepseekV3/R1 model.
         network (InferenceDeepseekV3ForCausalLM): The network of DeepseekV3/R1.
@@ -115,21 +115,21 @@ class DeepseekInferParallelism(BaseModelParallelism):
         ffn_concat = self.config.model.model_config.ffn_concat
         num_router_experts = self.config.moe_config.expert_num
 
-        parameter_dict = {}
         # router expert dense
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
         router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name)
         router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[router_dense_ms_name] = ms.Parameter(ms.Tensor(router_dense_ms_param, ms.bfloat16),
-                                                            name=router_dense_ms_name, requires_grad=False)
+        self.parameter_dict[router_dense_ms_name] = ms.Parameter(
+            ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16),
+            name=router_dense_ms_name, requires_grad=False)
 
         # e_score_correction_bias
         e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
         e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name)
         e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
-                                                                         hf_weight_map)
-        parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
-            ms.Tensor(e_score_correction_bias_ms_param, ms.float32),
+                                                                            hf_weight_map)
+        self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32),
             name=e_score_correction_bias_ms_name, requires_grad=False)
 
         w1_list = []
@@ -151,15 +151,15 @@ class DeepseekInferParallelism(BaseModelParallelism):
         for index in range(0, num_router_experts):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
             w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=0)
+                                                           is_split_param=True, split_axis=0)
 
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
             w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=1)
+                                                           is_split_param=True, split_axis=1)
 
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
             w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=0)
+                                                           is_split_param=True, split_axis=0)
 
             w1_list.append(w1_ms_param)
             w2_list.append(w2_ms_param)
@@ -167,15 +167,14 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
             w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
             w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                              is_split_param=True, split_axis=0)
+                                                                 is_split_param=True, split_axis=0)
 
             w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
             w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
-            # is_split_param=True, split_axis=0)
 
             w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
             w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                              is_split_param=True, split_axis=0)
+                                                                 is_split_param=True, split_axis=0)
 
             w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
             w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -184,9 +183,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
             w2_scale_list.append(w2_scale_ms_param)
             w3_scale_list.append(w3_scale_ms_param)
 
-        w1_ms_stack_param = np.stack(w1_list, axis=0).transpose(0, 2, 1)
-        w2_ms_stack_param = np.stack(w2_list, axis=0).transpose(0, 2, 1)
-        w3_ms_stack_param = np.stack(w3_list, axis=0).transpose(0, 2, 1)
+        w1_ms_stack_param = np.stack(w1_list, axis=0)
+        w2_ms_stack_param = np.stack(w2_list, axis=0)
+        w3_ms_stack_param = np.stack(w3_list, axis=0)
 
         w1_scale_ms_stack_param = np.stack(w1_scale_list, axis=0)
         w2_scale_ms_stack_param = np.stack(w2_scale_list, axis=0)
@@ -195,48 +194,54 @@ class DeepseekInferParallelism(BaseModelParallelism):
         if ffn_concat:
             # w_gate_hidden
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.weight"
-            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=2),
-                                            dtype=ms.int8)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
+            w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=1)
+            w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute(0, 2, 1).astype(ms.int8)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                                   requires_grad=False)
             # w_scale_gate_hidden
-            w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.matmul.weight_scale"
-            w_scale_gate_hidden_param = ms.Tensor(
-                np.concatenate([w1_scale_ms_stack_param, w3_scale_ms_stack_param], axis=1), dtype=ms.bfloat16)
-            parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
-                                                                    name=w_scale_gate_hidden_name,
-                                                                    requires_grad=False)
+            w_scale_gate_hidden_name = \
+                f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.matmul.weight_scale"
+
+            w_scale_gate_hidden_np = np.concatenate([w1_scale_ms_stack_param, w3_scale_ms_stack_param], axis=1)
+            w_scale_gate_hidden_param = ms.from_numpy(w_scale_gate_hidden_np).astype(ms.bfloat16)
+            self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
+                                                                         name=w_scale_gate_hidden_name,
+                                                                         requires_grad=False)
         else:
             # w1 w3
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_stack_param, ms.int8), name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_stack_param, ms.int8), name=w3_ms_name,
-                                                      requires_grad=False)
+            self.parameter_dict[w1_ms_name] = ms.Parameter(
+                ms.from_numpy(w1_ms_stack_param).permute(0, 2, 1).astype(ms.int8),
+                name=w1_ms_name,
+                requires_grad=False)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(
+                ms.from_numpy(w3_ms_stack_param).permute(0, 2, 1).astype(ms.int8),
+                name=w3_ms_name,
+                requires_grad=False)
 
             # w1_scale w3_scale
-            parameter_dict[w1_scale_ms_name] = ms.Parameter(ms.Tensor(w1_scale_ms_stack_param, ms.bfloat16),
-                                                            name=w1_ms_name,
-                                                            requires_grad=False)
-            parameter_dict[w3_scale_ms_name] = ms.Parameter(ms.Tensor(w3_scale_ms_stack_param, ms.bfloat16),
-                                                            name=w3_ms_name,
-                                                            requires_grad=False)
-
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_stack_param, ms.int8), name=w2_ms_name,
-                                                  requires_grad=False)
-
-        parameter_dict[w2_scale_ms_name] = ms.Parameter(ms.Tensor(w2_scale_ms_stack_param, ms.bfloat16),
-                                                        name=w2_scale_ms_name,
-                                                        requires_grad=False)
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+            self.parameter_dict[w1_scale_ms_name] = ms.Parameter(
+                ms.from_numpy(w1_scale_ms_stack_param).astype(ms.bfloat16),
+                name=w1_ms_name,
+                requires_grad=False)
+            self.parameter_dict[w3_scale_ms_name] = ms.Parameter(
+                ms.from_numpy(w3_scale_ms_stack_param).astype(ms.bfloat16),
+                name=w3_ms_name,
+                requires_grad=False)
+
+        self.parameter_dict[w2_ms_name] = ms.Parameter(
+            ms.from_numpy(w2_ms_stack_param).permute(0, 2, 1).astype(ms.int8),
+            name=w2_ms_name,
+            requires_grad=False)
+
+        self.parameter_dict[w2_scale_ms_name] = ms.Parameter(
+            ms.from_numpy(w2_scale_ms_stack_param).astype(ms.bfloat16),
+            name=w2_scale_ms_name,
+            requires_grad=False)
 
     def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process moe shared expert ffn weight"""
 
         ffn_concat = self.config.model.model_config.ffn_concat
-        parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
         w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
@@ -272,84 +277,80 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.weight"
-            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0), dtype=ms.int8)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
-
-            w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.matmul.weight_scale"
-            w_scale_gate_hidden_param = ms.Tensor(
-                np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0),
-                dtype=ms.bfloat16)
-
-            parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
-                                                                    name=w_scale_gate_hidden_name,
-                                                                    requires_grad=False)
+            w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0)
+            w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.int8)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                                   requires_grad=False)
+
+            w_scale_gate_hidden_name = \
+                f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.matmul.weight_scale"
+            w_scale_gate_hidden_np = np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0)
+            w_scale_gate_hidden_param = ms.from_numpy(w_scale_gate_hidden_np).astype(ms.bfloat16)
+            self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
+                                                                         name=w_scale_gate_hidden_name,
+                                                                         requires_grad=False)
 
         else:
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.int8),
-                                                      name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.int8),
-                                                      name=w3_ms_name,
-                                                      requires_grad=False)
-
-            parameter_dict[w1_scale_ms_name] = ms.Parameter(ms.Tensor(w1_scale_ms_param, ms.bfloat16),
-                                                            name=w1_ms_name,
-                                                            requires_grad=False)
-            parameter_dict[w3_scale_ms_name] = ms.Parameter(ms.Tensor(w3_scale_ms_param, ms.bfloat16),
-                                                            name=w3_ms_name,
-                                                            requires_grad=False)
-
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.int8),
-                                                  name=w2_ms_name,
-                                                  requires_grad=False)
+            self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.int8),
+                                                           name=w1_ms_name,
+                                                           requires_grad=False)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.int8),
+                                                           name=w3_ms_name,
+                                                           requires_grad=False)
+
+            self.parameter_dict[w1_scale_ms_name] = ms.Parameter(
+                ms.from_numpy(w1_scale_ms_param).astype(ms.bfloat16),
+                name=w1_ms_name,
+                requires_grad=False)
+            self.parameter_dict[w3_scale_ms_name] = ms.Parameter(
+                ms.from_numpy(w3_scale_ms_param).astype(ms.bfloat16),
+                name=w3_ms_name,
+                requires_grad=False)
+
+        self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.int8),
+                                                       name=w2_ms_name,
+                                                       requires_grad=False)
 
-        parameter_dict[w2_scale_ms_name] = ms.Parameter(ms.Tensor(w2_scale_ms_param, ms.bfloat16),
-                                                        name=w2_ms_name,
-                                                        requires_grad=False)
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[w2_scale_ms_name] = ms.Parameter(
+            ms.from_numpy(w2_scale_ms_param).astype(ms.bfloat16),
+            name=w2_ms_name,
+            requires_grad=False)
 
     def infer_quant_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process dense ffn weight"""
 
         ffn_concat = self.config.model.model_config.ffn_concat
-        parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
         w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                    is_split_param=True,
-                                                    split_axis=0)
+                                                       is_split_param=True,
+                                                       split_axis=0)
         w1_scale_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight_scale"
         w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
         w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                          is_split_param=True,
-                                                          split_axis=0)
+                                                             is_split_param=True,
+                                                             split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
         w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                    is_split_param=True,
-                                                    split_axis=1)
+                                                       is_split_param=True,
+                                                       split_axis=1)
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight_scale"
         w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
         # shape:[7168,1]
         w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
-        # is_split_param=True,
-        # split_axis=0)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
         w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                    is_split_param=True,
-                                                    split_axis=0)
+                                                       is_split_param=True,
+                                                       split_axis=0)
         w3_scale_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight_scale"
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
         w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                          is_split_param=True,
-                                                          split_axis=0)
+                                                             is_split_param=True,
+                                                             split_axis=0)
 
         w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
         w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -357,75 +358,70 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.weight"
-            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0),
-                                            dtype=ms.int8)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
+            w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0)
+            w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(dtype=ms.int8)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                                   requires_grad=False)
 
             w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.matmul.weight_scale"
-            w_scale_gate_hidden_param = ms.Tensor(
-                np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0),
-                dtype=ms.bfloat16)
-            parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
-                                                                    name=w_scale_gate_hidden_name,
-                                                                    requires_grad=False)
+            w_scale_gate_hidden_param = ms.from_numpy(
+                np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0)).astype(dtype=ms.bfloat16)
+            self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param,
+                                                                         name=w_scale_gate_hidden_name,
+                                                                         requires_grad=False)
 
         else:
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.int8),
-                                                      name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.int8),
-                                                      name=w3_ms_name,
-                                                      requires_grad=False)
-
-            parameter_dict[w1_scale_ms_name] = ms.Parameter(ms.Tensor(w1_scale_ms_param, ms.bfloat16),
-                                                            name=w1_scale_ms_name,
-                                                            requires_grad=False)
-            parameter_dict[w3_scale_ms_name] = ms.Parameter(ms.Tensor(w3_scale_ms_param, ms.bfloat16),
-                                                            name=w3_scale_ms_name,
-                                                            requires_grad=False)
-
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.int8),
-                                                  name=w2_ms_name,
-                                                  requires_grad=False)
+            self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.int8),
+                                                           name=w1_ms_name,
+                                                           requires_grad=False)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.int8),
+                                                           name=w3_ms_name,
+                                                           requires_grad=False)
+
+            self.parameter_dict[w1_scale_ms_name] = ms.Parameter(
+                ms.from_numpy(w1_scale_ms_param).astype(ms.bfloat16),
+                name=w1_scale_ms_name,
+                requires_grad=False)
+            self.parameter_dict[w3_scale_ms_name] = ms.Parameter(
+                ms.from_numpy(w3_scale_ms_param).astype(ms.bfloat16),
+                name=w3_scale_ms_name,
+                requires_grad=False)
+
+        self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.int8),
+                                                       name=w2_ms_name,
+                                                       requires_grad=False)
 
-        parameter_dict[w2_scale_ms_name] = ms.Parameter(ms.Tensor(w2_scale_ms_param, ms.bfloat16),
-                                                        name=w2_ms_name,
-                                                        requires_grad=False)
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[w2_scale_ms_name] = ms.Parameter(
+            ms.from_numpy(w2_scale_ms_param).astype(ms.bfloat16),
+            name=w2_ms_name,
+            requires_grad=False)
 
     def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map):
         """convert weight not in model"""
-        parameter_dict = {}
         embed_tokens_hf_name = "model.embed_tokens.weight"
         embed_tokens_ms_name = self.quant_convert_weight_name(embed_tokens_hf_name)
         np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16),
-                                                            name=embed_tokens_ms_name,
-                                                            requires_grad=False)
+        self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
+                                                                 name=embed_tokens_ms_name,
+                                                                 requires_grad=False)
 
         norm_hf_name = "model.norm.weight"
         norm_ms_name = self.quant_convert_weight_name(norm_hf_name)
         np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[norm_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=norm_ms_name,
-                                                    requires_grad=False)
+        self.parameter_dict[norm_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
+                                                         name=norm_ms_name,
+                                                         requires_grad=False)
 
         lm_head_hf_name = "lm_head.weight"
         lm_head_ms_name = self.quant_convert_weight_name(lm_head_hf_name)
         if not self.config.parallel_config.vocab_emb_dp:
             np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
-                                                    is_split_param=True, split_axis=0)
+                                                       is_split_param=True, split_axis=0)
         else:
             np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
-                                                       requires_grad=False)
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
+                                                            name=lm_head_ms_name,
+                                                            requires_grad=False)
 
     def quant_special_attention_weight(self, layer_id, src_hf_dir, hf_weight_map, name, is_trans_rope_weigh=False,
                                        is_split_param=False):
@@ -435,19 +431,19 @@ class DeepseekInferParallelism(BaseModelParallelism):
         # o_proj->wo
 
         # input_scale, input_zp no split
-        parameter_dict = {}
         input_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_scale"
         input_scale_ms_name = self.quant_convert_weight_name(input_scale_hf_name)
         input_scale_ms_param, _ = self.get_safetensor_from_file(input_scale_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[input_scale_ms_name] = ms.Parameter(ms.Tensor(input_scale_ms_param, ms.bfloat16),
-                                                           name=input_scale_ms_name, requires_grad=False)
+        self.parameter_dict[input_scale_ms_name] = ms.Parameter(
+            ms.from_numpy(input_scale_ms_param).astype(ms.bfloat16),
+            name=input_scale_ms_name, requires_grad=False)
 
         input_zp_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_offset"
         input_zp_ms_name = self.quant_convert_weight_name(input_zp_hf_name)
         input_zp_ms_param, _ = self.get_safetensor_from_file(input_zp_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[input_zp_ms_name] = ms.Parameter(ms.Tensor(input_zp_ms_param, ms.int8),
-                                                        name=input_zp_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[input_zp_ms_name] = ms.Parameter(ms.from_numpy(input_zp_ms_param).astype(ms.int8),
+                                                             name=input_zp_ms_name,
+                                                             requires_grad=False)
 
         if not is_trans_rope_weigh:
             quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
@@ -495,18 +491,16 @@ class DeepseekInferParallelism(BaseModelParallelism):
             quant_bias_ms_param = self.split_weight_by_rank(quant_bias_ms_param, split_axis=0)
             dequant_scale_ms_param = self.split_weight_by_rank(dequant_scale_ms_param, split_axis=0)
 
-        parameter_dict[quant_bias_ms_name] = ms.Parameter(ms.Tensor(quant_bias_ms_param, ms.int32),
-                                                          name=quant_bias_ms_name, requires_grad=False)
-        parameter_dict[dequant_scale_ms_name] = ms.Parameter(ms.Tensor(dequant_scale_ms_param, ms.float32),
-                                                             name=dequant_scale_ms_name, requires_grad=False)
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[quant_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(quant_bias_ms_param).astype(ms.int32),
+            name=quant_bias_ms_name, requires_grad=False)
+        self.parameter_dict[dequant_scale_ms_name] = ms.Parameter(
+            ms.from_numpy(dequant_scale_ms_param).astype(ms.float32),
+            name=dequant_scale_ms_name,
+            requires_grad=False)
 
     def infer_quant_bias_weight(self, src_hf_dir, layer_id, hf_weight_map):
         # quant_op.beta
-        parameter_dict = {}
         q2l_proj_bias_hf_name = f"model.layers.{layer_id}.input_layernorm.bias"
         q2l_proj_bias_ms_name = self.quant_convert_weight_name(q2l_proj_bias_hf_name)
         q2l_proj_bias_ms_param, _ = self.get_safetensor_from_file(q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map)
@@ -518,23 +512,21 @@ class DeepseekInferParallelism(BaseModelParallelism):
         l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name)
         l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map)
 
-        parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter(ms.Tensor(q2l_proj_bias_ms_param, ms.bfloat16),
-                                                             name=q2l_proj_bias_ms_name, requires_grad=False)
-        parameter_dict[kv2l_bias_ms_name] = ms.Parameter(ms.Tensor(kv2l_bias_ms_param, ms.bfloat16),
-                                                         name=kv2l_bias_ms_name,
-                                                         requires_grad=False)
-        parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter(ms.Tensor(l2q_proj_bias_ms_param, ms.bfloat16),
-                                                             name=l2q_proj_bias_ms_name,
-                                                             requires_grad=False)
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(q2l_proj_bias_ms_param).astype(ms.bfloat16),
+            name=q2l_proj_bias_ms_name,
+            requires_grad=False)
+        self.parameter_dict[kv2l_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(kv2l_bias_ms_param).astype(ms.bfloat16),
+            name=kv2l_bias_ms_name,
+            requires_grad=False)
+        self.parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(l2q_proj_bias_ms_param).astype(ms.bfloat16),
+            name=l2q_proj_bias_ms_name,
+            requires_grad=False)
 
     def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process attention weight"""
-        start_time = time.time()
-        parameter_dict = {}
         num_heads = self.config.model.model_config.num_heads
         kv_lora_rank = self.config.model.model_config.kv_lora_rank
         qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
@@ -548,9 +540,10 @@ class DeepseekInferParallelism(BaseModelParallelism):
         q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
         q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name)
         q2l_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.Tensor(q2l_proj_ms_param, ms.int8),
-                                                        name=q2l_proj_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(
+            ms.from_numpy(q2l_proj_ms_param).astype(ms.int8),
+            name=q2l_proj_ms_name,
+            requires_grad=False)
         self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj")
 
         # kv_a_proj_with_mqa->kv2l
@@ -559,8 +552,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
         kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
         kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
-        parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.int8), name=kv2l_ms_name,
-                                                    requires_grad=False)
+        self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.int8),
+                                                         name=kv2l_ms_name,
+                                                         requires_grad=False)
         self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa",
                                             is_trans_rope_weigh=True)
 
@@ -568,9 +562,9 @@ class DeepseekInferParallelism(BaseModelParallelism):
         lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
         lq_norm_ms_name = self.quant_convert_weight_name(lq_norm_hf_name)
         lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.Tensor(lq_norm_ms_param, ms.bfloat16),
-                                                       name=lq_norm_ms_name,
-                                                       requires_grad=False)
+        self.parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.from_numpy(lq_norm_ms_param).astype(ms.bfloat16),
+                                                            name=lq_norm_ms_name,
+                                                            requires_grad=False)
 
         # q_b_proj->l2q_proj
         l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight"
@@ -580,9 +574,10 @@ class DeepseekInferParallelism(BaseModelParallelism):
         l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim)
         l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1)
         l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0)
-        parameter_dict[l2q_proj_ms_name] = ms.Parameter(ms.Tensor(l2q_proj_ms_param, ms.int8),
-                                                        name=l2q_proj_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[l2q_proj_ms_name] = ms.Parameter(
+            ms.from_numpy(l2q_proj_ms_param).astype(ms.int8),
+            name=l2q_proj_ms_name,
+            requires_grad=False)
         self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_b_proj", is_trans_rope_weigh=True,
                                             is_split_param=True)
 
@@ -590,9 +585,10 @@ class DeepseekInferParallelism(BaseModelParallelism):
         lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight"
         lkv_norm_ms_name = self.quant_convert_weight_name(lkv_norm_hf_name)
         lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[lkv_norm_ms_name] = ms.Parameter(ms.Tensor(lkv_norm_ms_param, ms.bfloat16),
-                                                        name=lkv_norm_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[lkv_norm_ms_name] = ms.Parameter(
+            ms.from_numpy(lkv_norm_ms_param).astype(ms.bfloat16),
+            name=lkv_norm_ms_name,
+            requires_grad=False)
 
         # kv_b_proj->lkv2kv
         lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight"
@@ -606,32 +602,29 @@ class DeepseekInferParallelism(BaseModelParallelism):
         value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1])
         value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0)
         name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.")
-        parameter_dict[name_k_nope] = ms.Parameter(ms.Tensor(value_k_nope, ms.bfloat16), name=name_k_nope,
-                                                   requires_grad=False)
+        self.parameter_dict[name_k_nope] = ms.Parameter(ms.from_numpy(value_k_nope).astype(ms.bfloat16),
+                                                        name=name_k_nope,
+                                                        requires_grad=False)
         # value_v
         value_v = value_v.reshape(-1, value_v.shape[-1])
         value_v = self.split_weight_by_rank(value_v, split_axis=0)
         name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.")
-        parameter_dict[name_v] = ms.Parameter(ms.Tensor(value_v, ms.bfloat16), name=name_v,
-                                              requires_grad=False)
+        self.parameter_dict[name_v] = ms.Parameter(ms.from_numpy(value_v).astype(ms.bfloat16),
+                                                   name=name_v,
+                                                   requires_grad=False)
 
         # o_proj->wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.quant_convert_weight_name(wo_hf_name)
         wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
         wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1)
-        parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.int8), name=wo_ms_name,
-                                                  requires_grad=False)
+        self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.int8),
+                                                       name=wo_ms_name,
+                                                       requires_grad=False)
         self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj")
 
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
-
     def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant net convert layer weight"""
-        print(f"..... start convert layer {layer_id} .......", flush=True)
 
         if layer_id >= 3:
             self.infer_quant_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
@@ -643,8 +636,6 @@ class DeepseekInferParallelism(BaseModelParallelism):
         self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
 
-        print(f"..... end convert layer {layer_id} .......", flush=True)
-
     def convert_weight_name(self, weight_name: str):
         """replace weight name"""
         weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight')
@@ -674,22 +665,22 @@ class DeepseekInferParallelism(BaseModelParallelism):
         """process moe router expert weight"""
         ffn_concat = self.config.model.model_config.ffn_concat
         num_router_experts = self.config.moe_config.expert_num
-        parameter_dict = {}
 
         # router expert dense
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
         router_dense_ms_name = self.convert_weight_name(router_dense_hf_name)
         router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[router_dense_ms_name] = ms.Parameter(ms.Tensor(router_dense_ms_param, ms.bfloat16),
-                                                            name=router_dense_ms_name, requires_grad=False)
+        self.parameter_dict[router_dense_ms_name] = ms.Parameter(
+            ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16),
+            name=router_dense_ms_name, requires_grad=False)
 
         # e_score_correction_bias
         e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
         e_score_correction_bias_ms_name = self.convert_weight_name(e_score_correction_bias_hf_name)
         e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
-                                                                         hf_weight_map)
-        parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
-            ms.Tensor(e_score_correction_bias_ms_param, ms.float32),
+                                                                            hf_weight_map)
+        self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32),
             name=e_score_correction_bias_ms_name, requires_grad=False)
 
         w1_list = []
@@ -702,47 +693,50 @@ class DeepseekInferParallelism(BaseModelParallelism):
         for index in range(0, num_router_experts):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
             w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=0)
+                                                           is_split_param=True, split_axis=0)
 
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
             w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=1)
+                                                           is_split_param=True, split_axis=1)
 
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
             w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=0)
+                                                           is_split_param=True, split_axis=0)
 
             w1_list.append(w1_ms_param)
             w2_list.append(w2_ms_param)
             w3_list.append(w3_ms_param)
 
-        w1_ms_stack_param = np.stack(w1_list, axis=0).transpose(0, 2, 1)
-        w2_ms_stack_param = np.stack(w2_list, axis=0).transpose(0, 2, 1)
-        w3_ms_stack_param = np.stack(w3_list, axis=0).transpose(0, 2, 1)
+        w1_ms_stack_param = np.stack(w1_list, axis=0)
+        w2_ms_stack_param = np.stack(w2_list, axis=0)
+        w3_ms_stack_param = np.stack(w3_list, axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden.weight"
-            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=2),
-                                            dtype=ms.bfloat16)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
+            w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=2)
+            w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute(0, 2, 1).astype(dtype=ms.bfloat16)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param,
+                                                                   name=w_gate_hidden_name,
+                                                                   requires_grad=False)
         else:
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_stack_param, ms.bfloat16), name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_stack_param, ms.bfloat16), name=w3_ms_name,
-                                                      requires_grad=False)
-
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_stack_param, ms.bfloat16), name=w2_ms_name,
-                                                  requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+            w1_ms_stack_param = ms.from_numpy(w1_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16)
+            self.parameter_dict[w1_ms_name] = ms.Parameter(w1_ms_stack_param,
+                                                           name=w1_ms_name,
+                                                           requires_grad=False)
+
+            w3_ms_stack_param = ms.from_numpy(w3_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(w3_ms_stack_param,
+                                                           name=w3_ms_name,
+                                                           requires_grad=False)
+
+        w2_ms_stack_param = ms.from_numpy(w2_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16)
+        self.parameter_dict[w2_ms_name] = ms.Parameter(w2_ms_stack_param,
+                                                       name=w2_ms_name,
+                                                       requires_grad=False)
 
     def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process moe shared expert ffn weight"""
         ffn_concat = self.config.model.model_config.ffn_concat
-        parameter_dict = {}
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
         w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
@@ -757,59 +751,60 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
-            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0), dtype=ms.bfloat16)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
+            w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0)
+            w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param,
+                                                                   name=w_gate_hidden_name,
+                                                                   requires_grad=False)
         else:
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.bfloat16), name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.bfloat16), name=w3_ms_name,
-                                                      requires_grad=False)
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
-                                                  requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+            self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16),
+                                                           name=w1_ms_name,
+                                                           requires_grad=False)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16),
+                                                           name=w3_ms_name,
+                                                           requires_grad=False)
+        self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16),
+                                                       name=w2_ms_name,
+                                                       requires_grad=False)
 
     def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process dense ffn weight"""
 
         ffn_concat = self.config.model.model_config.ffn_concat
-        parameter_dict = {}
 
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
         w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
         w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=1)
+                                                       split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
         w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight"
-            w_gate_hidden_param = ms.Tensor(np.concatenate([w1_ms_param, w3_ms_param], axis=0), dtype=ms.bfloat16)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
+            w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0)
+            w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param,
+                                                                   name=w_gate_hidden_name,
+                                                                   requires_grad=False)
         else:
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.bfloat16), name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.bfloat16), name=w3_ms_name,
-                                                      requires_grad=False)
-
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
-                                                  requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+            self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16),
+                                                           name=w1_ms_name,
+                                                           requires_grad=False)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16),
+                                                           name=w3_ms_name,
+                                                           requires_grad=False)
+
+        self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16),
+                                                       name=w2_ms_name,
+                                                       requires_grad=False)
 
     def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process attention weight"""
@@ -822,14 +817,14 @@ class DeepseekInferParallelism(BaseModelParallelism):
         rope_dim = qk_rope_head_dim + qk_nope_head_dim
         kv_head_dim = kv_lora_rank + qk_rope_head_dim
 
-        parameter_dict = {}
         # q2l_proj
         q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
         q2l_proj_ms_name = self.convert_weight_name(q2l_proj_hf_name)
         q_a_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.Tensor(q_a_proj_ms_param, ms.bfloat16),
-                                                        name=q2l_proj_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(
+            ms.from_numpy(q_a_proj_ms_param).astype(ms.bfloat16),
+            name=q2l_proj_ms_name,
+            requires_grad=False)
 
         # kv2l
         kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
@@ -837,15 +832,17 @@ class DeepseekInferParallelism(BaseModelParallelism):
         kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
         kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
         kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
-        parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.bfloat16), name=kv2l_ms_name,
-                                                    requires_grad=False)
+        self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.bfloat16),
+                                                         name=kv2l_ms_name,
+                                                         requires_grad=False)
 
         # lq_norm
         lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
         lq_norm_ms_name = self.convert_weight_name(lq_norm_hf_name)
         lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.Tensor(lq_norm_ms_param, ms.bfloat16), name=lq_norm_ms_name,
-                                                       requires_grad=False)
+        self.parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.from_numpy(lq_norm_ms_param).astype(ms.bfloat16),
+                                                            name=lq_norm_ms_name,
+                                                            requires_grad=False)
 
         # l2q_proj
         l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight"
@@ -855,17 +852,19 @@ class DeepseekInferParallelism(BaseModelParallelism):
         l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim)
         l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1)
         l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0)
-        parameter_dict[l2q_proj_ms_name] = ms.Parameter(ms.Tensor(l2q_proj_ms_param, ms.bfloat16),
-                                                        name=l2q_proj_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[l2q_proj_ms_name] = ms.Parameter(
+            ms.from_numpy(l2q_proj_ms_param).astype(ms.bfloat16),
+            name=l2q_proj_ms_name,
+            requires_grad=False)
 
         # lkv_norm
         lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight"
         lkv_norm_ms_name = self.convert_weight_name(lkv_norm_hf_name)
         lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[lkv_norm_ms_name] = ms.Parameter(ms.Tensor(lkv_norm_ms_param, ms.bfloat16),
-                                                        name=lkv_norm_ms_name,
-                                                        requires_grad=False)
+        self.parameter_dict[lkv_norm_ms_name] = ms.Parameter(
+            ms.from_numpy(lkv_norm_ms_param).astype(ms.bfloat16),
+            name=lkv_norm_ms_name,
+            requires_grad=False)
 
         # lkv2kv
         lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight"
@@ -879,57 +878,50 @@ class DeepseekInferParallelism(BaseModelParallelism):
         value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1])
         value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0)
         name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.")
-        parameter_dict[name_k_nope] = ms.Parameter(ms.Tensor(value_k_nope, ms.bfloat16), name=name_k_nope,
-                                                   requires_grad=False)
+        self.parameter_dict[name_k_nope] = ms.Parameter(ms.from_numpy(value_k_nope).astype(ms.bfloat16),
+                                                        name=name_k_nope,
+                                                        requires_grad=False)
         # value_v
         value_v = value_v.reshape(-1, value_v.shape[-1])
         value_v = self.split_weight_by_rank(value_v, split_axis=0)
         name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.")
-        parameter_dict[name_v] = ms.Parameter(ms.Tensor(value_v, ms.bfloat16), name=name_v,
-                                              requires_grad=False)
+        self.parameter_dict[name_v] = ms.Parameter(ms.from_numpy(value_v).astype(ms.bfloat16),
+                                                   name=name_v,
+                                                   requires_grad=False)
 
         # wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.convert_weight_name(wo_hf_name)
         wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map)
         wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1)
-        parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
-                                                  requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16),
+                                                       name=wo_ms_name,
+                                                       requires_grad=False)
 
     def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process attention weight"""
-        parameter_dict = {}
         # attention_norm
         attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight"
         attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name)
         attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name,
-                                                                src_hf_dir,
-                                                                hf_weight_map)
-        parameter_dict[attention_norm_ms_name] = ms.Parameter(ms.Tensor(attention_norm_ms_param, ms.bfloat16),
-                                                              name=attention_norm_ms_name,
-                                                              requires_grad=False)
+                                                                   src_hf_dir,
+                                                                   hf_weight_map)
+        self.parameter_dict[attention_norm_ms_name] = ms.Parameter(
+            ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16),
+            name=attention_norm_ms_name,
+            requires_grad=False)
 
         # ffn_norm
         ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight"
         ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name)
         ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[ffn_norm_ms_name] = ms.Parameter(ms.Tensor(ffn_norm_ms_param, ms.bfloat16),
-                                                        name=ffn_norm_ms_name,
-                                                        requires_grad=False)
-
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        
-        del parameter_dict
-        gc.collect()
+        self.parameter_dict[ffn_norm_ms_name] = ms.Parameter(
+            ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16),
+            name=ffn_norm_ms_name,
+            requires_grad=False)
 
     def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer convert layer weight"""
-        print(f"..... start convert layer {layer_id} .......", flush=True)
-
         if layer_id >= 3:
             self.infer_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
             self.infer_process_moe_shared_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
@@ -939,19 +931,16 @@ class DeepseekInferParallelism(BaseModelParallelism):
         self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
 
-        print(f"..... end convert layer {layer_id} .......", flush=True)
-
     def infer_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
         """infer_quant_net_ms_convert_layer_weight"""
-        parameter_dict = {}
 
         no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj",
                                "kv2l", "routed_experts.router.dense",
                                "routed_experts.router.e_score_correction_bias",
                                "shared_experts.w_gate_hidden", "shared_experts.w2",
                                "topk_bias"]
-        
-        for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"):
+
+        for param_name, _ in hf_weight_map.items():
             if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
                 continue
 
@@ -1002,15 +991,15 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
             dst_dtype = convert_np_to_ms_dtype(value)
             if is_int4:
-                parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dtype.qint4x2),
-                                                          name=param_name, requires_grad=False)
+                self.parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dtype.qint4x2),
+                                                               name=param_name, requires_grad=False)
             else:
-                parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
-                                                          name=param_name, requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+                self.parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
+                                                               name=param_name, requires_grad=False)
 
-    def infer_convert_and_parallelism(self, src_hf_dir):
-        """convert inference model weight """
+    def load_safetensors_shard(self, src_hf_dir):
+        """deepseek load safetensors and shard """
+        rank_id = get_rank()
         param_json_path = ""
 
         for file in os.listdir(src_hf_dir):
@@ -1027,19 +1016,24 @@ class DeepseekInferParallelism(BaseModelParallelism):
 
         if not param_json_path:
             raise ValueError(f"Not found param_json_path in {src_hf_dir}")
-        print("param_json_path is {}".format(param_json_path))
 
         quantization_config = self.config.model.model_config.quantization_config
         quant_method = quantization_config.quant_method if quantization_config else None
         if not quant_method or quant_method != "gptq-pergroup":
             self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
-        
+
         num_layers = self.config.model.model_config.num_layers
         if quant_method and quant_method == "gptq-pergroup":
             self.infer_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
             return
-        for layer_id in range(num_layers):
+
+        enable_tqdm = rank_id == 0
+        for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm):
             if self.is_quant:
                 self.infer_quant_net_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
             else:
                 self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        ms.load_param_into_net(self.network, self.parameter_dict)
+        del self.parameter_dict
+        gc.collect()
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 4680b5cdb..58df668e0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -23,6 +23,7 @@ from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 
 from mindspore import Tensor, JitConfig
+from mindspore.nn.utils import no_init_parameters
 
 from mindformers.models.llama import LlamaConfig as LlamaConfig_MF
 from research.qwen2_5.infer.qwen2_5 import (
@@ -31,7 +32,7 @@ from research.qwen2_5.infer.qwen2_5 import (
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
-from vllm_mindspore.model_executor.models.mf_models.qwen2_infer_parallelism import Qwen2InferParallelism
+from vllm_mindspore.model_executor.models.mf_models.qwen2_weight_processor import Qwen2WeightProcessor
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 
@@ -52,7 +53,8 @@ class Qwen2ForCausalLM(MfModelBase):
         setattr(self.mf_model_config, 'npu_mem_size', -1)
         self.mf_config.model.model_config.qkv_concat = False
         # Initial network
-        self.network = ParallelQwenForCausalLM_MF(self.mf_model_config)
+        with no_init_parameters():  # Delay initialization
+            self.network = ParallelQwenForCausalLM_MF(self.mf_model_config)
         self.network._jit_config_dict = JitConfig(
             jit_level="O0", infer_boost="on"
         ).jit_config_dict
@@ -76,8 +78,8 @@ class Qwen2ForCausalLM(MfModelBase):
         self.set_flags = False
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
-        model_parallelism = Qwen2InferParallelism(self.mf_config, self.network, False)
-        model_parallelism.infer_convert_and_parallelism(self.mf_config.load_checkpoint)
+        weight_processor = Qwen2WeightProcessor(self.mf_config, self.network, False)
+        weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint)
 
         self.network.set_dynamic_inputs()
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py
similarity index 62%
rename from vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
rename to vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py
index 6075672ab..59423eca0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2_infer_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py
@@ -18,16 +18,19 @@ transform huggingface model to mindspore safetensor.
 """
 import os
 import json
+import gc
 import numpy as np
+from tqdm import tqdm
 from safetensors import safe_open
-
 import mindspore as ms
-from vllm_mindspore.model_executor.models.mf_models.model_parallelism import BaseModelParallelism
+from mindspore.communication.management import get_rank
+
+from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor
 
 
-class Qwen2InferParallelism(BaseModelParallelism):
+class Qwen2WeightProcessor(BaseWeightProcessor):
     r"""
-    Provide Qwen2 Model infer parameter convert and parallelism.
+    Provide Qwen2 Model weight load and shards.
     Args:
         config (Qwen2Config): The config of Qwen2 model.
         network (InferenceQwen2ForCausalLM): The network of Qwen2.
@@ -39,36 +42,35 @@ class Qwen2InferParallelism(BaseModelParallelism):
 
     def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map):
         """convert weight not in model"""
-        parameter_dict = {}
         embed_tokens_hf_name = "model.embed_tokens.weight"
         embed_tokens_ms_name = self.convert_weight_name(embed_tokens_hf_name)
         if self.config.parallel_config.vocab_emb_dp:
             np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
         else:
             np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map,
-                                                    is_split_param=True, split_axis=0)
-        parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16),
-                                                            name=embed_tokens_ms_name,
-                                                            requires_grad=False)
+                                                       is_split_param=True, split_axis=0)
+        self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
+                                                                 name=embed_tokens_ms_name,
+                                                                 requires_grad=False)
 
         norm_hf_name = "model.norm.weight"
         norm_ms_name = self.convert_weight_name(norm_hf_name)
         np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[norm_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=norm_ms_name,
-                                                    requires_grad=False)
+        self.parameter_dict[norm_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
+                                                         name=norm_ms_name,
+                                                         requires_grad=False)
 
         lm_head_hf_name = "lm_head.weight"
         lm_head_ms_name = self.convert_weight_name(lm_head_hf_name)
         if not self.config.model.model_config.tie_word_embeddings:
             if not self.config.parallel_config.vocab_emb_dp:
                 np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
-                                                        is_split_param=True, split_axis=0)
+                                                           is_split_param=True, split_axis=0)
             else:
                 np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
-            parameter_dict[lm_head_ms_name] = ms.Parameter(ms.Tensor(np_data, ms.bfloat16), name=lm_head_ms_name,
-                                                           requires_grad=False)
-
-        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+            self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
+                                                                name=lm_head_ms_name,
+                                                                requires_grad=False)
 
     def convert_weight_name(self, weight_name: str):
         """replace weight name"""
@@ -90,156 +92,157 @@ class Qwen2InferParallelism(BaseModelParallelism):
         """infer process dense ffn weight"""
 
         ffn_concat = self.config.model.model_config.qkv_concat
-        parameter_dict = {}
-
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
         w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
         w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=1)
+                                                       split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
         w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight"
             w_gate_hidden_param = np.concatenate((w1_ms_param, w3_ms_param), axis=0)
-            parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
-                                                              requires_grad=False)
+            self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name,
+                                                                   requires_grad=False)
         else:
-            parameter_dict[w1_ms_name] = ms.Parameter(ms.Tensor(w1_ms_param, ms.bfloat16), name=w1_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[w3_ms_name] = ms.Parameter(ms.Tensor(w3_ms_param, ms.bfloat16), name=w3_ms_name,
-                                                      requires_grad=False)
+            self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16),
+                                                           name=w1_ms_name,
+                                                           requires_grad=False)
+            self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16),
+                                                           name=w3_ms_name,
+                                                           requires_grad=False)
 
-        parameter_dict[w2_ms_name] = ms.Parameter(ms.Tensor(w2_ms_param, ms.bfloat16), name=w2_ms_name,
-                                                  requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16),
+                                                       name=w2_ms_name,
+                                                       requires_grad=False)
 
     def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process attention weight"""
         qkv_concat = self.config.model.model_config.qkv_concat
-        parameter_dict = {}
         # wq
         wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight"
         wq_ms_name = self.convert_weight_name(wq_hf_name)
         wq_ms_param, _ = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
         # wq bias
         wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias"
         wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name)
         wq_bias_ms_param, _ = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=0)
+                                                            is_split_param=True,
+                                                            split_axis=0)
 
         # wk
         wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight"
         wk_ms_name = self.convert_weight_name(wk_hf_name)
         wk_ms_param, _ = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
         # wk bias
         wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias"
         wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name)
         wk_bias_ms_param, _ = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=0)
+                                                            is_split_param=True,
+                                                            split_axis=0)
 
         # wv
         wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight"
         wv_ms_name = self.convert_weight_name(wv_hf_name)
         wv_ms_param, _ = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=0)
+                                                       split_axis=0)
         # wv bias
         wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias"
         wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name)
         wv_bias_ms_param, _ = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=0)
+                                                            is_split_param=True,
+                                                            split_axis=0)
 
         if qkv_concat:
             w_qkv_name = f"model.layers.{layer_id}.attention.w_qkv.weight"
             w_qkv_param = np.concatenate((wq_ms_param, wk_ms_param, wv_ms_param), axis=0)
-            w_qkv_param = ms.Tensor(w_qkv_param, dtype=ms.bfloat16)
-            parameter_dict[w_qkv_name] = ms.Parameter(w_qkv_param, name=w_qkv_name, requires_grad=False)
+            w_qkv_param = ms.from_numpy(w_qkv_param).astype(ms.bfloat16)
+            self.parameter_dict[w_qkv_name] = ms.Parameter(w_qkv_param, name=w_qkv_name, requires_grad=False)
 
             w_qkv_bias_name = f"model.layers.{layer_id}.attention.w_qkv.bias"
             w_qkv_bias_param = np.concatenate((wq_bias_ms_param, wk_bias_ms_param, wv_bias_ms_param), axis=0)
-            w_qkv_bias_param = ms.Tensor(w_qkv_bias_param, dtype=ms.bfloat16)
-            parameter_dict[w_qkv_bias_name] = ms.Parameter(w_qkv_bias_param, name=w_qkv_bias_name, requires_grad=False)
+            w_qkv_bias_param = ms.from_numpy(w_qkv_bias_param).astype(ms.bfloat16)
+            self.parameter_dict[w_qkv_bias_name] = ms.Parameter(w_qkv_bias_param, name=w_qkv_bias_name,
+                                                                requires_grad=False)
         else:
-            parameter_dict[wq_ms_name] = ms.Parameter(ms.Tensor(wq_ms_param, ms.bfloat16), name=wq_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[wk_ms_name] = ms.Parameter(ms.Tensor(wk_ms_param, ms.bfloat16), name=wk_ms_name,
-                                                      requires_grad=False)
-            parameter_dict[wv_ms_name] = ms.Parameter(ms.Tensor(wv_ms_param, ms.bfloat16), name=wv_ms_name,
-                                                      requires_grad=False)
-
-            parameter_dict[wq_bias_ms_name] = ms.Parameter(ms.Tensor(wq_bias_ms_param, ms.bfloat16),
-                                                           name=wq_bias_ms_name,
+            self.parameter_dict[wq_ms_name] = ms.Parameter(ms.from_numpy(wq_ms_param).astype(ms.bfloat16),
+                                                           name=wq_ms_name,
                                                            requires_grad=False)
-            parameter_dict[wk_bias_ms_name] = ms.Parameter(ms.Tensor(wk_bias_ms_param, ms.bfloat16),
-                                                           name=wk_bias_ms_name,
+            self.parameter_dict[wk_ms_name] = ms.Parameter(ms.from_numpy(wk_ms_param).astype(ms.bfloat16),
+                                                           name=wk_ms_name,
                                                            requires_grad=False)
-            parameter_dict[wv_bias_ms_name] = ms.Parameter(ms.Tensor(wv_bias_ms_param, ms.bfloat16),
-                                                           name=wv_bias_ms_name,
+            self.parameter_dict[wv_ms_name] = ms.Parameter(ms.from_numpy(wv_ms_param).astype(ms.bfloat16),
+                                                           name=wv_ms_name,
                                                            requires_grad=False)
 
+            self.parameter_dict[wq_bias_ms_name] = ms.Parameter(
+                ms.from_numpy(wq_bias_ms_param).astype(ms.bfloat16),
+                name=wq_bias_ms_name,
+                requires_grad=False)
+            self.parameter_dict[wk_bias_ms_name] = ms.Parameter(
+                ms.from_numpy(wk_bias_ms_param).astype(ms.bfloat16),
+                name=wk_bias_ms_name,
+                requires_grad=False)
+            self.parameter_dict[wv_bias_ms_name] = ms.Parameter(
+                ms.from_numpy(wv_bias_ms_param).astype(ms.bfloat16),
+                name=wv_bias_ms_name,
+                requires_grad=False)
+
         # wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.convert_weight_name(wo_hf_name)
         wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                    split_axis=1)
-        parameter_dict[wo_ms_name] = ms.Parameter(ms.Tensor(wo_ms_param, ms.bfloat16), name=wo_ms_name,
-                                                  requires_grad=False)
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+                                                       split_axis=1)
+        self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16),
+                                                       name=wo_ms_name,
+                                                       requires_grad=False)
 
     def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process attention weight"""
-        parameter_dict = {}
         # attention_norm
         attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight"
         attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name)
         attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name,
-                                                                src_hf_dir,
-                                                                hf_weight_map)
-        parameter_dict[attention_norm_ms_name] = ms.Parameter(ms.Tensor(attention_norm_ms_param, ms.bfloat16),
-                                                              name=attention_norm_ms_name,
-                                                              requires_grad=False)
+                                                                   src_hf_dir,
+                                                                   hf_weight_map)
+        self.parameter_dict[attention_norm_ms_name] = ms.Parameter(
+            ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16),
+            name=attention_norm_ms_name,
+            requires_grad=False)
 
         # ffn_norm
         ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight"
         ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name)
         ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map)
-        parameter_dict[ffn_norm_ms_name] = ms.Parameter(ms.Tensor(ffn_norm_ms_param, ms.bfloat16),
-                                                        name=ffn_norm_ms_name,
-                                                        requires_grad=False)
-
-        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        self.parameter_dict[ffn_norm_ms_name] = ms.Parameter(
+            ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16),
+            name=ffn_norm_ms_name,
+            requires_grad=False)
 
     def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer convert layer weight"""
-        print(f"..... start convert layer {layer_id} .......", flush=True)
-
         self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
 
-        print(f"..... end convert layer {layer_id} .......", flush=True)
-
-    def infer_convert_and_parallelism(self, src_hf_dir):
-        """convert inference model weight """
+    def load_safetensors_shard(self, src_hf_dir):
+        """qwen load safetensors and shard """
+        rank_id = get_rank()
         param_json_path = ""
         for file in os.listdir(src_hf_dir):
             if file.endswith('index.json'):
                 param_json_path = os.path.join(src_hf_dir, file)
                 break
-        print("param_json_path is {}".format(param_json_path))
 
         hf_weight_map = {}
         if os.path.exists(param_json_path):
@@ -255,5 +258,10 @@ class Qwen2InferParallelism(BaseModelParallelism):
 
         self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
         num_layers = self.config.model.model_config.num_layers
-        for layer_id in range(num_layers):
+        enable_tqdm = rank_id == 0
+        for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm):
             self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
+
+        ms.load_param_into_net(self.network, self.parameter_dict)
+        del self.parameter_dict
+        gc.collect()
diff --git a/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
similarity index 44%
rename from vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
rename to vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index 4ad9d4495..8edc0b79b 100644
--- a/vllm_mindspore/model_executor/models/mf_models/model_parallelism.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -16,13 +16,15 @@
 """
 transform huggingface safetensor.
 """
+
+import os
 from safetensors import safe_open
 from mindspore.communication.management import get_rank, get_group_size
 
 
-class BaseModelParallelism:
+class BaseWeightProcessor:
     r"""
-    Provide Infer model parameter convert and parallelism.
+    Provide model weight load and shards.
     Args:
         config (MF Config): The config of Infer model.
         network (InferenceModelForCausalLM): The network of infer model.
@@ -33,53 +35,63 @@ class BaseModelParallelism:
         self.config = config
         self.network = network
         self.is_quant = is_quant
+        self.tp_group_size = get_group_size()
+        self.rank_id = get_rank()
+        self.parameter_dict = {}
+        self.file_handles = {}
+
+    def get_file_handles(self, filename):
+        if filename not in self.file_handles:
+            fp = safe_open(filename, framework="np")
+            self.file_handles[filename] = fp
+        return self.file_handles[filename]
+
+    def release_file_handles(self):
+        del self.file_handles
 
     def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0):
-        tp_group_size = get_group_size()
-        rank_id = get_rank()
         safetensor_file = hf_weight_map[hf_param_name]
-        with safe_open(f"{src_hf_dir}/{safetensor_file}", framework="np") as sf_file:
-            qint4 = False
-            if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
-                qint4 = True
-            if not is_split_param:
-                np_data = sf_file.get_tensor(hf_param_name)
-                return np_data, qint4
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+            qint4 = True
+        if not is_split_param:
+            np_data = sf_file.get_tensor(hf_param_name)
+            return np_data, qint4
 
-            np_data = sf_file.get_slice(hf_param_name)
-            shape = np_data.get_shape()
-            if split_axis == 0:
-                split_size = shape[0] // tp_group_size
-                start = rank_id * split_size
-                stop = (rank_id + 1) * split_size
-                split_data = np_data[start:stop]
-            elif split_axis == 1:
-                split_size = shape[1] // tp_group_size
-                start = rank_id * split_size
-                stop = (rank_id + 1) * split_size
-                split_data = np_data[:, start:stop]
-            else:
-                raise ValueError("split_axis:{} is not supported.".format(split_axis))
-            return split_data, qint4
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.tp_group_size
+            start = self.rank_id * split_size
+            stop = (self.rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.tp_group_size
+            start = self.rank_id * split_size
+            stop = (self.rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        else:
+            raise ValueError("split_axis:{} is not supported.".format(split_axis))
+        return split_data, qint4
 
     def split_weight_by_rank(self, weight, split_axis=0):
-        tp_group_size = get_group_size()
-        rank_id = get_rank()
         shape = weight.shape
         if split_axis == 0:
-            split_size = shape[0] // tp_group_size
-            start = rank_id * split_size
-            stop = (rank_id + 1) * split_size
+            split_size = shape[0] // self.tp_group_size
+            start = self.rank_id * split_size
+            stop = (self.rank_id + 1) * split_size
             split_data = weight[start:stop]
         elif split_axis == 1:
-            split_size = shape[1] // tp_group_size
-            start = rank_id * split_size
-            stop = (rank_id + 1) * split_size
+            split_size = shape[1] // self.tp_group_size
+            start = self.rank_id * split_size
+            stop = (self.rank_id + 1) * split_size
             split_data = weight[:, start:stop]
         else:
             raise ValueError("split_axis:{} is not supported.".format(split_axis))
         return split_data
 
-    def infer_convert_and_parallelism(self, src_hf_dir):
-        """ infer convert and parallelism """
-        raise NotImplementedError("infer_convert_and_parallelism method is not implemented.")
+    def load_safetensors_shard(self, src_hf_dir):
+        """ load safetensors and shards """
+        raise NotImplementedError("load_safetensors_shard method is not implemented.")
-- 
Gitee


From 478be9577a14f0317fb53a0e3d897111783c381e Mon Sep 17 00:00:00 2001
From: ccsszz <1312463385@qq.com>
Date: Fri, 28 Mar 2025 21:06:21 +0800
Subject: [PATCH 53/82] support smoothquant in DeepseekInferParallelism

---
 .../mf_models/deepseekv3_weight_processor.py  | 111 ++++++++++++++++--
 1 file changed, 101 insertions(+), 10 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 441345133..83d1d8482 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -931,8 +931,92 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
 
-    def infer_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
-        """infer_quant_net_ms_convert_layer_weight"""
+    def infer_smooth_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
+        """infer_smooth_quant_net_ms_convert_layer_weight"""
+        parameter_dict = {}
+
+        no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj",
+                               "kv2l", "routed_experts.router.dense",
+                               "routed_experts.router.e_score_correction_bias",
+                               "topk_bias"]
+        for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"):
+            if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
+                continue
+
+            if any([name in param_name for name in no_need_split_layer]):
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                               hf_weight_map)
+            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.",
+                                                      "shared_experts.w_gate_hidden"]]):
+                if param_name.endswith(".weight") or "matmul" in param_name:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map)
+            elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]):
+                if param_name.endswith(".weight"):
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=1)
+                elif "quant_op" in param_name:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map, is_split_param=True,
+                                                                   split_axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map)
+            elif ".routed_experts.ffn.w_gate_hidden." in param_name:
+                if param_name.endswith(".weight"):
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                    value_list = []
+                    for experts_id in range(value.shape[0]):
+                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1))
+                    value = np.stack(value_list, axis=0)
+                elif "matmul" in param_name:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                    value_list = []
+                    for experts_id in range(value.shape[0]):
+                        value_list.append(self.split_weight_by_rank(value[experts_id, :], split_axis=0))
+                    value = np.stack(value_list, axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map)
+            elif ".routed_experts.ffn.w2" in param_name:
+                if param_name.endswith(".weight"):
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                    value_list = []
+                    for experts_id in range(value.shape[0]):
+                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0))
+                    value = np.stack(value_list, axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                   hf_weight_map)                 
+            elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=0)
+            elif "lm_head" in param_name:
+                if not self.config.parallel_config.vocab_emb_dp:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                                   is_split_param=True, split_axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+            else:
+                raise ValueError(f"not found layer {param_name}, please check safetensors file.")
+
+            dst_dtype = convert_np_to_ms_dtype(value)
+
+            parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
+                                                        name=param_name, requires_grad=False)
+
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+        print(f"smoothquant param_not_load:{param_not_load}")
+        print(f"smoothquant ckpt_not_load:{ckpt_not_load}")
+
+    def infer_gptq_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
+        """infer_gptq_quant_net_ms_convert_layer_weight"""
+        parameter_dict = {}
 
         no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj",
                                "kv2l", "routed_experts.router.dense",
@@ -940,7 +1024,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                "shared_experts.w_gate_hidden", "shared_experts.w2",
                                "topk_bias"]
 
-        for param_name, _ in hf_weight_map.items():
+        for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"):
             if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
                 continue
 
@@ -991,11 +1075,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
             dst_dtype = convert_np_to_ms_dtype(value)
             if is_int4:
-                self.parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dtype.qint4x2),
-                                                               name=param_name, requires_grad=False)
+                parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dtype.qint4x2),
+                                                          name=param_name, requires_grad=False)
             else:
-                self.parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
-                                                               name=param_name, requires_grad=False)
+                parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
+                                                          name=param_name, requires_grad=False)
+            param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+            print(f"gptq-quant param_not_load:{param_not_load}")
+            print(f"gptq-quant ckpt_not_load:{ckpt_not_load}")
 
     def load_safetensors_shard(self, src_hf_dir):
         """deepseek load safetensors and shard """
@@ -1011,7 +1098,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             elif file.endswith('_name_map.json'):
                 param_json_path = os.path.join(src_hf_dir, file)
                 with open(param_json_path, "r") as fp:
-                    hf_weight_map = json.load(fp)
+                    param_map = json.load(fp)
+                    hf_weight_map = param_map["weight_map"] if "weight_map" in param_map else param_map
                 break
 
         if not param_json_path:
@@ -1019,12 +1107,15 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         quantization_config = self.config.model.model_config.quantization_config
         quant_method = quantization_config.quant_method if quantization_config else None
-        if not quant_method or quant_method != "gptq-pergroup":
+        if not quant_method or (quant_method != "gptq-pergroup" and quant_method != "smoothquant"):
             self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
 
         num_layers = self.config.model.model_config.num_layers
         if quant_method and quant_method == "gptq-pergroup":
-            self.infer_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
+            self.infer_gptq_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
+            return
+        if quant_method and quant_method == "smoothquant":
+            self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
             return
 
         enable_tqdm = rank_id == 0
-- 
Gitee


From ed5dd23de47aee5bde097c9f94f240fa9549586a Mon Sep 17 00:00:00 2001
From: yyyyrf <yourifan@outlook.com>
Date: Thu, 27 Mar 2025 21:43:49 +0800
Subject: [PATCH 54/82] add jit to mf-model

---
 .../models/mf_models/deepseek_v3.py           | 45 +++++++++----------
 .../models/mf_models/mf_model_base.py         | 20 +++++++++
 .../model_executor/models/mf_models/qwen2.py  | 37 ++++++++-------
 3 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 49f9fc91c..30ea8d905 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -59,32 +59,9 @@ class DeepseekV3ForCausalLM(MfModelBase):
         super(DeepseekV3ForCausalLM, self).__init__(
             vllm_config=vllm_config, prefix=prefix
         )
-
-        self.mf_config.load_checkpoint = self.get_model_path()
-
-        self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
-        if self.mf_config.moe_config:
-            self.mf_model_config.moe_config = self.mf_config.moe_config
-        self.mf_model_config.return_hidden_states = True
-        setattr(self.mf_model_config, 'npu_mem_size', -1)
-
         self.is_quant = bool(hasattr(self.mf_model_config, "quantization_config") and
                              self.mf_model_config.quantization_config)
-        # Initital network
-        with no_init_parameters():  # Delay initialization
-            self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
-
-        # quant
-        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config,
-                                                                            "quant_method"):
-            ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY)
-            if ptq is not None:
-                ptq.apply(self.network)
-                ptq.convert(self.network)
 
-        self.network._jit_config_dict = JitConfig(
-            jit_level="O0", infer_boost="on"
-        ).jit_config_dict
         self.mf_kvcaches_init = False
 
         self.sampler = get_sampler()
@@ -101,6 +78,28 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
+    def _generate_model_config(self):
+        self.mf_config.load_checkpoint = self.get_model_path()
+
+        self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
+        if self.mf_config.moe_config:
+            self.mf_model_config.moe_config = self.mf_config.moe_config
+        self.mf_model_config.return_hidden_states = True
+        setattr(self.mf_model_config, 'npu_mem_size', -1)
+
+    def _create_network(self):
+        # Initital network
+        with no_init_parameters():  # Delay initialization
+            network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
+
+        # quant
+        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, "quant_method"):
+            ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY)
+            if ptq is not None:
+                ptq.apply(network)
+                ptq.convert(network)
+        return network
+
     def get_kvcache(self):
         key_cache = []
         forward_context = get_forward_context()
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 4ba57131f..785cedbcc 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -17,7 +17,9 @@
 # ============================================================================
 
 import os
+from types import MethodType
 from typing import Iterable, List, Optional, Set, Tuple, Union
+from abc import abstractmethod
 import numpy as np
 
 from vllm.attention import AttentionMetadata
@@ -101,6 +103,24 @@ class MfModelBase(MsModelBase):
         )
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
 
+        self._generate_model_config()
+        self.network = self._create_network()
+
+        self.network.construct = MethodType(ms.jit(self.network.__class__.construct,
+                                                   jit_level='O0', infer_boost='on'),
+                                            self.network)
+        self.network.lm_head.construct = MethodType(ms.jit(self.network.lm_head.__class__.construct,
+                                                            jit_level='O0', infer_boost='on'),
+                                                    self.network.lm_head)
+
+    @abstractmethod
+    def _generate_model_config(self):
+        raise NotImplementedError("Function _generate_model_config should be Implemented!")
+
+    @abstractmethod
+    def _create_network(self):
+        raise NotImplementedError("Function _create_network should be Implemented!")
+
 
     def get_kvcache(self):
         key_cache = []
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 58df668e0..aa0a7a14e 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -42,25 +42,6 @@ logger = init_logger(__name__)
 class Qwen2ForCausalLM(MfModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(Qwen2ForCausalLM, self).__init__(vllm_config=vllm_config, prefix=prefix)
-
-        self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config)
-        if self.mf_config.moe_config:
-            self.mf_model_config.moe_config = self.mf_config.moe_config
-        self.mf_model_config.return_hidden_states = True
-
-        # qwen qkv concat will support in next version
-        self.mf_model_config.qkv_concat = False
-        setattr(self.mf_model_config, 'npu_mem_size', -1)
-        self.mf_config.model.model_config.qkv_concat = False
-        # Initial network
-        with no_init_parameters():  # Delay initialization
-            self.network = ParallelQwenForCausalLM_MF(self.mf_model_config)
-        self.network._jit_config_dict = JitConfig(
-            jit_level="O0", infer_boost="on"
-        ).jit_config_dict
-
-        self.mf_config.load_checkpoint = self.get_model_path()
-
         self.mf_kvcaches_init = False
 
         self.sampler = get_sampler()
@@ -77,6 +58,24 @@ class Qwen2ForCausalLM(MfModelBase):
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
+    def _generate_model_config(self):
+        self.mf_config.load_checkpoint = self.get_model_path()
+        self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config)
+        if self.mf_config.moe_config:
+            self.mf_model_config.moe_config = self.mf_config.moe_config
+        self.mf_model_config.return_hidden_states = True
+
+        # qwen qkv concat will support in next version
+        self.mf_model_config.qkv_concat = False
+        setattr(self.mf_model_config, 'npu_mem_size', -1)
+        self.mf_config.model.model_config.qkv_concat = False
+
+    def _create_network(self):
+        # Initial network
+        with no_init_parameters():  # Delay initialization
+            network = ParallelQwenForCausalLM_MF(self.mf_model_config)
+        return network
+
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         weight_processor = Qwen2WeightProcessor(self.mf_config, self.network, False)
         weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint)
-- 
Gitee


From 9bebb130f50d560a288769d0f357ea9b3fc383c0 Mon Sep 17 00:00:00 2001
From: nashturing <jinrencao@huawei.com>
Date: Thu, 3 Apr 2025 16:28:50 +0800
Subject: [PATCH 55/82] =?UTF-8?q?=E3=80=90master=E3=80=91=E3=80=90bugfix?=
 =?UTF-8?q?=E3=80=91=E6=9D=83=E9=87=8D=E5=8A=A0=E8=BD=BD=EF=BC=8C=E5=85=B1?=
 =?UTF-8?q?=E4=BA=AB=E4=B8=93=E5=AE=B6=E5=88=87=E5=88=86=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/mf_models/deepseekv3_weight_processor.py      | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 83d1d8482..1affec1ef 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -739,15 +739,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
-- 
Gitee


From 10e141e3f53525c32e384ff7f5f36c1855da75bd Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Sat, 22 Mar 2025 18:07:43 +0800
Subject: [PATCH 56/82] update dependent package

---
 .gitmodules                                   |   4 +
 .jenkins/test/config/dependent_packages.yaml  |  11 +-
 tests/mindformers                             |   1 +
 .../config/predict_deepseek_r1_671b_w8a8.yaml | 125 +++
 .../config/predict_qwen2_5_7b_instruct.yaml   | 126 +++
 tests/st/python/set_env.py                    |  58 ++
 tests/st/python/test_custom.py                |  60 --
 tests/st/python/test_sampler.py               | 771 ------------------
 tests/st/python/test_vllm_deepseek_part.py    |  77 ++
 .../{test_demo.py => test_vllm_mf_qwen_7b.py} |  53 +-
 tests/st/python/test_vllm_mf_qwen_7b_mss.py   |  77 ++
 11 files changed, 512 insertions(+), 851 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 tests/mindformers
 create mode 100644 tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml
 create mode 100644 tests/st/python/config/predict_qwen2_5_7b_instruct.yaml
 create mode 100644 tests/st/python/set_env.py
 delete mode 100644 tests/st/python/test_custom.py
 delete mode 100644 tests/st/python/test_sampler.py
 create mode 100644 tests/st/python/test_vllm_deepseek_part.py
 rename tests/st/python/{test_demo.py => test_vllm_mf_qwen_7b.py} (51%)
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_mss.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..d057201a7
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "tests/mindformers"]
+	path = tests/mindformers
+	url = https://gitee.com/mindspore/mindformers.git
+	branch = br_infer_deepseek_os
diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index e632e9fd8..318f569d3 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,8 +1,11 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250307/br_infer_deepseek_os_20250307004508_4011166933d7e6230601ac1ad07bfe1a8329541d/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250403/br_infer_deepseek_os_20250403204446_a10f9cf58ea06de7cf6acbec0bde94442992955b_newest/'
 
-mindformers:
-  'https://repo.mindspore.cn/mindspore/mindformers/version/202503/20250303/br_infer_deepseek_os_20250303142905_569a4261552abe2984651bd31d675d76c5f51fb0_newest/'
+mindspore_gs:
+  'https://repo.mindspore.cn/mindspore/golden-stick/version/202503/20250322/master_20250322160019_1aa0a919d27c806700b2399bf965c5f6663c10fd_newest/'
 
 msadapter:
-  'https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250301/master_20250301_newest/'
+  'https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250328/master_20250328160020_5bff063570dd861da8fcf6540c0d2ab2d0d52458_newest/'
+
+vllm:
+  'https://repo.mindspore.cn/mirrors/vllm/version/202503/20250321/v0.7.3_20250321112504_ed6e9075d31e32c8548b480a47d1ffb77da1f54c_newest/'
diff --git a/tests/mindformers b/tests/mindformers
new file mode 160000
index 000000000..ed67bae4e
--- /dev/null
+++ b/tests/mindformers
@@ -0,0 +1 @@
+Subproject commit ed67bae4e88fa4d01c91cfbe4dfd822165c75d2f
diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml
new file mode 100644
index 000000000..5a5e9d60a
--- /dev/null
+++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml
@@ -0,0 +1,125 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+run_mode: 'predict'
+use_parallel: True
+
+load_checkpoint: "/path/to/deepseekr1/model_w8a8_ckpt"
+load_ckpt_format: "safetensors"
+auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'DeepSeekR1-W8A8'
+
+# default parallel of device num = 16 for Atlas 800T A2
+parallel_config:
+  model_parallel: 16
+  pipeline_stage: 1
+  expert_parallel: 1
+  vocab_emb_dp: False
+
+# mindspore context init config
+context:
+  mode: 0 # 0--Graph Mode; 1--Pynative Mode
+  max_device_memory: "61GB"
+  device_id: 0
+  affinity_cpu_list: None
+
+kernel_launch_group:
+  thread_num: 4
+  kernel_group_num: 16
+
+# parallel context config
+parallel:
+  parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
+  full_batch: False
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+
+# model config
+model:
+  model_config:
+    type: DeepseekV3Config
+    auto_register: deepseek3_config.DeepseekV3Config
+    batch_size: 1 # add for incre predict
+    seq_length: 4096
+    hidden_size: 7168
+    num_layers: 4
+    num_heads: 128
+    max_position_embeddings: 163840
+    intermediate_size: 18432
+    kv_lora_rank:  512
+    q_lora_rank: 1536
+    qk_rope_head_dim: 64
+    v_head_dim: 128
+    qk_nope_head_dim: 128
+    vocab_size: 129280
+    multiple_of: 256
+    rms_norm_eps: 1.0e-6
+    bos_token_id: 0
+    eos_token_id: 1
+    pad_token_id: 1
+    ignore_token_id: -100
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "bfloat16"
+    softmax_compute_type: "bfloat16"
+    rotary_dtype: "bfloat16"
+    router_dense_type: "bfloat16"
+    param_init_type: "bfloat16"
+    scaling_factor:
+      beta_fast: 32.0
+      beta_slow: 1.0
+      factor: 40.0
+      mscale: 1.0
+      mscale_all_dim: 1.0
+      original_max_position_embeddings: 4096
+    use_past: True
+    extend_method: "YARN"
+    use_flash_attention: True
+    block_size: 16
+    num_blocks: 512
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1
+    max_decode_length: 1024
+    top_k: 1
+    top_p: 1
+    theta: 10000.0
+    do_sample: False
+    is_dynamic: True
+    qkv_concat: False
+    ffn_concat: True
+    quantization_config:
+      quant_method: 'ptq'
+      weight_dtype: 'int8'
+      activation_dtype: 'int8'
+    auto_map:
+      AutoConfig: deepseek3_config.DeepseekV3Config
+      AutoModel: deepseek3.DeepseekV3ForCausalLM
+  arch:
+    type: DeepseekV3ForCausalLM
+    auto_register: deepseek3.DeepseekV3ForCausalLM
+
+moe_config:
+  expert_num: 256
+  num_experts_chosen: 8
+  routing_policy: "TopkRouterV2"
+  shared_expert_num: 1
+  routed_scaling_factor: 2.5
+  first_k_dense_replace: 3
+  moe_intermediate_size: 2048
+  topk_group: 4
+  n_group: 8
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<｜begin▁of▁sentence｜>'
+    eos_token: '<｜end▁of▁sentence｜>'
+    pad_token: '<｜end▁of▁sentence｜>'
+    type: LlamaTokenizerFast
+    vocab_file: '/path/to/deepseekr1/tokenizer.json'
+    tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
+    chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
+  type: LlamaProcessor
diff --git a/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml b/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml
new file mode 100644
index 000000000..821e33f5d
--- /dev/null
+++ b/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml
@@ -0,0 +1,126 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+use_parallel: False
+run_mode: 'predict'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'qwen2_5_7b'
+
+# runner config
+runner_config:
+  epochs: 5
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+# default parallel of device num = 8 for Atlas 800T A2
+parallel_config:
+  data_parallel: 1
+  model_parallel: 1
+  pipeline_stage: 1
+  micro_batch_num: 1
+  vocab_emb_dp: False
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1
+    seq_length: 32768
+    hidden_size: 3584
+    num_layers: 28
+    num_heads: 28
+    n_kv_heads: 4
+    vocab_size: 152064
+    intermediate_size: 18944
+    max_position_embeddings: 32768
+    qkv_has_bias: True
+    rms_norm_eps: 1.0e-6
+    theta: 1000000.0
+    emb_dropout_prob: 0.0
+    eos_token_id: [151645,151643]
+    pad_token_id: 151643
+    bos_token_id: 151643
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float32"
+    rotary_dtype: "bfloat16"
+    param_init_type: "bfloat16"
+    use_past: True
+    use_flash_attention: True
+    block_size: 32
+    num_blocks: 1024
+    use_past_shard: False
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1.05
+    max_decode_length: 512
+    top_k: 20
+    top_p: 0.8
+    temperature: 0.7
+    do_sample: True
+    is_dynamic: True
+    qkv_concat: True
+    auto_map:
+      AutoTokenizer: [qwen2_5_tokenizer.Qwen2Tokenizer, null]
+
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    model_max_length: 131072
+    bos_token: null
+    eos_token: "<|im_end|>"
+    unk_token: null
+    pad_token: "<|endoftext|>"
+    vocab_file: "/path/to/vocab.json"
+    merges_file: "/path/to/merges.txt"
+    chat_template: "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+    type: Qwen2Tokenizer
+  type: Qwen2Processor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  ascend_config:
+    precision_mode: "must_keep_origin_dtype"
+  max_call_depth: 10000
+  max_device_memory: "59GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: False
+  strategy_ckpt_config:
+    save_file: "./ckpt_strategy.ckpt"
+    only_trainable_params: False
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
diff --git a/tests/st/python/set_env.py b/tests/st/python/set_env.py
new file mode 100644
index 000000000..f39bd0199
--- /dev/null
+++ b/tests/st/python/set_env.py
@@ -0,0 +1,58 @@
+import os
+import sys
+from typing import Dict, Optional
+
+mindformers_path = "/home/jenkins/mindspore/testcases/testcases/tests/mindformers"
+
+if mindformers_path not in sys.path:
+    sys.path.insert(0, mindformers_path)
+
+current_pythonpath = os.environ.get("PYTHONPATH", "")
+if current_pythonpath:
+    os.environ["PYTHONPATH"] = f"{mindformers_path}:{current_pythonpath}"
+else:
+    os.environ["PYTHONPATH"] = mindformers_path
+
+
+class EnvVarManager:
+    def __init__(self):
+        self._original_env: Dict[str, Optional[str]] = {}
+        self._managed_vars: Dict[str, str] = {}
+
+    def set_env_var(self, var_name: str, value: str) -> None:
+        """设置环境变量并记录原始值（如果存在）"""
+        if var_name not in self._original_env:
+            # 保存原始值，即使它不存在（保存为None）
+            self._original_env[var_name] = os.environ.get(var_name)
+
+        os.environ[var_name] = value
+        self._managed_vars[var_name] = value
+
+    def unset_env_var(self, var_name: str) -> None:
+        """取消设置之前设置的环境变量，恢复原始值"""
+        if var_name not in self._original_env:
+            raise ValueError(f"Variable {var_name} was not set by this manager")
+
+        original_value = self._original_env[var_name]
+        if original_value is not None:
+            os.environ[var_name] = original_value
+        else:
+            if var_name in os.environ:
+                del os.environ[var_name]
+
+        del self._original_env[var_name]
+        del self._managed_vars[var_name]
+
+    def unset_all(self) -> None:
+        """取消设置所有由该管理器设置的环境变量"""
+        for var_name in list(self._managed_vars.keys()):
+            self.unset_env_var(var_name)
+
+    def get_managed_vars(self) -> Dict[str, str]:
+        """获取当前由该管理器管理的所有环境变量       """
+        return self._managed_vars.copy()
+
+    def setup_ai_environment(self, env_vars: Dict[str, str]) -> None:
+        """设置AI相关的环境变量，使用传入的参数"""
+        for var_name, value in env_vars.items():
+            self.set_env_var(var_name, value)
diff --git a/tests/st/python/test_custom.py b/tests/st/python/test_custom.py
deleted file mode 100644
index b7e8cc3b6..000000000
--- a/tests/st/python/test_custom.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2024 The vLLM team.
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://wwww.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by application law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""test case for custom op adv_step_flash"""
-
-import mindspore as ms
-from vllm_mindspore import npu_ops
-import numpy as np
-from mindspore import Tensor
-
-# TODO refactor this case to run on ci
-def testcase():
-    ms.context.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
-    in_block_tables = Tensor(np.load("data/block_tables.npy"))
-    in_input_positions = Tensor(np.load("data/input_positions.npy"))
-    in_input_tokens = Tensor(np.load("data/input_tokens.npy"))
-    in_sampled_token_ids = Tensor(np.load("data/sampled_token_ids.npy"))
-    in_seq_lens_tensor = Tensor(np.load("data/seq_lens_tensor.npy"))
-    in_slot_mapping = Tensor(np.load("data/slot_mapping.npy"))
-    num_seqs = 256
-    num_queries = 256
-    block_size = 32
-    npu_ops.adv_step_flash(num_seqs=num_seqs,
-                           num_queries=num_queries,
-                           block_size=block_size,
-                           input_tokens=in_input_tokens,
-                           sampled_token_ids=in_sampled_token_ids,
-                           input_positions=in_input_positions,
-                           seq_lens=in_seq_lens_tensor,
-                           slot_mapping=in_slot_mapping,
-                           block_tables=in_block_tables)
-
-    out_block_tables = np.load("data/o_block_tables.npy").astype(np.int32)
-    out_input_positions = np.load("data/o_input_positions.npy").astype(np.int32)
-    out_input_tokens = np.load("data/o_input_tokens.npy").astype(np.int32)
-    out_sampled_token_ids = np.load("data/o_sampled_token_ids.npy").astype(np.int32)
-    out_seq_lens_tensor = np.load("data/o_seq_lens_tensor.npy").astype(np.int32)
-    out_slot_mapping = np.load("data/o_slot_mapping.npy").astype(np.int32)
-    assert np.allclose(in_block_tables, out_block_tables)
-    assert np.allclose(in_input_positions, out_input_positions)
-    assert np.allclose(in_input_tokens, out_input_tokens)
-    assert np.allclose(in_sampled_token_ids, out_sampled_token_ids)
-    assert np.allclose(in_seq_lens_tensor, out_seq_lens_tensor)
-    assert np.allclose(in_slot_mapping, out_slot_mapping)
-    print("passed.")
-
-if __name__ == "__main__":
-    testcase()
diff --git a/tests/st/python/test_sampler.py b/tests/st/python/test_sampler.py
deleted file mode 100644
index d31c10161..000000000
--- a/tests/st/python/test_sampler.py
+++ /dev/null
@@ -1,771 +0,0 @@
-#!/usr/bin/env python3
-# encoding: utf-8
-# Copyright 2025 Huawei Technologies Co., Ltd
-# Copyright 2024 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-import vllm_mindspore
-import itertools
-import random
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-from unittest.mock import Mock, patch
-from mindspore import mint
-
-import pytest
-import torch
-from transformers import GenerationConfig, GenerationMixin
-
-import vllm.envs as envs
-
-from vllm_mindspore.model_executor.layers.sampler import Sampler
-from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm_mindspore.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter, is_pin_memory_available
-
-class MockLogitsSampler(Sampler):
-
-    def __init__(self, fake_logits: torch.Tensor):
-        super().__init__()
-        self.fake_logits = fake_logits
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, VOCAB_SIZE),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-    return input_tensor, fake_logits, sampler
-
-
-VOCAB_SIZE = 32000
-RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = 'cuda'
-
-
-def _do_sample(
-    batch_size: int,
-    input_tensor: torch.Tensor,
-    sampler: MockLogitsSampler,
-    sampling_params: SamplingParams,
-    device: str,
-):
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_greedy(seed: int, device: str):
-    set_random_seed(seed)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-    expected = torch.argmax(fake_logits, dim=-1)
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == expected[i].item()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random(seed: int, device: str):
-    set_random_seed(seed)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed_deterministic(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                      sampling_params, device)
-
-    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                       sampling_params, device)
-
-    assert first_sampler_output == second_sampler_output
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_min_tokens_penalty(seed: int, device: str):
-    seq_id_counter = Counter(start=random.randint(0, 100))
-    set_random_seed(seed)
-    torch.set_default_device(device)
-
-    def create_sampling_params(min_tokens,
-                               eos_token_id=0,
-                               *,
-                               stop_token_ids: Optional[List[int]] = None,
-                               prompt_logprobs: Optional[int] = None):
-        sampling_params = SamplingParams(
-            min_tokens=min_tokens,
-            max_tokens=9999,  # keep higher than max of min_tokens
-            stop_token_ids=stop_token_ids,
-            # requesting prompt_logprobs changes the structure of `logits`
-            prompt_logprobs=prompt_logprobs,
-        )
-        sampling_params.all_stop_token_ids.add(eos_token_id)
-        return sampling_params
-
-    def create_sequence_data(num_input=3, num_generated=0):
-        seq_data = SequenceData.from_seqs(
-            random.choices(range(0, VOCAB_SIZE), k=num_input))
-        if num_generated > 0:
-            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
-                                                       k=num_generated)
-        return seq_data
-
-    def generate_test_case():
-        # generate multiple seq groups but limit total batch size
-        batch_size = random.randint(1, 128)
-
-        expected_penalization = []
-        sequence_metadata_list: List[SequenceGroupMetadata] = []
-        # 20% chance to generate seq group metadata list with all prompts
-        is_prompt = random.random() < 0.2
-        while batch_size > 0:
-            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
-
-            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
-            min_tokens = random.randint(0, 50)
-            num_stop_tokens = random.randint(0, 8)
-            if num_stop_tokens > 0:
-                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
-                                                k=num_stop_tokens)
-            else:
-                stop_token_ids = None
-
-            sampling_params = create_sampling_params(
-                min_tokens=min_tokens,
-                eos_token_id=eos_token_id,
-                stop_token_ids=stop_token_ids)
-
-            seq_data: Dict[int, SequenceData] = {}
-            seq_group_penalization: List[bool] = []
-            for _ in range(num_seqs):
-                num_input = random.randint(1, 100)
-                num_generated = 0 if is_prompt else random.randint(1, 100)
-                seq_data[next(seq_id_counter)] = create_sequence_data(
-                    num_input=num_input, num_generated=num_generated)
-                seq_group_penalization.append(num_generated < min_tokens)
-
-            expected_penalization.extend(seq_group_penalization)
-            sequence_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{batch_size}",
-                    is_prompt=is_prompt,
-                    seq_data=seq_data,
-                    sampling_params=sampling_params,
-                    block_tables={},
-                ))
-            batch_size -= num_seqs
-
-        return {
-            "expected_penalization": expected_penalization,
-            "seq_group_metadata_list": sequence_metadata_list,
-        }
-
-    # define some explicit test cases for edge case behavior
-    prompt_without_penalization = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(0),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization = {
-        "expected_penalization": [True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization_and_prompt_logprobs = {
-        "expected_penalization": [False, False, True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=3),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-        ]
-    }
-
-    stop_penalizing_after_min_tokens = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
-    prompt_combination = {
-        "expected_penalization": [False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=2),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_3",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(
-                    0, stop_token_ids=stop_token_ids),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
-    decode_combination = {
-        "expected_penalization": [True, False, False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=100),
-                },
-                sampling_params=create_sampling_params(
-                    2, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=20),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=10),
-                },
-                sampling_params=create_sampling_params(
-                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-        ]
-    }
-
-    if seed == 0:
-        test_cases = [
-            prompt_without_penalization,
-            prompt_with_penalization,
-            prompt_with_penalization_and_prompt_logprobs,
-            stop_penalizing_after_min_tokens,
-            prompt_combination,
-            decode_combination,
-        ]
-    else:
-        test_cases = [generate_test_case()]
-
-    def run_test_case(*, expected_penalization: List[bool],
-                      seq_group_metadata_list: List[SequenceGroupMetadata]):
-        assert expected_penalization, \
-            "Invalid test case, need expected_penalization"
-        assert seq_group_metadata_list, \
-            "Invalid test case, need seq_group_metadata_list"
-
-        batch_size = 0
-        seq_lens: List[int] = []
-        sampling_params_per_row: List[SamplingParams] = []
-        for sgm in seq_group_metadata_list:
-            sampling_params = sgm.sampling_params
-
-            num_rows = len(sgm.seq_data)
-            if sgm.is_prompt:
-                # a prompt seq_group has only one sequence
-                seq_data = next(iter(sgm.seq_data.values()))
-                prompt_len = seq_data.get_prompt_len()
-                seq_lens.append(prompt_len)
-
-                assert sgm.sampling_params is not None
-                if sgm.sampling_params.prompt_logprobs:
-                    # with prompt_logprobs each token in the prompt has a row in
-                    # logits
-                    num_rows = prompt_len
-
-            batch_size += num_rows
-            sampling_params_per_row.extend(
-                itertools.repeat(sampling_params, num_rows))
-
-        assert len(
-            expected_penalization
-        ) == batch_size, \
-            ("Invalid test case, expected_penalization does not match computed"
-             "batch size")
-
-        _, fake_logits, sampler = _prepare_test(batch_size)
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens=seq_lens if seq_lens else None,
-            query_lens=seq_lens if seq_lens else [1] * batch_size,
-            device=device,
-            pin_memory=is_pin_memory_available())
-        # the logits tensor is modified in-place by the sampler
-        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-        for logits_idx, (should_penalize, sampling_params) in enumerate(
-                zip(expected_penalization, sampling_params_per_row)):
-
-            tokens_to_check = sampling_params.all_stop_token_ids
-
-            if should_penalize:
-                for token_id in tokens_to_check:
-                    assert fake_logits[logits_idx, token_id] == -float(
-                        'inf'
-                    ), f"Expected token {token_id} for logits row {logits_idx}"
-                    " to be penalized"
-                # no other tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] == -float('inf')) == len(
-                        tokens_to_check
-                    ), f"Expected only {len(tokens_to_check)} to be penalized"
-            else:
-                # no tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] ==
-                    -float('inf')) == 0, "No tokens should have been penalized"
-
-    for test_case in test_cases:
-        run_test_case(**test_case)
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_mixed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    expected_tokens: List[Optional[List[int]]] = []
-    seq_lens: List[int] = []
-    for i in range(batch_size):
-        expected: Optional[List[int]] = None
-        sampling_type = random.randint(0, 2)
-        if sampling_type == 0:
-            sampling_params = SamplingParams(temperature=0)
-            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
-        elif sampling_type in (1, 2):
-            n = random.randint(1, 10)
-            sampling_params = SamplingParams(
-                temperature=random.random() + 0.1,
-                top_p=min(random.random() + 0.1, 1),
-                top_k=random.randint(0, 10) or -1,
-                n=n,
-                presence_penalty=random.randint(0, 1),
-            )
-            if sampling_type == 2:
-                sampling_params.seed = random.randint(0, 10000)
-            else:
-                for idx in range(n):
-                    fake_logits[i, i + idx] = 1e2
-                expected = list(range(i, i + n))
-
-        expected_tokens.append(expected)
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    generators: Dict[str, torch.Generator] = {}
-
-    def test_sampling():
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available(),
-            generators=generators)
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        for i, (sequence_output, metadata) in enumerate(
-                zip(sampler_output, seq_group_metadata_list)):
-            assert metadata.sampling_params is not None
-
-            if (metadata.sampling_params.seed is not None
-                    and expected_tokens[i] is None):
-                # Record seeded random result to compare with results of
-                # second invocation
-                expected_tokens[i] = [
-                    nth_output.output_token
-                    for nth_output in sequence_output.samples
-                ]
-                continue
-
-            expected_tokens_item = expected_tokens[i]
-            assert expected_tokens_item is not None
-
-            for n, nth_output in enumerate(sequence_output.samples):
-                assert metadata.sampling_params is not None
-
-                if (metadata.sampling_params.temperature == 0
-                        or metadata.sampling_params.seed is not None):
-                    # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens_item[n]
-                else:
-                    # For non-seeded random check that one of the high-logit
-                    # tokens were chosen
-                    assert nth_output.output_token in expected_tokens_item
-
-    # Test batch
-    test_sampling()
-
-    # Shuffle the batch and resample
-    target_index = list(range(batch_size))
-    for list_to_shuffle in (target_index, seq_group_metadata_list,
-                            expected_tokens, seq_lens):
-        random.Random(seed).shuffle(list_to_shuffle)
-    target_index = torch.tensor(target_index)
-    input_tensor.data = input_tensor.index_select(0, target_index)
-    fake_logits.data = fake_logits.index_select(0, target_index)
-
-    # This time, results of seeded random samples will be compared with
-    # the corresponding sample in the pre-shuffled batch
-    test_sampling()
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_top_k_top_p(seed: int, device: str):
-    set_random_seed(seed)
-    batch_size = random.randint(1, 256)
-    top_k = random.randint(100, 500)
-    top_p = random.random() * 0.1
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device=device,
-                              dtype=torch.float16)
-    fake_logits = torch.normal(0,
-                               5,
-                               size=(batch_size, vocab_size),
-                               device=input_tensor.device,
-                               dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-
-    generation_model = GenerationMixin()
-    generation_config = GenerationConfig(top_k=top_k,
-                                         top_p=top_p,
-                                         do_sample=True)
-
-    @dataclass
-    class MockConfig:
-        is_encoder_decoder: bool = False
-
-    generation_model.config = MockConfig()  # needed by the following method
-    generation_model._prepare_special_tokens(generation_config, device=device)
-    processors = generation_model._get_logits_processor(generation_config,
-                                                        None,
-                                                        None,
-                                                        None, [],
-                                                        device=device)
-    assert len(processors) == 2  # top_p and top_k
-
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=1,
-                    top_k=top_k,
-                    top_p=top_p,
-                ),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-
-    sample_probs = None
-
-    def mock_sample(probs, *args, **kwargs):
-        nonlocal sample_probs
-        sample_probs = probs
-        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
-                 for prob in probs], None)
-
-    # top-k and top-p is only calculated when flashinfer kernel is not available
-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
-         patch("vllm.model_executor.layers.sampler."
-               "flashinfer_top_k_top_p_sampling", None):
-        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-    assert sample_probs is not None
-
-    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
-    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
-    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
-    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_flashinfer_fallback(seed: int, device: str):
-    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
-        pytest.skip("Flashinfer sampler is disabled")
-
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    def failing_flashinfer_sampling(*_args, **_kwargs):
-        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    with patch(
-            "vllm.model_executor.layers.sampler."
-            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
-        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                             sampling_params, device)
-
-    assert sampler_output == fallback_sampler_output
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_repetition_penalty_mixed(device: str):
-
-    vocab_size = 8
-
-    def test_sampling_params(sampling_params: List[SamplingParams]):
-
-        seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        seq_lens: List[int] = []
-        for i in range(2):
-            seq_group_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{i}",
-                    is_prompt=True,
-                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                    sampling_params=sampling_params[i],
-                    block_tables={0: [1]},
-                ))
-            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available())
-
-        fake_logits = torch.full((2, vocab_size),
-                                 1e-2,
-                                 device=device,
-                                 dtype=torch.float16)
-
-        fake_logits[:, 5] = 1.1e-2
-        fake_logits[:, 1] = 1.2e-2
-
-        sampler = MockLogitsSampler(fake_logits)
-
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        generated_tokens = []
-        for output in sampler_output:
-            generated_tokens.append(output.samples[0].output_token)
-
-        return generated_tokens
-
-    # one configuration is greedy with repetition_penalty
-    sampling_params_rep = SamplingParams(
-        temperature=0.0,
-        repetition_penalty=2.0,
-    )
-
-    # other configuration is sampling w/o repetition_penalty
-    sampling_params_sample = SamplingParams(
-        temperature=1.0,
-        top_k=1,
-        seed=42,
-    )
-
-    tokens1 = test_sampling_params(
-        [sampling_params_rep, sampling_params_sample])
-
-    tokens2 = test_sampling_params(
-        [sampling_params_sample, sampling_params_rep])
-
-    assert tokens1[0] == tokens2[1]
-    assert tokens1[1] == tokens2[0]
-
-@pytest.mark.skip(reason="Not implemented yet")
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_include_gpu_probs_tensor(device: str):
-    set_random_seed(42)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-    sampler.include_gpu_probs_tensor = True
-    sampler.should_modify_greedy_probs_inplace = False
-
-    sampling_params = SamplingParams(temperature=0)
-
-    mock_inplace = Mock()
-    with patch(
-            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
-            mock_inplace):
-
-        sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                    sampling_params, device)
-        mock_inplace.assert_not_called()
-
-    assert sampler_output.sampled_token_probs is not None
-    assert sampler_output.logprobs is not None
-    assert sampler_output.sampled_token_ids is not None
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
new file mode 100644
index 000000000..72885cf66
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -0,0 +1,77 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "vLLM_MODEL_MEMORY_USE_GB": "40",
+    "ASCEND_TOTAL_MEMORY_GB": "60",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestDeepSeek:
+    """
+    Test Deepseek.
+    """
+
+    # @pytest.mark.level0
+    # @pytest.mark.platform_arm_ascend910b_training
+    # @pytest.mark.env_single
+    def test_deepseek_r1(self):
+        """
+        test case deepseek r1 w8a8
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['ugs611ాలు哒ాలు mahassisemaSTE的道德']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/tests/st/python/test_demo.py b/tests/st/python/test_vllm_mf_qwen_7b.py
similarity index 51%
rename from tests/st/python/test_demo.py
rename to tests/st/python/test_vllm_mf_qwen_7b.py
index d6e1fd0e8..e152e11fb 100644
--- a/tests/st/python/test_demo.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b.py
@@ -13,44 +13,65 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""test demo for st."""
+"""test mf qwen."""
 import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    # "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "vLLM_MODEL_MEMORY_USE_GB": "50",
+    "ASCEND_TOTAL_MEMORY_GB": "64",
+    # "MS_ENABLE_LCCL": "off",
+    # "HCCL_OP_EXPANSION_MODE": "AIV",
+    # "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    # "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
 
 
-class TestDemo:
+class TestMfQwen:
     """
-    Test Demo for ST.
+    Test Qwen.
     """
-
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    def test_aaa(self):
+    def test_mf_qwen(self):
         """
-        test case aaa
+        test case qwen2.5 7B
         """
-        # pylint: disable=W0611
-        import vllm_mindspore
-        from vllm import LLM, SamplingParams
 
         # Sample prompts.
         prompts = [
-            "I am",
-            "Today is",
-            "Llama is"
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
         ]
 
         # Create a sampling params object.
-        sampling_params = SamplingParams(temperature=0.0, top_p=0.95)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
 
         # Create an LLM.
-        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Llama-2-7b-hf")
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", gpu_memory_utilization=0.9)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
+        except_list=['中性<｜Assistant｜> 这句话']
         # Print the outputs.
-        for output in outputs:
+        for i, output in enumerate(outputs):
             prompt = output.prompt
             generated_text = output.outputs[0].text
             print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        assert len(outputs) == 3
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
new file mode 100644
index 000000000..a96c69d62
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
@@ -0,0 +1,77 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf qwen mss."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "vLLM_MODEL_MEMORY_USE_GB": "20",
+    "ASCEND_TOTAL_MEMORY_GB": "29",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestMfQwen_mss:
+    """
+    Test qwen.
+    """
+    # @pytest.mark.level0
+    # @pytest.mark.platform_arm_ascend910b_training
+    # @pytest.mark.env_single
+    def test_mf_qwen_7b_mss(self):
+        """
+        test case qwen_7b_mss
+        """
+
+        # Sample prompts.
+        prompts = [
+            "I love Beijing, because",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", max_model_len=8192, max_num_batched_tokens=8192,
+                  block_size=32, gpu_memory_utilization=0.9, num_scheduler_steps=8, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=[' it is a city with a long history. Which']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
-- 
Gitee


From 65b2f17bc6597c29a646f1b87aa426b7401f6bb7 Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Mon, 7 Apr 2025 11:05:11 +0800
Subject: [PATCH 57/82] add 2 tp test

---
 .jenkins/test/config/dependent_packages.yaml |  2 +-
 tests/st/python/test_vllm_deepseek_part.py   |  9 +++++----
 tests/st/python/test_vllm_mf_qwen_7b.py      | 13 +++++++------
 tests/st/python/test_vllm_mf_qwen_7b_mss.py  |  9 +++++----
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index 318f569d3..eb243d6fa 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -5,7 +5,7 @@ mindspore_gs:
   'https://repo.mindspore.cn/mindspore/golden-stick/version/202503/20250322/master_20250322160019_1aa0a919d27c806700b2399bf965c5f6663c10fd_newest/'
 
 msadapter:
-  'https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250328/master_20250328160020_5bff063570dd861da8fcf6540c0d2ab2d0d52458_newest/'
+  'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250403/master_20250403171706_61451a9e1a5909cfa7877f72b1286bc0a843a067_newest/'
 
 vllm:
   'https://repo.mindspore.cn/mirrors/vllm/version/202503/20250321/v0.7.3_20250321112504_ed6e9075d31e32c8548b480a47d1ffb77da1f54c_newest/'
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 72885cf66..a0caa3161 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -44,9 +44,9 @@ class TestDeepSeek:
     Test Deepseek.
     """
 
-    # @pytest.mark.level0
-    # @pytest.mark.platform_arm_ascend910b_training
-    # @pytest.mark.env_single
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
     def test_deepseek_r1(self):
         """
         test case deepseek r1 w8a8
@@ -61,7 +61,8 @@ class TestDeepSeek:
         sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
 
         # Create an LLM.
-        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/st/python/test_vllm_mf_qwen_7b.py b/tests/st/python/test_vllm_mf_qwen_7b.py
index e152e11fb..e8c71690f 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b.py
@@ -21,14 +21,14 @@ env_manager = set_env.EnvVarManager()
 # def env
 env_vars = {
     "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
-    # "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
     "vLLM_MODEL_BACKEND": "MindFormers",
     "vLLM_MODEL_MEMORY_USE_GB": "50",
     "ASCEND_TOTAL_MEMORY_GB": "64",
-    # "MS_ENABLE_LCCL": "off",
-    # "HCCL_OP_EXPANSION_MODE": "AIV",
-    # "ASCEND_RT_VISIBLE_DEVICES": "0,1",
-    # "MS_ALLOC_CONF": "enable_vmm:True",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
@@ -61,7 +61,8 @@ class TestMfQwen:
         sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
 
         # Create an LLM.
-        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", gpu_memory_utilization=0.9)
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  gpu_memory_utilization=0.9, tensor_parallel_size=2)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
index a96c69d62..7983d7a88 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
@@ -43,9 +43,9 @@ class TestMfQwen_mss:
     """
     Test qwen.
     """
-    # @pytest.mark.level0
-    # @pytest.mark.platform_arm_ascend910b_training
-    # @pytest.mark.env_single
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
     def test_mf_qwen_7b_mss(self):
         """
         test case qwen_7b_mss
@@ -60,7 +60,8 @@ class TestMfQwen_mss:
         sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
 
         # Create an LLM.
-        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", max_model_len=8192, max_num_batched_tokens=8192,
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  max_model_len=8192, max_num_batched_tokens=8192,
                   block_size=32, gpu_memory_utilization=0.9, num_scheduler_steps=8, tensor_parallel_size=2)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
-- 
Gitee


From 01c92348a6f76eb2e625045a8ffe78bba770b733 Mon Sep 17 00:00:00 2001
From: w00521005 <wangshaocong1@huawei.com>
Date: Fri, 28 Mar 2025 14:23:19 +0800
Subject: [PATCH 58/82] support mtp

---
 vllm_mindspore/__init__.py                    |  28 ++
 vllm_mindspore/attention/backends/ms_attn.py  |   2 +
 .../layers/rejection_sampler.py               |  77 ++++
 vllm_mindspore/model_executor/models/llama.py |   1 +
 .../models/mf_models/deepseek_mtp.py          | 129 +++++++
 .../mf_models/deepseekv3_weight_processor.py  |  86 ++++-
 .../models/mf_models/mf_model_base.py         |  29 +-
 .../model_executor/models/model_base.py       |   5 +
 vllm_mindspore/model_executor/models/qwen2.py |   3 +-
 .../model_executor/models/registry.py         |   1 +
 vllm_mindspore/utils.py                       |   8 +-
 vllm_mindspore/worker/spec_decode_worker.py   | 364 ++++++++++++++++++
 vllm_mindspore/worker/worker.py               |  26 +-
 13 files changed, 726 insertions(+), 33 deletions(-)
 create mode 100644 vllm_mindspore/model_executor/layers/rejection_sampler.py
 create mode 100644 vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
 create mode 100644 vllm_mindspore/worker/spec_decode_worker.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 2be6a9505..f1da0cc60 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -193,6 +193,34 @@ vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
 vllm.config.SchedulerConfig._verify_args = _verify_args
 
+from .utils import update_modules
+from vllm_mindspore.attention.backends import ms_attn
+update_modules("vllm.attention.backends.flash_attn", ms_attn)
+
+from vllm_mindspore.worker.spec_decode_worker import (
+    spec_decode_worker_init,
+    _run_no_spec,
+    _verify_tokens,
+    _create_output,
+    _merge_outputs,
+)
+from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+SpecDecodeWorker.__init__ = spec_decode_worker_init
+SpecDecodeWorker._verify_tokens = _verify_tokens
+SpecDecodeWorker._run_no_spec = _run_no_spec
+
+from vllm.model_executor.layers.spec_decode_base_sampler import SpecDecodeBaseSampler
+SpecDecodeBaseSampler._create_output = _create_output
+
+from vllm.spec_decode.top1_proposer import Top1Proposer
+Top1Proposer._merge_outputs = _merge_outputs
+
+from vllm_mindspore.model_executor.layers.rejection_sampler import _smallest_positive_value, _multinomial
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+RejectionSampler._smallest_positive_value = _smallest_positive_value
+RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value')
+vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial
+
 from .utils import check_ready
 
 check_ready()
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index 568e69604..499321223 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -780,3 +780,5 @@ class MLABackend(AttentionBackend):
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
         return [576]
+
+FlashAttentionMetadata = MSAttentionMetadata
diff --git a/vllm_mindspore/model_executor/layers/rejection_sampler.py b/vllm_mindspore/model_executor/layers/rejection_sampler.py
new file mode 100644
index 000000000..b6842cf5a
--- /dev/null
+++ b/vllm_mindspore/model_executor/layers/rejection_sampler.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# the data type of finfo.tiny is not float but narray in msadapter,
+# which is not supported to be a tensor index
+
+from functools import cached_property
+from typing import Dict
+
+import torch
+import mindspore as ms
+
+from vllm.platforms import current_platform
+
+@cached_property
+def _smallest_positive_value(self) -> float:
+    """Return the smallest positive value representable by the probs dtype.
+    This value is used when constructing a distribution from which to sample
+    recovered tokens in the first rejection case.
+
+    See _get_recovered_probs for more details
+
+    Note that this isn't actually the smallest positive value representable
+    by float32, but the smallest positive normal value.
+    See https://en.wikipedia.org/wiki/Subnormal_number for more information.
+    """
+    # the value type of tiny is numpy in msadapter.
+    return float(torch.finfo(self.probs_dtype).tiny)
+
+
+# msadapter does not support 'exponential_'
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def _multinomial(
+        probs: torch.Tensor,
+        num_samples: int,
+        k: int,
+        seeded_seqs: Dict[int, torch.Generator],
+) -> torch.Tensor:
+    # msadapter donot support tensor.exponential_
+    def exponential_(x: torch.Tensor, lambda_, generator=None):
+        random_x = ms.mint.rand(x.shape, generator=generator)  # 生成均匀分布随机数
+        return -torch.log(random_x) / lambda_  # 逆变换法
+
+    if num_samples > 1:
+        # This is equivalent to torch.repeat_interleaved (which also
+        # forces a GPU<->CPU sync).
+        probs = probs[:, None, :].expand(probs.shape[0], num_samples,
+                                         probs.shape[1]).contiguous().view(
+            -1, probs.shape[1])
+    q = torch.empty_like(probs)
+    if not seeded_seqs:
+        q = exponential_(q, 1.0)
+    else:
+        start = 0
+        for idx in range(len(q) // k):
+            end = start + k
+            generator = seeded_seqs.get(idx)
+            # Note: generator might be None for non seeded
+            q[start:end] = exponential_(q[start:end], 1.0, generator=generator)
+            start = end
+
+    return probs.div_(q).argmax(dim=1).view(-1, num_samples)
diff --git a/vllm_mindspore/model_executor/models/llama.py b/vllm_mindspore/model_executor/models/llama.py
index 19cb6e3a8..3a18956b9 100644
--- a/vllm_mindspore/model_executor/models/llama.py
+++ b/vllm_mindspore/model_executor/models/llama.py
@@ -532,6 +532,7 @@ class LlamaForCausalLM(MsModelBase, SupportsPP):
         attn_metadata,
         intermediate_tensors=None,
         inputs_embeds=None,
+        **kwargs
     ):
         if attn_metadata.num_prefill_tokens > 0:
             input_ids = input_ids.expand_dims(0)
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
new file mode 100644
index 000000000..357259857
--- /dev/null
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+from typing import Iterable, Set, Tuple, Optional
+
+from vllm.config import VllmConfig
+from vllm.config import  get_current_vllm_config
+from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+import mindspore as ms
+from mindspore import Tensor, JitConfig, Model, mutable
+from mindspore.nn.utils import no_init_parameters
+
+from research.deepseek3.deepseek3_config import (
+    DeepseekV3Config as DeepseekV3Config_MF,
+)
+from research.deepseek3.deepseek3 import (
+    DeepseekV3ForCausalLM as DeepseekV3ForCausalLM_MF,
+)
+
+from vllm_mindspore.model_executor.layers.sampler import get_sampler
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
+from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
+from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
+
+logger = init_logger(__name__)
+
+class DeepseekV3MTPForCausalLM(MfModelBase):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super(DeepseekV3MTPForCausalLM, self).__init__(
+            vllm_config=vllm_config, prefix=prefix
+        )
+
+        self.mf_config.load_checkpoint = self.get_model_path()
+
+        self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
+        if self.mf_config.moe_config:
+            self.mf_model_config.moe_config = self.mf_config.moe_config
+        self.mf_model_config.return_hidden_states = True
+        setattr(self.mf_model_config, 'npu_mem_size', -1)
+
+        self.mf_model_config.is_mtp_model = True
+        self.mf_model_config.num_nextn_predict_layers = vllm_config.model_config.hf_config.num_nextn_predict_layers
+        if self.mf_model_config.num_nextn_predict_layers != 1:
+            raise NotImplementedError("Only support 1 MTP-layer now.")
+        
+        self.mf_config.model.model_config = self.mf_model_config
+        # Initital network
+        with no_init_parameters():  # Delay initialization
+            self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
+
+        self.network._jit_config_dict = JitConfig(
+            jit_level="O0", infer_boost="on"
+        ).jit_config_dict
+        self.mf_kvcaches_init = False
+
+        self.sampler = get_sampler()
+        self.set_modules({"model": self.network})
+
+        self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_nextn_predict_layers)]
+        compilation_config = get_current_vllm_config().compilation_config
+
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        for i in range(self.mf_model_config.num_nextn_predict_layers):
+            compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
+
+        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
+        self.set_flags = False
+
+    def get_kvcache(self):
+        key_cache = []
+        forward_context = get_forward_context()
+        for i in range(self.mf_model_config.num_nextn_predict_layers):
+            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
+            key_cache.append(k_cache)
+        return mutable(key_cache), None
+
+
+    def update_model_inputs(self, model_inputs, **kwargs):
+        # ToDo: supports multi-mtpLayers with 'spec_step_idx' specifing the layer index.
+        if kwargs.get("spec_step_idx", 0) != 0:
+            raise NotImplementedError("Only support 1 MTP-layer now.")
+        # model_inputs["index"] = ms.Tensor(kwargs.get("spec_step_idx", 0), ms.int32)
+        hidden_states_shape = list(model_inputs["input_ids"].shape)
+        hidden_states_shape.append(self.model_config.get_hidden_size())
+        model_inputs["hidden_states"] = kwargs.get("previous_hidden_states").reshape(hidden_states_shape)
+        return model_inputs
+
+
+    def compute_logits(
+        self,
+        hidden_states: Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[Tensor]:
+        selected_token_indices = sampling_metadata.selected_token_indices
+        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
+            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
+                                    dtype=self.mf_model_config.compute_dtype)
+        else:
+            hidden_states = hidden_states.index_select(0, selected_token_indices)
+            logits = self.network.mtp_model.head(hidden_states)
+            logits = logits.reshape(-1, logits.shape[-1])
+
+        return logits
+
+
+    def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
+        weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, False)
+        weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint, is_mtp_model=True)
+        self.network.set_dynamic_inputs()
+        return None
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 83d1d8482..5af059d4e 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -40,6 +40,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
     def __init__(self, config, network, is_quant):
         super().__init__(config, network, is_quant)
+        self.num_layers = self.config.model.model_config.num_layers
 
     def quant_convert_weight_name(self, weight_name: str):
         """replace quant net weight name"""
@@ -659,6 +660,25 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.')
         weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.')
         weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight')
+
+        weight_name = self.convert_mtp_weight_name(weight_name)
+        return weight_name
+
+    def convert_mtp_weight_name(self, weight_name: str):
+        layer = 0 if 'layers.' not in weight_name else int(weight_name[weight_name.find('layers.') : ].split('.')[1])
+        if layer < self.num_layers:
+            return weight_name
+        mtp_prefix = f'mtp_model'
+        is_mtp_layer = 'tok_embeddings' not in weight_name and 'shared_head.' not in weight_name
+        mtp_prefix = mtp_prefix if not is_mtp_layer else f'{mtp_prefix}.layer'
+        is_decode_layer = "ffn" in weight_name or "attention" in weight_name or "feed_forward" in weight_name
+        mtp_prefix = mtp_prefix if not is_decode_layer else f'{mtp_prefix}.decode_layer'
+
+        weight_name = weight_name.replace(f'model.layers.{layer}', mtp_prefix)
+        if "tok_embeddings" in weight_name:
+            weight_name = weight_name.replace(f'.weight', f'.embedding_weight')
+        if "shared_head." in weight_name:
+            weight_name = weight_name.replace(f'shared_head.', f'')
         return weight_name
 
     def infer_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
@@ -688,8 +708,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w3_list = []
 
         w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1.weight"
+        w1_ms_name = w1_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w1_ms_name)
         w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2.weight"
+        w2_ms_name = w2_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w2_ms_name)
         w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight"
+        w3_ms_name = w3_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w3_ms_name)
+
         for index in range(0, num_router_experts):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
             w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
@@ -713,7 +737,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden.weight"
-            w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=2)
+            w_gate_hidden_name = w_gate_hidden_name if layer_id < self.num_layers else \
+                self.convert_mtp_weight_name(w_gate_hidden_name)
+            w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=1)
             w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute(0, 2, 1).astype(dtype=ms.bfloat16)
             self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param,
                                                                    name=w_gate_hidden_name,
@@ -739,18 +765,23 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                       is_split_param=True, split_axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
+            w_gate_hidden_name = w_gate_hidden_name if layer_id < self.num_layers else \
+                self.convert_mtp_weight_name(w_gate_hidden_name)
             w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0)
             w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16)
             self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param,
@@ -920,6 +951,25 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             name=ffn_norm_ms_name,
             requires_grad=False)
 
+    def infer_process_mtp_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        parameter_dict = {}
+        mtp_layer_names = ["embed_tokens.weight", "enorm.weight", "hnorm.weight", "eh_proj.weight",
+                           "shared_head.norm.weight", "shared_head.head.weight"]
+        head_names = ["eh_proj.weight", "shared_head.head.weight"]
+        for prefix_name in mtp_layer_names:
+            hf_name = f"model.layers.{layer_id}.{prefix_name}"
+            ms_name = self.convert_weight_name(hf_name)
+            if prefix_name in head_names and not self.config.parallel_config.vocab_emb_dp:
+                ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True, split_axis=0)
+            else:
+                ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map)
+            parameter_dict[ms_name] = ms.Parameter(ms.Tensor(ms_param, ms.bfloat16),
+                                                             name=ms_name,
+                                                             requires_grad=False)
+
+        _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
+
     def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer convert layer weight"""
         if layer_id >= 3:
@@ -931,6 +981,10 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
 
+        # convert mtp shared weights.
+        if layer_id >= self.num_layers:
+            self.infer_process_mtp_layer_weight(src_hf_dir, layer_id, hf_weight_map)
+
     def infer_smooth_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
         """infer_smooth_quant_net_ms_convert_layer_weight"""
         parameter_dict = {}
@@ -1084,17 +1138,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             print(f"gptq-quant param_not_load:{param_not_load}")
             print(f"gptq-quant ckpt_not_load:{ckpt_not_load}")
 
-    def load_safetensors_shard(self, src_hf_dir):
+    def load_safetensors_shard(self, src_hf_dir, is_mtp_model=False):
         """deepseek load safetensors and shard """
         rank_id = get_rank()
         param_json_path = ""
 
         for file in os.listdir(src_hf_dir):
             if file.endswith('index.json'):
-                param_json_path = os.path.join(src_hf_dir, file)
-                with open(param_json_path, "r") as fp:
-                    hf_weight_map = json.load(fp)['weight_map']
-                break
+                if (self.is_quant and 'quant' in file) or (is_mtp_model and 'quant' not in file):
+                    param_json_path = os.path.join(src_hf_dir, file)
+                    with open(param_json_path, "r") as fp:
+                        hf_weight_map = json.load(fp)['weight_map']
+                    break
             elif file.endswith('_name_map.json'):
                 param_json_path = os.path.join(src_hf_dir, file)
                 with open(param_json_path, "r") as fp:
@@ -1107,19 +1162,22 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         quantization_config = self.config.model.model_config.quantization_config
         quant_method = quantization_config.quant_method if quantization_config else None
-        if not quant_method or (quant_method != "gptq-pergroup" and quant_method != "smoothquant"):
+        if not quant_method or (quant_method != "gptq-pergroup" and quant_method != "smoothquant") and \
+                not is_mtp_model:
             self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
 
-        num_layers = self.config.model.model_config.num_layers
         if quant_method and quant_method == "gptq-pergroup":
-            self.infer_gptq_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
+            self.infer_gptq_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map)
             return
         if quant_method and quant_method == "smoothquant":
-            self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, num_layers, hf_weight_map)
+            self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map)
             return
 
         enable_tqdm = rank_id == 0
-        for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm):
+        mtp_layers = self.config.model.model_config.num_nextn_predict_layers
+        start_layer = 0 if not is_mtp_model else self.num_layers
+        end_layer = self.num_layers if not is_mtp_model else self.num_layers + mtp_layers
+        for layer_id in tqdm(range(start_layer, end_layer), desc="Weight loading", disable=not enable_tqdm):
             if self.is_quant:
                 self.infer_quant_net_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
             else:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 4ba57131f..697e9546c 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -114,17 +114,8 @@ class MfModelBase(MsModelBase):
         return mutable(key_cache), mutable(value_cache)
 
 
-    def forward(
-        self,
-        input_ids: Tensor,
-        positions: Tensor,
-        kv_caches: List[Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[Tensor] = None,
-    ) -> Union[Tensor, IntermediateTensors]:
+    def prepare_inputs(self, input_ids, positions, attn_metadata):
         key_cache, value_cache = self.get_kvcache()
-
         seq_lens = attn_metadata.seq_lens
         max_query_len = attn_metadata.max_query_len
         # When Mutli-Step is enabled with Chunked-Prefill, prefills and
@@ -158,6 +149,24 @@ class MfModelBase(MsModelBase):
         model_inputs["key_cache"] = key_cache
         model_inputs["value_cache"] = value_cache
 
+        return model_inputs, is_prefill
+
+    def update_model_inputs(self, model_inputs, **kwargs):
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        kv_caches: List[Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        **kwargs
+    ) -> Union[Tensor, IntermediateTensors]:
+        model_inputs, is_prefill = self.prepare_inputs(input_ids, positions, attn_metadata)
+        model_inputs = self.update_model_inputs(model_inputs, **kwargs)
+
         if is_prefill:
             self.network.phase = "prefill"
             if not self.set_flags:
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index f1bb23615..d6355e429 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -129,6 +129,8 @@ class MsModelBase():
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
+        previous_hidden_states: Optional[Tensor] = None,
+        spec_step_idx: int = 0,
     ) -> Union[Tensor, IntermediateTensors]:
         return self.forward(
             input_ids,
@@ -137,6 +139,8 @@ class MsModelBase():
             attn_metadata,
             intermediate_tensors,
             inputs_embeds,
+            previous_hidden_states=previous_hidden_states,
+            spec_step_idx=spec_step_idx
         )
 
     def forward(
@@ -147,6 +151,7 @@ class MsModelBase():
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
+        **kwargs
     ) -> Union[Tensor, IntermediateTensors]:
         raise NotImplementedError
 
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 90ce30416..2c3c81d45 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -495,7 +495,8 @@ class Qwen2ForCausalLM(MsModelBase):
         kv_caches: List[Tuple[Tensor, Tensor]],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: IntermediateTensors = None,
-        inputs_embeds: Tensor = None
+        inputs_embeds: Tensor = None,
+        **kwargs
     ) -> Union[Tensor, IntermediateTensors]:
         if attn_metadata.num_prefill_tokens > 0:
             input_ids = input_ids.expand_dims(0)
diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py
index 09243fcb4..1a9dbe9fe 100644
--- a/vllm_mindspore/model_executor/models/registry.py
+++ b/vllm_mindspore/model_executor/models/registry.py
@@ -37,6 +37,7 @@ _MINDSPORE_MODELS = {
 _MINDFORMERS_MODELS = {
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
+    "DeepSeekMTPModel": ("deepseek_mtp", "DeepseekV3MTPForCausalLM"),
 }
 
 MindSporeModelRegistry = _ModelRegistry(
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index 0273fb873..42759bfeb 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -20,6 +20,7 @@ import contextlib
 import gc
 import logging
 import os
+import sys
 from typing import (
     TYPE_CHECKING,
     Callable,
@@ -255,4 +256,9 @@ def convert_np_to_ms_dtype(value):
         value_dtype = ms.float32
     else:
         value_dtype = ms.bfloat16
-    return value_dtype
\ No newline at end of file
+    return value_dtype
+
+# Replace the directly loaded module in vllm, such as 'from module import xxx'
+def update_modules(name, module):
+    logger.info(f"replace module {name} by {module}")
+    sys.modules.update({name: module})
diff --git a/vllm_mindspore/worker/spec_decode_worker.py b/vllm_mindspore/worker/spec_decode_worker.py
new file mode 100644
index 000000000..91a717cc9
--- /dev/null
+++ b/vllm_mindspore/worker/spec_decode_worker.py
@@ -0,0 +1,364 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# ToDo: remove when msadapter supports
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import torch
+
+from vllm.worker.worker_base import WorkerBase
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeScorer, SpeculativeScores)
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
+                           CompletionSequenceGroupOutput, ExecuteModelRequest,
+                           HiddenStates, SequenceGroupMetadata,
+                           get_all_seq_ids_and_request_ids)
+from vllm.model_executor.layers.spec_decode_base_sampler import (
+    SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler)
+
+from vllm.spec_decode.util import (Timer, create_logprobs_output,
+                                   create_sequence_group_output,
+                                   get_all_num_logprobs,
+                                   get_sampled_token_logprobs, nvtx_range,
+                                   split_batch_by_proposal_len)
+
+# MQAScore is only supported in FLASH_ATTN and eager mode.
+def spec_decode_worker_init(
+        self,
+        proposer_worker: ProposerWorkerBase,
+        scorer_worker: WorkerBase,
+        spec_decode_sampler: SpecDecodeBaseSampler,
+        disable_mqa_scorer: bool = False,
+        disable_logprobs: bool = False,
+        disable_log_stats: bool = False,
+        metrics_collector: Optional[AsyncMetricsCollector] = None,
+        disable_by_batch_size: Optional[int] = None,
+        allow_zero_draft_token_step: Optional[bool] = True,
+        enable_lm_head_weight_load: Optional[bool] = False,
+        num_spec_prefill_steps: int = 1,
+):
+    self.proposer_worker = proposer_worker
+    self.scorer_worker = scorer_worker
+    scorer_runner = getattr(self.scorer_worker, "model_runner", None)
+    self.generators = scorer_runner.get_generators(
+    ) if scorer_runner else None
+    self.disable_by_batch_size = disable_by_batch_size or float("inf")
+    self.spec_decode_sampler = spec_decode_sampler
+    self._allow_zero_draft_token_step = allow_zero_draft_token_step
+    self._enable_lm_head_weight_load = enable_lm_head_weight_load
+    self._metrics = AsyncMetricsCollector(
+        self.spec_decode_sampler
+    ) if metrics_collector is None else metrics_collector
+    # Tracks the sequence IDs that received a bonus token ID in
+    # their last forward pass. Needed only if KV cache is being
+    # used for token generation such as in the case of MultiStepWorker.
+    self._seq_with_bonus_token_in_last_step: Set[int] = set()
+    # Tracks the currently active request ids and the sequence IDs
+    # corresponding to them
+    self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set)
+    # Tracks if the proposer worker uses the KV cache or not.
+
+    self.probs_dtype = self.spec_decode_sampler.probs_dtype
+    self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
+    # Lazy initialization.
+    self.scorer: SpeculativeScorer
+    self.disable_mqa_scorer = False
+
+    # Hidden states from target model to pass to proposer
+    # in the subsequent step.
+    self.previous_hidden_states: Optional[HiddenStates] = None
+    self._disable_logprobs = disable_logprobs
+    self._disable_log_stats = disable_log_stats
+    self._num_spec_prefill_steps = num_spec_prefill_steps
+
+# msadapter does not support to slice tensor with empty index,
+# rewrite this method to optimize the performance(almost 2ms)
+@nvtx_range("spec_decode_worker._verify_tokens")
+def _verify_tokens(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_scores: SpeculativeScores,
+        proposals: SpeculativeProposals,
+        max_proposal_len: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Determine which speculative tokens are accepted using the
+    probabilities of each token according to the proposer and scorer models.
+
+    Returns a tuple of Tensors, one for the accepted token ids and one for
+    the logprobs according to the scoring model.
+    """
+    proposal_lens_list = proposals.proposal_lens.tolist()
+
+    # vLLM currently only supports proposal lens equal to zero or the batch
+    # proposal len. This adds some complexity (splitting the batch into spec
+    # and non spec sequences) and should be removed in the future. It can be
+    # done by supporting per-sequence proposal lens.
+    (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len(
+        seq_group_metadata_list, proposal_lens_list)
+    original_indices = spec_indices + non_spec_indices
+
+    proposal_verifier_probs = proposal_scores.probs
+    bonus_token_ids = proposal_scores.token_ids[:, -1:]
+    proposal_probs = proposals.proposal_probs
+    proposal_token_ids = proposals.proposal_token_ids
+    if non_spec_indices:
+        # Get probabilities of target model, including bonus tokens.
+        proposal_verifier_probs = proposal_verifier_probs[spec_indices]
+        # Get bonus tokens from target model.
+        bonus_token_ids = bonus_token_ids[spec_indices]
+        # Get probabilities according to proposal method.
+        proposal_probs = proposal_probs[spec_indices]
+        # Get proposed tokens.
+        proposal_token_ids = proposal_token_ids[spec_indices]
+
+    # Sampler arguments
+    sampler_extra_kwargs: Dict[str, Any] = {}
+    if self.generators and isinstance(self.spec_decode_sampler,
+                                      SpecDecodeStochasticBaseSampler):
+        sampler_extra_kwargs["seeded_seqs"] = {
+            idx: self.generators[sgm.request_id]
+            for idx, sgm in enumerate(seq_group_metadata_list)
+            if sgm.sampling_params.seed is not None
+        }
+
+    accepted_token_ids = self.spec_decode_sampler(
+        target_with_bonus_probs=proposal_verifier_probs,
+        bonus_token_ids=bonus_token_ids,
+        draft_probs=proposal_probs,
+        draft_token_ids=proposal_token_ids,
+        **sampler_extra_kwargs,
+    )
+    if non_spec_indices:
+        # Get non-speculative sampled tokens from target model.
+        non_spec_token_ids = proposal_scores.token_ids[non_spec_indices].expand(-1, max_proposal_len + 1).clone()
+
+        # Append output tokens from non-speculative sequences to
+        # the accepted token ids tensor.
+        non_spec_token_ids[:, 1:] = -1
+        accepted_token_ids = torch.cat([accepted_token_ids, non_spec_token_ids])
+        # Rearrange so that results are in the order of the original seq group
+        # metadata.
+        accepted_token_ids[original_indices] = accepted_token_ids.clone()
+
+    logprobs = proposal_scores.logprobs
+    # B x K+1 x D
+    hidden_states = proposal_scores.hidden_states
+    if hidden_states is not None:
+        # Only get terminal hidden states for next step
+        terminal_metadata = [
+            sg for sg in seq_group_metadata_list if sg.do_sample
+        ]
+
+        # Contract hidden states based on accepted tokens
+        hs_size = hidden_states.shape[-1]
+        accepted_index = accepted_token_ids + 1  # Convert -1 to 0
+        accepted_index = accepted_index.count_nonzero(dim=1).add_(-1)  # b
+        # Drop non-terminal prefill chunks hidden states.
+        if VLLM_INVALID_TOKEN_ID in accepted_index.tolist():
+            hidden_states = hidden_states[accepted_index != VLLM_INVALID_TOKEN_ID]
+            accepted_index = accepted_index[accepted_index != VLLM_INVALID_TOKEN_ID]
+        assert len(accepted_index) == hidden_states.shape[0] == len( terminal_metadata)
+        index = accepted_index[:, None, None].expand(-1, 1, hs_size)  # b x 1 x d
+        second_last_token_hidden_states = hidden_states[:, -2]  # b x d
+        hidden_states = hidden_states.gather(1, index).squeeze(1)  # b x d
+        # Store hidden states from target model for subsequent decode step
+        self.previous_hidden_states = HiddenStates(
+            hidden_states, terminal_metadata,
+            second_last_token_hidden_states)
+    return accepted_token_ids, logprobs
+
+
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.spec_decode.spec_decode_worker import prepare_prefill_hidden_states
+
+# the 'where' ops in msadapter does not support condition-only inputs, use nonzero
+@nvtx_range("spec_decode_worker._run_no_spec")
+def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
+                 skip_proposer: bool) -> List[SamplerOutput]:
+    """Run a single generation step without any speculation. The input is
+    sent to the proposer and scorer model so that the KV cache is consistent
+    between the two. When skip_proposer is True, the proposer model is
+    not called, meaning that the kv-cache in proposer for requests is not
+    updated, so they cannot enable spec decode in the rest decoding.
+    """
+
+    sampler_output = self.scorer_worker.execute_model(execute_model_req)
+    assert len(sampler_output) == 1
+    sampler_output = sampler_output[0]
+
+    # Store hidden states from target model execution, BxD.
+    hidden_states = sampler_output.hidden_states
+    if hidden_states is not None:
+        # Only decodes and prefill terminal chunks need a hidden state.
+        seq_group_meta_with_hidden = [
+            sg for sg in execute_model_req.seq_group_metadata_list
+            if sg.do_sample
+        ]
+        if any(seq.is_prompt for seq in seq_group_meta_with_hidden):
+            # Drop hidden_states with no prediction (eg non-terminal chunks)
+            hidden_states = hidden_states[
+                (sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID).nonzero(as_tuple=True)[0]]
+        if self.previous_hidden_states is None and len(
+                seq_group_meta_with_hidden):
+            self.previous_hidden_states = HiddenStates(
+                hidden_states, seq_group_meta_with_hidden)
+        elif self.previous_hidden_states and len(
+                seq_group_meta_with_hidden):
+            self.previous_hidden_states.update(hidden_states,
+                                               seq_group_meta_with_hidden)
+
+    if not skip_proposer:
+        # We prepare the prefill hidden states here so that there no
+        # additional complexity in worker for spec_decode vs non_spec_decode
+        # flow and execute_model doesn't need additional modifications.
+        execute_model_req.previous_hidden_states = \
+            prepare_prefill_hidden_states(
+                sampler_output.prefill_hidden_states)
+        for i in range(self._num_spec_prefill_steps):
+            execute_model_req.spec_step_idx = i
+            self.proposer_worker.execute_model(execute_model_req)
+
+    sampler_output_to_return = (self._serialize_sampler_output_no_logprobs(
+        execute_model_req=execute_model_req, sampler_output=sampler_output)
+                                if self._disable_logprobs else
+                                [sampler_output])
+
+    # Clear device tensors from sampler output. This reduces communication
+    # overhead when the engine runs in a different process than the workers.
+    sampler_output.sampled_token_probs = None
+    sampler_output.sampled_token_ids = None
+    sampler_output.logprobs = None
+    return sampler_output_to_return
+
+
+# the output of 'tensor.max()' does not consistent with torch
+def _create_output(
+        self,
+        accepted: torch.Tensor,  # [batch_size, k]
+        substitute_token_ids: torch.Tensor,  # [batch_size, k]
+        draft_token_ids: torch.Tensor,  # [batch_size, k]
+        bonus_token_ids: torch.Tensor,  # [batch_size]
+) -> torch.Tensor:
+    """Format output. Returns a matrix of token ids. When
+    a token is rejected via sampling, all subsequent token ids are
+    set to -1 for the sequence.
+
+    Args:
+        accepted: A boolean tensor indicating if the corresponding
+        draft token in draft_token_ids should be accepted or not.
+        substitute_token_ids: A tensor of token_ids that can be used
+        as substitutes for the draft token ids if the proposed token
+        is rejected.
+        draft_token_ids: A tensor of token ids speculated by the
+        draft model.
+        bonus_token_ids: Token ids to use as the bonus token if
+        all the draft tokens are accepted.
+    Returns:
+        A tensor containing the accepted token ids. The shape of the
+        tensor is [batch_size, k + num_bonus_tokens]
+    """
+    # the return type of max is a tuple in msadapter
+    batch_size, k = substitute_token_ids.shape
+    assert self._num_bonus_tokens == 1    # ToDo: only support 1 mtp layer to optimize performance(almost 2ms)
+
+    # Create an extended output tensor
+    output_with_bonus_tokens = -torch.ones(
+        (batch_size, k + self._num_bonus_tokens),
+        dtype=self.token_id_dtype,
+        device=accepted.device)
+
+    # Fill in the first k columns of the output tensor using masks and data tensors.
+    output_with_bonus_tokens[:, :k] = draft_token_ids * accepted + substitute_token_ids * (~accepted)
+
+    # Fill the last column.
+    # We check output directly as accepted may have True values inconsistentwith causal acceptance.
+    # Fill the recovered token ids.
+    output_with_bonus_tokens[:, -1:] = bonus_token_ids * accepted + (-1) * (~accepted)
+
+    self.num_accepted_tokens += accepted.sum()
+    self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum()
+    self.num_draft_tokens += batch_size * k
+
+    return output_with_bonus_tokens
+
+
+# msadapter does not support 'new_full', and the operator 'new_zero' only supports a list or a tuple as an input
+from vllm.spec_decode.util import sampler_output_to_torch
+def _merge_outputs(
+        self,
+        batch_size: int,
+        proposal_len: int,
+        maybe_sampler_output: Optional[List[SamplerOutput]],
+        proposal_lens: List[int],
+        nonzero_proposal_len_indices: List[int],
+        sampler_transposed: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """After speculations are produced, merge the speculation results with
+    the skipped sequences.
+    """
+    if maybe_sampler_output is None:
+        # If no speculative tokens, the sampler output will be None.
+        # In this case we return empty proposals.
+        proposal_tokens = torch.tensor(-1,
+                                       dtype=torch.long,
+                                       device=self._device).expand(
+            batch_size, proposal_len)
+        proposal_probs = torch.tensor(0,
+                                      dtype=torch.float32,
+                                      device=self._device).expand(
+            batch_size, proposal_len,
+            self._vocab_size)
+        proposal_lens_tensor = torch.tensor(0,
+                                            dtype=torch.long,
+                                            device=self._device).expand(
+            len(proposal_lens))
+        return proposal_tokens, proposal_probs, proposal_lens_tensor
+
+    sampler_output = maybe_sampler_output
+    proposal_tokens, proposal_probs, *_ = sampler_output_to_torch(
+        sampler_output, sampler_transposed)
+
+    # Now, reformat the output GPU tensors such that each sequence has
+    # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
+
+    # entire_proposal_tokens = proposal_tokens.new_full(
+    #     size=(batch_size, *proposal_tokens.shape[1:]),
+    #     fill_value=-1,
+    # )
+    entire_proposal_tokens = torch.full(size=(batch_size, *proposal_tokens.shape[1:]), fill_value=-1)
+    entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
+    entire_proposal_probs = proposal_probs.new_zeros((
+        batch_size,
+        *proposal_probs.shape[1:],)
+    )
+    entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
+
+    proposal_tokens, proposal_probs = (
+        entire_proposal_tokens,
+        entire_proposal_probs,
+    )
+
+    proposal_lens_tensor = torch.zeros(batch_size,
+                                       dtype=torch.long,
+                                       device=self._device)
+    proposal_lens_tensor[nonzero_proposal_len_indices] = proposal_len
+
+    return proposal_tokens, proposal_probs, proposal_lens_tensor
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 3ef4717b0..8ce1bc91d 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -34,7 +34,7 @@ from vllm.distributed import (
 
 from vllm.logger import init_logger
 
-from vllm_mindspore.utils import is_mindformers_model_backend
+from vllm_mindspore.utils import get_valid_dtype
 from vllm.model_executor import set_random_seed
 from vllm.sequence import SequenceGroupMetadata
 from vllm.sampling_params import SamplingParams
@@ -43,9 +43,9 @@ from vllm.sampling_params import SamplingParams
 logger = init_logger(__name__)
 
 
-def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefill):
+def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefill, is_mtp_model=False):
     bs = 1
-    seq_len = model_config.max_seq_len_to_capture if is_prefill else 1
+    seq_len = model_runner.scheduler_config.max_num_batched_tokens if is_prefill else 1
     dummy_data = model_runner.input_registry.dummy_data_for_profiling(model_config, seq_len, model_runner.mm_registry)
     block_tables = [i for i in range(math.ceil(seq_len / cache_engine.block_size))]
     seqs = [
@@ -66,20 +66,32 @@ def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefi
     block_tables = model_input.attn_metadata.block_tables
     if block_tables is not None and block_tables.numel() <= 0:
         model_input.attn_metadata.block_tables = torch.zeros((1, 1), dtype=torch.int32)
-    return model_input
+
+    previous_hidden_states = None if not is_mtp_model else \
+        torch.ones([bs, seq_len, model_config.get_hidden_size()], dtype=get_valid_dtype(model_config.dtype))
+    return model_input, previous_hidden_states
 
 
 def _warm_up_model(self) -> None:
     # cache_engine is a list with length equal to the size of pipeline-parallel, and only pp=1 is supported.
     kv_cache = self.cache_engine[0].gpu_cache
+    is_mtp_model = self.speculative_config is not None and self.model_config.hf_config.model_type == "deepseek_mtp"
+    if is_mtp_model:
+        # prefill mtp model
+        model_input, previous_hidden_states = _prepare_input_for_warmup(self.model_config, self.model_runner,
+                                                                        self.cache_engine[0], True, is_mtp_model)
+        self.model_runner.execute_model(model_input, kv_cache, None, previous_hidden_states=previous_hidden_states)
 
     # warmup for decode
     if self.vllm_config.scheduler_config.is_multi_step:
-        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner, self.cache_engine[0], False)
+        model_input, _ = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner,
+                                                   self.cache_engine[0], False)
         self.model_runner._base_model_runner.execute_model(model_input, kv_cache, None)
     else:
-        model_input = _prepare_input_for_warmup(self.model_config, self.model_runner, self.cache_engine[0], False)
-        self.model_runner.execute_model(model_input, kv_cache, None)
+        model_input, previous_hidden_states = _prepare_input_for_warmup(self.model_config, self.model_runner,
+                                                                        self.cache_engine[0], False, is_mtp_model)
+        self.model_runner.execute_model(model_input, kv_cache, None, previous_hidden_states=previous_hidden_states)
+
     torch.cuda.synchronize()
 
     # Reset the seed to ensure that the random state is not affected by
-- 
Gitee


From 54e509c322b52072b0ca40a2b9fae7397dc829d1 Mon Sep 17 00:00:00 2001
From: yyyyrf <yourifan@outlook.com>
Date: Sat, 29 Mar 2025 21:02:43 +0800
Subject: [PATCH 59/82] int4 split strategy adapter moe shared-experts

---
 .../mf_models/deepseekv3_weight_processor.py  | 38 +++++++------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 83d1d8482..ae13ab839 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1021,7 +1021,6 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj",
                                "kv2l", "routed_experts.router.dense",
                                "routed_experts.router.e_score_correction_bias",
-                               "shared_experts.w_gate_hidden", "shared_experts.w2",
                                "topk_bias"]
 
         for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"):
@@ -1031,24 +1030,16 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             if any([name in param_name for name in no_need_split_layer]):
                 value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
                                                                hf_weight_map)
-            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden."]]):
-                if param_name.endswith(".weight"):
-                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=0)
-                else:
-                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=1)
-            elif any([name in param_name for name in [".feed_forward.w2.", ".wo."]]):
-                if param_name.endswith(".weight"):
-                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=1)
-                else:
-                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=0)
+            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.",
+                                                      "shared_experts.w_gate_hidden"]]):
+                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                hf_weight_map, is_split_param=True,
+                                                                split_axis=1)
+            elif any([name in param_name for name in [".feed_forward.w2.", ".wo.",
+                                                      "shared_experts.w2"]]):
+                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                                hf_weight_map, is_split_param=True,
+                                                                split_axis=0)
             elif ".routed_experts.ffn.w_gate_hidden." in param_name:
                 value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
                 value_list = []
@@ -1080,9 +1071,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             else:
                 parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
                                                           name=param_name, requires_grad=False)
-            param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-            print(f"gptq-quant param_not_load:{param_not_load}")
-            print(f"gptq-quant ckpt_not_load:{ckpt_not_load}")
+            _, _ = ms.load_param_into_net(self.network, parameter_dict)
 
     def load_safetensors_shard(self, src_hf_dir):
         """deepseek load safetensors and shard """
@@ -1098,8 +1087,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             elif file.endswith('_name_map.json'):
                 param_json_path = os.path.join(src_hf_dir, file)
                 with open(param_json_path, "r") as fp:
-                    param_map = json.load(fp)
-                    hf_weight_map = param_map["weight_map"] if "weight_map" in param_map else param_map
+                    hf_weight_map = json.load(fp)
+                    if hf_weight_map.get('weight_map'):
+                        hf_weight_map = hf_weight_map['weight_map']
                 break
 
         if not param_json_path:
-- 
Gitee


From b5c4892774f7545b73201a0301b2ab9d5c74163f Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Tue, 8 Apr 2025 11:33:43 +0800
Subject: [PATCH 60/82] fix vllm_codecheck tool

---
 codecheck_toolkits/vllm_codecheck.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh
index 7e5e0a286..e67c73723 100644
--- a/codecheck_toolkits/vllm_codecheck.sh
+++ b/codecheck_toolkits/vllm_codecheck.sh
@@ -1,5 +1,7 @@
 pip install -r requirements-lint.txt
 
+RET_FLAG=0
+
 cd ..
 # yapf formats code automatically
 
@@ -11,6 +13,7 @@ fi
 
 if [[ $? -ne 0 ]]; then
   echo "yapf run failed."
+  RET_FLAG=1
 else
   echo "yapf run success."
 fi
@@ -22,6 +25,7 @@ if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.py
 fi
 if [[ $? -ne 0 ]]; then
   echo "codespell check failed."
+  RET_FLAG=1
 else
   echo "codespell check success."
 fi
@@ -33,6 +37,7 @@ if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.py
 fi
 if [[ $? -ne 0 ]]; then
   echo "ruff check failed."
+  RET_FLAG=1
 else
   echo "ruff check success."
 fi
@@ -44,19 +49,24 @@ if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.py
 fi
 if [[ $? -ne 0 ]]; then
   echo "isort fixed failed."
+  RET_FLAG=1
 else
   echo "isort fixed success."
 fi
 
 # mypy check type
+
+PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+
 if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
   git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
-  mypy --follow-imports skip --python-version 3.9 "$@"
+  mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
 fi
 if [[ $? -ne 0 ]]; then
   echo "mypy check failed."
+  RET_FLAG=1
 else
   echo "mypy check success."
 fi
 
-cd -
+cd - || exit $RET_FLAG
-- 
Gitee


From 5e4116fe530112724687dffbe03ee7391ff127ef Mon Sep 17 00:00:00 2001
From: yyyyrf <yourifan@outlook.com>
Date: Tue, 8 Apr 2025 14:23:20 +0800
Subject: [PATCH 61/82] [CI]update mindformers to adapater jit

---
 tests/mindformers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mindformers b/tests/mindformers
index ed67bae4e..4b50139b4 160000
--- a/tests/mindformers
+++ b/tests/mindformers
@@ -1 +1 @@
-Subproject commit ed67bae4e88fa4d01c91cfbe4dfd822165c75d2f
+Subproject commit 4b50139b476981ac23e6bf7634bb6e479f9bbf16
-- 
Gitee


From 7adbe40a70d0ba1305374ade6ae75e2b249030e2 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Tue, 8 Apr 2025 14:33:59 +0800
Subject: [PATCH 62/82] =?UTF-8?q?=E9=87=8D=E6=96=B0=E6=B7=BB=E5=8A=A0sampl?=
 =?UTF-8?q?er=E7=94=A8=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/st/python/test_sampler.py | 777 ++++++++++++++++++++++++++++++++
 vllm_mindspore/utils.py         |   6 +-
 2 files changed, 780 insertions(+), 3 deletions(-)
 create mode 100644 tests/st/python/test_sampler.py

diff --git a/tests/st/python/test_sampler.py b/tests/st/python/test_sampler.py
new file mode 100644
index 000000000..9f8916ca4
--- /dev/null
+++ b/tests/st/python/test_sampler.py
@@ -0,0 +1,777 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import vllm_mindspore
+import itertools
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+from unittest.mock import Mock, patch
+from mindspore import mint
+
+import pytest
+import torch
+from transformers import GenerationConfig, GenerationMixin
+
+import vllm.envs as envs
+
+from vllm_mindspore.model_executor.layers.sampler import Sampler
+from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm_mindspore.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import Counter, is_pin_memory_available
+
+class MockLogitsSampler(Sampler):
+
+    def __init__(self, fake_logits: torch.Tensor):
+        super().__init__()
+        self.fake_logits = fake_logits
+
+    def forward(self, *args, **kwargs):
+        return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, VOCAB_SIZE),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+    return input_tensor, fake_logits, sampler
+
+
+VOCAB_SIZE = 32000
+RANDOM_SEEDS = list(range(2))
+CUDA_DEVICES = ['cuda']
+
+
+def _do_sample(
+    batch_size: int,
+    input_tensor: torch.Tensor,
+    sampler: MockLogitsSampler,
+    sampling_params: SamplingParams,
+    device: str,
+):
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(temperature=0)
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+    expected = torch.argmax(fake_logits, dim=-1)
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == expected[i].item()
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    for i in range(batch_size):
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                      sampling_params, device)
+
+    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                       sampling_params, device)
+
+    assert first_sampler_output == second_sampler_output
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_min_tokens_penalty(seed: int, device: str):
+    seq_id_counter = Counter(start=random.randint(0, 100))
+    set_random_seed(seed)
+    torch.set_default_device(device)
+
+    def create_sampling_params(min_tokens,
+                               eos_token_id=0,
+                               *,
+                               stop_token_ids: Optional[List[int]] = None,
+                               prompt_logprobs: Optional[int] = None):
+        sampling_params = SamplingParams(
+            min_tokens=min_tokens,
+            max_tokens=9999,  # keep higher than max of min_tokens
+            stop_token_ids=stop_token_ids,
+            # requesting prompt_logprobs changes the structure of `logits`
+            prompt_logprobs=prompt_logprobs,
+        )
+        sampling_params.all_stop_token_ids.add(eos_token_id)
+        return sampling_params
+
+    def create_sequence_data(num_input=3, num_generated=0):
+        seq_data = SequenceData.from_seqs(
+            random.choices(range(0, VOCAB_SIZE), k=num_input))
+        if num_generated > 0:
+            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
+                                                       k=num_generated)
+        return seq_data
+
+    def generate_test_case():
+        # generate multiple seq groups but limit total batch size
+        batch_size = random.randint(1, 128)
+
+        expected_penalization = []
+        sequence_metadata_list: List[SequenceGroupMetadata] = []
+        # 20% chance to generate seq group metadata list with all prompts
+        is_prompt = random.random() < 0.2
+        while batch_size > 0:
+            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
+
+            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
+            min_tokens = random.randint(0, 50)
+            num_stop_tokens = random.randint(0, 8)
+            if num_stop_tokens > 0:
+                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
+                                                k=num_stop_tokens)
+            else:
+                stop_token_ids = None
+
+            sampling_params = create_sampling_params(
+                min_tokens=min_tokens,
+                eos_token_id=eos_token_id,
+                stop_token_ids=stop_token_ids)
+
+            seq_data: Dict[int, SequenceData] = {}
+            seq_group_penalization: List[bool] = []
+            for _ in range(num_seqs):
+                num_input = random.randint(1, 100)
+                num_generated = 0 if is_prompt else random.randint(1, 100)
+                seq_data[next(seq_id_counter)] = create_sequence_data(
+                    num_input=num_input, num_generated=num_generated)
+                seq_group_penalization.append(num_generated < min_tokens)
+
+            expected_penalization.extend(seq_group_penalization)
+            sequence_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{batch_size}",
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=sampling_params,
+                    block_tables={},
+                ))
+            batch_size -= num_seqs
+
+        return {
+            "expected_penalization": expected_penalization,
+            "seq_group_metadata_list": sequence_metadata_list,
+        }
+
+    # define some explicit test cases for edge case behavior
+    prompt_without_penalization = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(0),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization = {
+        "expected_penalization": [True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            ),
+        ]
+    }
+
+    prompt_with_penalization_and_prompt_logprobs = {
+        "expected_penalization": [False, False, True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=3),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+        ]
+    }
+
+    stop_penalizing_after_min_tokens = {
+        "expected_penalization": [False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                },
+                sampling_params=create_sampling_params(1),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
+    prompt_combination = {
+        "expected_penalization": [False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=2),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_3",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(
+                    0, stop_token_ids=stop_token_ids),
+                block_tables={},
+            )
+        ]
+    }
+
+    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
+    decode_combination = {
+        "expected_penalization": [True, False, False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=100),
+                },
+                sampling_params=create_sampling_params(
+                    2, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=False,
+                seq_data={
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=20),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=10),
+                },
+                sampling_params=create_sampling_params(
+                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
+                block_tables={},
+            ),
+        ]
+    }
+
+    if seed == 0:
+        test_cases = [
+            prompt_without_penalization,
+            prompt_with_penalization,
+            prompt_with_penalization_and_prompt_logprobs,
+            stop_penalizing_after_min_tokens,
+            prompt_combination,
+            decode_combination,
+        ]
+    else:
+        test_cases = [generate_test_case()]
+
+    def run_test_case(*, expected_penalization: List[bool],
+                      seq_group_metadata_list: List[SequenceGroupMetadata]):
+        assert expected_penalization, \
+            "Invalid test case, need expected_penalization"
+        assert seq_group_metadata_list, \
+            "Invalid test case, need seq_group_metadata_list"
+
+        batch_size = 0
+        seq_lens: List[int] = []
+        sampling_params_per_row: List[SamplingParams] = []
+        for sgm in seq_group_metadata_list:
+            sampling_params = sgm.sampling_params
+
+            num_rows = len(sgm.seq_data)
+            if sgm.is_prompt:
+                # a prompt seq_group has only one sequence
+                seq_data = next(iter(sgm.seq_data.values()))
+                prompt_len = seq_data.get_prompt_len()
+                seq_lens.append(prompt_len)
+
+                assert sgm.sampling_params is not None
+                if sgm.sampling_params.prompt_logprobs:
+                    # with prompt_logprobs each token in the prompt has a row in
+                    # logits
+                    num_rows = prompt_len
+
+            batch_size += num_rows
+            sampling_params_per_row.extend(
+                itertools.repeat(sampling_params, num_rows))
+
+        assert len(
+            expected_penalization
+        ) == batch_size, \
+            ("Invalid test case, expected_penalization does not match computed"
+             "batch size")
+
+        _, fake_logits, sampler = _prepare_test(batch_size)
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens=seq_lens if seq_lens else None,
+            query_lens=seq_lens if seq_lens else [1] * batch_size,
+            device=device,
+            pin_memory=is_pin_memory_available())
+        # the logits tensor is modified in-place by the sampler
+        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+        for logits_idx, (should_penalize, sampling_params) in enumerate(
+                zip(expected_penalization, sampling_params_per_row)):
+
+            tokens_to_check = sampling_params.all_stop_token_ids
+
+            if should_penalize:
+                for token_id in tokens_to_check:
+                    assert fake_logits[logits_idx, token_id] == -float(
+                        'inf'
+                    ), f"Expected token {token_id} for logits row {logits_idx}"
+                    " to be penalized"
+                # no other tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] == -float('inf')) == len(
+                        tokens_to_check
+                    ), f"Expected only {len(tokens_to_check)} to be penalized"
+            else:
+                # no tokens should be set to -inf
+                assert torch.count_nonzero(
+                    fake_logits[logits_idx, :] ==
+                    -float('inf')) == 0, "No tokens should have been penalized"
+
+    for test_case in test_cases:
+        run_test_case(**test_case)
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_mixed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
+
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    expected_tokens: List[Optional[List[int]]] = []
+    seq_lens: List[int] = []
+    for i in range(batch_size):
+        expected: Optional[List[int]] = None
+        sampling_type = random.randint(0, 2)
+        if sampling_type == 0:
+            sampling_params = SamplingParams(temperature=0)
+            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
+        elif sampling_type in (1, 2):
+            n = random.randint(1, 10)
+            sampling_params = SamplingParams(
+                temperature=random.random() + 0.1,
+                top_p=min(random.random() + 0.1, 1),
+                top_k=random.randint(0, 10) or -1,
+                n=n,
+                presence_penalty=random.randint(0, 1),
+            )
+            if sampling_type == 2:
+                sampling_params.seed = random.randint(0, 10000)
+            else:
+                for idx in range(n):
+                    fake_logits[i, i + idx] = 1e2
+                expected = list(range(i, i + n))
+
+        expected_tokens.append(expected)
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=sampling_params,
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    generators: Dict[str, torch.Generator] = {}
+
+    def test_sampling():
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available(),
+            generators=generators)
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        for i, (sequence_output, metadata) in enumerate(
+                zip(sampler_output, seq_group_metadata_list)):
+            assert metadata.sampling_params is not None
+
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
+                expected_tokens[i] = [
+                    nth_output.output_token
+                    for nth_output in sequence_output.samples
+                ]
+                continue
+
+            expected_tokens_item = expected_tokens[i]
+            assert expected_tokens_item is not None
+
+            for n, nth_output in enumerate(sequence_output.samples):
+                assert metadata.sampling_params is not None
+
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
+                    # Ensure exact matches for greedy or random with seed
+                    assert nth_output.output_token == expected_tokens_item[n]
+                else:
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
+                    assert nth_output.output_token in expected_tokens_item
+
+    # Test batch
+    test_sampling()
+
+    # Shuffle the batch and resample
+    target_index = list(range(batch_size))
+    for list_to_shuffle in (target_index, seq_group_metadata_list,
+                            expected_tokens, seq_lens):
+        random.Random(seed).shuffle(list_to_shuffle)
+    target_index = torch.tensor(target_index)
+    input_tensor.data = input_tensor.index_select(0, target_index)
+    fake_logits.data = fake_logits.index_select(0, target_index)
+
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
+    test_sampling()
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_top_k_top_p(seed: int, device: str):
+    set_random_seed(seed)
+    batch_size = random.randint(1, 256)
+    top_k = random.randint(100, 500)
+    top_p = random.random() * 0.1
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024),
+                              device=device,
+                              dtype=torch.float16)
+    fake_logits = torch.normal(0,
+                               5,
+                               size=(batch_size, vocab_size),
+                               device=input_tensor.device,
+                               dtype=input_tensor.dtype)
+    sampler = MockLogitsSampler(fake_logits)
+
+    generation_model = GenerationMixin()
+    generation_config = GenerationConfig(top_k=top_k,
+                                         top_p=top_p,
+                                         do_sample=True)
+
+    @dataclass
+    class MockConfig:
+        is_encoder_decoder: bool = False
+
+    generation_model.config = MockConfig()  # needed by the following method
+    generation_model._prepare_special_tokens(generation_config, device=device)
+    processors = generation_model._get_logits_processor(generation_config,
+                                                        None,
+                                                        None,
+                                                        None, [],
+                                                        device=device)
+    assert len(processors) == 2  # top_p and top_k
+
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: List[int] = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(
+                    temperature=1,
+                    top_k=top_k,
+                    top_p=top_p,
+                ),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+
+    sample_probs = None
+
+    def mock_sample(probs, *args, **kwargs):
+        nonlocal sample_probs
+        sample_probs = probs
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                 for prob in probs], None)
+
+    # top-k and top-p is only calculated when flashinfer kernel is not available
+    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
+         patch("vllm.model_executor.layers.sampler."
+               "flashinfer_top_k_top_p_sampling", None):
+        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
+
+    assert sample_probs is not None
+
+    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
+    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
+    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
+    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_flashinfer_fallback(seed: int, device: str):
+    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
+        pytest.skip("Flashinfer sampler is disabled")
+
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    def failing_flashinfer_sampling(*_args, **_kwargs):
+        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    with patch(
+            "vllm.model_executor.layers.sampler."
+            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
+        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                             sampling_params, device)
+
+    assert sampler_output == fallback_sampler_output
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_repetition_penalty_mixed(device: str):
+
+    vocab_size = 8
+
+    def test_sampling_params(sampling_params: List[SamplingParams]):
+
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        seq_lens: List[int] = []
+        for i in range(2):
+            seq_group_metadata_list.append(
+                SequenceGroupMetadata(
+                    request_id=f"test_{i}",
+                    is_prompt=True,
+                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                    sampling_params=sampling_params[i],
+                    block_tables={0: [1]},
+                ))
+            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+        sampling_metadata = SamplingMetadata.prepare(
+            seq_group_metadata_list,
+            seq_lens,
+            query_lens=seq_lens,
+            device=device,
+            pin_memory=is_pin_memory_available())
+
+        fake_logits = torch.full((2, vocab_size),
+                                 1e-2,
+                                 device=device,
+                                 dtype=torch.float16)
+
+        fake_logits[:, 5] = 1.1e-2
+        fake_logits[:, 1] = 1.2e-2
+
+        sampler = MockLogitsSampler(fake_logits)
+
+        sampler_output = sampler(logits=fake_logits,
+                                 sampling_metadata=sampling_metadata)
+
+        generated_tokens = []
+        for output in sampler_output:
+            generated_tokens.append(output.samples[0].output_token)
+
+        return generated_tokens
+
+    # one configuration is greedy with repetition_penalty
+    sampling_params_rep = SamplingParams(
+        temperature=0.0,
+        repetition_penalty=2.0,
+    )
+
+    # other configuration is sampling w/o repetition_penalty
+    sampling_params_sample = SamplingParams(
+        temperature=1.0,
+        top_k=1,
+        seed=42,
+    )
+
+    tokens1 = test_sampling_params(
+        [sampling_params_rep, sampling_params_sample])
+
+    tokens2 = test_sampling_params(
+        [sampling_params_sample, sampling_params_rep])
+
+    assert tokens1[0] == tokens2[1]
+    assert tokens1[1] == tokens2[0]
+
+@pytest.mark.skip(reason="Not implemented yet")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_include_gpu_probs_tensor(device: str):
+    set_random_seed(42)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+    sampler.include_gpu_probs_tensor = True
+    sampler.should_modify_greedy_probs_inplace = False
+
+    sampling_params = SamplingParams(temperature=0)
+
+    mock_inplace = Mock()
+    with patch(
+            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
+            mock_inplace):
+
+        sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                    sampling_params, device)
+        mock_inplace.assert_not_called()
+
+    assert sampler_output.sampled_token_probs is not None
+    assert sampler_output.logprobs is not None
+    assert sampler_output.sampled_token_ids is not None
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index 42759bfeb..f72024a6b 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -89,8 +89,8 @@ def _create_empty_tensor(ms_type):
     return init_tensor
 
 
-def _create_dummy_block_tables():
-    return ms.ops.zeros((1, 1), dtype=ms.int32)
+def _create_dummy_block_tables(dtype):
+    return ms.ops.zeros((1, 1), dtype=dtype)
 
 
 def make_tensor_with_pad(
@@ -114,7 +114,7 @@ def make_tensor_with_pad(
     pin_memory = False
 
     if padded_x.size == 0:
-        tensor = _create_dummy_block_tables()
+        tensor = _create_dummy_block_tables(dtype)
     else:
         tensor = torch.from_numpy(padded_x)
     if pin_memory:
-- 
Gitee


From e925e6f8b0b7a67955c075998d0ced938a76db11 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Wed, 9 Apr 2025 19:54:35 +0800
Subject: [PATCH 63/82] =?UTF-8?q?=E5=90=8E=E5=A4=84=E7=90=86=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .jenkins/test/config/dependent_packages.yaml    |  2 +-
 vllm_mindspore/model_executor/layers/sampler.py | 17 +++++------------
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index eb243d6fa..c212c47d9 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -5,7 +5,7 @@ mindspore_gs:
   'https://repo.mindspore.cn/mindspore/golden-stick/version/202503/20250322/master_20250322160019_1aa0a919d27c806700b2399bf965c5f6663c10fd_newest/'
 
 msadapter:
-  'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250403/master_20250403171706_61451a9e1a5909cfa7877f72b1286bc0a843a067_newest/'
+  'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250410/master_20250410120007_83e7214eb2b9598179135a4e98dce3b69ba27da2_newest/'
 
 vllm:
   'https://repo.mindspore.cn/mirrors/vllm/version/202503/20250321/v0.7.3_20250321112504_ed6e9075d31e32c8548b480a47d1ffb77da1f54c_newest/'
diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index 836d20810..354fb0214 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -504,6 +504,7 @@ def _random_sample(
     # Find the maximum n value of the prompt phase requests.
     sample_idx = 0
     results: SampleResultType = []
+    random_samples = random_samples.asnumpy()
     for seq_group in selected_seq_groups:
         if not seq_group.do_sample:
             results.append(([], []))
@@ -596,13 +597,6 @@ def _beam_search_sample(
     return results
 
 
-def exponential(x, lambd=1.0, *, generator=None):
-    if generator is not None:
-        raise ValueError("`generator` can not be supported.")
-    output = np.random.exponential(scale=lambd, size=x.shape)
-    return ms.Tensor(output).astype(x.dtype)
-
-
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead.
 # Note that we always sample with replacement.
@@ -617,18 +611,17 @@ def _multinomial(
         probs = probs.repeat_interleave(num_samples, dim=0)
     q = torch.empty_like(probs)
     if seq_groups is None:
-        q = exponential(q)
+        q.exponential_()
     else:
         sample_idx = 0
         for seq_group in seq_groups:
             seq_ids = seq_group.seq_ids
             stride = len(seq_ids) * num_samples
             assert seq_group.generator is not None
-            q[sample_idx : sample_idx + stride] = exponential(
-                q[sample_idx : sample_idx + stride]
-            )
+            q[sample_idx : sample_idx +
+              stride].exponential_(generator=seq_group.generator)
             sample_idx += stride
-    return probs.div(q).argmax(axis=1).view(-1, num_samples)
+    return probs.div_(q).argmax(dim=1).view(-1, num_samples)
 
 
 def _top_k_top_p_multinomial_with_flashinfer(
-- 
Gitee


From f0896e8d14473fb59ee9e4c89dce3842bacf6836 Mon Sep 17 00:00:00 2001
From: yangminghai <hale97518@foxmail.com>
Date: Thu, 10 Apr 2025 15:48:09 +0800
Subject: [PATCH 64/82] add commit info

---
 setup.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/setup.py b/setup.py
index 647dfa531..10b8be5a6 100644
--- a/setup.py
+++ b/setup.py
@@ -88,6 +88,14 @@ def get_requirements() -> List[str]:
     return requirements
 
 
+def write_commit_id():
+    ret_code = os.system("git rev-parse --abbrev-ref HEAD > ./vllm_mindspore/.commit_id "
+                         "&& git log --abbrev-commit -1 >> ./vllm_mindspore/.commit_id")
+    if ret_code != 0:
+        sys.stdout.write("Warning: Can not get commit id information. Please make sure git is available.")
+        os.system("echo 'git is not available while building.' > ./vllm_mindspore/.commit_id")
+
+
 version = (Path("vllm_mindspore") / "version.txt").read_text()
 
 def _get_ascend_home_path():
@@ -184,10 +192,14 @@ class CustomBuildExt(build_ext):
         print(f"Moved npu_ops.so to {build_lib_dir}.")
         shutil.rmtree(kernel_meta_dir)
 
+
+write_commit_id()
+
 package_data = {
     "": [
         "*.so",
         "lib/*.so",
+        ".commit_id"
     ]
 }
 
-- 
Gitee


From 7062ae7400a43b06530a3add84707c6aca89fd8a Mon Sep 17 00:00:00 2001
From: w00521005 <wangshaocong1@huawei.com>
Date: Wed, 9 Apr 2025 12:45:32 +0800
Subject: [PATCH 65/82] fix mtp and add ut

---
 tests/st/python/test_vllm_deepseek_part.py    | 41 ++++++++++++
 .../models/mf_models/deepseek_mtp.py          | 67 +++++++------------
 .../models/mf_models/deepseek_v3.py           |  2 +-
 .../mf_models/deepseekv3_weight_processor.py  |  1 +
 .../models/mf_models/mf_model_base.py         | 10 +--
 .../model_executor/models/mf_models/qwen2.py  |  2 +-
 6 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index a0caa3161..7a5d7635a 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -76,3 +76,44 @@ class TestDeepSeek:
 
         # unset env
         env_manager.unset_all()
+
+
+class TestDeepSeekMTP:
+    """
+    Test DeepseekMTP.
+    大模型用量化（4层），mtp模型用浮点（1层，layer 61）。
+    mtp的权重加载默认从配置的num_hidden_layer开始，为了支持减层推理场景mtp权重加载，ci服务器上修改了浮点的权重map文件的layer为4。
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_deepseek_mtp(self):
+        """
+        test case deepseek mtp with main model of r1-w8a8
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
+                  trust_remote_code=True, gpu_memory_utilization=0.8, tensor_parallel_size=8,
+                  num_speculative_tokens=1)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list = ['ugs611ాలు哒ాలు mahassisemaSTE的道德']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
index 357259857..dfe9b0d4c 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
@@ -16,15 +16,13 @@
 # limitations under the License.
 # ============================================================================
 
-from typing import Iterable, Set, Tuple, Optional
+from typing import Iterable, Set, Tuple
 
 from vllm.config import VllmConfig
 from vllm.config import  get_current_vllm_config
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 
-import mindspore as ms
 from mindspore import Tensor, JitConfig, Model, mutable
 from mindspore.nn.utils import no_init_parameters
 
@@ -36,7 +34,7 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_MLA
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
@@ -47,7 +45,24 @@ class DeepseekV3MTPForCausalLM(MfModelBase):
         super(DeepseekV3MTPForCausalLM, self).__init__(
             vllm_config=vllm_config, prefix=prefix
         )
+        self.mf_kvcaches_init = False
 
+        self.sampler = get_sampler()
+        self.set_modules({"model": self.network})
+
+        self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)]
+        compilation_config = get_current_vllm_config().compilation_config
+
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        for i in range(self.mf_model_config.num_nextn_predict_layers):
+            compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
+
+        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
+        self.set_flags = False
+
+
+    def _generate_model_config(self):
         self.mf_config.load_checkpoint = self.get_model_path()
 
         self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
@@ -57,33 +72,20 @@ class DeepseekV3MTPForCausalLM(MfModelBase):
         setattr(self.mf_model_config, 'npu_mem_size', -1)
 
         self.mf_model_config.is_mtp_model = True
-        self.mf_model_config.num_nextn_predict_layers = vllm_config.model_config.hf_config.num_nextn_predict_layers
+        self.mf_model_config.num_nextn_predict_layers = self.model_config.hf_config.num_nextn_predict_layers
         if self.mf_model_config.num_nextn_predict_layers != 1:
             raise NotImplementedError("Only support 1 MTP-layer now.")
-        
-        self.mf_config.model.model_config = self.mf_model_config
-        # Initital network
-        with no_init_parameters():  # Delay initialization
-            self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
-        self.network._jit_config_dict = JitConfig(
-            jit_level="O0", infer_boost="on"
-        ).jit_config_dict
-        self.mf_kvcaches_init = False
+        self.mf_config.model.model_config = self.mf_model_config
 
-        self.sampler = get_sampler()
-        self.set_modules({"model": self.network})
 
-        self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_nextn_predict_layers)]
-        compilation_config = get_current_vllm_config().compilation_config
+    def _create_network(self):
+        # Initital network
+        with no_init_parameters():  # Delay initialization
+            network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
-        if prefix in compilation_config.static_forward_context:
-            raise ValueError(f"Duplicate layer name: {prefix}")
-        for i in range(self.mf_model_config.num_nextn_predict_layers):
-            compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
+        return network, network.mtp_model.head
 
-        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
-        self.set_flags = False
 
     def get_kvcache(self):
         key_cache = []
@@ -105,23 +107,6 @@ class DeepseekV3MTPForCausalLM(MfModelBase):
         return model_inputs
 
 
-    def compute_logits(
-        self,
-        hidden_states: Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[Tensor]:
-        selected_token_indices = sampling_metadata.selected_token_indices
-        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
-            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
-                                    dtype=self.mf_model_config.compute_dtype)
-        else:
-            hidden_states = hidden_states.index_select(0, selected_token_indices)
-            logits = self.network.mtp_model.head(hidden_states)
-            logits = logits.reshape(-1, logits.shape[-1])
-
-        return logits
-
-
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, False)
         weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint, is_mtp_model=True)
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 30ea8d905..c6f2b0a97 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -98,7 +98,7 @@ class DeepseekV3ForCausalLM(MfModelBase):
             if ptq is not None:
                 ptq.apply(network)
                 ptq.convert(network)
-        return network
+        return network, network.lm_head
 
     def get_kvcache(self):
         key_cache = []
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index db5575d7f..a3b9bb3ee 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1134,6 +1134,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         for file in os.listdir(src_hf_dir):
             if file.endswith('index.json'):
+                # mtp model do not support quantization, needs to load bf16 weight.
                 if (self.is_quant and 'quant' in file) or (is_mtp_model and 'quant' not in file):
                     param_json_path = os.path.join(src_hf_dir, file)
                     with open(param_json_path, "r") as fp:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 777c843c1..cfbba55d0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -104,14 +104,14 @@ class MfModelBase(MsModelBase):
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
 
         self._generate_model_config()
-        self.network = self._create_network()
+        self.network, self.lm_head = self._create_network()
 
         self.network.construct = MethodType(ms.jit(self.network.__class__.construct,
                                                    jit_level='O0', infer_boost='on'),
                                             self.network)
-        self.network.lm_head.construct = MethodType(ms.jit(self.network.lm_head.__class__.construct,
-                                                            jit_level='O0', infer_boost='on'),
-                                                    self.network.lm_head)
+        self.lm_head.construct = MethodType(ms.jit(self.lm_head.__class__.construct,
+                                                   jit_level='O0', infer_boost='on'),
+                                            self.lm_head)
 
     @abstractmethod
     def _generate_model_config(self):
@@ -212,7 +212,7 @@ class MfModelBase(MsModelBase):
                                     dtype=self.mf_model_config.compute_dtype)
         else:
             hidden_states = hidden_states.index_select(0, selected_token_indices)
-            logits = self.network.lm_head(hidden_states)
+            logits = self.lm_head(hidden_states)
             logits = logits.reshape(-1, logits.shape[-1])
 
         return logits
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index aa0a7a14e..1288884e8 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -74,7 +74,7 @@ class Qwen2ForCausalLM(MfModelBase):
         # Initial network
         with no_init_parameters():  # Delay initialization
             network = ParallelQwenForCausalLM_MF(self.mf_model_config)
-        return network
+        return network, network.lm_head
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         weight_processor = Qwen2WeightProcessor(self.mf_config, self.network, False)
-- 
Gitee


From c8dd696885d05a0eb7d9019ed421b7e151f3bb25 Mon Sep 17 00:00:00 2001
From: huzhikun <huzhikun1@yeah.net>
Date: Thu, 10 Apr 2025 16:57:26 +0800
Subject: [PATCH 66/82] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dgather=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90=E8=B6=8A=E7=95=8C=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/model_executor/models/mf_models/mf_model_base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 777c843c1..c39c8c4a9 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -36,6 +36,7 @@ from vllm.logger import init_logger
 import torch
 import mindspore as ms
 from mindspore import Tensor, mutable
+from mindspore.common.api import _pynative_executor
 
 from mindformers.tools.register.config import MindFormerConfig
 from mindformers.core.context import build_context
@@ -223,6 +224,7 @@ class MfModelBase(MsModelBase):
         sampling_metadata: SamplingMetadata,
     ) -> Optional[SamplerOutput]:
         next_tokens = self.sampler(logits, sampling_metadata)
+        _pynative_executor.sync()
         return next_tokens
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
-- 
Gitee


From 22d818b199fbdcbe469290799da61834cbb54354 Mon Sep 17 00:00:00 2001
From: huandong <huandong1@huawei.com>
Date: Fri, 11 Apr 2025 02:57:10 +0800
Subject: [PATCH 67/82] add MS_MEMPOOL_BLOCK_SIZE for vmm=False

---
 vllm_mindspore/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index f72024a6b..717416bbc 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -223,6 +223,9 @@ def check_ready():
     # Common environment variables of predict.
     set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
 
+    if os.getenv("MS_MEMPOOL_BLOCK_SIZE"):
+        set_context(mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB")
+
     if is_mindformers_model_backend():
         logger.info("Run with Mindformers backend!")
         necessary_envs = ("MINDFORMERS_MODEL_CONFIG", )
-- 
Gitee


From 0779ec075a84ded9071bd3e18087a65f7f10e34b Mon Sep 17 00:00:00 2001
From: huandong <huandong1@huawei.com>
Date: Fri, 11 Apr 2025 03:03:37 +0800
Subject: [PATCH 68/82] del vLLM_MODEL_MEMORY_USE_GB\ASCEND_TOTAL_MEMORY_GB for
 test

---
 tests/st/python/test_vllm_deepseek_part.py  | 2 --
 tests/st/python/test_vllm_mf_qwen_7b.py     | 2 --
 tests/st/python/test_vllm_mf_qwen_7b_mss.py | 2 --
 3 files changed, 6 deletions(-)

diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 7a5d7635a..72ba0b9a3 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -23,8 +23,6 @@ env_vars = {
     "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml",
     "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
     "vLLM_MODEL_BACKEND": "MindFormers",
-    "vLLM_MODEL_MEMORY_USE_GB": "40",
-    "ASCEND_TOTAL_MEMORY_GB": "60",
     "MS_ENABLE_LCCL": "off",
     "HCCL_OP_EXPANSION_MODE": "AIV",
     "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
diff --git a/tests/st/python/test_vllm_mf_qwen_7b.py b/tests/st/python/test_vllm_mf_qwen_7b.py
index e8c71690f..77044ea0c 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b.py
@@ -23,8 +23,6 @@ env_vars = {
     "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
     "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
     "vLLM_MODEL_BACKEND": "MindFormers",
-    "vLLM_MODEL_MEMORY_USE_GB": "50",
-    "ASCEND_TOTAL_MEMORY_GB": "64",
     "MS_ENABLE_LCCL": "off",
     "HCCL_OP_EXPANSION_MODE": "AIV",
     "ASCEND_RT_VISIBLE_DEVICES": "0,1",
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
index 7983d7a88..51ebee5f5 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
@@ -23,8 +23,6 @@ env_vars = {
     "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
     "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
     "vLLM_MODEL_BACKEND": "MindFormers",
-    "vLLM_MODEL_MEMORY_USE_GB": "20",
-    "ASCEND_TOTAL_MEMORY_GB": "29",
     "MS_ENABLE_LCCL": "off",
     "HCCL_OP_EXPANSION_MODE": "AIV",
     "ASCEND_RT_VISIBLE_DEVICES": "0,1",
-- 
Gitee


From 46acb43595da1dc3d5434fc47f37257e173db8e2 Mon Sep 17 00:00:00 2001
From: w00521005 <wangshaocong1@huawei.com>
Date: Fri, 11 Apr 2025 10:46:11 +0800
Subject: [PATCH 69/82] fix mtp ut

---
 tests/st/python/test_vllm_deepseek_part.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 7a5d7635a..6334f6277 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -101,7 +101,7 @@ class TestDeepSeekMTP:
         sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
 
         # Create an LLM.
-        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP",
                   trust_remote_code=True, gpu_memory_utilization=0.8, tensor_parallel_size=8,
                   num_speculative_tokens=1)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
-- 
Gitee


From 951c518bbd1968932332a41a61279b55719eaa7b Mon Sep 17 00:00:00 2001
From: dayschan <chendeshi@huawei.com>
Date: Tue, 1 Apr 2025 14:32:37 +0800
Subject: [PATCH 70/82] refactor the compilation project of custom ops

---
 setup.py                                      | 106 ++++++------------
 vllm_mindspore/ops/CMakeLists.txt             |  40 +++++++
 vllm_mindspore/ops/ascendc/CMakeLists.txt     |  14 +--
 .../adv_step_flash.cpp}                       |   5 +-
 vllm_mindspore/ops/module/module.cpp          |   6 +
 vllm_mindspore/ops/module/module.h            |  54 +++++++++
 6 files changed, 147 insertions(+), 78 deletions(-)
 create mode 100644 vllm_mindspore/ops/CMakeLists.txt
 rename vllm_mindspore/ops/{ascendc/adv_step_flash_adapter.cpp => module/adv_step_flash.cpp} (97%)
 create mode 100644 vllm_mindspore/ops/module/module.cpp
 create mode 100644 vllm_mindspore/ops/module/module.h

diff --git a/setup.py b/setup.py
index 647dfa531..43275db29 100644
--- a/setup.py
+++ b/setup.py
@@ -26,9 +26,9 @@ from typing import List
 from pathlib import Path
 from setuptools import find_packages, setup
 from setuptools.command.build_ext import build_ext
-from setuptools.command.install import install
 from setuptools import Extension
 import subprocess
+import warnings
 
 
 def load_module_from_path(module_name, path):
@@ -93,46 +93,50 @@ version = (Path("vllm_mindspore") / "version.txt").read_text()
 def _get_ascend_home_path():
     return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest")
 
+def _get_ascend_env_path(check_exists=True):
+    env_script_path = os.path.join(_get_ascend_home_path(), "bin", "setenv.bash")
+    if check_exists and not os.path.exists(env_script_path):
+        warnings.warn(f"The file '{env_script_path}' is not found, "
+                            "please make sure env variable 'ASCEND_HOME_PATH' is set correctly.")
+        return None
+    return env_script_path
+
 class CustomBuildExt(build_ext):
     ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
-    ASCENDC_OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops", "ascendc")
 
     def build_extension(self, ext):
-        if ext.name == "ascendc_kernels_npu":
-            self.build_ascendc_kernels()
-        elif ext.name == "npu_ops":
+        if ext.name == "vllm_mindspore.npu_ops":
             self.build_npu_ops(ext)
         else:
             raise ValueError(f"Unknown extension name: {ext.name}")
 
-    def build_ascendc_kernels(self):
-        kernel_so_name = "libascendc_kernels_npu.so"
-        print(f"Building {kernel_so_name}...")
-        tmp_build_dir = os.path.join(self.ASCENDC_OPS_DIR, "build")
-        if os.path.exists(tmp_build_dir):
-            print(f"Removing existing build directory: {tmp_build_dir}")
-            shutil.rmtree(tmp_build_dir)
-        os.makedirs(tmp_build_dir, exist_ok=True)
+    def build_npu_ops(self, ext):
+        # "vllm_mindspore.npu_ops" --> "npu_ops"
+        ext_name = ext.name.split('.')[-1]
+        so_name = ext_name + ".so"
+        print(f"Building {so_name} ...")
+        OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops")
+        BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ops")
+        os.makedirs(BUILD_OPS_DIR, exist_ok=True)
 
         ascend_home_path = _get_ascend_home_path()
-        env_script_path = os.path.join(ascend_home_path, "bin", "setenv.bash")
-        if not os.path.exists(env_script_path):
-            raise RuntimeError(f"The file '{env_script_path}' is not found, "
-                               "please make sure env variable 'ASCEND_HOME_PATH' is set correctly.")
+        env_script_path = _get_ascend_env_path(False)
+        build_extension_dir = os.path.join(BUILD_OPS_DIR, "kernel_meta", ext_name)
         # Combine all cmake commands into one string
         cmake_cmd = (
             f"source {env_script_path} && "
-            f"cmake -S {self.ASCENDC_OPS_DIR} -B {tmp_build_dir} "
-            f"-DRUN_MODE=npu -DCMAKE_BUILD_TYPE=Debug "
-            f"-DCMAKE_INSTALL_PREFIX={os.path.join(tmp_build_dir, 'install')} "
-            f"-DASCEND_CANN_PACKAGE_PATH={ascend_home_path} && "
-            f"cmake --build {tmp_build_dir} -j --verbose && "
-            f"cmake --install {tmp_build_dir}"
+            f"cmake -S {OPS_DIR} -B {BUILD_OPS_DIR}"
+            f"  -DCMAKE_BUILD_TYPE=Release"
+            f"  -DCMAKE_INSTALL_PREFIX={os.path.join(BUILD_OPS_DIR, 'install')}"
+            f"  -DBUILD_EXTENSION_DIR={build_extension_dir}"
+            f"  -DMS_EXTENSION_NAME={ext_name}"
+            f"  -DASCEND_CANN_PACKAGE_PATH={ascend_home_path} && "
+            f"cmake --build {BUILD_OPS_DIR} -j --verbose"
         )
 
         try:
             # Run the combined cmake command
-            print("Running combined CMake commands:")
+            print(f"Running combined CMake commands:\n{cmake_cmd}")
             result = subprocess.run(cmake_cmd, cwd=self.ROOT_DIR, text=True, shell=True, capture_output=True)
             if result.returncode != 0:
                 print("CMake commands failed:")
@@ -140,49 +144,16 @@ class CustomBuildExt(build_ext):
                 print(result.stderr)  # Print error output
                 raise RuntimeError(f"Combined CMake commands failed with exit code {result.returncode}")
         except subprocess.CalledProcessError as e:
-            raise RuntimeError(f"Failed to build {kernel_so_name}: {e}")
-
-        # Move the generated .so file to the target directory
-        src_so_path = os.path.join(tmp_build_dir, "lib", kernel_so_name)
-        lib_dir = os.path.join(self.ROOT_DIR, self.build_lib, "vllm_mindspore", "lib")
-        dst_so_path = os.path.join(lib_dir, kernel_so_name)
-        os.makedirs(lib_dir, exist_ok=True)
-        if os.path.exists(dst_so_path):
-            os.remove(dst_so_path)
-        shutil.move(src_so_path, dst_so_path)
-        print(f"Moved {kernel_so_name} to {lib_dir}.")
-        # Remove the build directory after building kernels.so
-        shutil.rmtree(tmp_build_dir)
+            raise RuntimeError(f"Failed to build {so_name}: {e}")
 
-    def build_npu_ops(self, ext):
-        print("Building npu_ops.so ...")
-        try:
-            import mindspore as ms
-        except ImportError:
-            print("Mindspore is not found, skip building npu_ops.so")
-            return
-        try:
-            src = [os.path.join(self.ASCENDC_OPS_DIR, s) for s in ext.sources]
-            build_lib_dir = os.path.join(self.ROOT_DIR, self.build_lib, "vllm_mindspore")
-            ms.ops.CustomOpBuilder(
-                "npu_ops",
-                src,
-                backend="Ascend",
-                cflags=f"-I{self.ASCENDC_OPS_DIR}",
-                ldflags=f"-L{os.path.join(build_lib_dir, 'lib')} -lascendc_kernels_npu -Wl,-rpath,'$$ORIGIN/lib'"
-            ).load()
-        except ImportError:
-            pass
-        # Move the generated .so file to the target directory
-        kernel_meta_dir = os.path.join(self.ROOT_DIR, "kernel_meta")
-        src_so_path = os.path.join(kernel_meta_dir, "npu_ops", "npu_ops.so")
-        dst_so_path = os.path.join(build_lib_dir, "npu_ops.so")
-        os.makedirs(build_lib_dir, exist_ok=True)
+        # Copy the generated .so file to the target directory
+        src_so_path = os.path.join(build_extension_dir, so_name)
+        dst_so_path = self.get_ext_fullpath(ext.name)
+        os.makedirs(os.path.dirname(dst_so_path), exist_ok=True)
         if os.path.exists(dst_so_path):
             os.remove(dst_so_path)
-        shutil.move(src_so_path, build_lib_dir)
-        print(f"Moved npu_ops.so to {build_lib_dir}.")
-        shutil.rmtree(kernel_meta_dir)
+        shutil.copy(src_so_path, dst_so_path)
+        print(f"Copied {so_name} to {dst_so_path}")
 
 package_data = {
     "": [
@@ -197,11 +168,8 @@ def _get_ext_modules():
     # As a temporary solution, this is controlled via an environment variable.
     # Once the CI environment adds support for custom operator compilation,
     # this should be updated to enable compilation by default.
-    if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on":
-        ext_modules.append(Extension("ascendc_kernels_npu", sources=[]))
-        ext_modules.append(Extension("npu_ops", sources=[
-            "adv_step_flash_adapter.cpp"
-        ]))
+    if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on" and _get_ascend_env_path() is not None:
+        ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[])) # sources are specified in CMakeLists.txt
     return ext_modules
 
 setup(
diff --git a/vllm_mindspore/ops/CMakeLists.txt b/vllm_mindspore/ops/CMakeLists.txt
new file mode 100644
index 000000000..4c94b2c08
--- /dev/null
+++ b/vllm_mindspore/ops/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ops)
+
+set(MS_EXTENSION_NAME "" CACHE STRING "Extension Name")
+set(BUILD_EXTENSION_DIR "" CACHE STRING "Extension directory")
+if (MS_EXTENSION_NAME STREQUAL "")
+    message(FATAL_ERROR "MS_EXTENSION_NAME must be set. Use -DMS_EXTENSION_NAME=<name>")
+endif()
+if (BUILD_EXTENSION_DIR STREQUAL "")
+    message(FATAL_ERROR "BUILD_EXTENSION_DIR must be set. Use -DBUILD_EXTENSION_DIR=<path>")
+endif()
+
+# Build ascendc kernels
+add_subdirectory(ascendc)
+
+# Collect source files
+file(GLOB SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/module/*.cpp)
+
+# Generate a temporary python script file to build custom ops with MindSpore's CustomOpBuilder
+set(PYTHON_SCRIPT_PATH "${CMAKE_BINARY_DIR}/build_custom_with_ms.py")
+file(WRITE ${PYTHON_SCRIPT_PATH} "
+import mindspore as ms
+src_files = '${SRC_FILES}'.split(';')
+ms.ops.CustomOpBuilder(
+    name='${MS_EXTENSION_NAME}',
+    sources=src_files,
+    backend='Ascend',
+    cflags='-I${CMAKE_CURRENT_SOURCE_DIR}',
+    ldflags='-L${ASCENDC_TARGET_DIR} -l${ASCENDC_TARGET_NAME}',
+    build_dir='${BUILD_EXTENSION_DIR}'
+).build()
+")
+
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+add_custom_target(
+    BuildCustomOp ALL
+    COMMAND cd ${CMAKE_BINARY_DIR} && ${Python3_EXECUTABLE} ${PYTHON_SCRIPT_PATH}
+    DEPENDS ${ASCENDC_TARGET_NAME}
+    COMMENT "Building custom operator with MindSpore"
+)
diff --git a/vllm_mindspore/ops/ascendc/CMakeLists.txt b/vllm_mindspore/ops/ascendc/CMakeLists.txt
index ce4a8d276..d6165987c 100644
--- a/vllm_mindspore/ops/ascendc/CMakeLists.txt
+++ b/vllm_mindspore/ops/ascendc/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.16)
 project(AscendC_Kernels)
 
 # Parameters passed from command line or default values
-set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(RUN_MODE "npu")
 set(SOC_VERSION "Ascend910B1" CACHE STRING "system on chip type")
 set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug")
 
@@ -21,11 +21,11 @@ endif()
 # Include Ascend CANN CMake file
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 
-# Add source files
-file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/adv_step_flash.c)
+# Collect source files
+file(GLOB ASCENDC_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.c)
 
-# Build shared library
-ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+# Create an object library
+ascendc_library(ascendc_kernels_npu STATIC ${ASCENDC_KERNEL_FILES})
 
-# Set the output directory
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
\ No newline at end of file
+set(ASCENDC_TARGET_NAME ascendc_kernels_npu PARENT_SCOPE)
+set(ASCENDC_TARGET_DIR "${CMAKE_BINARY_DIR}/lib" PARENT_SCOPE)
diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp b/vllm_mindspore/ops/module/adv_step_flash.cpp
similarity index 97%
rename from vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp
rename to vllm_mindspore/ops/module/adv_step_flash.cpp
index d72af3e38..803abb0a4 100644
--- a/vllm_mindspore/ops/ascendc/adv_step_flash_adapter.cpp
+++ b/vllm_mindspore/ops/module/adv_step_flash.cpp
@@ -4,7 +4,8 @@
 
 #include "ms_extension.h"
 
-#include "adv_step_flash.h"
+#include "ascendc/adv_step_flash.h"
+#include "module/module.h"
 
 using BaseTensor = mindspore::tensor::BaseTensor;
 using BaseTensorPtr = mindspore::tensor::BaseTensorPtr;
@@ -91,7 +92,7 @@ void AdvStepFlashAscendC(int32_t num_seqs, int32_t num_queries, int32_t block_si
   seq_lens = caster.RecoveryTensorDtype(seq_lens, "seq_lens");
 }
 
-PYBIND11_MODULE(MS_EXTENSION_NAME, m) {
+MS_EXTENSION_MODULE(adv_step_flash) {
   m.def("adv_step_flash", &AdvStepFlashAscendC, "adv_step_flash_ascendc", pybind11::arg("num_seqs"),
         pybind11::arg("num_queries"), pybind11::arg("block_size"), pybind11::arg("input_tokens"),
         pybind11::arg("sampled_token_ids"), pybind11::arg("input_positions"), pybind11::arg("seq_lens"),
diff --git a/vllm_mindspore/ops/module/module.cpp b/vllm_mindspore/ops/module/module.cpp
new file mode 100644
index 000000000..45ae8c067
--- /dev/null
+++ b/vllm_mindspore/ops/module/module.cpp
@@ -0,0 +1,6 @@
+#include "module/module.h"
+
+PYBIND11_MODULE(MS_EXTENSION_NAME, m) {
+  m.doc() = "A custom module for operators";
+  ModuleRegistry::Instance().RegisterAll(m);
+}
diff --git a/vllm_mindspore/ops/module/module.h b/vllm_mindspore/ops/module/module.h
new file mode 100644
index 000000000..ef660e12d
--- /dev/null
+++ b/vllm_mindspore/ops/module/module.h
@@ -0,0 +1,54 @@
+#ifndef VLLM_MINDSPORE_OPS_MODULE_MODULE_H
+#define VLLM_MINDSPORE_OPS_MODULE_MODULE_H
+
+#include <pybind11/pybind11.h>
+#include <functional>
+#include <vector>
+#include <string>
+
+// Define the type of module registration functions
+using ModuleRegisterFunction = std::function<void(pybind11::module_ &)>;
+
+// Module registry class
+class ModuleRegistry {
+ public:
+  // Get the singleton instance
+  static ModuleRegistry &Instance() {
+    static ModuleRegistry instance;
+    return instance;
+  }
+
+  // Register a module function
+  void Register(const ModuleRegisterFunction &func) { functions_.push_back(func); }
+
+  // Call all registered module functions
+  void RegisterAll(pybind11::module_ &m) {
+    for (const auto &func : functions_) {
+      func(m);
+    }
+  }
+
+ private:
+  ModuleRegistry() = default;
+  ~ModuleRegistry() = default;
+
+  // Disable copy and assignment
+  ModuleRegistry(const ModuleRegistry &) = delete;
+  ModuleRegistry &operator=(const ModuleRegistry &) = delete;
+
+  // Store all registered functions
+  std::vector<ModuleRegisterFunction> functions_;
+};
+
+// Define a macro to register module functions
+#define MS_EXTENSION_MODULE(func)                                                \
+  static void func##_register(pybind11::module_ &);                              \
+  namespace {                                                                    \
+  struct func##_registrar {                                                      \
+    func##_registrar() { ModuleRegistry::Instance().Register(func##_register); } \
+  };                                                                             \
+  static func##_registrar registrar_instance;                                    \
+  }                                                                              \
+  static void func##_register(pybind11::module_ &m)
+
+#endif  // VLLM_MINDSPORE_OPS_MODULE_MODULE_H
-- 
Gitee


From 14fa91138bda6925af0d23425850778af51f8d96 Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Thu, 10 Apr 2025 23:03:42 +0800
Subject: [PATCH 71/82] modify qwen2 for v0.7.3

---
 tests/st/python/test_vllm_qwen_7b.py          | 74 +++++++++++++++
 vllm_mindspore/attention/backends/ms_attn.py  | 16 +---
 vllm_mindspore/attention/layer.py             | 51 +++++++----
 .../models/mf_models/deepseek_mtp.py          |  3 +-
 .../models/mf_models/deepseek_v3.py           |  3 +-
 .../models/mf_models/mf_model_base.py         | 43 ---------
 .../model_executor/models/mf_models/qwen2.py  |  3 +-
 .../model_executor/models/model_base.py       | 58 ++++++++++--
 vllm_mindspore/model_executor/models/qwen2.py | 91 ++++++++++++-------
 9 files changed, 226 insertions(+), 116 deletions(-)
 create mode 100644 tests/st/python/test_vllm_qwen_7b.py

diff --git a/tests/st/python/test_vllm_qwen_7b.py b/tests/st/python/test_vllm_qwen_7b.py
new file mode 100644
index 000000000..bce75d3e1
--- /dev/null
+++ b/tests/st/python/test_vllm_qwen_7b.py
@@ -0,0 +1,74 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test vllm qwen."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestQwen:
+    """
+    Test Qwen.
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_vllm_qwen(self):
+        """
+        test case qwen2.5 7B
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  gpu_memory_utilization=0.9, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['中性<｜Assistant｜> 这句话']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index 499321223..558882cdf 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -23,6 +23,8 @@ from itertools import accumulate
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 import os
 
+import numpy as np
+
 import torch
 
 from vllm.attention.backends.abstract import (
@@ -55,6 +57,7 @@ import mindspore as ms
 from mindspore import mutable
 from mindspore._c_expression import swap_cache
 
+
 def advance_step_op(sampled_token_ids,
                     model_input,
                     seq_lens_tensor,
@@ -390,19 +393,6 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
         else:
             raise AttributeError(f"Invalid attention type {str(attn_type)}")
 
-    def keys(self):
-        return ["num_prefill_tokens", "num_decode_tokens", "slot_mapping", "batch_valid_length", "context_lens", "block_tables"]
-
-    def __getitem__(self, key):
-        if key == "context_lens":
-            key = "seq_lens_tensor"
-        if key == "batch_valid_length":
-            return mutable(getattr(self, "seq_lens"), dynamic_len=True)
-        if key == "block_tables":
-            if getattr(self, key).ndim == 1:
-                return mutable(getattr(self, key).expand_dims(0))
-            return mutable(getattr(self, key))
-        return mutable(getattr(self, key))
 
 class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]):
 
diff --git a/vllm_mindspore/attention/layer.py b/vllm_mindspore/attention/layer.py
index 84335349b..99cdc521e 100644
--- a/vllm_mindspore/attention/layer.py
+++ b/vllm_mindspore/attention/layer.py
@@ -153,15 +153,16 @@ class Attention(nn.Cell):
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        kv_cache: Tuple[Tensor, Tensor],
-        # attn_metadata: MSMetadata,
-        num_prefill_tokens: int,
+        key_cache: Tensor,
+        value_cache: Tensor,
+        num_prefill_tokens: bool,
         num_decode_tokens: int,
         slot_mapping: Tensor,
         batch_valid_length: Tuple[int],
-        context_lens: Tensor,
+        q_seq_lens: Tensor,
         block_tables: Tensor,
         attn_mask: Tensor,
+        decode_mask:Tensor,
     ) -> Tensor:
         """Attention foward, support MHA and GQA.
 
@@ -175,13 +176,13 @@ class Attention(nn.Cell):
             block_tables: shape = [block_size, num_block]
         """
         output = query
-        key_cache, value_cache = kv_cache
         cache_out = self.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
         query = ops.depend(query, cache_out)
         if num_prefill_tokens > 0:
             output = self._run_prefill_forward(query, key, value, attn_mask, batch_valid_length, batch_valid_length)
         if num_decode_tokens > 0:
-            output = self._run_decode_forward(query, key_cache, value_cache, block_tables, context_lens)
+            output = self._run_decode_forward(query, key_cache, value_cache, block_tables,batch_valid_length,
+                                              decode_mask, q_seq_lens)
         return output
 
     def _run_prefill_forward(
@@ -206,16 +207,18 @@ class Attention(nn.Cell):
         query = query.view(-1, self.hidden_size_per_partition)
         key = key.view(-1, self.kv_hidden_size_per_partition)
         value = value.view(-1, self.kv_hidden_size_per_partition)
-        _, _, _, output = self.flash_attention(query,
-                                               key,
-                                               value,
-                                               None,
-                                               None,
-                                               None,
-                                               attn_mask,
-                                               None,
-                                               actual_seq_qlen,
-                                               actual_seq_kvlen)
+        _, _, _, output = self.flash_attention(
+            query,
+            key,
+            value,
+            None,
+            None,
+            None,
+            attn_mask,
+            None,
+            actual_seq_qlen,
+            actual_seq_kvlen
+        )
         output = output.view(1, -1, self.hidden_size_per_partition)
         return output
 
@@ -225,7 +228,9 @@ class Attention(nn.Cell):
         key_cache: Tensor,
         value_cache: Tensor,
         block_tables: Tensor,
-        context_lens: Tensor,
+        batch_valid_length: Tensor,
+        decode_mask:Tensor,
+        q_seq_lens: Tensor,
     ) -> Tensor:
         """Decode with PagedAttention.
 
@@ -236,5 +241,15 @@ class Attention(nn.Cell):
             block_tables: shape = [block_size, num_block]
             context_lens: shape = [batch_size, ]
         """
-        output = self.paged_attention(query, key_cache, value_cache, block_tables, context_lens)
+        output = self.paged_attention(
+            query,
+            key_cache,
+            value_cache,
+            block_tables,
+            batch_valid_length,
+            None,
+            None,
+            decode_mask,
+            q_seq_lens
+        )
         return output
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
index dfe9b0d4c..fea96442b 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
@@ -34,7 +34,8 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_MLA
+from vllm_mindspore.model_executor.models.model_base import Fake_MLA
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index c6f2b0a97..af5a34284 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -46,7 +46,8 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_MLA
+from vllm_mindspore.model_executor.models.model_base import Fake_MLA
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 7ac62f49c..b784849ec 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -24,13 +24,10 @@ import numpy as np
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.sequence import IntermediateTensors
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.attention.backends.abstract import AttentionType
 from vllm.logger import init_logger
 
 import torch
@@ -58,34 +55,6 @@ def _batch_seq(input_tokens, prefill):
     return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
 
 
-class Fake_Attention:
-    def __init__(self):
-        vllm_config = get_current_vllm_config()
-        block_size = vllm_config.cache_config.block_size
-        num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-            vllm_config.parallel_config
-        )
-        head_size = vllm_config.model_config.get_head_size()
-        num_block = 0
-        self.kv_shape = [num_block, block_size, num_kv_heads, head_size]
-        self.kv_cache = [
-            (
-                torch.zeros(self.kv_shape, dtype=ms.bfloat16, device="Ascend"),
-                torch.zeros(self.kv_shape, dtype=ms.bfloat16, device="Ascend"),
-            )
-            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
-        ]
-        self.attn_type = AttentionType.DECODER
-
-
-class Fake_MLA(Fake_Attention):
-    def __init__(self):
-        super().__init__()
-        vllm_config = get_current_vllm_config()
-        self.kv_cache = [
-            (torch.zeros(self.kv_shape, dtype=ms.bfloat16, device="Ascend"),)
-            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
-        ]
 
 class MfModelBase(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -123,18 +92,6 @@ class MfModelBase(MsModelBase):
         raise NotImplementedError("Function _create_network should be Implemented!")
 
 
-    def get_kvcache(self):
-        key_cache = []
-        value_cache = []
-        forward_context = get_forward_context()
-        for i in range(self.mf_model_config.num_layers):
-            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
-            v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1]
-            key_cache.append(k_cache)
-            value_cache.append(v_cache)
-        return mutable(key_cache), mutable(value_cache)
-
-
     def prepare_inputs(self, input_ids, positions, attn_metadata):
         key_cache, value_cache = self.get_kvcache()
         seq_lens = attn_metadata.seq_lens
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 1288884e8..27711b938 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -31,7 +31,8 @@ from research.qwen2_5.infer.qwen2_5 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase, Fake_Attention
+from vllm_mindspore.model_executor.models.model_base import Fake_Attention
+from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 from vllm_mindspore.model_executor.models.mf_models.qwen2_weight_processor import Qwen2WeightProcessor
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index d6355e429..7155ee0dc 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -21,16 +21,48 @@ from abc import abstractmethod
 from typing import Iterable, List, Optional, Set, Tuple, Union, Dict
 
 from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.attention.backends.abstract import AttentionType
+from vllm.forward_context import get_forward_context
+
+import torch
 
 from mindspore import Tensor, nn, mutable
 from mindspore import dtype as mstype
 
 from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE
 
+class Fake_Attention:
+    def __init__(self):
+        vllm_config = get_current_vllm_config()
+        block_size = vllm_config.cache_config.block_size
+        num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+            vllm_config.parallel_config
+        )
+        head_size = vllm_config.model_config.get_head_size()
+        num_block = 0
+        self.kv_shape = [num_block, block_size, num_kv_heads, head_size]
+        self.kv_cache = [
+            (
+                torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),
+                torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),
+            )
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+        ]
+        self.attn_type = AttentionType.DECODER
+
+
+class Fake_MLA(Fake_Attention):
+    def __init__(self):
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+        self.kv_cache = [
+            (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),)
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+        ]
 
 class MsModelBase():
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -172,13 +204,13 @@ class MsModelBase():
 
         dyn_key_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype))
         dyn_value_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype))
-        dyn_kv_cache = mutable((dyn_key_cache, dyn_value_cache))
-        dyn_kv_caches = mutable([dyn_kv_cache for _ in range(num_layers)])
+        dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)])
+        dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)])
 
         dyn_num_prefill_tokens = mutable(1)
         dyn_num_decode_tokens = mutable(0)
-        dyn_context_lens = Tensor(shape=[None, ], dtype=mstype.int32)
-        dyn_batch_valid_length = mutable([0, 0, 0], dynamic_len=True)
+        dyn_batch_valid_length = Tensor(shape=[None, ], dtype=mstype.int32)
+        dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32)
         dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32)
         dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32)
         dyn_intermediate_tensors = None
@@ -187,17 +219,29 @@ class MsModelBase():
         self.model.set_inputs(
             dyn_input_ids,
             dyn_position_ids,
-            dyn_kv_caches,
+            dyn_key_caches,
+            dyn_value_caches,
             dyn_num_prefill_tokens,
             dyn_num_decode_tokens,
-            dyn_context_lens,
             dyn_batch_valid_length,
+            dyn_q_seq_lens,
             dyn_slot_mapping,
             dyn_block_tables,
             dyn_intermediate_tensors,
             dyn_inputs_embeds
         )
 
+    def get_kvcache(self):
+        key_cache = []
+        value_cache = []
+        forward_context = get_forward_context()
+        for i in range(self.config.num_hidden_layers):
+            k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0]
+            v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1]
+            key_cache.append(k_cache)
+            value_cache.append(v_cache)
+        return mutable(key_cache), mutable(value_cache)
+
     @abstractmethod
     def compute_logits(
         self,
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 2c3c81d45..387b7cc7e 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -15,12 +15,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+from vllm.config import get_current_vllm_config
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Iterable
 
 if TYPE_CHECKING:
     from transformers import Qwen2Config
 else:
     Qwen2Config = None
+
+import numpy as np
+
 from mindspore import Parameter, Tensor, mint, nn, jit, mutable
 from mindspore.common import dtype as mstype
 
@@ -33,8 +37,6 @@ from vllm_mindspore.model_executor.layers.linear import (
     MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
 from vllm_mindspore.model_executor.layers.logits_processor import \
     LogitsProcessor
-from vllm.model_executor.layers.quantization import \
-    QuantizationConfig
 from vllm_mindspore.model_executor.layers.rotary_embedding import get_rope
 from vllm_mindspore.model_executor.layers.sampler import (SamplerOutput,
                                                           get_sampler)
@@ -46,10 +48,12 @@ from vllm_mindspore.model_executor.models.utils import (
     PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers,
     maybe_prefix)
 from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
-from vllm_mindspore.model_executor.models.model_base import MsModelBase
+from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention
 
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.model_executor.layers.quantization import \
+    QuantizationConfig
 from vllm.sequence import IntermediateTensors
 from vllm.attention.backends.abstract import AttentionType
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -170,26 +174,27 @@ class Qwen2Attention(nn.Cell):
             attn_type=attn_type
         )
         self.attn_mask = mint.triu(mint.ones(size=(128, 128), dtype=mstype.bfloat16), 1)
+        self.hard_mask = Tensor([0], dtype=mstype.bfloat16).reshape(1, 1)
 
     @jit
     def construct(
         self,
         positions: Tensor,
         hidden_states: Tensor,
-        kv_cache: Tuple[Tensor, Tensor],
-        # attn_metadata: AttentionMetadata,
+        key_cache: Tensor,
+        value_cache: Tensor,
         num_prefill_tokens: int,
         num_decode_tokens: int,
         slot_mapping: Tensor,
         batch_valid_length: Tuple[int],
-        context_lens: Tensor,
+        q_seq_lens: Tensor,
         block_tables: Tensor,
     ) -> Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = mint.split(qkv, (self.q_size, self.kv_size, self.kv_size), -1)
-        q, k = self.rotary_emb(positions, q, k, context_lens, num_prefill_tokens)
-        attn_output = self.attn(q, k, v, kv_cache, num_prefill_tokens, num_decode_tokens,
-                                slot_mapping, batch_valid_length, context_lens, block_tables, self.attn_mask)
+        q, k = self.rotary_emb(positions, q, k, q_seq_lens, num_prefill_tokens)
+        attn_output = self.attn(q, k, v, key_cache, value_cache, num_prefill_tokens, num_decode_tokens,
+                                slot_mapping, batch_valid_length, q_seq_lens, block_tables, self.attn_mask, self.hard_mask)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -249,13 +254,13 @@ class Qwen2DecoderLayer(nn.Cell):
         self,
         positions: Tensor,
         hidden_states: Tensor,
-        kv_cache: Tuple[Tensor, Tensor],
-        # attn_metadata: AttentionMetadata,
+        key_cache: Tensor,
+        value_cache: Tensor,
         num_prefill_tokens: int,
         num_decode_tokens: int,
         slot_mapping: Tensor,
         batch_valid_length: Tuple[int],
-        context_lens: Tensor,
+        q_seq_lens: Tensor,
         block_tables: Tensor,
         residual: Optional[Tensor],
     ) -> Tuple[Tensor, Tensor]:
@@ -268,12 +273,13 @@ class Qwen2DecoderLayer(nn.Cell):
         hidden_states = self.self_attn(
             positions,
             hidden_states,
-            kv_cache,
+            key_cache,
+            value_cache,
             num_prefill_tokens,
             num_decode_tokens,
             slot_mapping,
             batch_valid_length,
-            context_lens,
+            q_seq_lens,
             block_tables
         )
 
@@ -335,13 +341,13 @@ class Qwen2Model(nn.Cell):
         self,
         input_ids: Optional[Tensor],
         positions: Tensor,
-        kv_caches: List[Tuple[Tensor, Tensor]],
-        # attn_metadata: AttentionMetadata,
+        key_caches: List[Tensor], 
+        value_caches: List[Tensor],
         num_prefill_tokens: int,
         num_decode_tokens: int,
         slot_mapping: Tensor,
-        batch_valid_length: Tuple[int],
-        context_lens: Tensor,
+        batch_valid_length: Tensor,
+        q_seq_lens: Tensor,
         block_tables: Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
@@ -361,12 +367,13 @@ class Qwen2Model(nn.Cell):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
+                key_caches[i - self.start_layer],
+                value_caches[i - self.start_layer],
                 num_prefill_tokens,
                 num_decode_tokens,
                 slot_mapping,
                 batch_valid_length,
-                context_lens,
+                q_seq_lens,
                 block_tables,
                 residual
             )
@@ -398,16 +405,16 @@ class Qwen2Model(nn.Cell):
                 # the checkpoint. Skip them.
                 continue
             if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-               # Loading kv cache quantization scales
-               param = params_dict[scale_name]
-               weight_loader = getattr(param, "weight_loader",
-                                       default_weight_loader)
-               loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                loaded_weight[0])
-               weight_loader(param, loaded_weight)
-               loaded_params.add(scale_name)
-               continue
+                    (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -484,6 +491,13 @@ class Qwen2ForCausalLM(MsModelBase):
         self.set_modules({"model": self.model, "lm_head": self.lm_head})
 
         self.set_model_inputs()
+        self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)]
+        compilation_config = vllm_config.compilation_config
+
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        for i in range(config.num_hidden_layers):
+            compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
     def get_input_embeddings(self, input_ids: Tensor) -> Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -498,14 +512,27 @@ class Qwen2ForCausalLM(MsModelBase):
         inputs_embeds: Tensor = None,
         **kwargs
     ) -> Union[Tensor, IntermediateTensors]:
+        key_cache, value_cache = self.get_kvcache()
         if attn_metadata.num_prefill_tokens > 0:
             input_ids = input_ids.expand_dims(0)
         if attn_metadata.num_decode_tokens > 0:
             input_ids = input_ids.expand_dims(1)
+        num_prefill_tokens = mutable(attn_metadata.num_prefill_tokens)
+        num_decode_tokens = mutable(attn_metadata.num_decode_tokens)
+        slot_mapping = attn_metadata.slot_mapping
+        batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32))
+        q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32))
+        block_tables = attn_metadata.block_tables
         model_output = self.model(input_ids,
                                   positions,
-                                  kv_caches,
-                                  **dict(attn_metadata),
+                                  key_cache,
+                                  value_cache,
+                                  num_prefill_tokens,
+                                  num_decode_tokens,
+                                  slot_mapping,
+                                  batch_valid_length,
+                                  q_seq_lens,
+                                  block_tables,
                                   intermediate_tensors=intermediate_tensors,
                                   inputs_embeds=inputs_embeds)
         if attn_metadata.num_prefill_tokens > 0:
-- 
Gitee


From b828928ebe88aff42facf3c29a79c17b563de37c Mon Sep 17 00:00:00 2001
From: w00521005 <wangshaocong1@huawei.com>
Date: Sat, 12 Apr 2025 15:26:28 +0800
Subject: [PATCH 72/82] fix to load bf16 weight

---
 .../config/predict_deepseek_r1_671b.yaml      | 121 ++++++++++++++++++
 .../st/python/test_vllm_deepseek_bf16_part.py |  76 +++++++++++
 .../mf_models/deepseekv3_weight_processor.py  |   3 +-
 3 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 tests/st/python/config/predict_deepseek_r1_671b.yaml
 create mode 100644 tests/st/python/test_vllm_deepseek_bf16_part.py

diff --git a/tests/st/python/config/predict_deepseek_r1_671b.yaml b/tests/st/python/config/predict_deepseek_r1_671b.yaml
new file mode 100644
index 000000000..112375eff
--- /dev/null
+++ b/tests/st/python/config/predict_deepseek_r1_671b.yaml
@@ -0,0 +1,121 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+run_mode: 'predict'
+use_parallel: True
+
+load_checkpoint: "/path/to/deepseekr1/model_ckpt"
+load_ckpt_format: "safetensors"
+auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'DeepSeekR1'
+
+# default parallel of device num = 32 for Atlas 800T A2
+parallel_config:
+  model_parallel: 4
+  pipeline_stage: 1
+  expert_parallel: 1
+  vocab_emb_dp: False
+
+# mindspore context init config
+context:
+  mode: 0 # 0--Graph Mode; 1--Pynative Mode
+  max_device_memory: "61GB"
+  device_id: 0
+  affinity_cpu_list: None
+
+kernel_launch_group:
+  thread_num: 4
+  kernel_group_num: 16
+
+# parallel context config
+parallel:
+  parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
+  full_batch: False
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+
+# model config
+model:
+  model_config:
+    type: DeepseekV3Config
+    auto_register: deepseek3_config.DeepseekV3Config
+    batch_size: 1 # add for incre predict
+    seq_length: 4096
+    hidden_size: 7168
+    num_layers: 4
+    num_heads: 128
+    max_position_embeddings: 163840
+    intermediate_size: 18432
+    kv_lora_rank:  512
+    q_lora_rank: 1536
+    qk_rope_head_dim: 64
+    v_head_dim: 128
+    qk_nope_head_dim: 128
+    vocab_size: 129280
+    multiple_of: 256
+    rms_norm_eps: 1.0e-6
+    bos_token_id: 0
+    eos_token_id: 1
+    pad_token_id: 1
+    ignore_token_id: -100
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "bfloat16"
+    softmax_compute_type: "bfloat16"
+    rotary_dtype: "bfloat16"
+    router_dense_type: "bfloat16"
+    param_init_type: "bfloat16"
+    scaling_factor:
+      beta_fast: 32.0
+      beta_slow: 1.0
+      factor: 40.0
+      mscale: 1.0
+      mscale_all_dim: 1.0
+      original_max_position_embeddings: 4096
+    use_past: True
+    extend_method: "YARN"
+    use_flash_attention: True
+    block_size: 16
+    num_blocks: 512
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1
+    max_decode_length: 1024
+    top_k: 1
+    top_p: 1
+    theta: 10000.0
+    do_sample: False
+    is_dynamic: True
+    qkv_concat: False
+    ffn_concat: True
+    auto_map:
+      AutoConfig: deepseek3_config.DeepseekV3Config
+      AutoModel: deepseek3.DeepseekV3ForCausalLM
+  arch:
+    type: DeepseekV3ForCausalLM
+    auto_register: deepseek3.DeepseekV3ForCausalLM
+
+moe_config:
+  expert_num: 256
+  num_experts_chosen: 8
+  routing_policy: "TopkRouterV2"
+  shared_expert_num: 1
+  routed_scaling_factor: 2.5
+  first_k_dense_replace: 3
+  moe_intermediate_size: 2048
+  topk_group: 4
+  n_group: 8
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<｜begin▁of▁sentence｜>'
+    eos_token: '<｜end▁of▁sentence｜>'
+    pad_token: '<｜end▁of▁sentence｜>'
+    type: LlamaTokenizerFast
+    vocab_file: '/path/to/deepseekr1/tokenizer.json'
+    tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
+    chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
+  type: LlamaProcessor
diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py
new file mode 100644
index 000000000..c19dd14a6
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_bf16_part.py
@@ -0,0 +1,76 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestDeepSeek:
+    """
+    Test Deepseek.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_deepseek_r1_bf16(self):
+        """
+        test case deepseek r1 bf16
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['ugs611ాలు sic辨hara的开璞 SquaresInsp']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index a3b9bb3ee..953c4db6b 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1135,7 +1135,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         for file in os.listdir(src_hf_dir):
             if file.endswith('index.json'):
                 # mtp model do not support quantization, needs to load bf16 weight.
-                if (self.is_quant and 'quant' in file) or (is_mtp_model and 'quant' not in file):
+                if not self.is_quant or (self.is_quant and 'quant' in file) or \
+                        (is_mtp_model and 'quant' not in file):
                     param_json_path = os.path.join(src_hf_dir, file)
                     with open(param_json_path, "r") as fp:
                         hf_weight_map = json.load(fp)['weight_map']
-- 
Gitee


From ecaf10ccd92835b600e8bd3e122921a10cee8044 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Mon, 14 Apr 2025 16:20:58 +0800
Subject: [PATCH 73/82] lm head set dynamic input

---
 vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py | 2 ++
 vllm_mindspore/model_executor/models/mf_models/qwen2.py       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index af5a34284..38a08e115 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -123,6 +123,8 @@ class DeepseekV3ForCausalLM(MfModelBase):
             weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant)
             weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint)
         self.network.set_dynamic_inputs()
+        dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype)
+        self.lm_head.set_inputs(dynamic_hidden_states)
         return None
 
     def get_model_path(self):
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 27711b938..14ce94449 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -82,5 +82,6 @@ class Qwen2ForCausalLM(MfModelBase):
         weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint)
 
         self.network.set_dynamic_inputs()
-
+        dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype)
+        self.lm_head.set_inputs(dynamic_hidden_states)
         return None
-- 
Gitee


From 43fbd69647e9117d64b8d20543d771205a91c2e1 Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Tue, 15 Apr 2025 21:13:52 +0800
Subject: [PATCH 74/82] fix ScheduleConfig verify_args check

---
 vllm_mindspore/config.py | 78 ++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 0f20ca17d..e702278ef 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -120,43 +120,43 @@ def _verify_args(self) -> None:
             "sequences. Please increase max_num_batched_tokens or "
             "decrease max_model_len.")
 
-        if self.max_num_batched_tokens < self.max_num_seqs:
+    if self.max_num_batched_tokens < self.max_num_seqs:
+        raise ValueError(
+            f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+            "be greater than or equal to max_num_seqs "
+            f"({self.max_num_seqs}).")
+
+    if self.num_lookahead_slots < 0:
+        raise ValueError(
+            "num_lookahead_slots "
+            f"({self.num_lookahead_slots}) must be greater than or "
+            "equal to 0.")
+
+    if self.num_scheduler_steps < 1:
+        raise ValueError(
+            "num_scheduler_steps "
+            f"({self.num_scheduler_steps}) must be greater than or "
+            "equal to 1.")
+
+    if self.max_num_partial_prefills < 1:
+        raise ValueError(
+            f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
+            "must be greater than or equal to 1.")
+    elif self.max_num_partial_prefills > 1:
+        if not self.chunked_prefill_enabled:
+            raise ValueError("Chunked prefill must be enabled to set "
+                             "max_num_partial_prefills > 1.")
+
+        if self.long_prefill_token_threshold > self.max_model_len:
             raise ValueError(
-                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
-                "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs}).")
-
-        if self.num_lookahead_slots < 0:
-            raise ValueError(
-                "num_lookahead_slots "
-                f"({self.num_lookahead_slots}) must be greater than or "
-                "equal to 0.")
-
-        if self.num_scheduler_steps < 1:
-            raise ValueError(
-                "num_scheduler_steps "
-                f"({self.num_scheduler_steps}) must be greater than or "
-                "equal to 1.")
-
-        if self.max_num_partial_prefills < 1:
-            raise ValueError(
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
-                "must be greater than or equal to 1.")
-        elif self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
-                raise ValueError("Chunked prefill must be enabled to set "
-                                 "max_num_partial_prefills > 1.")
-
-            if self.long_prefill_token_threshold > self.max_model_len:
-                raise ValueError(
-                    "long_prefill_token_threshold "
-                    f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len}).")
-
-        if (self.max_long_partial_prefills
-                < 1) or (self.max_long_partial_prefills
-                         > self.max_num_partial_prefills):
-            raise ValueError(
-                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
-                "must be greater than or equal to 1 and less than or equal to "
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+                "long_prefill_token_threshold "
+                f"({self.long_prefill_token_threshold}) cannot be greater "
+                f"than the max_model_len ({self.max_model_len}).")
+
+    if (self.max_long_partial_prefills
+            < 1) or (self.max_long_partial_prefills
+                     > self.max_num_partial_prefills):
+        raise ValueError(
+            f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
+            "must be greater than or equal to 1 and less than or equal to "
+            f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
-- 
Gitee


From 7ca751ac7ca3c1002652a679215ad031f39dc9f2 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Fri, 11 Apr 2025 15:02:43 +0800
Subject: [PATCH 75/82] =?UTF-8?q?=E6=B7=BB=E5=8A=A0pc=E7=89=B9=E6=80=A7st?=
 =?UTF-8?q?=E7=94=A8=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/st/python/test_vllm_deepseek_part.py    |  8 +-
 tests/st/python/test_vllm_mf_qwen_7b.py       |  8 +-
 tests/st/python/test_vllm_mf_qwen_7b_mss.py   |  8 +-
 .../test_vllm_mf_qwen_7b_prefix_caching.py    | 83 +++++++++++++++++++
 4 files changed, 98 insertions(+), 9 deletions(-)
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py

diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 6334f6277..ca4596595 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -1,13 +1,15 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
 # Copyright 2024 The vLLM team.
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://wwww.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by application law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
diff --git a/tests/st/python/test_vllm_mf_qwen_7b.py b/tests/st/python/test_vllm_mf_qwen_7b.py
index e8c71690f..5b08b8206 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b.py
@@ -1,13 +1,15 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
 # Copyright 2024 The vLLM team.
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://wwww.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by application law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
index 7983d7a88..5e7899fbf 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
@@ -1,13 +1,15 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
 # Copyright 2024 The vLLM team.
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://wwww.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by application law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
new file mode 100644
index 000000000..89ba64c0e
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""test mf qwen prefix caching."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestMfQwen_prefix_caching:
+    """
+    Test qwen7b enable prefix_caching
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_mf_qwen_7b_prefix_caching(self):
+        """
+        test case qwen_7b_prefix_caching
+        """
+
+        # First prompts.
+        prompts = [
+            "I love Beijing, because it is a city that has so much to offer. I have visited"
+        ]
+        #second prompts, the second prompt is a continuation of the first prompts, make sure prefix caching work.
+        second_prompts = [
+            "I love Beijing, because it is a city that has so much to offer. I have visited many places"
+        ]
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  max_model_len=8192, block_size=16, enable_prefix_caching=True,
+                  gpu_memory_utilization=0.9, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        second_outputs = llm.generate(second_prompts, sampling_params)
+        except_list=[' many times and each time I have found something new']
+        second_except_list=[' to visit, such as the Forbidden City, the']
+        for i, (output, second_output) in enumerate(zip(outputs, second_outputs)):
+            generated_text = output.outputs[i].text
+            print(f"Output1 - Prompt: {prompts[i]!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+            second_generated_text = second_output.outputs[i].text
+            print(f"Output2 - Prompt: {second_prompts[i]!r}, Generated text: {second_generated_text!r}")
+            assert second_generated_text == second_except_list[i]
+
+        env_manager.unset_all()
-- 
Gitee


From a50083712c25793ee3714b7f928092643e053917 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Mon, 14 Apr 2025 10:29:49 +0800
Subject: [PATCH 76/82] vllm-ms master adapter to mf dev

---
 tests/mindformers                             |  2 +-
 tests/st/python/test_vllm_deepseek_part.py    |  8 +-
 .../test_vllm_mf_qwen_7b_chunk_prefill.py     | 89 +++++++++++++++++++
 .../python/test_vllm_mf_qwen_7b_cp_pc_mss.py  | 86 ++++++++++++++++++
 .../models/mf_models/deepseek_v3.py           | 15 ++++
 .../models/mf_models/mf_model_base.py         | 15 ++--
 6 files changed, 200 insertions(+), 15 deletions(-)
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py

diff --git a/tests/mindformers b/tests/mindformers
index 4b50139b4..544c40095 160000
--- a/tests/mindformers
+++ b/tests/mindformers
@@ -1 +1 @@
-Subproject commit 4b50139b476981ac23e6bf7634bb6e479f9bbf16
+Subproject commit 544c4009573051e0e254efab71d212bfc77fc7b2
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 0573d7b6a..8dfa95635 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -66,13 +66,13 @@ class TestDeepSeek:
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
-        except_list=['ugs611ాలు哒ాలు mahassisemaSTE的道德']
+        except_list=['ugs611ాలు哒ాలు mahassisemaSTE的道德', 'ugs611ాలు哒ాలు mah战区rollerOVERlaid']
         # Print the outputs.
         for i, output in enumerate(outputs):
             prompt = output.prompt
             generated_text = output.outputs[0].text
             print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            assert generated_text == except_list[i]
+            assert generated_text in except_list
 
         # unset env
         env_manager.unset_all()
@@ -107,13 +107,13 @@ class TestDeepSeekMTP:
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
-        except_list = ['ugs611ాలు哒ాలు mahassisemaSTE的道德']
+        except_list = ['ugs611ాలు哒ాలు mahassisemaSTE的道德', 'ugs611ాలు哒ాలు mah战区rollerOVERlaid']
         # Print the outputs.
         for i, output in enumerate(outputs):
             prompt = output.prompt
             generated_text = output.outputs[0].text
             print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-            assert generated_text == except_list[i]
+            assert generated_text in except_list
 
         # unset env
         env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
new file mode 100644
index 000000000..1523e46bb
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
@@ -0,0 +1,89 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf qwen chunk prefill."""
+import pytest
+import os
+from . import set_env
+
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestMfQwen_chunk_prefill:
+    """
+    Test qwen.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_mf_qwen_7b_chunk_prefill(self):
+        """
+        test case qwen_7b_chunk_prefill
+        """
+
+        # Sample prompts.
+        batch_datas = [{
+            "prompt": "I love Beijing, because it is a city with a long history and profound cultural heritage. Walking through "
+                      "its ancient hutongs, one can almost feel the whispers of the past. The Forbidden City, an architectural "
+                      "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great "
+                      "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the "
+                      "strategic genius and resilience of ancient China.",
+            "answer": " The city's blend of traditional and modern architecture, vibrant street life, and rich culinary scene "
+                      "make it a truly unique and captivating destination. I am always eager to"},
+            {"prompt": "I love Beijing, because",
+             "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"},
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  max_model_len=8192, max_num_seqs=16, max_num_batched_tokens=32,
+                  block_size=32, gpu_memory_utilization=0.9, tensor_parallel_size=2,
+                  enable_chunked_prefill=True)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        for batch_data in batch_datas:
+            prompt = batch_data["prompt"]
+            answer = batch_data["answer"]
+            outputs = llm.generate(prompt, sampling_params)
+            # Print the outputs.
+            for i, output in enumerate(outputs):
+                generated_text = output.outputs[0].text
+                print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+                assert generated_text == answer
+
+        # unset env
+        env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
new file mode 100644
index 000000000..6292b22c6
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
@@ -0,0 +1,86 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf qwen chunk prefill, prefix cache, mss."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestMfQwen_cp_pc_mss:
+    """
+    Test qwen.
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_mf_qwen_7b_cp_pc_mss(self):
+        """
+        test case mf_qwen_7b_cp_pc_mss
+        """
+
+        # Sample prompts.
+        batch_datas = [{
+            "prompt": "I love Beijing, because it is a city with a long history and profound cultural heritage. Walking through "
+                      "its ancient hutongs, one can almost feel the whispers of the past. The Forbidden City, an architectural "
+                      "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great "
+                      "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the "
+                      "strategic genius and resilience of ancient China.",
+            "answer": ""},
+            {"prompt": "I love Beijing, because",
+             "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"},
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  max_model_len=8192, max_num_seqs=16, max_num_batched_tokens=32,
+                  block_size=32, gpu_memory_utilization=0.9, tensor_parallel_size=2,
+                  enable_chunked_prefill=True, enable_prefix_caching=True, num_scheduler_steps=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        for _ in range(3):
+            for batch_data in batch_datas:
+                prompt = batch_data["prompt"]
+                answer = batch_data["answer"]
+                outputs = llm.generate(prompt, sampling_params)
+                # Print the outputs.
+                for i, output in enumerate(outputs):
+                    generated_text = output.outputs[0].text
+                    print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+                    assert generated_text == answer
+
+        # unset env
+        env_manager.unset_all()
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 38a08e115..34ba9e952 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -27,6 +27,7 @@ from vllm.config import get_current_vllm_config
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 
+import mindspore as ms
 from mindspore import Tensor, JitConfig, Model, mutable
 from mindspore.common import dtype as msdtype
 from mindspore.nn.utils import no_init_parameters
@@ -55,6 +56,19 @@ from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerT
 logger = init_logger(__name__)
 
 
+def set_runtime_kernel_launch_group():
+    kernel_launch_group = {'thread_num' : 2, 'kernel_group_num' : 8}
+    env_kernel_launch_group = os.getenv("EXPERIMENTAL_KERNEL_LAUNCH_GROUP", None)
+    if env_kernel_launch_group is not None:
+        pairs = env_kernel_launch_group.split(',')
+        for pair in pairs:
+            key, val = pair.split(':')
+            kernel_launch_group[key] = val
+    thread_num = int(kernel_launch_group.get('thread_num', 2))
+    kernel_group_num = int(kernel_launch_group.get('kernel_group_num', 8))
+    ms.runtime.set_kernel_launch_group(thread_num=thread_num, kernel_group_num=kernel_group_num)
+
+
 class DeepseekV3ForCausalLM(MfModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(DeepseekV3ForCausalLM, self).__init__(
@@ -78,6 +92,7 @@ class DeepseekV3ForCausalLM(MfModelBase):
 
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
+        set_runtime_kernel_launch_group()
 
     def _generate_model_config(self):
         self.mf_config.load_checkpoint = self.get_model_path()
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index b784849ec..fc297e658 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -36,7 +36,7 @@ from mindspore import Tensor, mutable
 from mindspore.common.api import _pynative_executor
 
 from mindformers.tools.register.config import MindFormerConfig
-from mindformers.core.context import build_context
+from mindformers.core.context import build_mf_context
 from mindformers.core.parallel_config import build_parallel_config
 
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
@@ -55,7 +55,6 @@ def _batch_seq(input_tokens, prefill):
     return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
 
 
-
 class MfModelBase(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(MfModelBase, self).__init__(
@@ -63,7 +62,7 @@ class MfModelBase(MsModelBase):
         )
 
         self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG"))
-        build_context(self.mf_config, is_set_ms_ctx=False, is_init_ms=False)
+        build_mf_context(self.mf_config)
         build_parallel_config(self.mf_config)
         self.mf_config.model.model_config.parallel_config = (
             self.mf_config.parallel_config
@@ -75,13 +74,9 @@ class MfModelBase(MsModelBase):
 
         self._generate_model_config()
         self.network, self.lm_head = self._create_network()
-
-        self.network.construct = MethodType(ms.jit(self.network.__class__.construct,
-                                                   jit_level='O0', infer_boost='on'),
-                                            self.network)
-        self.lm_head.construct = MethodType(ms.jit(self.lm_head.__class__.construct,
-                                                   jit_level='O0', infer_boost='on'),
-                                            self.lm_head)
+        affinity_config = self.mf_config.get('context', {}).get('affinity_cpu_list', {})
+        if isinstance(affinity_config, dict):
+            ms.runtime.set_cpu_affinity(True, affinity_config)
 
     @abstractmethod
     def _generate_model_config(self):
-- 
Gitee


From 8c253727627ec4671b8fafd9eef9b3f20b508d04 Mon Sep 17 00:00:00 2001
From: WanYidong <wanyidong@huawei.com>
Date: Wed, 9 Apr 2025 00:16:47 +0800
Subject: [PATCH 77/82] update cpu communicator and share memory

---
 .jenkins/test/config/dependent_packages.yaml |   2 +-
 tests/st/python/test_shm_broadcast.py        | 140 +++++++++++++++++++
 vllm_mindspore/__init__.py                   |   8 --
 vllm_mindspore/distributed/parallel_state.py | 113 ---------------
 4 files changed, 141 insertions(+), 122 deletions(-)
 create mode 100644 tests/st/python/test_shm_broadcast.py
 delete mode 100644 vllm_mindspore/distributed/parallel_state.py

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index eb243d6fa..2cf8a822d 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,5 +1,5 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250403/br_infer_deepseek_os_20250403204446_a10f9cf58ea06de7cf6acbec0bde94442992955b_newest/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250417/br_infer_deepseek_os_20250417004508_38b6db6c3039b59153d52d5e353cd01fe774dc93_newest/'
 
 mindspore_gs:
   'https://repo.mindspore.cn/mindspore/golden-stick/version/202503/20250322/master_20250322160019_1aa0a919d27c806700b2399bf965c5f6663c10fd_newest/'
diff --git a/tests/st/python/test_shm_broadcast.py b/tests/st/python/test_shm_broadcast.py
new file mode 100644
index 000000000..cfc328810
--- /dev/null
+++ b/tests/st/python/test_shm_broadcast.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test cpu communicator and share memory"""
+import pytest
+import multiprocessing
+import random
+import time
+from typing import List
+
+import numpy as np
+import torch.distributed as dist
+
+import vllm_mindspore
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port, update_environment_variables, get_distributed_init_method
+
+
+def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+    np.random.seed(seed)
+    sizes = np.random.randint(1, 10_000, n)
+    # on average, each array will have 5k elements
+    # with int64, each array will have 40kb
+    return [np.random.randint(1, 100, i) for i in sizes]
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes = []
+
+    port = get_open_port()
+    distributed_init_method = get_distributed_init_method("127.0.0.1", port)
+
+    for i in range(number_of_processes):
+        p = multiprocessing.Process(target=fn, args=(distributed_init_method, i, world_size))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(distributed_init_method, rank, world_size):
+        dist.init_process_group(
+            backend="nccl",
+            init_method=distributed_init_method,
+            rank=rank,
+            world_size=world_size,
+        )
+        fn()
+
+    return wrapped_fn
+
+
+@worker_fn_wrapper
+def worker_fn():
+
+    rank = dist.get_rank()
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = dist.new_group([0,1,2,3], backend="gloo")
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+
+        writer_rank = 2
+        broadcaster = MessageQueue.create_from_process_group(
+            pg, 40 * 1024, 2, writer_rank)
+        if rank == writer_rank:
+            seed = random.randint(0, 1000)
+            dist.broadcast_object_list([seed], writer_rank)
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, writer_rank)
+            seed = recv[0]  # type: ignore
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+        else:
+            dist.barrier(group=pg)
+
+        # in case we find a race condition
+        # print the seed so that we can reproduce the error
+        print(f"Rank {rank} got seed {seed}")
+        # test broadcasting with about 400MB of data
+        N = 10_000
+        if rank == writer_rank:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                broadcaster.broadcast_object(x)
+                time.sleep(random.random() / 1000)
+        else:
+            arrs = get_arrays(N, seed)
+            for x in arrs:
+                y = broadcaster.broadcast_object(None)
+                assert np.array_equal(x, y)
+                time.sleep(random.random() / 1000)
+
+        if pg == dist.group.WORLD:
+            dist.barrier()
+            print("torch distributed passed the test!")
+        else:
+            dist.barrier(group=pg)
+            print("StatelessProcessGroup passed the test!")
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+def test_shm_broadcast():
+    distributed_run(worker_fn, 4)
diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index f1da0cc60..eebc2cc6c 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -151,14 +151,6 @@ vllm.worker.multi_step_model_runner._get_supported_attention_backends = (
     _get_supported_attention_backends
 )
 
-from vllm_mindspore.distributed.parallel_state import (
-    init_model_parallel_group,
-    init_group_coordinator,
-)
-
-vllm.distributed.parallel_state.init_model_parallel_group = init_model_parallel_group
-vllm.distributed.parallel_state.GroupCoordinator.__init__ = init_group_coordinator
-
 from vllm_mindspore.executor.multiproc_worker_utils import (
     get_mp_context as ms_get_mp_context,
 )
diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py
deleted file mode 100644
index 42b10d699..000000000
--- a/vllm_mindspore/distributed/parallel_state.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python3
-# encoding: utf-8
-# Copyright 2025 Huawei Technologies Co., Ltd
-# Copyright 2024 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-import pickle
-from typing import List, Optional, Any, Union
-
-import numpy as np
-import torch
-import torch.distributed
-
-from torch.distributed import Backend
-
-
-def init_model_parallel_group(
-    group_ranks: List[List[int]],
-    local_rank: int,
-    backend: str,
-    use_message_queue_broadcaster: bool = False,
-    group_name: Optional[str] = None,
-) -> "GroupCoordinator":
-    from vllm.distributed.parallel_state import GroupCoordinator
-
-    return GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=local_rank,
-        torch_distributed_backend=backend,
-        use_device_communicator=True,
-        use_message_queue_broadcaster=False,
-        group_name=group_name,
-    )
-
-
-def init_group_coordinator(
-    self,
-    group_ranks: List[List[int]],
-    local_rank: int,
-    torch_distributed_backend: Union[str, Backend],
-    use_device_communicator: bool,
-    use_message_queue_broadcaster: bool = False,
-    group_name: Optional[str] = None,
-):
-    from vllm.distributed.parallel_state import _get_unique_name, _register_group
-    from vllm.utils import resolve_obj_by_qualname
-
-    group_name = group_name or "anonymous"
-    self.unique_name = _get_unique_name(group_name)
-    _register_group(self)
-
-    self.rank = torch.distributed.get_rank()
-    self.local_rank = local_rank
-    self.device_group = None
-    self.cpu_group = None
-
-    for ranks in group_ranks:
-        device_group = torch.distributed.new_group(
-            ranks, backend=torch_distributed_backend)
-        # CPU not ready now, use device to communication now.
-        cpu_group = torch.distributed.new_group(ranks, backend="hccl")
-        if self.rank in ranks:
-            self.ranks = ranks
-            self.world_size = len(ranks)
-            self.rank_in_group = ranks.index(self.rank)
-            self.device_group = device_group
-            self.cpu_group = cpu_group
-
-    assert self.cpu_group is not None
-    assert self.device_group is not None
-
-    from vllm.platforms import current_platform
-
-    # TODO: fix it for other platforms
-    if current_platform.is_cuda_alike():
-        self.device = torch.device(f"cuda:{local_rank}")
-    else:
-        self.device = torch.device("cpu")
-
-    self.use_device_communicator = use_device_communicator
-
-    self.device_communicator: DeviceCommunicatorBase = None  # type: ignore
-    if use_device_communicator and self.world_size > 1:
-        device_comm_cls = resolve_obj_by_qualname(
-            current_platform.get_device_communicator_cls())
-        self.device_communicator = device_comm_cls(
-            cpu_group=self.cpu_group,
-            device=self.device,
-            device_group=self.device_group,
-            unique_name=self.unique_name,
-        )
-
-    from vllm.distributed.device_communicators.shm_broadcast import (
-        MessageQueue)
-    self.mq_broadcaster: Optional[MessageQueue] = None
-    if use_message_queue_broadcaster and self.world_size > 1:
-        self.mq_broadcaster = MessageQueue.create_from_process_group(
-            self.cpu_group, 1 << 22, 6)
-
-    from vllm.platforms import current_platform
-    self.use_custom_op_call = current_platform.is_cuda_alike()
-- 
Gitee


From 1a07c8398aa6ffdd49ab1ceb4be120a12ce7e50e Mon Sep 17 00:00:00 2001
From: wangpingan2 <wangpingan2@huawei.com>
Date: Fri, 18 Apr 2025 15:03:23 +0800
Subject: [PATCH 78/82] fix load json

---
 .../models/mf_models/deepseekv3_weight_processor.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 953c4db6b..97338bd9d 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1135,8 +1135,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         for file in os.listdir(src_hf_dir):
             if file.endswith('index.json'):
                 # mtp model do not support quantization, needs to load bf16 weight.
-                if not self.is_quant or (self.is_quant and 'quant' in file) or \
-                        (is_mtp_model and 'quant' not in file):
+                if ('quant' in file and self.is_quant) or \
+                        ('quant' not in file and (not self.is_quant or is_mtp_model)):
                     param_json_path = os.path.join(src_hf_dir, file)
                     with open(param_json_path, "r") as fp:
                         hf_weight_map = json.load(fp)['weight_map']
-- 
Gitee


From e8806fd75e719cdb486c1923e593efd794826941 Mon Sep 17 00:00:00 2001
From: zhang_xu_hao1230 <zhangxuhao6@huawei.com>
Date: Wed, 16 Apr 2025 11:03:28 +0800
Subject: [PATCH 79/82] =?UTF-8?q?casual=5Fmask=20=E4=B8=8A=E7=A7=BB?=
 =?UTF-8?q?=E5=88=B0mfmodelbase?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py | 2 --
 vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py  | 2 --
 .../model_executor/models/mf_models/mf_model_base.py           | 3 ++-
 vllm_mindspore/model_executor/models/mf_models/qwen2.py        | 2 --
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
index fea96442b..fac2bf20f 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
@@ -37,7 +37,6 @@ from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.model_base import Fake_MLA
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
-from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 logger = init_logger(__name__)
 
@@ -59,7 +58,6 @@ class DeepseekV3MTPForCausalLM(MfModelBase):
         for i in range(self.mf_model_config.num_nextn_predict_layers):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
-        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
 
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 38a08e115..03e4719be 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -50,7 +50,6 @@ from vllm_mindspore.model_executor.models.model_base import Fake_MLA
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
-from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 logger = init_logger(__name__)
 
@@ -76,7 +75,6 @@ class DeepseekV3ForCausalLM(MfModelBase):
         for i in range(self.mf_model_config.num_layers):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
-        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
     def _generate_model_config(self):
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index b784849ec..0fb4b7e14 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -40,6 +40,7 @@ from mindformers.core.context import build_context
 from mindformers.core.parallel_config import build_parallel_config
 
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
+from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 logger = init_logger(__name__)
 
@@ -72,8 +73,8 @@ class MfModelBase(MsModelBase):
             get_tensor_model_parallel_world_size()
         )
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
-
         self._generate_model_config()
+        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.network, self.lm_head = self._create_network()
 
         self.network.construct = MethodType(ms.jit(self.network.__class__.construct,
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 14ce94449..18a865c14 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -34,7 +34,6 @@ from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.model_base import Fake_Attention
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 from vllm_mindspore.model_executor.models.mf_models.qwen2_weight_processor import Qwen2WeightProcessor
-from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 
 
 logger = init_logger(__name__)
@@ -56,7 +55,6 @@ class Qwen2ForCausalLM(MfModelBase):
         for i in range(self.mf_model_config.num_layers):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
-        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.set_flags = False
 
     def _generate_model_config(self):
-- 
Gitee


From d0c2447baae113a001c7f802a39999866ad82099 Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Wed, 16 Apr 2025 16:27:13 +0800
Subject: [PATCH 80/82] Micro-refactoring and performance optimization

---
 vllm_mindspore/attention/layer.py             | 13 ++--
 .../model_executor/layers/logits_processor.py |  5 +-
 .../model_executor/layers/rotary_embedding.py |  9 ++-
 .../model_executor/models/model_base.py       |  9 +--
 vllm_mindspore/model_executor/models/qwen2.py | 74 +++++++++++--------
 5 files changed, 60 insertions(+), 50 deletions(-)

diff --git a/vllm_mindspore/attention/layer.py b/vllm_mindspore/attention/layer.py
index 99cdc521e..4634727b9 100644
--- a/vllm_mindspore/attention/layer.py
+++ b/vllm_mindspore/attention/layer.py
@@ -155,14 +155,13 @@ class Attention(nn.Cell):
         value: Tensor,
         key_cache: Tensor,
         value_cache: Tensor,
-        num_prefill_tokens: bool,
-        num_decode_tokens: int,
+        is_prefill: bool,
         slot_mapping: Tensor,
         batch_valid_length: Tuple[int],
         q_seq_lens: Tensor,
         block_tables: Tensor,
         attn_mask: Tensor,
-        decode_mask:Tensor,
+        decode_mask: Tensor,
     ) -> Tensor:
         """Attention foward, support MHA and GQA.
 
@@ -178,10 +177,10 @@ class Attention(nn.Cell):
         output = query
         cache_out = self.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
         query = ops.depend(query, cache_out)
-        if num_prefill_tokens > 0:
+        if is_prefill:
             output = self._run_prefill_forward(query, key, value, attn_mask, batch_valid_length, batch_valid_length)
-        if num_decode_tokens > 0:
-            output = self._run_decode_forward(query, key_cache, value_cache, block_tables,batch_valid_length,
+        else:
+            output = self._run_decode_forward(query, key_cache, value_cache, block_tables, batch_valid_length,
                                               decode_mask, q_seq_lens)
         return output
 
@@ -229,7 +228,7 @@ class Attention(nn.Cell):
         value_cache: Tensor,
         block_tables: Tensor,
         batch_valid_length: Tensor,
-        decode_mask:Tensor,
+        decode_mask: Tensor,
         q_seq_lens: Tensor,
     ) -> Tensor:
         """Decode with PagedAttention.
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index cc0550378..647b4ac83 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -21,8 +21,7 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import mindspore.nn as nn
-from mindspore import Tensor
-from mindspore import mint
+from mindspore import Tensor, ops, mint, nn
 
 import vllm.envs as envs
 from vllm.config import get_current_vllm_config
@@ -148,7 +147,7 @@ def _prune_hidden_states(
     # (warmup, profile_run) we might not have selected_token_indices,
     # so we skip pruning.
     if sampling_metadata.selected_token_indices is not None:
-        return hidden_states.index_select(0, sampling_metadata.selected_token_indices)
+        return ops.gather(hidden_states, sampling_metadata.selected_token_indices, 0)
     else:
         return hidden_states
 
diff --git a/vllm_mindspore/model_executor/layers/rotary_embedding.py b/vllm_mindspore/model_executor/layers/rotary_embedding.py
index 257db72bb..7903702a9 100644
--- a/vllm_mindspore/model_executor/layers/rotary_embedding.py
+++ b/vllm_mindspore/model_executor/layers/rotary_embedding.py
@@ -156,6 +156,7 @@ class InferRotaryEmbedding(CustomOp):
         self.freqs_cos = Tensor(freqs_cos, dtype=dtype)
         self.freqs_sin = Tensor(freqs_sin, dtype=dtype)
         self.rotary_embedding_op = ops.ApplyRotaryPosEmb(2)
+        self.gather = ops.Gather()
 
     def forward_native(
         self,
@@ -163,14 +164,14 @@ class InferRotaryEmbedding(CustomOp):
         query: Tensor,
         key: Tensor,
         batch_valid_length: Tensor,
-        num_prefill_tokens: int,
+        is_prefill: bool,
         offsets: Optional[Tensor] = None,
     ) -> Tuple[Tensor, Tensor]:
-        if num_prefill_tokens > 0:
+        if is_prefill:
             return self.rotary_embedding_op(query, key, self.freqs_cos, self.freqs_sin, batch_valid_length)
 
-        freqs_cos = self.freqs_cos.index_select(0, positions)
-        freqs_sin = self.freqs_sin.index_select(0, positions)
+        freqs_cos = self.gather(self.freqs_cos, positions, 0)
+        freqs_sin = self.gather(self.freqs_sin, positions, 0)
         return self.rotary_embedding_op(query, key, freqs_cos, freqs_sin, batch_valid_length)
 
 
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 7155ee0dc..b97d71526 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -187,7 +187,7 @@ class MsModelBase():
     ) -> Union[Tensor, IntermediateTensors]:
         raise NotImplementedError
 
-    def set_model_inputs(self):
+    def set_model_inputs(self, is_prefill):
         dyn_input_ids = Tensor(shape=[None, None], dtype=mstype.int64)
         dyn_position_ids = Tensor(shape=[None], dtype=mstype.int64)
 
@@ -207,8 +207,6 @@ class MsModelBase():
         dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)])
         dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)])
 
-        dyn_num_prefill_tokens = mutable(1)
-        dyn_num_decode_tokens = mutable(0)
         dyn_batch_valid_length = Tensor(shape=[None, ], dtype=mstype.int32)
         dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32)
         dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32)
@@ -221,11 +219,10 @@ class MsModelBase():
             dyn_position_ids,
             dyn_key_caches,
             dyn_value_caches,
-            dyn_num_prefill_tokens,
-            dyn_num_decode_tokens,
+            is_prefill,
+            dyn_slot_mapping,
             dyn_batch_valid_length,
             dyn_q_seq_lens,
-            dyn_slot_mapping,
             dyn_block_tables,
             dyn_intermediate_tensors,
             dyn_inputs_embeds
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 387b7cc7e..32d9da8d9 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -25,7 +25,7 @@ else:
 
 import numpy as np
 
-from mindspore import Parameter, Tensor, mint, nn, jit, mutable
+from mindspore import Parameter, Tensor, mint, nn, jit, ops
 from mindspore.common import dtype as mstype
 
 
@@ -183,8 +183,7 @@ class Qwen2Attention(nn.Cell):
         hidden_states: Tensor,
         key_cache: Tensor,
         value_cache: Tensor,
-        num_prefill_tokens: int,
-        num_decode_tokens: int,
+        is_prefill: bool,
         slot_mapping: Tensor,
         batch_valid_length: Tuple[int],
         q_seq_lens: Tensor,
@@ -192,9 +191,9 @@ class Qwen2Attention(nn.Cell):
     ) -> Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = mint.split(qkv, (self.q_size, self.kv_size, self.kv_size), -1)
-        q, k = self.rotary_emb(positions, q, k, q_seq_lens, num_prefill_tokens)
-        attn_output = self.attn(q, k, v, key_cache, value_cache, num_prefill_tokens, num_decode_tokens,
-                                slot_mapping, batch_valid_length, q_seq_lens, block_tables, self.attn_mask, self.hard_mask)
+        q, k = self.rotary_emb(positions, q, k, q_seq_lens, is_prefill)
+        attn_output = self.attn(q, k, v, key_cache, value_cache, is_prefill, slot_mapping, batch_valid_length,
+                                q_seq_lens, block_tables, self.attn_mask, self.hard_mask)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -256,8 +255,7 @@ class Qwen2DecoderLayer(nn.Cell):
         hidden_states: Tensor,
         key_cache: Tensor,
         value_cache: Tensor,
-        num_prefill_tokens: int,
-        num_decode_tokens: int,
+        is_prefill: bool,
         slot_mapping: Tensor,
         batch_valid_length: Tuple[int],
         q_seq_lens: Tensor,
@@ -275,8 +273,7 @@ class Qwen2DecoderLayer(nn.Cell):
             hidden_states,
             key_cache,
             value_cache,
-            num_prefill_tokens,
-            num_decode_tokens,
+            is_prefill,
             slot_mapping,
             batch_valid_length,
             q_seq_lens,
@@ -341,10 +338,9 @@ class Qwen2Model(nn.Cell):
         self,
         input_ids: Optional[Tensor],
         positions: Tensor,
-        key_caches: List[Tensor], 
+        key_caches: List[Tensor],
         value_caches: List[Tensor],
-        num_prefill_tokens: int,
-        num_decode_tokens: int,
+        is_prefill: bool,
         slot_mapping: Tensor,
         batch_valid_length: Tensor,
         q_seq_lens: Tensor,
@@ -369,8 +365,7 @@ class Qwen2Model(nn.Cell):
                 hidden_states,
                 key_caches[i - self.start_layer],
                 value_caches[i - self.start_layer],
-                num_prefill_tokens,
-                num_decode_tokens,
+                is_prefill,
                 slot_mapping,
                 batch_valid_length,
                 q_seq_lens,
@@ -490,7 +485,8 @@ class Qwen2ForCausalLM(MsModelBase):
             self.model.make_empty_intermediate_tensors)
         self.set_modules({"model": self.model, "lm_head": self.lm_head})
 
-        self.set_model_inputs()
+        self.prefill = True
+        self.set_model_inputs(self.prefill)
         self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)]
         compilation_config = vllm_config.compilation_config
 
@@ -513,12 +509,31 @@ class Qwen2ForCausalLM(MsModelBase):
         **kwargs
     ) -> Union[Tensor, IntermediateTensors]:
         key_cache, value_cache = self.get_kvcache()
-        if attn_metadata.num_prefill_tokens > 0:
-            input_ids = input_ids.expand_dims(0)
-        if attn_metadata.num_decode_tokens > 0:
-            input_ids = input_ids.expand_dims(1)
-        num_prefill_tokens = mutable(attn_metadata.num_prefill_tokens)
-        num_decode_tokens = mutable(attn_metadata.num_decode_tokens)
+        seq_lens = attn_metadata.seq_lens
+        max_query_len = attn_metadata.max_query_len
+        # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+        # decodes are scheduled together. In the first step, all the
+        # prefills turn into decodes and max_query_len will be 1.
+        if self.is_multi_step_chunked_prefill and max_query_len == 1:
+            query_lens = [1] * len(seq_lens)
+        else:
+            query_lens = attn_metadata.query_lens
+
+        seq_lens_np = np.array(seq_lens, dtype=np.int32)
+        query_lens_np = np.array(query_lens, dtype=np.int32)
+        kv_cache_lens = seq_lens_np - query_lens_np
+        is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0
+        if is_prefill:
+            input_ids = ops.expand_dims(input_ids, 0)
+            if not self.prefill:
+                self.prefill = True
+                self.set_model_inputs(self.prefill)
+        else:
+            input_ids = ops.expand_dims(input_ids, 1)
+            if self.prefill:
+                self.prefill = False
+                self.set_model_inputs(self.prefill)
+
         slot_mapping = attn_metadata.slot_mapping
         batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32))
         q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32))
@@ -527,18 +542,17 @@ class Qwen2ForCausalLM(MsModelBase):
                                   positions,
                                   key_cache,
                                   value_cache,
-                                  num_prefill_tokens,
-                                  num_decode_tokens,
+                                  is_prefill,
                                   slot_mapping,
                                   batch_valid_length,
                                   q_seq_lens,
                                   block_tables,
-                                  intermediate_tensors=intermediate_tensors,
-                                  inputs_embeds=inputs_embeds)
-        if attn_metadata.num_prefill_tokens > 0:
-            model_output = model_output.squeeze(0)
-        if attn_metadata.num_decode_tokens > 0:
-            model_output = model_output.squeeze(1)
+                                  intermediate_tensors,
+                                  inputs_embeds)
+        if is_prefill:
+            model_output = ops.squeeze(model_output, 0)
+        else:
+            model_output = ops.squeeze(model_output, 1)
         return model_output
 
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
-- 
Gitee


From 992cd103f824ad040b035868005dda0d229080c8 Mon Sep 17 00:00:00 2001
From: ccsszz <changshaozhong@huawei.com>
Date: Mon, 21 Apr 2025 16:33:32 +0800
Subject: [PATCH 81/82] support smooth-quant safetensors split for qkv/ffn
 concat

---
 .jenkins/test/config/dependent_packages.yaml  |   2 +-
 ...ict_deepseek_r1_671b_w8a8_smoothquant.yaml | 125 ++++++
 .../python/test_vllm_deepseek_smoothquant.py  |  77 ++++
 .../models/mf_models/deepseek_v3.py           |   3 +-
 .../mf_models/deepseekv3_weight_processor.py  | 424 +++++++++++++++---
 .../models/mf_models/weight_processor.py      |   5 +
 6 files changed, 560 insertions(+), 76 deletions(-)
 create mode 100644 tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml
 create mode 100644 tests/st/python/test_vllm_deepseek_smoothquant.py

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index efb6f8e25..375119335 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -2,7 +2,7 @@ mindspore:
   'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250417/br_infer_deepseek_os_20250417004508_38b6db6c3039b59153d52d5e353cd01fe774dc93_newest/'
 
 mindspore_gs:
-  'https://repo.mindspore.cn/mindspore/golden-stick/version/202503/20250322/master_20250322160019_1aa0a919d27c806700b2399bf965c5f6663c10fd_newest/'
+  'https://repo.mindspore.cn/mindspore/golden-stick/version/202504/20250424/master_20250424010019_dc3222e266c572dce1070a112aa6e12155a45370_newest/'
 
 msadapter:
   'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250410/master_20250410120007_83e7214eb2b9598179135a4e98dce3b69ba27da2_newest/'
diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml
new file mode 100644
index 000000000..f8984e0fd
--- /dev/null
+++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml
@@ -0,0 +1,125 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+run_mode: 'predict'
+use_parallel: True
+
+load_checkpoint: "/path/to/deepseekr1/model_w8a8_smoothquant_ckpt"
+load_ckpt_format: "safetensors"
+auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'DeepSeekR1-W8A8'
+
+# default parallel of device num = 16 for Atlas 800T A2
+parallel_config:
+  model_parallel: 16
+  pipeline_stage: 1
+  expert_parallel: 1
+  vocab_emb_dp: False
+
+# mindspore context init config
+context:
+  mode: 0 # 0--Graph Mode; 1--Pynative Mode
+  max_device_memory: "61GB"
+  device_id: 0
+  affinity_cpu_list: None
+
+kernel_launch_group:
+  thread_num: 4
+  kernel_group_num: 16
+
+# parallel context config
+parallel:
+  parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
+  full_batch: False
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+
+# model config
+model:
+  model_config:
+    type: DeepseekV3Config
+    auto_register: deepseek3_config.DeepseekV3Config
+    batch_size: 1 # add for incre predict
+    seq_length: 4096
+    hidden_size: 7168
+    num_layers: 4
+    num_heads: 128
+    max_position_embeddings: 163840
+    intermediate_size: 18432
+    kv_lora_rank:  512
+    q_lora_rank: 1536
+    qk_rope_head_dim: 64
+    v_head_dim: 128
+    qk_nope_head_dim: 128
+    vocab_size: 129280
+    multiple_of: 256
+    rms_norm_eps: 1.0e-6
+    bos_token_id: 0
+    eos_token_id: 1
+    pad_token_id: 1
+    ignore_token_id: -100
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "bfloat16"
+    softmax_compute_type: "bfloat16"
+    rotary_dtype: "bfloat16"
+    router_dense_type: "bfloat16"
+    param_init_type: "bfloat16"
+    scaling_factor:
+      beta_fast: 32.0
+      beta_slow: 1.0
+      factor: 40.0
+      mscale: 1.0
+      mscale_all_dim: 1.0
+      original_max_position_embeddings: 4096
+    use_past: True
+    extend_method: "YARN"
+    use_flash_attention: True
+    block_size: 16
+    num_blocks: 512
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1
+    max_decode_length: 1024
+    top_k: 1
+    top_p: 1
+    theta: 10000.0
+    do_sample: False
+    is_dynamic: True
+    qkv_concat: True
+    ffn_concat: True
+    quantization_config:
+      quant_method: 'smoothquant'
+      weight_dtype: 'int8'
+      activation_dtype: 'int8'
+    auto_map:
+      AutoConfig: deepseek3_config.DeepseekV3Config
+      AutoModel: deepseek3.DeepseekV3ForCausalLM
+  arch:
+    type: DeepseekV3ForCausalLM
+    auto_register: deepseek3.DeepseekV3ForCausalLM
+
+moe_config:
+  expert_num: 256
+  num_experts_chosen: 8
+  routing_policy: "TopkRouterV2"
+  shared_expert_num: 1
+  routed_scaling_factor: 2.5
+  first_k_dense_replace: 3
+  moe_intermediate_size: 2048
+  topk_group: 4
+  n_group: 8
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<｜begin▁of▁sentence｜>'
+    eos_token: '<｜end▁of▁sentence｜>'
+    pad_token: '<｜end▁of▁sentence｜>'
+    type: LlamaTokenizerFast
+    vocab_file: '/path/to/deepseekr1/tokenizer.json'
+    tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
+    chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
+  type: LlamaProcessor
diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py
new file mode 100644
index 000000000..7582e55b2
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_smoothquant.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1 smoothquant."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestDeepSeek:
+    """
+    Test Deepseek.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_deepseek_r1(self):
+        """
+        test case deepseek r1 w8a8
+        """
+
+        # Sample prompts.
+        prompts = [
+            "介绍下北京故宫",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert "博物院" in generated_text
+
+        # unset env
+        env_manager.unset_all()
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index ae70c3031..e0ede9464 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -215,6 +215,7 @@ class DeepseekV3ForCausalLM(MfModelBase):
             ptq._config.weight_symmetric = False
         if 'smoothquant' in quant_type.lower():
             # pylint: disable=protected-access
-            ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden"]
+            ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1",
+                                            "routed_experts.ffn.w3"]
         ptq.decoder_layer_types.append(DeepseekV3DecodeLayer)
         return ptq
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 97338bd9d..642897ed4 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -25,6 +25,7 @@ from tqdm import tqdm
 import mindspore as ms
 from mindspore import dtype
 from mindspore.communication.management import get_rank
+from mindformers.experimental.parallel_core.pynative.parallel_state import get_tensor_model_parallel_rank
 from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor
 from vllm_mindspore.utils import convert_np_to_ms_dtype
 
@@ -450,6 +451,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
             quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name)
             quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
+            if name == "o_proj" and get_tensor_model_parallel_rank() != 0:
+                quant_bias_ms_param.fill(0)
 
             dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale"
             dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name)
@@ -848,14 +851,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         rope_dim = qk_rope_head_dim + qk_nope_head_dim
         kv_head_dim = kv_lora_rank + qk_rope_head_dim
 
+        qkv_concat = self.config.model.model_config.qkv_concat
         # q2l_proj
         q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
         q2l_proj_ms_name = self.convert_weight_name(q2l_proj_hf_name)
         q_a_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
-        self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(
-            ms.from_numpy(q_a_proj_ms_param).astype(ms.bfloat16),
-            name=q2l_proj_ms_name,
-            requires_grad=False)
 
         # kv2l
         kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
@@ -863,10 +863,19 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
         kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
         kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
-        self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.bfloat16),
-                                                         name=kv2l_ms_name,
-                                                         requires_grad=False)
-
+        if qkv_concat:
+            wqkv2l_weight = np.concatenate((q_a_proj_ms_param, kv2l_ms_param), 0)
+            wqkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l.weight"
+            self.parameter_dict[wqkv2l_weight_name] = ms.Parameter(ms.from_numpy(wqkv2l_weight).astype(ms.bfloat16),
+                                                                   name=wqkv2l_weight_name,
+                                                                   requires_grad=False)
+        else:
+            self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.from_numpy(q_a_proj_ms_param).astype(ms.bfloat16),
+                                                                 name=q2l_proj_ms_name,
+                                                                 requires_grad=False)
+            self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.bfloat16),
+                                                             name=kv2l_ms_name,
+                                                             requires_grad=False)
         # lq_norm
         lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
         lq_norm_ms_name = self.convert_weight_name(lq_norm_hf_name)
@@ -985,84 +994,351 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         if layer_id >= self.num_layers:
             self.infer_process_mtp_layer_weight(src_hf_dir, layer_id, hf_weight_map)
 
+    def smooth_quant_process_route_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type):
+        """smooth_quant_process_route_ffn_weight"""
+        ffn_concat = self.config.model.model_config.ffn_concat
+        w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight"
+        w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map,
+                                                           is_split_param=True,
+                                                           split_axis=2)
+
+        w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
+        w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=1)
+        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
+        w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map,
+                                                          is_split_param=True,
+                                                          split_axis=1)
+
+        w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
+        w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
+        w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
+        w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
+
+        w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
+        w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map,
+                                                           is_split_param=True,
+                                                           split_axis=2)
+
+        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
+        w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=1)
+
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
+        w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map,
+                                                          is_split_param=True,
+                                                          split_axis=1)
+
+        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
+        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
+        w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
+        w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
+        if ffn_concat:
+            concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight"
+            concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=2), dtype=ms.int8)
+            parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name,
+                                                              requires_grad=False)
+
+            concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias"
+            concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=1), dtype=ms.int32)
+            parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name,
+                                                            requires_grad=False)
+
+            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale"
+            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=1), dtype=ms.bfloat16)
+            parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name,
+                                                             requires_grad=False)
+
+            concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp"
+            concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.bfloat16)
+            parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name,
+                                                                requires_grad=False)
+
+            concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale"
+            concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16)
+            parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param,
+                                                                   name=concat_quant_scale_name,
+                                                                   requires_grad=False)
+        else:
+            # w1 w3
+            parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name,
+                                                          requires_grad=False)
+            parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name,
+                                                          requires_grad=False)
+
+            parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32),
+                                                        name=w1_bias_name, requires_grad=False)
+            parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32),
+                                                        name=w3_bias_name, requires_grad=False)
+
+            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16),
+                                                         name=w1_scale_name, requires_grad=False)
+            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16),
+                                                         name=w3_scale_name, requires_grad=False)
+
+            parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.bfloat16),
+                                                       name=w1_quant_zp, requires_grad=False)
+            parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.bfloat16),
+                                                       name=w3_quant_zp, requires_grad=False)
+
+            parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16),
+                                                          name=w1_quant_scale, requires_grad=False)
+            parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
+                                                          name=w3_quant_scale, requires_grad=False)
+
+    def smooth_quant_process_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type):
+        """smooth_quant_process_ffn_weight"""
+
+        ffn_concat = self.config.model.model_config.ffn_concat
+        w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight"
+        w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map,
+                                                           is_split_param=True,
+                                                           split_axis=0)
+        w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
+        w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=0)
+        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
+        w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map,
+                                                          is_split_param=True,
+                                                          split_axis=0)
+
+        w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
+        w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
+        w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
+        w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
+
+        w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
+        w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map,
+                                                           is_split_param=True,
+                                                           split_axis=0)
+        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
+        w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True,
+                                                         split_axis=0)
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
+        w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map,
+                                                          is_split_param=True,
+                                                          split_axis=0)
+
+        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
+        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
+        w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
+        w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
+        if ffn_concat:
+            concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight"
+            concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8)
+            parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name,
+                                                              requires_grad=False)
+
+            concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias"
+            concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32)
+            parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name,
+                                                            requires_grad=False)
+
+            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale"
+            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32)
+            parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name,
+                                                             requires_grad=False)
+
+            concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp"
+            concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8)
+            parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name,
+                                                                requires_grad=False)
+
+            concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale"
+            concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16)
+            parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param,
+                                                                   name=concat_quant_scale_name,
+                                                                   requires_grad=False)
+        else:
+            # w1 w3
+            parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name,
+                                                          requires_grad=False)
+            parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name,
+                                                          requires_grad=False)
+
+            parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32),
+                                                        name=w1_bias_name, requires_grad=False)
+            parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32),
+                                                        name=w3_bias_name, requires_grad=False)
+
+            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32),
+                                                         name=w1_scale_name, requires_grad=False)
+            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32),
+                                                         name=w3_scale_name, requires_grad=False)
+
+            parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8),
+                                                       name=w1_quant_zp, requires_grad=False)
+            parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8),
+                                                       name=w3_quant_zp, requires_grad=False)
+
+            parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16),
+                                                          name=w1_quant_scale, requires_grad=False)
+            parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
+                                                          name=w3_quant_scale, requires_grad=False)
+
+    def smooth_quant_process_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict):
+        '''smooth_quant_process_qkv_weight'''
+        qkv_concat = self.config.model.model_config.qkv_concat
+        # q2l_proj
+        q2l_weight_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.weight"
+        q2l_weight_param, _ = self.get_safetensor_from_file(q2l_weight_name, src_hf_dir, hf_weight_map)
+        q2l_bias_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.quant_bias"
+        q2l_bias_param, _ = self.get_safetensor_from_file(q2l_bias_name, src_hf_dir, hf_weight_map)
+        q2l_scale_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.dequant_scale"
+        q2l_scale_param, _ = self.get_safetensor_from_file(q2l_scale_name, src_hf_dir, hf_weight_map)
+
+        q2l_quant_zp = f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_zp"
+        q2l_quant_scale = f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_scale"
+        q2l_quant_zp_param, _ = self.get_safetensor_from_file(q2l_quant_zp, src_hf_dir, hf_weight_map)
+        q2l_quant_scale_param, _ = self.get_safetensor_from_file(q2l_quant_scale, src_hf_dir, hf_weight_map)
+
+        kv2l_weight_name = f"model.layers.{layer_id}.attention.kv2l._layer.weight"
+        kv2l_weight_param, _ = self.get_safetensor_from_file(kv2l_weight_name, src_hf_dir, hf_weight_map)
+        kv2l_bias_name = f"model.layers.{layer_id}.attention.kv2l._layer.matmul.quant_bias"
+        kv2l_bias_param, _ = self.get_safetensor_from_file(kv2l_bias_name, src_hf_dir, hf_weight_map)
+        kv2l_scale_name = f"model.layers.{layer_id}.attention.kv2l._layer.matmul.dequant_scale"
+        kv2l_scale_param, _ = self.get_safetensor_from_file(kv2l_scale_name, src_hf_dir, hf_weight_map)
+
+        kv2l_quant_zp = f"model.layers.{layer_id}.attention.kv2l.quant_op.input_zp"
+        kv2l_quant_scale = f"model.layers.{layer_id}.attention.kv2l.quant_op.input_scale"
+        kv2l_quant_zp_param, _ = self.get_safetensor_from_file(kv2l_quant_zp, src_hf_dir, hf_weight_map)
+        kv2l_quant_scale_param, _ = self.get_safetensor_from_file(kv2l_quant_scale, src_hf_dir, hf_weight_map)
+
+        if qkv_concat:
+            qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight"
+            qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias"
+            qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale"
+            qkv2l_quant_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp"
+            qkv2l_quant_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale"
+
+            qkv2l_weight = np.concatenate((q2l_weight_param, kv2l_weight_param), 0)
+            parameter_dict[qkv2l_weight_name] = ms.Parameter(ms.Tensor(qkv2l_weight, ms.int8), name=qkv2l_weight_name,
+                                                             requires_grad=False)
+            qkv2l_bias = np.concatenate((q2l_bias_param, kv2l_bias_param), 0)
+            parameter_dict[qkv2l_bias_name] = ms.Parameter(ms.Tensor(qkv2l_bias, ms.int32), name=qkv2l_bias_name,
+                                                           requires_grad=False)
+            qkv2l_scale = np.concatenate((q2l_scale_param, kv2l_scale_param), 0)
+            parameter_dict[qkv2l_scale_name] = ms.Parameter(ms.Tensor(qkv2l_scale, ms.float32), name=qkv2l_scale_name,
+                                                            requires_grad=False)
+            parameter_dict[qkv2l_quant_zp_name] = ms.Parameter(ms.Tensor(q2l_quant_zp_param, ms.int8),
+                                                               name=qkv2l_quant_zp_name, requires_grad=False)
+            parameter_dict[qkv2l_quant_scale_name] = ms.Parameter(ms.Tensor(q2l_quant_scale_param, ms.bfloat16),
+                                                                  name=qkv2l_quant_scale_name, requires_grad=False)
+        else:
+            parameter_dict[q2l_weight_name] = ms.Parameter(ms.Tensor(q2l_weight_param, ms.int8), name=q2l_weight_name,
+                                                           requires_grad=False)
+            parameter_dict[kv2l_weight_name] = ms.Parameter(ms.Tensor(kv2l_weight_param, ms.int8),
+                                                            name=kv2l_weight_name, requires_grad=False)
+            parameter_dict[q2l_bias_name] = ms.Parameter(ms.Tensor(q2l_bias_param, ms.int32), name=q2l_bias_name,
+                                                         requires_grad=False)
+            parameter_dict[kv2l_bias_name] = ms.Parameter(ms.Tensor(kv2l_bias_param, ms.int32), name=kv2l_bias_name,
+                                                          requires_grad=False)
+            parameter_dict[q2l_scale_name] = ms.Parameter(ms.Tensor(q2l_scale_param, ms.float32), name=q2l_scale_name,
+                                                          requires_grad=False)
+            parameter_dict[kv2l_scale_name] = ms.Parameter(ms.Tensor(kv2l_scale_param, ms.float32),
+                                                           name=kv2l_scale_name, requires_grad=False)
+            parameter_dict[q2l_quant_zp] = ms.Parameter(ms.Tensor(q2l_quant_zp_param, ms.int8), name=q2l_quant_zp,
+                                                        requires_grad=False)
+            parameter_dict[kv2l_quant_zp] = ms.Parameter(ms.Tensor(kv2l_quant_zp_param, ms.int8), name=kv2l_quant_zp,
+                                                         requires_grad=False)
+            parameter_dict[q2l_quant_scale] = ms.Parameter(ms.Tensor(q2l_quant_scale_param, ms.bfloat16),
+                                                           name=q2l_quant_scale, requires_grad=False)
+            parameter_dict[kv2l_quant_scale] = ms.Parameter(ms.Tensor(kv2l_quant_scale_param, ms.bfloat16),
+                                                            name=kv2l_quant_scale, requires_grad=False)
+
+    def infer_smooth_quant_row_linear_split(self, param_name, src_hf_dir, hf_weight_map):
+        '''infer_smooth_quant_row_linear_split'''
+        if param_name.endswith(".weight"):
+            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                     hf_weight_map, is_split_param=True,
+                                                     split_axis=1)
+        elif "quant_op" in param_name:
+            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                     hf_weight_map, is_split_param=True,
+                                                     split_axis=0)
+        else:
+            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                     hf_weight_map)
+        if "wo._layer.matmul.quant_bias" in param_name and get_tensor_model_parallel_rank() != 0:
+            value.fill(0)
+        return value
+
+    def infer_smooth_quant_get_value(self, param_name, src_hf_dir, hf_weight_map, no_need_split_layer):
+        '''infer_smooth_quant_get_value'''
+
+        if any([name in param_name for name in no_need_split_layer]):
+            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                     hf_weight_map)
+        elif any([name in param_name for name in [".l2q_proj."]]):
+            if param_name.endswith(".weight") or "matmul" in param_name:
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                         hf_weight_map, is_split_param=True,
+                                                         split_axis=0)
+            else:
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                         hf_weight_map)
+        elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]):
+            value = self.infer_smooth_quant_row_linear_split(param_name, src_hf_dir, hf_weight_map)
+        elif ".routed_experts.ffn.w2" in param_name:
+            if param_name.endswith(".weight"):
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True, split_axis=1)
+            else:
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                         hf_weight_map)
+        elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
+            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                     is_split_param=True, split_axis=0)
+        elif "lm_head" in param_name:
+            if not self.config.parallel_config.vocab_emb_dp:
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True, split_axis=0)
+            else:
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+        else:
+            raise ValueError(f"not found layer {param_name}, please check safetensors file.")
+        return value
+
     def infer_smooth_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
-        """infer_smooth_quant_net_ms_convert_layer_weight"""
+        '''infer_smooth_quant_net_ms_convert_layer_weight'''
         parameter_dict = {}
 
-        no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj",
-                               "kv2l", "routed_experts.router.dense",
+        no_need_split_layer = ["tok_embeddings", "norm", "routed_experts.router.dense",
                                "routed_experts.router.e_score_correction_bias",
                                "topk_bias"]
-        for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"):
+        for layer_id in tqdm(range(num_layers), desc="qkv/ffn params load"):
+            if layer_id >= 3:
+                self.smooth_quant_process_route_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
+                                                           "feed_forward.routed_experts.ffn")
+                self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
+                                                     "feed_forward.shared_experts")
+
+            else:
+                self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
+                                                     "feed_forward")
+            self.smooth_quant_process_qkv_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict)
+
+        skip_layer = ["feed_forward.routed_experts.ffn.w1", "feed_forward.shared_experts.w1", "feed_forward.w1",
+                      "feed_forward.routed_experts.ffn.w3", "feed_forward.shared_experts.w3", "feed_forward.w3",
+                      "feed_forward.routed_experts.ffn.w_gate_hidden", "feed_forward.shared_experts.w_gate_hidden",
+                      "feed_forward.w_gate_hidden", "attention.kv2l", "attention.q2l_proj", "attention.qkv2l"]
+
+        for param_name, _ in tqdm(hf_weight_map.items(), desc="remaining params load"):
             if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
                 continue
 
-            if any([name in param_name for name in no_need_split_layer]):
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                               hf_weight_map)
-            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.",
-                                                      "shared_experts.w_gate_hidden"]]):
-                if param_name.endswith(".weight") or "matmul" in param_name:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map)
-            elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]):
-                if param_name.endswith(".weight"):
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=1)
-                elif "quant_op" in param_name:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map, is_split_param=True,
-                                                                   split_axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map)
-            elif ".routed_experts.ffn.w_gate_hidden." in param_name:
-                if param_name.endswith(".weight"):
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-                    value_list = []
-                    for experts_id in range(value.shape[0]):
-                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1))
-                    value = np.stack(value_list, axis=0)
-                elif "matmul" in param_name:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-                    value_list = []
-                    for experts_id in range(value.shape[0]):
-                        value_list.append(self.split_weight_by_rank(value[experts_id, :], split_axis=0))
-                    value = np.stack(value_list, axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map)
-            elif ".routed_experts.ffn.w2" in param_name:
-                if param_name.endswith(".weight"):
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-                    value_list = []
-                    for experts_id in range(value.shape[0]):
-                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0))
-                    value = np.stack(value_list, axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                                   hf_weight_map)                 
-            elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=0)
-            elif "lm_head" in param_name:
-                if not self.config.parallel_config.vocab_emb_dp:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                                   is_split_param=True, split_axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-            else:
-                raise ValueError(f"not found layer {param_name}, please check safetensors file.")
+            if any([name in param_name for name in skip_layer]):
+                continue
 
+            value = self.infer_smooth_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer)
             dst_dtype = convert_np_to_ms_dtype(value)
 
             parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype),
-                                                        name=param_name, requires_grad=False)
+                                                      name=param_name, requires_grad=False)
 
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
         print(f"smoothquant param_not_load:{param_not_load}")
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index 8edc0b79b..9b0aab3a1 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -72,6 +72,11 @@ class BaseWeightProcessor:
             start = self.rank_id * split_size
             stop = (self.rank_id + 1) * split_size
             split_data = np_data[:, start:stop]
+        elif split_axis == 2:
+            split_size = shape[2] // self.tp_group_size
+            start = self.rank_id * split_size
+            stop = (self.rank_id + 1) * split_size
+            split_data = np_data[:, :, start:stop]
         else:
             raise ValueError("split_axis:{} is not supported.".format(split_axis))
         return split_data, qint4
-- 
Gitee


From b7da6aba30709cea8e17ca7f759ca5db54538504 Mon Sep 17 00:00:00 2001
From: fengyixing <fengyixing@huawei.com>
Date: Sun, 27 Apr 2025 10:27:18 +0800
Subject: [PATCH 82/82] test pipeline

---
 tests/st/python/test_vllm_deepseek_part.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 8dfa95635..8cb7fe842 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -7,7 +7,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-- 
Gitee