From 5ffe553a70cc85343af4f931245391ba43ac8539 Mon Sep 17 00:00:00 2001
From: jerry <jinzongquan@huawei.com>
Date: Fri, 23 Jan 2026 15:39:38 +0800
Subject: [PATCH] vllm_mindspore master support sparsequant algoritom(w8a8sc)

---
 vllm_mindspore/__init__.py                    |  5 ++--
 vllm_mindspore/config.py                      |  2 +-
 .../quantization/golden_stick/__init__.py     |  2 +-
 .../quantization/golden_stick/a8w8sc.py       |  2 +-
 .../quantization/golden_stick/golden_stick.py |  4 ++--
 .../layers/quantization/quant_ops.py          |  2 +-
 .../model_loader/sparse_quant_loader.py       |  2 +-
 vllm_mindspore/model_executor/models/qwen2.py | 23 +++----------------
 .../models/sparse_quant_weight_loader.py      |  2 +-
 vllm_mindspore/utils.py                       |  2 +-
 10 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index bf144e79..7fc8bcb3 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -211,8 +211,7 @@ vllm.model_executor.model_loader.weight_utils.get_quantization_config = (
 
 # Import golden_stick module to trigger sparse quantization support setup
 # The setup is done automatically in golden_stick/__init__.py
-from vllm_mindspore.model_executor.layers.quantization.golden_stick import (  # noqa: F401
-    GoldenStickConfig, ModelSlimConfig)
+import vllm_mindspore.model_executor.layers.quantization.golden_stick  # noqa: F401
 
 from vllm_mindspore.executor.multiproc_worker_utils import (
     get_mp_context as ms_get_mp_context, )
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 7dd5dbf2..6ea80611 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -3,7 +3,7 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.8.3/vllm/config.py
 #
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 # Copyright 2024-2025 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/vllm_mindspore/model_executor/layers/quantization/golden_stick/__init__.py b/vllm_mindspore/model_executor/layers/quantization/golden_stick/__init__.py
index 398902a4..8ab5b93c 100644
--- a/vllm_mindspore/model_executor/layers/quantization/golden_stick/__init__.py
+++ b/vllm_mindspore/model_executor/layers/quantization/golden_stick/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/quantization/golden_stick/a8w8sc.py b/vllm_mindspore/model_executor/layers/quantization/golden_stick/a8w8sc.py
index 31dadacf..63f20e30 100644
--- a/vllm_mindspore/model_executor/layers/quantization/golden_stick/a8w8sc.py
+++ b/vllm_mindspore/model_executor/layers/quantization/golden_stick/a8w8sc.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/layers/quantization/golden_stick/golden_stick.py b/vllm_mindspore/model_executor/layers/quantization/golden_stick/golden_stick.py
index 209f1e90..66c98551 100644
--- a/vllm_mindspore/model_executor/layers/quantization/golden_stick/golden_stick.py
+++ b/vllm_mindspore/model_executor/layers/quantization/golden_stick/golden_stick.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -148,10 +148,10 @@ class GoldenStickConfig(QuantizationConfig):
 
     @staticmethod
     def get_config_filenames() -> list[str]:
+        # quant_model_description_w8a8sc.json is for sparse quant (W8A8SC)
         return [
             "quantization_description.json", "quant_model_description.json",
             "quant_model_description_w8a8sc.json"
-            # Added for sparse quantization config
         ]
 
     @classmethod
diff --git a/vllm_mindspore/model_executor/layers/quantization/quant_ops.py b/vllm_mindspore/model_executor/layers/quantization/quant_ops.py
index 16dd5ab9..0281a6b8 100644
--- a/vllm_mindspore/model_executor/layers/quantization/quant_ops.py
+++ b/vllm_mindspore/model_executor/layers/quantization/quant_ops.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/model_loader/sparse_quant_loader.py b/vllm_mindspore/model_executor/model_loader/sparse_quant_loader.py
index d026001e..84825178 100644
--- a/vllm_mindspore/model_executor/model_loader/sparse_quant_loader.py
+++ b/vllm_mindspore/model_executor/model_loader/sparse_quant_loader.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 31aad66e..b21801d6 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -2,7 +2,7 @@
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
@@ -363,24 +363,6 @@ class Qwen2Model(nn.Cell):
             hidden_states, residual = self.norm(hidden_states, residual)
         return hidden_states, residual
 
-    def _load_split_weights(self, weights: Iterable[tuple[str, Tensor]],
-                            params_dict: dict[str, Parameter]) -> set[str]:
-        """Load sparse quantized weights directly without sharding.
-        
-        Weights are already partitioned by rank folders, so load them
-        directly without any sharding operations. This method delegates
-        to the common sparse quantized weight loader utility.
-        
-        Args:
-            weights: Iterable of (name, weight) tuples
-            params_dict: Dictionary of parameter names to Parameter objects
-            
-        Returns:
-            Set of loaded parameter names
-        """
-        return load_split_weights(weights, params_dict, self.config,
-                                  self.quant_config)
-
     def load_weights(self, weights: Iterable[tuple[str, Tensor]],
                      params_dict: dict[str, Parameter]):
         # Check if sparse quantization is enabled via rank-level config
@@ -397,7 +379,8 @@ class Qwen2Model(nn.Cell):
                     isinstance(v, str) and v.lower() == "w8a8s"
                     for v in sparse_config.values())
                 if has_sparse_quant:
-                    return self._load_split_weights(weights, params_dict)
+                    return load_split_weights(weights, params_dict,
+                                              self.config, self.quant_config)
 
         loaded_params: set[str] = set()
         stacked_params_mapping = [
diff --git a/vllm_mindspore/model_executor/models/sparse_quant_weight_loader.py b/vllm_mindspore/model_executor/models/sparse_quant_weight_loader.py
index 29dd8d7e..bbaee2cb 100644
--- a/vllm_mindspore/model_executor/models/sparse_quant_weight_loader.py
+++ b/vllm_mindspore/model_executor/models/sparse_quant_weight_loader.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index f8fbf042..6d6a8a61 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -3,7 +3,7 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.8.3/vllm/utils.py
 #
-# Copyright 2025 Huawei Technologies Co., Ltd.
+# Copyright 2026 Huawei Technologies Co., Ltd.
 # Copyright 2024-2025 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
-- 
Gitee